diff --git a/include/llvm/IR/IntrinsicsX86.td b/include/llvm/IR/IntrinsicsX86.td index ed9503bad12..9456cb642ec 100644 --- a/include/llvm/IR/IntrinsicsX86.td +++ b/include/llvm/IR/IntrinsicsX86.td @@ -3458,6 +3458,108 @@ let TargetPrefix = "x86" in { llvm_i8_ty], [IntrNoMem]>; } +// Compress, Expand +let TargetPrefix = "x86" in { + def int_x86_avx512_mask_compress_ps_512 : + GCCBuiltin<"__builtin_ia32_compresssf512_mask">, + Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty, llvm_v16f32_ty, + llvm_i16_ty], [IntrNoMem]>; + def int_x86_avx512_mask_compress_pd_512 : + GCCBuiltin<"__builtin_ia32_compressdf512_mask">, + Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_v8f64_ty, + llvm_i8_ty], [IntrNoMem]>; + def int_x86_avx512_mask_compress_ps_256 : + GCCBuiltin<"__builtin_ia32_compresssf256_mask">, + Intrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty, llvm_v8f32_ty, + llvm_i8_ty], [IntrNoMem]>; + def int_x86_avx512_mask_compress_pd_256 : + GCCBuiltin<"__builtin_ia32_compressdf256_mask">, + Intrinsic<[llvm_v4f64_ty], [llvm_v4f64_ty, llvm_v4f64_ty, + llvm_i8_ty], [IntrNoMem]>; + def int_x86_avx512_mask_compress_ps_128 : + GCCBuiltin<"__builtin_ia32_compresssf128_mask">, + Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty, + llvm_i8_ty], [IntrNoMem]>; + def int_x86_avx512_mask_compress_pd_128 : + GCCBuiltin<"__builtin_ia32_compressdf128_mask">, + Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty, + llvm_i8_ty], [IntrNoMem]>; + + def int_x86_avx512_mask_compress_store_ps_512 : + GCCBuiltin<"__builtin_ia32_compressstoresf512_mask">, + Intrinsic<[], [llvm_ptr_ty, llvm_v16f32_ty, + llvm_i16_ty], [IntrReadWriteArgMem]>; + def int_x86_avx512_mask_compress_store_pd_512 : + GCCBuiltin<"__builtin_ia32_compressstoredf512_mask">, + Intrinsic<[], [llvm_ptr_ty, llvm_v8f64_ty, + llvm_i8_ty], [IntrReadWriteArgMem]>; + def int_x86_avx512_mask_compress_store_ps_256 : + GCCBuiltin<"__builtin_ia32_compressstoresf256_mask">, + Intrinsic<[], [llvm_ptr_ty, llvm_v8f32_ty, + llvm_i8_ty], [IntrReadWriteArgMem]>; + def int_x86_avx512_mask_compress_store_pd_256 : + GCCBuiltin<"__builtin_ia32_compressstoredf256_mask">, + Intrinsic<[], [llvm_ptr_ty, llvm_v4f64_ty, + llvm_i8_ty], [IntrReadWriteArgMem]>; + def int_x86_avx512_mask_compress_store_ps_128 : + GCCBuiltin<"__builtin_ia32_compressstoresf128_mask">, + Intrinsic<[], [llvm_ptr_ty, llvm_v4f32_ty, + llvm_i8_ty], [IntrReadWriteArgMem]>; + def int_x86_avx512_mask_compress_store_pd_128 : + GCCBuiltin<"__builtin_ia32_compressstoredf128_mask">, + Intrinsic<[], [llvm_ptr_ty, llvm_v2f64_ty, + llvm_i8_ty], [IntrReadWriteArgMem]>; + + def int_x86_avx512_mask_compress_d_512 : + GCCBuiltin<"__builtin_ia32_compresssi512_mask">, + Intrinsic<[llvm_v16i32_ty], [llvm_v16i32_ty, llvm_v16i32_ty, + llvm_i16_ty], [IntrNoMem]>; + def int_x86_avx512_mask_compress_q_512 : + GCCBuiltin<"__builtin_ia32_compressdi512_mask">, + Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty, llvm_v8i64_ty, + llvm_i8_ty], [IntrNoMem]>; + def int_x86_avx512_mask_compress_d_256 : + GCCBuiltin<"__builtin_ia32_compresssi256_mask">, + Intrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty, llvm_v8i32_ty, + llvm_i8_ty], [IntrNoMem]>; + def int_x86_avx512_mask_compress_q_256 : + GCCBuiltin<"__builtin_ia32_compressdi256_mask">, + Intrinsic<[llvm_v4i64_ty], [llvm_v4i64_ty, llvm_v4i64_ty, + llvm_i8_ty], [IntrNoMem]>; + def int_x86_avx512_mask_compress_d_128 : + GCCBuiltin<"__builtin_ia32_compresssi128_mask">, + Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty, + llvm_i8_ty], [IntrNoMem]>; + def int_x86_avx512_mask_compress_q_128 : + GCCBuiltin<"__builtin_ia32_compressdi128_mask">, + Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty, + llvm_i8_ty], [IntrNoMem]>; + + def int_x86_avx512_mask_compress_store_d_512 : + GCCBuiltin<"__builtin_ia32_compressstoresi512_mask">, + Intrinsic<[], [llvm_ptr_ty, llvm_v16i32_ty, + llvm_i16_ty], [IntrReadWriteArgMem]>; + def int_x86_avx512_mask_compress_store_q_512 : + GCCBuiltin<"__builtin_ia32_compressstoredi512_mask">, + Intrinsic<[], [llvm_ptr_ty, llvm_v8i64_ty, + llvm_i8_ty], [IntrReadWriteArgMem]>; + def int_x86_avx512_mask_compress_store_d_256 : + GCCBuiltin<"__builtin_ia32_compressstoresi256_mask">, + Intrinsic<[], [llvm_ptr_ty, llvm_v8i32_ty, + llvm_i8_ty], [IntrReadWriteArgMem]>; + def int_x86_avx512_mask_compress_store_q_256 : + GCCBuiltin<"__builtin_ia32_compressstoredi256_mask">, + Intrinsic<[], [llvm_ptr_ty, llvm_v4i64_ty, + llvm_i8_ty], [IntrReadWriteArgMem]>; + def int_x86_avx512_mask_compress_store_d_128 : + GCCBuiltin<"__builtin_ia32_compressstoresi128_mask">, + Intrinsic<[], [llvm_ptr_ty, llvm_v4i32_ty, + llvm_i8_ty], [IntrReadWriteArgMem]>; + def int_x86_avx512_mask_compress_store_q_128 : + GCCBuiltin<"__builtin_ia32_compressstoredi128_mask">, + Intrinsic<[], [llvm_ptr_ty, llvm_v2i64_ty, + llvm_i8_ty], [IntrReadWriteArgMem]>; +} // Misc. let TargetPrefix = "x86" in { def int_x86_avx512_mask_cmp_ps_512 : GCCBuiltin<"__builtin_ia32_cmpps512_mask">, diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 1c629ed3164..4a086127786 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -16957,9 +16957,31 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(), Op.getOperand(1), Op.getOperand(2), DAG); case VSHIFT_MASK: - return getVectorMaskingNode(getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(), - Op.getOperand(1), Op.getOperand(2), DAG), - Op.getOperand(4), Op.getOperand(3), Subtarget, DAG); + return getVectorMaskingNode(getTargetVShiftNode(IntrData->Opc0, dl, + Op.getSimpleValueType(), + Op.getOperand(1), + Op.getOperand(2), DAG), + Op.getOperand(4), Op.getOperand(3), Subtarget, + DAG); + case COMPRESS_TO_REG: { + SDValue Mask = Op.getOperand(3); + SDValue DataToCompress = Op.getOperand(1); + SDValue PassThru = Op.getOperand(2); + if (isAllOnes(Mask)) // return data as is + return Op.getOperand(1); + EVT VT = Op.getValueType(); + EVT MaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, + VT.getVectorNumElements()); + EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, + Mask.getValueType().getSizeInBits()); + SDLoc dl(Op); + SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT, + DAG.getNode(ISD::BITCAST, dl, BitcastVT, Mask), + DAG.getIntPtrConstant(0)); + + return DAG.getNode(IntrData->Opc0, dl, VT, VMask, DataToCompress, + PassThru); + } default: break; } @@ -17477,6 +17499,31 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget, Results.push_back(Store); return DAG.getMergeValues(Results, dl); } + case COMPRESS_TO_MEM: { + SDLoc dl(Op); + SDValue Mask = Op.getOperand(4); + SDValue DataToCompress = Op.getOperand(3); + SDValue Addr = Op.getOperand(2); + SDValue Chain = Op.getOperand(0); + + if (isAllOnes(Mask)) // return just a store + return DAG.getStore(Chain, dl, DataToCompress, Addr, + MachinePointerInfo(), false, false, 0); + + EVT VT = DataToCompress.getValueType(); + EVT MaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, + VT.getVectorNumElements()); + EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, + Mask.getValueType().getSizeInBits()); + SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT, + DAG.getNode(ISD::BITCAST, dl, BitcastVT, Mask), + DAG.getIntPtrConstant(0)); + + SDValue Compressed = DAG.getNode(IntrData->Opc0, dl, VT, VMask, + DataToCompress, DAG.getUNDEF(VT)); + return DAG.getStore(Chain, dl, Compressed, Addr, + MachinePointerInfo(), false, false, 0); + } } } @@ -19662,6 +19709,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::PCMPESTRI: return "X86ISD::PCMPESTRI"; case X86ISD::PCMPISTRI: return "X86ISD::PCMPISTRI"; case X86ISD::XTEST: return "X86ISD::XTEST"; + case X86ISD::COMPRESS: return "X86ISD::COMPRESS"; } } diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h index 7c6ffa2afa2..b793171e2c5 100644 --- a/lib/Target/X86/X86ISelLowering.h +++ b/lib/Target/X86/X86ISelLowering.h @@ -379,6 +379,10 @@ namespace llvm { FMADDSUB, FMSUBADD, + // Compress and expand + COMPRESS, + EXPAND, + // Save xmm argument registers to the stack, according to %al. An operator // is needed so that this can be expanded with control flow. VASTART_SAVE_XMM_REGS, diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td index 48287f40f83..b512305d5dd 100644 --- a/lib/Target/X86/X86InstrAVX512.td +++ b/lib/Target/X86/X86InstrAVX512.td @@ -5281,3 +5281,51 @@ multiclass avx512_convert_mask_to_vector { } defm VPMOVM2 : avx512_convert_mask_to_vector<"vpmovm2">; + +//===----------------------------------------------------------------------===// +// AVX-512 - COMPRESS and EXPAND +// +multiclass compress_by_vec_width opc, X86VectorVTInfo _, + string OpcodeStr> { + def rrkz : AVX5128I, EVEX_KZ; + + let Constraints = "$src0 = $dst" in + def rrk : AVX5128I, EVEX_K; + + let mayStore = 1 in { + def mrk : AVX5128I, + EVEX_K, EVEX_CD8<_.EltSize, CD8VT1>; + } +} + +multiclass compress_by_elt_width opc, string OpcodeStr, + AVX512VLVectorVTInfo VTInfo> { + defm Z : compress_by_vec_width, EVEX_V512; + + let Predicates = [HasVLX] in { + defm Z256 : compress_by_vec_width, EVEX_V256; + defm Z128 : compress_by_vec_width, EVEX_V128; + } +} + +defm VPCOMPRESSD : compress_by_elt_width <0x8B, "vpcompressd", avx512vl_i32_info>, + EVEX; +defm VPCOMPRESSQ : compress_by_elt_width <0x8B, "vpcompressq", avx512vl_i64_info>, + EVEX, VEX_W; +defm VCOMPRESSPS : compress_by_elt_width <0x8A, "vcompressps", avx512vl_f32_info>, + EVEX; +defm VCOMPRESSPD : compress_by_elt_width <0x8A, "vcompresspd", avx512vl_f64_info>, + EVEX, VEX_W; + diff --git a/lib/Target/X86/X86InstrFragmentsSIMD.td b/lib/Target/X86/X86InstrFragmentsSIMD.td index 5448b9f9131..04de47be08a 100644 --- a/lib/Target/X86/X86InstrFragmentsSIMD.td +++ b/lib/Target/X86/X86InstrFragmentsSIMD.td @@ -283,6 +283,10 @@ def SDT_PCMPESTRI : SDTypeProfile<2, 5, [SDTCisVT<0, i32>, SDTCisVT<1, i32>, def X86pcmpistri : SDNode<"X86ISD::PCMPISTRI", SDT_PCMPISTRI>; def X86pcmpestri : SDNode<"X86ISD::PCMPESTRI", SDT_PCMPESTRI>; +def X86compress: SDNode<"X86ISD::COMPRESS", SDTypeProfile<1, 3, + [SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>, + SDTCisVec<3>, SDTCisVec<1>, SDTCisInt<1>]>, []>; + //===----------------------------------------------------------------------===// // SSE Complex Patterns //===----------------------------------------------------------------------===// diff --git a/lib/Target/X86/X86IntrinsicsInfo.h b/lib/Target/X86/X86IntrinsicsInfo.h index bcd55c7df4a..e7ad044d1e5 100644 --- a/lib/Target/X86/X86IntrinsicsInfo.h +++ b/lib/Target/X86/X86IntrinsicsInfo.h @@ -21,7 +21,8 @@ enum IntrinsicType { GATHER, SCATTER, PREFETCH, RDSEED, RDRAND, RDPMC, RDTSC, XTEST, ADX, INTR_TYPE_1OP, INTR_TYPE_2OP, INTR_TYPE_3OP, CMP_MASK, CMP_MASK_CC, VSHIFT, VSHIFT_MASK, COMI, - INTR_TYPE_1OP_MASK_RM, INTR_TYPE_2OP_MASK, INTR_TYPE_SCALAR_MASK_RM + INTR_TYPE_1OP_MASK_RM, INTR_TYPE_2OP_MASK, INTR_TYPE_SCALAR_MASK_RM, + COMPRESS_TO_REG, COMPRESS_TO_MEM }; struct IntrinsicData { @@ -70,6 +71,31 @@ static const IntrinsicData IntrinsicsWithChain[] = { X86_INTRINSIC_DATA(avx512_gatherpf_qps_512, PREFETCH, X86::VGATHERPF0QPSm, X86::VGATHERPF1QPSm), + X86_INTRINSIC_DATA(avx512_mask_compress_store_d_128, + COMPRESS_TO_MEM, X86ISD::COMPRESS, 0), + X86_INTRINSIC_DATA(avx512_mask_compress_store_d_256, + COMPRESS_TO_MEM, X86ISD::COMPRESS, 0), + X86_INTRINSIC_DATA(avx512_mask_compress_store_d_512, + COMPRESS_TO_MEM, X86ISD::COMPRESS, 0), + X86_INTRINSIC_DATA(avx512_mask_compress_store_pd_128, + COMPRESS_TO_MEM, X86ISD::COMPRESS, 0), + X86_INTRINSIC_DATA(avx512_mask_compress_store_pd_256, + COMPRESS_TO_MEM, X86ISD::COMPRESS, 0), + X86_INTRINSIC_DATA(avx512_mask_compress_store_pd_512, + COMPRESS_TO_MEM, X86ISD::COMPRESS, 0), + X86_INTRINSIC_DATA(avx512_mask_compress_store_ps_128, + COMPRESS_TO_MEM, X86ISD::COMPRESS, 0), + X86_INTRINSIC_DATA(avx512_mask_compress_store_ps_256, + COMPRESS_TO_MEM, X86ISD::COMPRESS, 0), + X86_INTRINSIC_DATA(avx512_mask_compress_store_ps_512, + COMPRESS_TO_MEM, X86ISD::COMPRESS, 0), + X86_INTRINSIC_DATA(avx512_mask_compress_store_q_128, + COMPRESS_TO_MEM, X86ISD::COMPRESS, 0), + X86_INTRINSIC_DATA(avx512_mask_compress_store_q_256, + COMPRESS_TO_MEM, X86ISD::COMPRESS, 0), + X86_INTRINSIC_DATA(avx512_mask_compress_store_q_512, + COMPRESS_TO_MEM, X86ISD::COMPRESS, 0), + X86_INTRINSIC_DATA(avx512_scatter_dpd_512, SCATTER, X86::VSCATTERDPDZmr, 0), X86_INTRINSIC_DATA(avx512_scatter_dpi_512, SCATTER, X86::VPSCATTERDDZmr, 0), X86_INTRINSIC_DATA(avx512_scatter_dpq_512, SCATTER, X86::VPSCATTERDQZmr, 0), @@ -157,8 +183,8 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx2_pmovzxwd, INTR_TYPE_1OP, X86ISD::VZEXT, 0), X86_INTRINSIC_DATA(avx2_pmovzxwq, INTR_TYPE_1OP, X86ISD::VZEXT, 0), X86_INTRINSIC_DATA(avx2_pmul_dq, INTR_TYPE_2OP, X86ISD::PMULDQ, 0), - X86_INTRINSIC_DATA(avx2_pmulh_w, INTR_TYPE_2OP, ISD::MULHS, 0), - X86_INTRINSIC_DATA(avx2_pmulhu_w, INTR_TYPE_2OP, ISD::MULHU, 0), + X86_INTRINSIC_DATA(avx2_pmulh_w, INTR_TYPE_2OP, ISD::MULHS, 0), + X86_INTRINSIC_DATA(avx2_pmulhu_w, INTR_TYPE_2OP, ISD::MULHU, 0), X86_INTRINSIC_DATA(avx2_pmulu_dq, INTR_TYPE_2OP, X86ISD::PMULUDQ, 0), X86_INTRINSIC_DATA(avx2_pshuf_b, INTR_TYPE_2OP, X86ISD::PSHUFB, 0), X86_INTRINSIC_DATA(avx2_psign_b, INTR_TYPE_2OP, X86ISD::PSIGN, 0), @@ -207,6 +233,30 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx512_mask_cmp_w_128, CMP_MASK_CC, X86ISD::CMPM, 0), X86_INTRINSIC_DATA(avx512_mask_cmp_w_256, CMP_MASK_CC, X86ISD::CMPM, 0), X86_INTRINSIC_DATA(avx512_mask_cmp_w_512, CMP_MASK_CC, X86ISD::CMPM, 0), + X86_INTRINSIC_DATA(avx512_mask_compress_d_128, COMPRESS_TO_REG, + X86ISD::COMPRESS, 0), + X86_INTRINSIC_DATA(avx512_mask_compress_d_256, COMPRESS_TO_REG, + X86ISD::COMPRESS, 0), + X86_INTRINSIC_DATA(avx512_mask_compress_d_512, COMPRESS_TO_REG, + X86ISD::COMPRESS, 0), + X86_INTRINSIC_DATA(avx512_mask_compress_pd_128, COMPRESS_TO_REG, + X86ISD::COMPRESS, 0), + X86_INTRINSIC_DATA(avx512_mask_compress_pd_256, COMPRESS_TO_REG, + X86ISD::COMPRESS, 0), + X86_INTRINSIC_DATA(avx512_mask_compress_pd_512, COMPRESS_TO_REG, + X86ISD::COMPRESS, 0), + X86_INTRINSIC_DATA(avx512_mask_compress_ps_128, COMPRESS_TO_REG, + X86ISD::COMPRESS, 0), + X86_INTRINSIC_DATA(avx512_mask_compress_ps_256, COMPRESS_TO_REG, + X86ISD::COMPRESS, 0), + X86_INTRINSIC_DATA(avx512_mask_compress_ps_512, COMPRESS_TO_REG, + X86ISD::COMPRESS, 0), + X86_INTRINSIC_DATA(avx512_mask_compress_q_128, COMPRESS_TO_REG, + X86ISD::COMPRESS, 0), + X86_INTRINSIC_DATA(avx512_mask_compress_q_256, COMPRESS_TO_REG, + X86ISD::COMPRESS, 0), + X86_INTRINSIC_DATA(avx512_mask_compress_q_512, COMPRESS_TO_REG, + X86ISD::COMPRESS, 0), X86_INTRINSIC_DATA(avx512_mask_pcmpeq_b_128, CMP_MASK, X86ISD::PCMPEQM, 0), X86_INTRINSIC_DATA(avx512_mask_pcmpeq_b_256, CMP_MASK, X86ISD::PCMPEQM, 0), X86_INTRINSIC_DATA(avx512_mask_pcmpeq_b_512, CMP_MASK, X86ISD::PCMPEQM, 0), diff --git a/test/CodeGen/X86/avx512vl-intrinsics.ll b/test/CodeGen/X86/avx512vl-intrinsics.ll index fa19084eb68..8f3b32a93c0 100644 --- a/test/CodeGen/X86/avx512vl-intrinsics.ll +++ b/test/CodeGen/X86/avx512vl-intrinsics.ll @@ -67,7 +67,7 @@ define i8 @test_mask_pcmpgt_q_256(<4 x i64> %a, <4 x i64> %b, i8 %mask) { declare i8 @llvm.x86.avx512.mask.pcmpgt.q.256(<4 x i64>, <4 x i64>, i8) define <8 x i8> @test_cmp_d_256(<8 x i32> %a0, <8 x i32> %a1) { -; CHECK_LABEL: test_cmp_d_256 +; CHECK-LABEL: test_cmp_d_256 ; CHECK: vpcmpeqd %ymm1, %ymm0, %k0 ## %res0 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 0, i8 -1) %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0 @@ -96,7 +96,7 @@ define <8 x i8> @test_cmp_d_256(<8 x i32> %a0, <8 x i32> %a1) { } define <8 x i8> @test_mask_cmp_d_256(<8 x i32> %a0, <8 x i32> %a1, i8 %mask) { -; CHECK_LABEL: test_mask_cmp_d_256 +; CHECK-LABEL: test_mask_cmp_d_256 ; CHECK: vpcmpeqd %ymm1, %ymm0, %k0 {%k1} ## %res0 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 0, i8 %mask) %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0 @@ -127,7 +127,7 @@ define <8 x i8> @test_mask_cmp_d_256(<8 x i32> %a0, <8 x i32> %a1, i8 %mask) { declare i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32>, <8 x i32>, i32, i8) nounwind readnone define <8 x i8> @test_ucmp_d_256(<8 x i32> %a0, <8 x i32> %a1) { -; CHECK_LABEL: test_ucmp_d_256 +; CHECK-LABEL: test_ucmp_d_256 ; CHECK: vpcmpequd %ymm1, %ymm0, %k0 ## %res0 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 0, i8 -1) %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0 @@ -156,7 +156,7 @@ define <8 x i8> @test_ucmp_d_256(<8 x i32> %a0, <8 x i32> %a1) { } define <8 x i8> @test_mask_ucmp_d_256(<8 x i32> %a0, <8 x i32> %a1, i8 %mask) { -; CHECK_LABEL: test_mask_ucmp_d_256 +; CHECK-LABEL: test_mask_ucmp_d_256 ; CHECK: vpcmpequd %ymm1, %ymm0, %k0 {%k1} ## %res0 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 0, i8 %mask) %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0 @@ -187,7 +187,7 @@ define <8 x i8> @test_mask_ucmp_d_256(<8 x i32> %a0, <8 x i32> %a1, i8 %mask) { declare i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32>, <8 x i32>, i32, i8) nounwind readnone define <8 x i8> @test_cmp_q_256(<4 x i64> %a0, <4 x i64> %a1) { -; CHECK_LABEL: test_cmp_q_256 +; CHECK-LABEL: test_cmp_q_256 ; CHECK: vpcmpeqq %ymm1, %ymm0, %k0 ## %res0 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 0, i8 -1) %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0 @@ -216,7 +216,7 @@ define <8 x i8> @test_cmp_q_256(<4 x i64> %a0, <4 x i64> %a1) { } define <8 x i8> @test_mask_cmp_q_256(<4 x i64> %a0, <4 x i64> %a1, i8 %mask) { -; CHECK_LABEL: test_mask_cmp_q_256 +; CHECK-LABEL: test_mask_cmp_q_256 ; CHECK: vpcmpeqq %ymm1, %ymm0, %k0 {%k1} ## %res0 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 0, i8 %mask) %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0 @@ -247,7 +247,7 @@ define <8 x i8> @test_mask_cmp_q_256(<4 x i64> %a0, <4 x i64> %a1, i8 %mask) { declare i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64>, <4 x i64>, i32, i8) nounwind readnone define <8 x i8> @test_ucmp_q_256(<4 x i64> %a0, <4 x i64> %a1) { -; CHECK_LABEL: test_ucmp_q_256 +; CHECK-LABEL: test_ucmp_q_256 ; CHECK: vpcmpequq %ymm1, %ymm0, %k0 ## %res0 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 0, i8 -1) %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0 @@ -276,7 +276,7 @@ define <8 x i8> @test_ucmp_q_256(<4 x i64> %a0, <4 x i64> %a1) { } define <8 x i8> @test_mask_ucmp_q_256(<4 x i64> %a0, <4 x i64> %a1, i8 %mask) { -; CHECK_LABEL: test_mask_ucmp_q_256 +; CHECK-LABEL: test_mask_ucmp_q_256 ; CHECK: vpcmpequq %ymm1, %ymm0, %k0 {%k1} ## %res0 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 0, i8 %mask) %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0 @@ -373,7 +373,7 @@ define i8 @test_mask_pcmpgt_q_128(<2 x i64> %a, <2 x i64> %b, i8 %mask) { declare i8 @llvm.x86.avx512.mask.pcmpgt.q.128(<2 x i64>, <2 x i64>, i8) define <8 x i8> @test_cmp_d_128(<4 x i32> %a0, <4 x i32> %a1) { -; CHECK_LABEL: test_cmp_d_128 +; CHECK-LABEL: test_cmp_d_128 ; CHECK: vpcmpeqd %xmm1, %xmm0, %k0 ## %res0 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 0, i8 -1) %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0 @@ -402,7 +402,7 @@ define <8 x i8> @test_cmp_d_128(<4 x i32> %a0, <4 x i32> %a1) { } define <8 x i8> @test_mask_cmp_d_128(<4 x i32> %a0, <4 x i32> %a1, i8 %mask) { -; CHECK_LABEL: test_mask_cmp_d_128 +; CHECK-LABEL: test_mask_cmp_d_128 ; CHECK: vpcmpeqd %xmm1, %xmm0, %k0 {%k1} ## %res0 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 0, i8 %mask) %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0 @@ -433,7 +433,7 @@ define <8 x i8> @test_mask_cmp_d_128(<4 x i32> %a0, <4 x i32> %a1, i8 %mask) { declare i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32>, <4 x i32>, i32, i8) nounwind readnone define <8 x i8> @test_ucmp_d_128(<4 x i32> %a0, <4 x i32> %a1) { -; CHECK_LABEL: test_ucmp_d_128 +; CHECK-LABEL: test_ucmp_d_128 ; CHECK: vpcmpequd %xmm1, %xmm0, %k0 ## %res0 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 0, i8 -1) %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0 @@ -462,7 +462,7 @@ define <8 x i8> @test_ucmp_d_128(<4 x i32> %a0, <4 x i32> %a1) { } define <8 x i8> @test_mask_ucmp_d_128(<4 x i32> %a0, <4 x i32> %a1, i8 %mask) { -; CHECK_LABEL: test_mask_ucmp_d_128 +; CHECK-LABEL: test_mask_ucmp_d_128 ; CHECK: vpcmpequd %xmm1, %xmm0, %k0 {%k1} ## %res0 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 0, i8 %mask) %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0 @@ -493,7 +493,7 @@ define <8 x i8> @test_mask_ucmp_d_128(<4 x i32> %a0, <4 x i32> %a1, i8 %mask) { declare i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32>, <4 x i32>, i32, i8) nounwind readnone define <8 x i8> @test_cmp_q_128(<2 x i64> %a0, <2 x i64> %a1) { -; CHECK_LABEL: test_cmp_q_128 +; CHECK-LABEL: test_cmp_q_128 ; CHECK: vpcmpeqq %xmm1, %xmm0, %k0 ## %res0 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 0, i8 -1) %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0 @@ -522,7 +522,7 @@ define <8 x i8> @test_cmp_q_128(<2 x i64> %a0, <2 x i64> %a1) { } define <8 x i8> @test_mask_cmp_q_128(<2 x i64> %a0, <2 x i64> %a1, i8 %mask) { -; CHECK_LABEL: test_mask_cmp_q_128 +; CHECK-LABEL: test_mask_cmp_q_128 ; CHECK: vpcmpeqq %xmm1, %xmm0, %k0 {%k1} ## %res0 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 0, i8 %mask) %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0 @@ -553,7 +553,7 @@ define <8 x i8> @test_mask_cmp_q_128(<2 x i64> %a0, <2 x i64> %a1, i8 %mask) { declare i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64>, <2 x i64>, i32, i8) nounwind readnone define <8 x i8> @test_ucmp_q_128(<2 x i64> %a0, <2 x i64> %a1) { -; CHECK_LABEL: test_ucmp_q_128 +; CHECK-LABEL: test_ucmp_q_128 ; CHECK: vpcmpequq %xmm1, %xmm0, %k0 ## %res0 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 0, i8 -1) %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0 @@ -582,7 +582,7 @@ define <8 x i8> @test_ucmp_q_128(<2 x i64> %a0, <2 x i64> %a1) { } define <8 x i8> @test_mask_ucmp_q_128(<2 x i64> %a0, <2 x i64> %a1, i8 %mask) { -; CHECK_LABEL: test_mask_ucmp_q_128 +; CHECK-LABEL: test_mask_ucmp_q_128 ; CHECK: vpcmpequq %xmm1, %xmm0, %k0 {%k1} ## %res0 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 0, i8 %mask) %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0 @@ -611,3 +611,90 @@ define <8 x i8> @test_mask_ucmp_q_128(<2 x i64> %a0, <2 x i64> %a1, i8 %mask) { } declare i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64>, <2 x i64>, i32, i8) nounwind readnone + +; CHECK-LABEL: compr1 +; CHECK: vcompresspd %zmm0 +define void @compr1(i8* %addr, <8 x double> %data, i8 %mask) { + call void @llvm.x86.avx512.mask.compress.store.pd.512(i8* %addr, <8 x double> %data, i8 %mask) + ret void +} + +declare void @llvm.x86.avx512.mask.compress.store.pd.512(i8* %addr, <8 x double> %data, i8 %mask) + +; CHECK-LABEL: compr2 +; CHECK: vcompresspd %ymm0 +define void @compr2(i8* %addr, <4 x double> %data, i8 %mask) { + call void @llvm.x86.avx512.mask.compress.store.pd.256(i8* %addr, <4 x double> %data, i8 %mask) + ret void +} + +declare void @llvm.x86.avx512.mask.compress.store.pd.256(i8* %addr, <4 x double> %data, i8 %mask) + +; CHECK-LABEL: compr3 +; CHECK: vcompressps %xmm0 +define void @compr3(i8* %addr, <4 x float> %data, i8 %mask) { + call void @llvm.x86.avx512.mask.compress.store.ps.128(i8* %addr, <4 x float> %data, i8 %mask) + ret void +} + +declare void @llvm.x86.avx512.mask.compress.store.ps.128(i8* %addr, <4 x float> %data, i8 %mask) + +; CHECK-LABEL: compr4 +; CHECK: vcompresspd %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xc9,0x8a,0xc0] +define <8 x double> @compr4(i8* %addr, <8 x double> %data, i8 %mask) { + %res = call <8 x double> @llvm.x86.avx512.mask.compress.pd.512(<8 x double> %data, <8 x double> zeroinitializer, i8 %mask) + ret <8 x double> %res +} + +declare <8 x double> @llvm.x86.avx512.mask.compress.pd.512(<8 x double> %data, <8 x double> %src0, i8 %mask) + +; CHECK-LABEL: compr5 +; CHECK: vcompresspd %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x8a,0xc1] +define <4 x double> @compr5(<4 x double> %data, <4 x double> %src0, i8 %mask) { + %res = call <4 x double> @llvm.x86.avx512.mask.compress.pd.256( <4 x double> %data, <4 x double> %src0, i8 %mask) + ret <4 x double> %res +} + +declare <4 x double> @llvm.x86.avx512.mask.compress.pd.256(<4 x double> %data, <4 x double> %src0, i8 %mask) + +; CHECK-LABEL: compr6 +; CHECK: vcompressps %xmm0 +define <4 x float> @compr6(<4 x float> %data, i8 %mask) { + %res = call <4 x float> @llvm.x86.avx512.mask.compress.ps.128(<4 x float> %data, <4 x float>zeroinitializer, i8 %mask) + ret <4 x float> %res +} + +declare <4 x float> @llvm.x86.avx512.mask.compress.ps.128(<4 x float> %data, <4 x float> %src0, i8 %mask) + +; CHECK-LABEL: compr7 +; CHECK-NOT: vcompress +; CHECK: vmovapd +define void @compr7(i8* %addr, <8 x double> %data) { + call void @llvm.x86.avx512.mask.compress.store.pd.512(i8* %addr, <8 x double> %data, i8 -1) + ret void +} + +; CHECK-LABEL: compr8 +; CHECK-NOT: vcompressps %xmm0 +define <4 x float> @compr8(<4 x float> %data) { + %res = call <4 x float> @llvm.x86.avx512.mask.compress.ps.128(<4 x float> %data, <4 x float>zeroinitializer, i8 -1) + ret <4 x float> %res +} + +; CHECK-LABEL: compr9 +; CHECK: vpcompressq %zmm0, (%rdi) {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x8b,0x07] +define void @compr9(i8* %addr, <8 x i64> %data, i8 %mask) { + call void @llvm.x86.avx512.mask.compress.store.q.512(i8* %addr, <8 x i64> %data, i8 %mask) + ret void +} + +declare void @llvm.x86.avx512.mask.compress.store.q.512(i8* %addr, <8 x i64> %data, i8 %mask) + +; CHECK-LABEL: compr10 +; CHECK: vpcompressd %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x8b,0xc0] +define <4 x i32> @compr10(<4 x i32> %data, i8 %mask) { + %res = call <4 x i32> @llvm.x86.avx512.mask.compress.d.128(<4 x i32> %data, <4 x i32>zeroinitializer, i8 %mask) + ret <4 x i32> %res +} + +declare <4 x i32> @llvm.x86.avx512.mask.compress.d.128(<4 x i32> %data, <4 x i32> %src0, i8 %mask)