AVX-512: Added all forms of COMPRESS instruction

+ intrinsics + tests


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@224019 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
Elena Demikhovsky 2014-12-11 15:02:24 +00:00
parent c3692e5c67
commit 11fb1d0eb5
7 changed files with 365 additions and 22 deletions

View File

@ -3458,6 +3458,108 @@ let TargetPrefix = "x86" in {
llvm_i8_ty], [IntrNoMem]>;
}
// Compress, Expand
let TargetPrefix = "x86" in {
def int_x86_avx512_mask_compress_ps_512 :
GCCBuiltin<"__builtin_ia32_compresssf512_mask">,
Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty, llvm_v16f32_ty,
llvm_i16_ty], [IntrNoMem]>;
def int_x86_avx512_mask_compress_pd_512 :
GCCBuiltin<"__builtin_ia32_compressdf512_mask">,
Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_v8f64_ty,
llvm_i8_ty], [IntrNoMem]>;
def int_x86_avx512_mask_compress_ps_256 :
GCCBuiltin<"__builtin_ia32_compresssf256_mask">,
Intrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty, llvm_v8f32_ty,
llvm_i8_ty], [IntrNoMem]>;
def int_x86_avx512_mask_compress_pd_256 :
GCCBuiltin<"__builtin_ia32_compressdf256_mask">,
Intrinsic<[llvm_v4f64_ty], [llvm_v4f64_ty, llvm_v4f64_ty,
llvm_i8_ty], [IntrNoMem]>;
def int_x86_avx512_mask_compress_ps_128 :
GCCBuiltin<"__builtin_ia32_compresssf128_mask">,
Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty,
llvm_i8_ty], [IntrNoMem]>;
def int_x86_avx512_mask_compress_pd_128 :
GCCBuiltin<"__builtin_ia32_compressdf128_mask">,
Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty,
llvm_i8_ty], [IntrNoMem]>;
def int_x86_avx512_mask_compress_store_ps_512 :
GCCBuiltin<"__builtin_ia32_compressstoresf512_mask">,
Intrinsic<[], [llvm_ptr_ty, llvm_v16f32_ty,
llvm_i16_ty], [IntrReadWriteArgMem]>;
def int_x86_avx512_mask_compress_store_pd_512 :
GCCBuiltin<"__builtin_ia32_compressstoredf512_mask">,
Intrinsic<[], [llvm_ptr_ty, llvm_v8f64_ty,
llvm_i8_ty], [IntrReadWriteArgMem]>;
def int_x86_avx512_mask_compress_store_ps_256 :
GCCBuiltin<"__builtin_ia32_compressstoresf256_mask">,
Intrinsic<[], [llvm_ptr_ty, llvm_v8f32_ty,
llvm_i8_ty], [IntrReadWriteArgMem]>;
def int_x86_avx512_mask_compress_store_pd_256 :
GCCBuiltin<"__builtin_ia32_compressstoredf256_mask">,
Intrinsic<[], [llvm_ptr_ty, llvm_v4f64_ty,
llvm_i8_ty], [IntrReadWriteArgMem]>;
def int_x86_avx512_mask_compress_store_ps_128 :
GCCBuiltin<"__builtin_ia32_compressstoresf128_mask">,
Intrinsic<[], [llvm_ptr_ty, llvm_v4f32_ty,
llvm_i8_ty], [IntrReadWriteArgMem]>;
def int_x86_avx512_mask_compress_store_pd_128 :
GCCBuiltin<"__builtin_ia32_compressstoredf128_mask">,
Intrinsic<[], [llvm_ptr_ty, llvm_v2f64_ty,
llvm_i8_ty], [IntrReadWriteArgMem]>;
def int_x86_avx512_mask_compress_d_512 :
GCCBuiltin<"__builtin_ia32_compresssi512_mask">,
Intrinsic<[llvm_v16i32_ty], [llvm_v16i32_ty, llvm_v16i32_ty,
llvm_i16_ty], [IntrNoMem]>;
def int_x86_avx512_mask_compress_q_512 :
GCCBuiltin<"__builtin_ia32_compressdi512_mask">,
Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty, llvm_v8i64_ty,
llvm_i8_ty], [IntrNoMem]>;
def int_x86_avx512_mask_compress_d_256 :
GCCBuiltin<"__builtin_ia32_compresssi256_mask">,
Intrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty, llvm_v8i32_ty,
llvm_i8_ty], [IntrNoMem]>;
def int_x86_avx512_mask_compress_q_256 :
GCCBuiltin<"__builtin_ia32_compressdi256_mask">,
Intrinsic<[llvm_v4i64_ty], [llvm_v4i64_ty, llvm_v4i64_ty,
llvm_i8_ty], [IntrNoMem]>;
def int_x86_avx512_mask_compress_d_128 :
GCCBuiltin<"__builtin_ia32_compresssi128_mask">,
Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty,
llvm_i8_ty], [IntrNoMem]>;
def int_x86_avx512_mask_compress_q_128 :
GCCBuiltin<"__builtin_ia32_compressdi128_mask">,
Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty,
llvm_i8_ty], [IntrNoMem]>;
def int_x86_avx512_mask_compress_store_d_512 :
GCCBuiltin<"__builtin_ia32_compressstoresi512_mask">,
Intrinsic<[], [llvm_ptr_ty, llvm_v16i32_ty,
llvm_i16_ty], [IntrReadWriteArgMem]>;
def int_x86_avx512_mask_compress_store_q_512 :
GCCBuiltin<"__builtin_ia32_compressstoredi512_mask">,
Intrinsic<[], [llvm_ptr_ty, llvm_v8i64_ty,
llvm_i8_ty], [IntrReadWriteArgMem]>;
def int_x86_avx512_mask_compress_store_d_256 :
GCCBuiltin<"__builtin_ia32_compressstoresi256_mask">,
Intrinsic<[], [llvm_ptr_ty, llvm_v8i32_ty,
llvm_i8_ty], [IntrReadWriteArgMem]>;
def int_x86_avx512_mask_compress_store_q_256 :
GCCBuiltin<"__builtin_ia32_compressstoredi256_mask">,
Intrinsic<[], [llvm_ptr_ty, llvm_v4i64_ty,
llvm_i8_ty], [IntrReadWriteArgMem]>;
def int_x86_avx512_mask_compress_store_d_128 :
GCCBuiltin<"__builtin_ia32_compressstoresi128_mask">,
Intrinsic<[], [llvm_ptr_ty, llvm_v4i32_ty,
llvm_i8_ty], [IntrReadWriteArgMem]>;
def int_x86_avx512_mask_compress_store_q_128 :
GCCBuiltin<"__builtin_ia32_compressstoredi128_mask">,
Intrinsic<[], [llvm_ptr_ty, llvm_v2i64_ty,
llvm_i8_ty], [IntrReadWriteArgMem]>;
}
// Misc.
let TargetPrefix = "x86" in {
def int_x86_avx512_mask_cmp_ps_512 : GCCBuiltin<"__builtin_ia32_cmpps512_mask">,

View File

@ -16957,9 +16957,31 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget
return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(),
Op.getOperand(1), Op.getOperand(2), DAG);
case VSHIFT_MASK:
return getVectorMaskingNode(getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(),
Op.getOperand(1), Op.getOperand(2), DAG),
Op.getOperand(4), Op.getOperand(3), Subtarget, DAG);
return getVectorMaskingNode(getTargetVShiftNode(IntrData->Opc0, dl,
Op.getSimpleValueType(),
Op.getOperand(1),
Op.getOperand(2), DAG),
Op.getOperand(4), Op.getOperand(3), Subtarget,
DAG);
case COMPRESS_TO_REG: {
SDValue Mask = Op.getOperand(3);
SDValue DataToCompress = Op.getOperand(1);
SDValue PassThru = Op.getOperand(2);
if (isAllOnes(Mask)) // return data as is
return Op.getOperand(1);
EVT VT = Op.getValueType();
EVT MaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
VT.getVectorNumElements());
EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
Mask.getValueType().getSizeInBits());
SDLoc dl(Op);
SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
DAG.getNode(ISD::BITCAST, dl, BitcastVT, Mask),
DAG.getIntPtrConstant(0));
return DAG.getNode(IntrData->Opc0, dl, VT, VMask, DataToCompress,
PassThru);
}
default:
break;
}
@ -17477,6 +17499,31 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget,
Results.push_back(Store);
return DAG.getMergeValues(Results, dl);
}
case COMPRESS_TO_MEM: {
SDLoc dl(Op);
SDValue Mask = Op.getOperand(4);
SDValue DataToCompress = Op.getOperand(3);
SDValue Addr = Op.getOperand(2);
SDValue Chain = Op.getOperand(0);
if (isAllOnes(Mask)) // return just a store
return DAG.getStore(Chain, dl, DataToCompress, Addr,
MachinePointerInfo(), false, false, 0);
EVT VT = DataToCompress.getValueType();
EVT MaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
VT.getVectorNumElements());
EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
Mask.getValueType().getSizeInBits());
SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
DAG.getNode(ISD::BITCAST, dl, BitcastVT, Mask),
DAG.getIntPtrConstant(0));
SDValue Compressed = DAG.getNode(IntrData->Opc0, dl, VT, VMask,
DataToCompress, DAG.getUNDEF(VT));
return DAG.getStore(Chain, dl, Compressed, Addr,
MachinePointerInfo(), false, false, 0);
}
}
}
@ -19662,6 +19709,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
case X86ISD::PCMPESTRI: return "X86ISD::PCMPESTRI";
case X86ISD::PCMPISTRI: return "X86ISD::PCMPISTRI";
case X86ISD::XTEST: return "X86ISD::XTEST";
case X86ISD::COMPRESS: return "X86ISD::COMPRESS";
}
}

View File

@ -379,6 +379,10 @@ namespace llvm {
FMADDSUB,
FMSUBADD,
// Compress and expand
COMPRESS,
EXPAND,
// Save xmm argument registers to the stack, according to %al. An operator
// is needed so that this can be expanded with control flow.
VASTART_SAVE_XMM_REGS,

View File

@ -5281,3 +5281,51 @@ multiclass avx512_convert_mask_to_vector<string OpcodeStr> {
}
defm VPMOVM2 : avx512_convert_mask_to_vector<"vpmovm2">;
//===----------------------------------------------------------------------===//
// AVX-512 - COMPRESS and EXPAND
//
multiclass compress_by_vec_width<bits<8> opc, X86VectorVTInfo _,
string OpcodeStr> {
def rrkz : AVX5128I<opc, MRMDestReg, (outs _.RC:$dst),
(ins _.KRCWM:$mask, _.RC:$src),
OpcodeStr # "\t{$src, $dst {${mask}} {z}|$dst {${mask}} {z}, $src}",
[(set _.RC:$dst, (_.VT (X86compress _.KRCWM:$mask, _.RC:$src,
_.ImmAllZerosV)))]>, EVEX_KZ;
let Constraints = "$src0 = $dst" in
def rrk : AVX5128I<opc, MRMDestReg, (outs _.RC:$dst),
(ins _.RC:$src0, _.KRCWM:$mask, _.RC:$src),
OpcodeStr # "\t{$src, $dst {${mask}} |$dst {${mask}}, $src}",
[(set _.RC:$dst, (_.VT (X86compress _.KRCWM:$mask, _.RC:$src,
_.RC:$src0)))]>, EVEX_K;
let mayStore = 1 in {
def mrk : AVX5128I<opc, MRMDestMem, (outs),
(ins _.MemOp:$dst, _.KRCWM:$mask, _.RC:$src),
OpcodeStr # "\t{$src, $dst {${mask}} |$dst {${mask}}, $src}",
[(store (_.VT (X86compress _.KRCWM:$mask, _.RC:$src, undef)),
addr:$dst)]>,
EVEX_K, EVEX_CD8<_.EltSize, CD8VT1>;
}
}
multiclass compress_by_elt_width<bits<8> opc, string OpcodeStr,
AVX512VLVectorVTInfo VTInfo> {
defm Z : compress_by_vec_width<opc, VTInfo.info512, OpcodeStr>, EVEX_V512;
let Predicates = [HasVLX] in {
defm Z256 : compress_by_vec_width<opc, VTInfo.info256, OpcodeStr>, EVEX_V256;
defm Z128 : compress_by_vec_width<opc, VTInfo.info128, OpcodeStr>, EVEX_V128;
}
}
defm VPCOMPRESSD : compress_by_elt_width <0x8B, "vpcompressd", avx512vl_i32_info>,
EVEX;
defm VPCOMPRESSQ : compress_by_elt_width <0x8B, "vpcompressq", avx512vl_i64_info>,
EVEX, VEX_W;
defm VCOMPRESSPS : compress_by_elt_width <0x8A, "vcompressps", avx512vl_f32_info>,
EVEX;
defm VCOMPRESSPD : compress_by_elt_width <0x8A, "vcompresspd", avx512vl_f64_info>,
EVEX, VEX_W;

View File

@ -283,6 +283,10 @@ def SDT_PCMPESTRI : SDTypeProfile<2, 5, [SDTCisVT<0, i32>, SDTCisVT<1, i32>,
def X86pcmpistri : SDNode<"X86ISD::PCMPISTRI", SDT_PCMPISTRI>;
def X86pcmpestri : SDNode<"X86ISD::PCMPESTRI", SDT_PCMPESTRI>;
def X86compress: SDNode<"X86ISD::COMPRESS", SDTypeProfile<1, 3,
[SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>,
SDTCisVec<3>, SDTCisVec<1>, SDTCisInt<1>]>, []>;
//===----------------------------------------------------------------------===//
// SSE Complex Patterns
//===----------------------------------------------------------------------===//

View File

@ -21,7 +21,8 @@ enum IntrinsicType {
GATHER, SCATTER, PREFETCH, RDSEED, RDRAND, RDPMC, RDTSC, XTEST, ADX,
INTR_TYPE_1OP, INTR_TYPE_2OP, INTR_TYPE_3OP,
CMP_MASK, CMP_MASK_CC, VSHIFT, VSHIFT_MASK, COMI,
INTR_TYPE_1OP_MASK_RM, INTR_TYPE_2OP_MASK, INTR_TYPE_SCALAR_MASK_RM
INTR_TYPE_1OP_MASK_RM, INTR_TYPE_2OP_MASK, INTR_TYPE_SCALAR_MASK_RM,
COMPRESS_TO_REG, COMPRESS_TO_MEM
};
struct IntrinsicData {
@ -70,6 +71,31 @@ static const IntrinsicData IntrinsicsWithChain[] = {
X86_INTRINSIC_DATA(avx512_gatherpf_qps_512, PREFETCH,
X86::VGATHERPF0QPSm, X86::VGATHERPF1QPSm),
X86_INTRINSIC_DATA(avx512_mask_compress_store_d_128,
COMPRESS_TO_MEM, X86ISD::COMPRESS, 0),
X86_INTRINSIC_DATA(avx512_mask_compress_store_d_256,
COMPRESS_TO_MEM, X86ISD::COMPRESS, 0),
X86_INTRINSIC_DATA(avx512_mask_compress_store_d_512,
COMPRESS_TO_MEM, X86ISD::COMPRESS, 0),
X86_INTRINSIC_DATA(avx512_mask_compress_store_pd_128,
COMPRESS_TO_MEM, X86ISD::COMPRESS, 0),
X86_INTRINSIC_DATA(avx512_mask_compress_store_pd_256,
COMPRESS_TO_MEM, X86ISD::COMPRESS, 0),
X86_INTRINSIC_DATA(avx512_mask_compress_store_pd_512,
COMPRESS_TO_MEM, X86ISD::COMPRESS, 0),
X86_INTRINSIC_DATA(avx512_mask_compress_store_ps_128,
COMPRESS_TO_MEM, X86ISD::COMPRESS, 0),
X86_INTRINSIC_DATA(avx512_mask_compress_store_ps_256,
COMPRESS_TO_MEM, X86ISD::COMPRESS, 0),
X86_INTRINSIC_DATA(avx512_mask_compress_store_ps_512,
COMPRESS_TO_MEM, X86ISD::COMPRESS, 0),
X86_INTRINSIC_DATA(avx512_mask_compress_store_q_128,
COMPRESS_TO_MEM, X86ISD::COMPRESS, 0),
X86_INTRINSIC_DATA(avx512_mask_compress_store_q_256,
COMPRESS_TO_MEM, X86ISD::COMPRESS, 0),
X86_INTRINSIC_DATA(avx512_mask_compress_store_q_512,
COMPRESS_TO_MEM, X86ISD::COMPRESS, 0),
X86_INTRINSIC_DATA(avx512_scatter_dpd_512, SCATTER, X86::VSCATTERDPDZmr, 0),
X86_INTRINSIC_DATA(avx512_scatter_dpi_512, SCATTER, X86::VPSCATTERDDZmr, 0),
X86_INTRINSIC_DATA(avx512_scatter_dpq_512, SCATTER, X86::VPSCATTERDQZmr, 0),
@ -157,8 +183,8 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(avx2_pmovzxwd, INTR_TYPE_1OP, X86ISD::VZEXT, 0),
X86_INTRINSIC_DATA(avx2_pmovzxwq, INTR_TYPE_1OP, X86ISD::VZEXT, 0),
X86_INTRINSIC_DATA(avx2_pmul_dq, INTR_TYPE_2OP, X86ISD::PMULDQ, 0),
X86_INTRINSIC_DATA(avx2_pmulh_w, INTR_TYPE_2OP, ISD::MULHS, 0),
X86_INTRINSIC_DATA(avx2_pmulhu_w, INTR_TYPE_2OP, ISD::MULHU, 0),
X86_INTRINSIC_DATA(avx2_pmulh_w, INTR_TYPE_2OP, ISD::MULHS, 0),
X86_INTRINSIC_DATA(avx2_pmulhu_w, INTR_TYPE_2OP, ISD::MULHU, 0),
X86_INTRINSIC_DATA(avx2_pmulu_dq, INTR_TYPE_2OP, X86ISD::PMULUDQ, 0),
X86_INTRINSIC_DATA(avx2_pshuf_b, INTR_TYPE_2OP, X86ISD::PSHUFB, 0),
X86_INTRINSIC_DATA(avx2_psign_b, INTR_TYPE_2OP, X86ISD::PSIGN, 0),
@ -207,6 +233,30 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(avx512_mask_cmp_w_128, CMP_MASK_CC, X86ISD::CMPM, 0),
X86_INTRINSIC_DATA(avx512_mask_cmp_w_256, CMP_MASK_CC, X86ISD::CMPM, 0),
X86_INTRINSIC_DATA(avx512_mask_cmp_w_512, CMP_MASK_CC, X86ISD::CMPM, 0),
X86_INTRINSIC_DATA(avx512_mask_compress_d_128, COMPRESS_TO_REG,
X86ISD::COMPRESS, 0),
X86_INTRINSIC_DATA(avx512_mask_compress_d_256, COMPRESS_TO_REG,
X86ISD::COMPRESS, 0),
X86_INTRINSIC_DATA(avx512_mask_compress_d_512, COMPRESS_TO_REG,
X86ISD::COMPRESS, 0),
X86_INTRINSIC_DATA(avx512_mask_compress_pd_128, COMPRESS_TO_REG,
X86ISD::COMPRESS, 0),
X86_INTRINSIC_DATA(avx512_mask_compress_pd_256, COMPRESS_TO_REG,
X86ISD::COMPRESS, 0),
X86_INTRINSIC_DATA(avx512_mask_compress_pd_512, COMPRESS_TO_REG,
X86ISD::COMPRESS, 0),
X86_INTRINSIC_DATA(avx512_mask_compress_ps_128, COMPRESS_TO_REG,
X86ISD::COMPRESS, 0),
X86_INTRINSIC_DATA(avx512_mask_compress_ps_256, COMPRESS_TO_REG,
X86ISD::COMPRESS, 0),
X86_INTRINSIC_DATA(avx512_mask_compress_ps_512, COMPRESS_TO_REG,
X86ISD::COMPRESS, 0),
X86_INTRINSIC_DATA(avx512_mask_compress_q_128, COMPRESS_TO_REG,
X86ISD::COMPRESS, 0),
X86_INTRINSIC_DATA(avx512_mask_compress_q_256, COMPRESS_TO_REG,
X86ISD::COMPRESS, 0),
X86_INTRINSIC_DATA(avx512_mask_compress_q_512, COMPRESS_TO_REG,
X86ISD::COMPRESS, 0),
X86_INTRINSIC_DATA(avx512_mask_pcmpeq_b_128, CMP_MASK, X86ISD::PCMPEQM, 0),
X86_INTRINSIC_DATA(avx512_mask_pcmpeq_b_256, CMP_MASK, X86ISD::PCMPEQM, 0),
X86_INTRINSIC_DATA(avx512_mask_pcmpeq_b_512, CMP_MASK, X86ISD::PCMPEQM, 0),

View File

@ -67,7 +67,7 @@ define i8 @test_mask_pcmpgt_q_256(<4 x i64> %a, <4 x i64> %b, i8 %mask) {
declare i8 @llvm.x86.avx512.mask.pcmpgt.q.256(<4 x i64>, <4 x i64>, i8)
define <8 x i8> @test_cmp_d_256(<8 x i32> %a0, <8 x i32> %a1) {
; CHECK_LABEL: test_cmp_d_256
; CHECK-LABEL: test_cmp_d_256
; CHECK: vpcmpeqd %ymm1, %ymm0, %k0 ##
%res0 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 0, i8 -1)
%vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
@ -96,7 +96,7 @@ define <8 x i8> @test_cmp_d_256(<8 x i32> %a0, <8 x i32> %a1) {
}
define <8 x i8> @test_mask_cmp_d_256(<8 x i32> %a0, <8 x i32> %a1, i8 %mask) {
; CHECK_LABEL: test_mask_cmp_d_256
; CHECK-LABEL: test_mask_cmp_d_256
; CHECK: vpcmpeqd %ymm1, %ymm0, %k0 {%k1} ##
%res0 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 0, i8 %mask)
%vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
@ -127,7 +127,7 @@ define <8 x i8> @test_mask_cmp_d_256(<8 x i32> %a0, <8 x i32> %a1, i8 %mask) {
declare i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32>, <8 x i32>, i32, i8) nounwind readnone
define <8 x i8> @test_ucmp_d_256(<8 x i32> %a0, <8 x i32> %a1) {
; CHECK_LABEL: test_ucmp_d_256
; CHECK-LABEL: test_ucmp_d_256
; CHECK: vpcmpequd %ymm1, %ymm0, %k0 ##
%res0 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 0, i8 -1)
%vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
@ -156,7 +156,7 @@ define <8 x i8> @test_ucmp_d_256(<8 x i32> %a0, <8 x i32> %a1) {
}
define <8 x i8> @test_mask_ucmp_d_256(<8 x i32> %a0, <8 x i32> %a1, i8 %mask) {
; CHECK_LABEL: test_mask_ucmp_d_256
; CHECK-LABEL: test_mask_ucmp_d_256
; CHECK: vpcmpequd %ymm1, %ymm0, %k0 {%k1} ##
%res0 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 0, i8 %mask)
%vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
@ -187,7 +187,7 @@ define <8 x i8> @test_mask_ucmp_d_256(<8 x i32> %a0, <8 x i32> %a1, i8 %mask) {
declare i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32>, <8 x i32>, i32, i8) nounwind readnone
define <8 x i8> @test_cmp_q_256(<4 x i64> %a0, <4 x i64> %a1) {
; CHECK_LABEL: test_cmp_q_256
; CHECK-LABEL: test_cmp_q_256
; CHECK: vpcmpeqq %ymm1, %ymm0, %k0 ##
%res0 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 0, i8 -1)
%vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
@ -216,7 +216,7 @@ define <8 x i8> @test_cmp_q_256(<4 x i64> %a0, <4 x i64> %a1) {
}
define <8 x i8> @test_mask_cmp_q_256(<4 x i64> %a0, <4 x i64> %a1, i8 %mask) {
; CHECK_LABEL: test_mask_cmp_q_256
; CHECK-LABEL: test_mask_cmp_q_256
; CHECK: vpcmpeqq %ymm1, %ymm0, %k0 {%k1} ##
%res0 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 0, i8 %mask)
%vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
@ -247,7 +247,7 @@ define <8 x i8> @test_mask_cmp_q_256(<4 x i64> %a0, <4 x i64> %a1, i8 %mask) {
declare i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64>, <4 x i64>, i32, i8) nounwind readnone
define <8 x i8> @test_ucmp_q_256(<4 x i64> %a0, <4 x i64> %a1) {
; CHECK_LABEL: test_ucmp_q_256
; CHECK-LABEL: test_ucmp_q_256
; CHECK: vpcmpequq %ymm1, %ymm0, %k0 ##
%res0 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 0, i8 -1)
%vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
@ -276,7 +276,7 @@ define <8 x i8> @test_ucmp_q_256(<4 x i64> %a0, <4 x i64> %a1) {
}
define <8 x i8> @test_mask_ucmp_q_256(<4 x i64> %a0, <4 x i64> %a1, i8 %mask) {
; CHECK_LABEL: test_mask_ucmp_q_256
; CHECK-LABEL: test_mask_ucmp_q_256
; CHECK: vpcmpequq %ymm1, %ymm0, %k0 {%k1} ##
%res0 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 0, i8 %mask)
%vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
@ -373,7 +373,7 @@ define i8 @test_mask_pcmpgt_q_128(<2 x i64> %a, <2 x i64> %b, i8 %mask) {
declare i8 @llvm.x86.avx512.mask.pcmpgt.q.128(<2 x i64>, <2 x i64>, i8)
define <8 x i8> @test_cmp_d_128(<4 x i32> %a0, <4 x i32> %a1) {
; CHECK_LABEL: test_cmp_d_128
; CHECK-LABEL: test_cmp_d_128
; CHECK: vpcmpeqd %xmm1, %xmm0, %k0 ##
%res0 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 0, i8 -1)
%vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
@ -402,7 +402,7 @@ define <8 x i8> @test_cmp_d_128(<4 x i32> %a0, <4 x i32> %a1) {
}
define <8 x i8> @test_mask_cmp_d_128(<4 x i32> %a0, <4 x i32> %a1, i8 %mask) {
; CHECK_LABEL: test_mask_cmp_d_128
; CHECK-LABEL: test_mask_cmp_d_128
; CHECK: vpcmpeqd %xmm1, %xmm0, %k0 {%k1} ##
%res0 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 0, i8 %mask)
%vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
@ -433,7 +433,7 @@ define <8 x i8> @test_mask_cmp_d_128(<4 x i32> %a0, <4 x i32> %a1, i8 %mask) {
declare i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32>, <4 x i32>, i32, i8) nounwind readnone
define <8 x i8> @test_ucmp_d_128(<4 x i32> %a0, <4 x i32> %a1) {
; CHECK_LABEL: test_ucmp_d_128
; CHECK-LABEL: test_ucmp_d_128
; CHECK: vpcmpequd %xmm1, %xmm0, %k0 ##
%res0 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 0, i8 -1)
%vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
@ -462,7 +462,7 @@ define <8 x i8> @test_ucmp_d_128(<4 x i32> %a0, <4 x i32> %a1) {
}
define <8 x i8> @test_mask_ucmp_d_128(<4 x i32> %a0, <4 x i32> %a1, i8 %mask) {
; CHECK_LABEL: test_mask_ucmp_d_128
; CHECK-LABEL: test_mask_ucmp_d_128
; CHECK: vpcmpequd %xmm1, %xmm0, %k0 {%k1} ##
%res0 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 0, i8 %mask)
%vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
@ -493,7 +493,7 @@ define <8 x i8> @test_mask_ucmp_d_128(<4 x i32> %a0, <4 x i32> %a1, i8 %mask) {
declare i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32>, <4 x i32>, i32, i8) nounwind readnone
define <8 x i8> @test_cmp_q_128(<2 x i64> %a0, <2 x i64> %a1) {
; CHECK_LABEL: test_cmp_q_128
; CHECK-LABEL: test_cmp_q_128
; CHECK: vpcmpeqq %xmm1, %xmm0, %k0 ##
%res0 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 0, i8 -1)
%vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
@ -522,7 +522,7 @@ define <8 x i8> @test_cmp_q_128(<2 x i64> %a0, <2 x i64> %a1) {
}
define <8 x i8> @test_mask_cmp_q_128(<2 x i64> %a0, <2 x i64> %a1, i8 %mask) {
; CHECK_LABEL: test_mask_cmp_q_128
; CHECK-LABEL: test_mask_cmp_q_128
; CHECK: vpcmpeqq %xmm1, %xmm0, %k0 {%k1} ##
%res0 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 0, i8 %mask)
%vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
@ -553,7 +553,7 @@ define <8 x i8> @test_mask_cmp_q_128(<2 x i64> %a0, <2 x i64> %a1, i8 %mask) {
declare i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64>, <2 x i64>, i32, i8) nounwind readnone
define <8 x i8> @test_ucmp_q_128(<2 x i64> %a0, <2 x i64> %a1) {
; CHECK_LABEL: test_ucmp_q_128
; CHECK-LABEL: test_ucmp_q_128
; CHECK: vpcmpequq %xmm1, %xmm0, %k0 ##
%res0 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 0, i8 -1)
%vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
@ -582,7 +582,7 @@ define <8 x i8> @test_ucmp_q_128(<2 x i64> %a0, <2 x i64> %a1) {
}
define <8 x i8> @test_mask_ucmp_q_128(<2 x i64> %a0, <2 x i64> %a1, i8 %mask) {
; CHECK_LABEL: test_mask_ucmp_q_128
; CHECK-LABEL: test_mask_ucmp_q_128
; CHECK: vpcmpequq %xmm1, %xmm0, %k0 {%k1} ##
%res0 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 0, i8 %mask)
%vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
@ -611,3 +611,90 @@ define <8 x i8> @test_mask_ucmp_q_128(<2 x i64> %a0, <2 x i64> %a1, i8 %mask) {
}
declare i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64>, <2 x i64>, i32, i8) nounwind readnone
; CHECK-LABEL: compr1
; CHECK: vcompresspd %zmm0
define void @compr1(i8* %addr, <8 x double> %data, i8 %mask) {
call void @llvm.x86.avx512.mask.compress.store.pd.512(i8* %addr, <8 x double> %data, i8 %mask)
ret void
}
declare void @llvm.x86.avx512.mask.compress.store.pd.512(i8* %addr, <8 x double> %data, i8 %mask)
; CHECK-LABEL: compr2
; CHECK: vcompresspd %ymm0
define void @compr2(i8* %addr, <4 x double> %data, i8 %mask) {
call void @llvm.x86.avx512.mask.compress.store.pd.256(i8* %addr, <4 x double> %data, i8 %mask)
ret void
}
declare void @llvm.x86.avx512.mask.compress.store.pd.256(i8* %addr, <4 x double> %data, i8 %mask)
; CHECK-LABEL: compr3
; CHECK: vcompressps %xmm0
define void @compr3(i8* %addr, <4 x float> %data, i8 %mask) {
call void @llvm.x86.avx512.mask.compress.store.ps.128(i8* %addr, <4 x float> %data, i8 %mask)
ret void
}
declare void @llvm.x86.avx512.mask.compress.store.ps.128(i8* %addr, <4 x float> %data, i8 %mask)
; CHECK-LABEL: compr4
; CHECK: vcompresspd %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xc9,0x8a,0xc0]
define <8 x double> @compr4(i8* %addr, <8 x double> %data, i8 %mask) {
%res = call <8 x double> @llvm.x86.avx512.mask.compress.pd.512(<8 x double> %data, <8 x double> zeroinitializer, i8 %mask)
ret <8 x double> %res
}
declare <8 x double> @llvm.x86.avx512.mask.compress.pd.512(<8 x double> %data, <8 x double> %src0, i8 %mask)
; CHECK-LABEL: compr5
; CHECK: vcompresspd %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x8a,0xc1]
define <4 x double> @compr5(<4 x double> %data, <4 x double> %src0, i8 %mask) {
%res = call <4 x double> @llvm.x86.avx512.mask.compress.pd.256( <4 x double> %data, <4 x double> %src0, i8 %mask)
ret <4 x double> %res
}
declare <4 x double> @llvm.x86.avx512.mask.compress.pd.256(<4 x double> %data, <4 x double> %src0, i8 %mask)
; CHECK-LABEL: compr6
; CHECK: vcompressps %xmm0
define <4 x float> @compr6(<4 x float> %data, i8 %mask) {
%res = call <4 x float> @llvm.x86.avx512.mask.compress.ps.128(<4 x float> %data, <4 x float>zeroinitializer, i8 %mask)
ret <4 x float> %res
}
declare <4 x float> @llvm.x86.avx512.mask.compress.ps.128(<4 x float> %data, <4 x float> %src0, i8 %mask)
; CHECK-LABEL: compr7
; CHECK-NOT: vcompress
; CHECK: vmovapd
define void @compr7(i8* %addr, <8 x double> %data) {
call void @llvm.x86.avx512.mask.compress.store.pd.512(i8* %addr, <8 x double> %data, i8 -1)
ret void
}
; CHECK-LABEL: compr8
; CHECK-NOT: vcompressps %xmm0
define <4 x float> @compr8(<4 x float> %data) {
%res = call <4 x float> @llvm.x86.avx512.mask.compress.ps.128(<4 x float> %data, <4 x float>zeroinitializer, i8 -1)
ret <4 x float> %res
}
; CHECK-LABEL: compr9
; CHECK: vpcompressq %zmm0, (%rdi) {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x8b,0x07]
define void @compr9(i8* %addr, <8 x i64> %data, i8 %mask) {
call void @llvm.x86.avx512.mask.compress.store.q.512(i8* %addr, <8 x i64> %data, i8 %mask)
ret void
}
declare void @llvm.x86.avx512.mask.compress.store.q.512(i8* %addr, <8 x i64> %data, i8 %mask)
; CHECK-LABEL: compr10
; CHECK: vpcompressd %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x8b,0xc0]
define <4 x i32> @compr10(<4 x i32> %data, i8 %mask) {
%res = call <4 x i32> @llvm.x86.avx512.mask.compress.d.128(<4 x i32> %data, <4 x i32>zeroinitializer, i8 %mask)
ret <4 x i32> %res
}
declare <4 x i32> @llvm.x86.avx512.mask.compress.d.128(<4 x i32> %data, <4 x i32> %src0, i8 %mask)