diff --git a/include/llvm/IR/IntrinsicsX86.td b/include/llvm/IR/IntrinsicsX86.td index 8caedfaf048..102a3f4f589 100644 --- a/include/llvm/IR/IntrinsicsX86.td +++ b/include/llvm/IR/IntrinsicsX86.td @@ -4264,6 +4264,102 @@ let TargetPrefix = "x86" in { llvm_v8i64_ty, llvm_i8_ty, llvm_i32_ty], [IntrReadArgMem]>; + def int_x86_avx512_gather3div2_df : + GCCBuiltin<"__builtin_ia32_gather3div2df">, + Intrinsic<[llvm_v2f64_ty], + [llvm_v2f64_ty, llvm_ptr_ty, llvm_v2i64_ty, llvm_i8_ty, llvm_i32_ty], + [IntrReadArgMem]>; + + def int_x86_avx512_gather3div2_di : + GCCBuiltin<"__builtin_ia32_gather3div2di">, + Intrinsic<[llvm_v4i32_ty], + [llvm_v2i64_ty, llvm_ptr_ty, llvm_v2i64_ty, llvm_i8_ty, llvm_i32_ty], + [IntrReadArgMem]>; + + def int_x86_avx512_gather3div4_df : + GCCBuiltin<"__builtin_ia32_gather3div4df">, + Intrinsic<[llvm_v4f64_ty], + [llvm_v4f64_ty, llvm_ptr_ty, llvm_v4i64_ty, llvm_i8_ty, llvm_i32_ty], + [IntrReadArgMem]>; + + def int_x86_avx512_gather3div4_di : + GCCBuiltin<"__builtin_ia32_gather3div4di">, + Intrinsic<[llvm_v8i32_ty], + [llvm_v4i64_ty, llvm_ptr_ty, llvm_v4i64_ty, llvm_i8_ty, llvm_i32_ty], + [IntrReadArgMem]>; + + def int_x86_avx512_gather3div4_sf : + GCCBuiltin<"__builtin_ia32_gather3div4sf">, + Intrinsic<[llvm_v4f32_ty], + [llvm_v4f32_ty, llvm_ptr_ty, llvm_v2i64_ty, llvm_i8_ty, llvm_i32_ty], + [IntrReadArgMem]>; + + def int_x86_avx512_gather3div4_si : + GCCBuiltin<"__builtin_ia32_gather3div4si">, + Intrinsic<[llvm_v4i32_ty], + [llvm_v4i32_ty, llvm_ptr_ty, llvm_v2i64_ty, llvm_i8_ty, llvm_i32_ty], + [IntrReadArgMem]>; + + def int_x86_avx512_gather3div8_sf : + GCCBuiltin<"__builtin_ia32_gather3div8sf">, + Intrinsic<[llvm_v4f32_ty], + [llvm_v4f32_ty, llvm_ptr_ty, llvm_v4i64_ty, llvm_i8_ty, llvm_i32_ty], + [IntrReadArgMem]>; + + def int_x86_avx512_gather3div8_si : + GCCBuiltin<"__builtin_ia32_gather3div8si">, + Intrinsic<[llvm_v4i32_ty], + [llvm_v4i32_ty, llvm_ptr_ty, llvm_v4i64_ty, llvm_i8_ty, llvm_i32_ty], + [IntrReadArgMem]>; + + def int_x86_avx512_gather3siv2_df : + GCCBuiltin<"__builtin_ia32_gather3siv2df">, + Intrinsic<[llvm_v2f64_ty], + [llvm_v2f64_ty, llvm_ptr_ty, llvm_v4i32_ty, llvm_i8_ty, llvm_i32_ty], + [IntrReadArgMem]>; + + def int_x86_avx512_gather3siv2_di : + GCCBuiltin<"__builtin_ia32_gather3siv2di">, + Intrinsic<[llvm_v4i32_ty], + [llvm_v2i64_ty, llvm_ptr_ty, llvm_v4i32_ty, llvm_i8_ty, llvm_i32_ty], + [IntrReadArgMem]>; + + def int_x86_avx512_gather3siv4_df : + GCCBuiltin<"__builtin_ia32_gather3siv4df">, + Intrinsic<[llvm_v4f64_ty], + [llvm_v4f64_ty, llvm_ptr_ty, llvm_v4i32_ty, llvm_i8_ty, llvm_i32_ty], + [IntrReadArgMem]>; + + def int_x86_avx512_gather3siv4_di : + GCCBuiltin<"__builtin_ia32_gather3siv4di">, + Intrinsic<[llvm_v8i32_ty], + [llvm_v4i64_ty, llvm_ptr_ty, llvm_v4i32_ty, llvm_i8_ty, llvm_i32_ty], + [IntrReadArgMem]>; + + def int_x86_avx512_gather3siv4_sf : + GCCBuiltin<"__builtin_ia32_gather3siv4sf">, + Intrinsic<[llvm_v4f32_ty], + [llvm_v4f32_ty, llvm_ptr_ty, llvm_v4i32_ty, llvm_i8_ty, llvm_i32_ty], + [IntrReadArgMem]>; + + def int_x86_avx512_gather3siv4_si : + GCCBuiltin<"__builtin_ia32_gather3siv4si">, + Intrinsic<[llvm_v4i32_ty], + [llvm_v4i32_ty, llvm_ptr_ty, llvm_v4i32_ty, llvm_i8_ty, llvm_i32_ty], + [IntrReadArgMem]>; + + def int_x86_avx512_gather3siv8_sf : + GCCBuiltin<"__builtin_ia32_gather3siv8sf">, + Intrinsic<[llvm_v8f32_ty], + [llvm_v8f32_ty, llvm_ptr_ty, llvm_v8i32_ty, llvm_i8_ty, llvm_i32_ty], + [IntrReadArgMem]>; + + def int_x86_avx512_gather3siv8_si : + GCCBuiltin<"__builtin_ia32_gather3siv8si">, + Intrinsic<[llvm_v8i32_ty], + [llvm_v8i32_ty, llvm_ptr_ty, llvm_v8i32_ty, llvm_i8_ty, llvm_i32_ty], + [IntrReadArgMem]>; + // scatter def int_x86_avx512_scatter_dpd_512 : GCCBuiltin<"__builtin_ia32_scattersiv8df">, Intrinsic<[], [llvm_ptr_ty, llvm_i8_ty, diff --git a/lib/Target/X86/AsmParser/X86Operand.h b/lib/Target/X86/AsmParser/X86Operand.h index b3066efbab2..7ec02408ffa 100644 --- a/lib/Target/X86/AsmParser/X86Operand.h +++ b/lib/Target/X86/AsmParser/X86Operand.h @@ -238,18 +238,34 @@ struct X86Operand : public MCParsedAsmOperand { return Kind == Memory && (!Mem.Size || Mem.Size == 32) && getMemIndexReg() >= X86::XMM0 && getMemIndexReg() <= X86::XMM15; } + bool isMemVX32X() const { + return Kind == Memory && (!Mem.Size || Mem.Size == 32) && + getMemIndexReg() >= X86::XMM0 && getMemIndexReg() <= X86::XMM31; + } bool isMemVY32() const { return Kind == Memory && (!Mem.Size || Mem.Size == 32) && getMemIndexReg() >= X86::YMM0 && getMemIndexReg() <= X86::YMM15; } + bool isMemVY32X() const { + return Kind == Memory && (!Mem.Size || Mem.Size == 32) && + getMemIndexReg() >= X86::YMM0 && getMemIndexReg() <= X86::YMM31; + } bool isMemVX64() const { return Kind == Memory && (!Mem.Size || Mem.Size == 64) && getMemIndexReg() >= X86::XMM0 && getMemIndexReg() <= X86::XMM15; } + bool isMemVX64X() const { + return Kind == Memory && (!Mem.Size || Mem.Size == 64) && + getMemIndexReg() >= X86::XMM0 && getMemIndexReg() <= X86::XMM31; + } bool isMemVY64() const { return Kind == Memory && (!Mem.Size || Mem.Size == 64) && getMemIndexReg() >= X86::YMM0 && getMemIndexReg() <= X86::YMM15; } + bool isMemVY64X() const { + return Kind == Memory && (!Mem.Size || Mem.Size == 64) && + getMemIndexReg() >= X86::YMM0 && getMemIndexReg() <= X86::YMM31; + } bool isMemVZ32() const { return Kind == Memory && (!Mem.Size || Mem.Size == 32) && getMemIndexReg() >= X86::ZMM0 && getMemIndexReg() <= X86::ZMM31; diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 47d107607e6..b2e08c28f48 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -15424,7 +15424,12 @@ static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, const X86Subtarget * Subtarget) { SDLoc dl(Op); ConstantSDNode *C = dyn_cast(ScaleOp); - assert(C && "Invalid scale type"); + if (!C) + llvm_unreachable("Invalid scale type"); + unsigned ScaleVal = C->getZExtValue(); + if (ScaleVal > 2 && ScaleVal != 4 && ScaleVal != 8) + llvm_unreachable("Valid scale values are 1, 2, 4, 8"); + SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8); EVT MaskVT = MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements()); @@ -15432,8 +15437,16 @@ static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, ConstantSDNode *MaskC = dyn_cast(Mask); if (MaskC) MaskInReg = DAG.getTargetConstant(MaskC->getSExtValue(), dl, MaskVT); - else - MaskInReg = DAG.getBitcast(MaskVT, Mask); + else { + EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, + Mask.getValueType().getSizeInBits()); + + // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements + // are extracted by EXTRACT_SUBVECTOR. + MaskInReg = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT, + DAG.getBitcast(BitcastVT, Mask), + DAG.getIntPtrConstant(0, dl)); + } SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other); SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32); SDValue Segment = DAG.getRegister(0, MVT::i32); diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td index b4ccbfb0248..d2197a20b12 100644 --- a/lib/Target/X86/X86InstrAVX512.td +++ b/lib/Target/X86/X86InstrAVX512.td @@ -5440,10 +5440,11 @@ defm VPMOVSXDQ: avx512_extend_DQ<0x25, "vpmovsxdq", X86vsext, "s">; multiclass avx512_gather opc, string OpcodeStr, X86VectorVTInfo _, X86MemOperand memop, PatFrag GatherNode> { - let Constraints = "@earlyclobber $dst, $src1 = $dst, $mask = $mask_wb" in + let Constraints = "@earlyclobber $dst, $src1 = $dst, $mask = $mask_wb", + ExeDomain = _.ExeDomain in def rm : AVX5128I opc, string OpcodeStr, X86VectorVTInfo _, EVEX_CD8<_.EltSize, CD8VT1>; } -let ExeDomain = SSEPackedDouble in { -defm VGATHERDPDZ : avx512_gather<0x92, "vgatherdpd", v8f64_info, vy64xmem, - mgatherv8i32>, EVEX_V512, VEX_W; -defm VGATHERQPDZ : avx512_gather<0x93, "vgatherqpd", v8f64_info, vz64mem, - mgatherv8i64>, EVEX_V512, VEX_W; +multiclass avx512_gather_q_pd dopc, bits<8> qopc, + AVX512VLVectorVTInfo _, string OpcodeStr, string SUFF> { + defm NAME##D##SUFF##Z: avx512_gather, EVEX_V512, VEX_W; + defm NAME##Q##SUFF##Z: avx512_gather, EVEX_V512, VEX_W; +let Predicates = [HasVLX] in { + defm NAME##D##SUFF##Z256: avx512_gather, EVEX_V256, VEX_W; + defm NAME##Q##SUFF##Z256: avx512_gather, EVEX_V256, VEX_W; + defm NAME##D##SUFF##Z128: avx512_gather, EVEX_V128, VEX_W; + defm NAME##Q##SUFF##Z128: avx512_gather, EVEX_V128, VEX_W; +} } -let ExeDomain = SSEPackedSingle in { -defm VGATHERDPSZ : avx512_gather<0x92, "vgatherdps", v16f32_info, vz32mem, - mgatherv16i32>, EVEX_V512; -defm VGATHERQPSZ : avx512_gather<0x93, "vgatherqps", v8f32x_info, vz64mem, - mgatherv8i64>, EVEX_V512; +multiclass avx512_gather_d_ps dopc, bits<8> qopc, + AVX512VLVectorVTInfo _, string OpcodeStr, string SUFF> { + defm NAME##D##SUFF##Z: avx512_gather, EVEX_V512; + defm NAME##Q##SUFF##Z: avx512_gather, EVEX_V512; +let Predicates = [HasVLX] in { + defm NAME##D##SUFF##Z256: avx512_gather, EVEX_V256; + defm NAME##Q##SUFF##Z256: avx512_gather, EVEX_V256; + defm NAME##D##SUFF##Z128: avx512_gather, EVEX_V128; + defm NAME##Q##SUFF##Z128: avx512_gather, EVEX_V128; +} } -defm VPGATHERDQZ : avx512_gather<0x90, "vpgatherdq", v8i64_info, vy64xmem, - mgatherv8i32>, EVEX_V512, VEX_W; -defm VPGATHERDDZ : avx512_gather<0x90, "vpgatherdd", v16i32_info, vz32mem, - mgatherv16i32>, EVEX_V512; -defm VPGATHERQQZ : avx512_gather<0x91, "vpgatherqq", v8i64_info, vz64mem, - mgatherv8i64>, EVEX_V512, VEX_W; -defm VPGATHERQDZ : avx512_gather<0x91, "vpgatherqd", v8i32x_info, vz64mem, - mgatherv8i64>, EVEX_V512; +defm VGATHER : avx512_gather_q_pd<0x92, 0x93, avx512vl_f64_info, "vgather", "PD">, + avx512_gather_d_ps<0x92, 0x93, avx512vl_f32_info, "vgather", "PS">; + +defm VPGATHER : avx512_gather_q_pd<0x90, 0x91, avx512vl_i64_info, "vpgather", "Q">, + avx512_gather_d_ps<0x90, 0x91, avx512vl_i32_info, "vpgather", "D">; multiclass avx512_scatter opc, string OpcodeStr, X86VectorVTInfo _, X86MemOperand memop, PatFrag ScatterNode> { diff --git a/lib/Target/X86/X86InstrFragmentsSIMD.td b/lib/Target/X86/X86InstrFragmentsSIMD.td index 6bf589f7bf7..f1fb9b1c6f4 100644 --- a/lib/Target/X86/X86InstrFragmentsSIMD.td +++ b/lib/Target/X86/X86InstrFragmentsSIMD.td @@ -560,6 +560,14 @@ def unalignednontemporalstore : PatFrag<(ops node:$val, node:$ptr), return false; }]>; +def mgatherv4i32 : PatFrag<(ops node:$src1, node:$src2, node:$src3), + (masked_gather node:$src1, node:$src2, node:$src3) , [{ + if (MaskedGatherSDNode *Mgt = dyn_cast(N)) + return (Mgt->getIndex().getValueType() == MVT::v4i32 || + Mgt->getBasePtr().getValueType() == MVT::v4i32); + return false; +}]>; + def mgatherv8i32 : PatFrag<(ops node:$src1, node:$src2, node:$src3), (masked_gather node:$src1, node:$src2, node:$src3) , [{ if (MaskedGatherSDNode *Mgt = dyn_cast(N)) @@ -568,6 +576,20 @@ def mgatherv8i32 : PatFrag<(ops node:$src1, node:$src2, node:$src3), return false; }]>; +def mgatherv2i64 : PatFrag<(ops node:$src1, node:$src2, node:$src3), + (masked_gather node:$src1, node:$src2, node:$src3) , [{ + if (MaskedGatherSDNode *Mgt = dyn_cast(N)) + return (Mgt->getIndex().getValueType() == MVT::v2i64 || + Mgt->getBasePtr().getValueType() == MVT::v2i64); + return false; +}]>; +def mgatherv4i64 : PatFrag<(ops node:$src1, node:$src2, node:$src3), + (masked_gather node:$src1, node:$src2, node:$src3) , [{ + if (MaskedGatherSDNode *Mgt = dyn_cast(N)) + return (Mgt->getIndex().getValueType() == MVT::v4i64 || + Mgt->getBasePtr().getValueType() == MVT::v4i64); + return false; +}]>; def mgatherv8i64 : PatFrag<(ops node:$src1, node:$src2, node:$src3), (masked_gather node:$src1, node:$src2, node:$src3) , [{ if (MaskedGatherSDNode *Mgt = dyn_cast(N)) diff --git a/lib/Target/X86/X86InstrInfo.td b/lib/Target/X86/X86InstrInfo.td index e936b4bc466..6f38cb8eaf3 100644 --- a/lib/Target/X86/X86InstrInfo.td +++ b/lib/Target/X86/X86InstrInfo.td @@ -282,6 +282,10 @@ let RenderMethod = "addMemOperands" in { def X86MemVX64Operand : AsmOperandClass { let Name = "MemVX64"; } def X86MemVY64Operand : AsmOperandClass { let Name = "MemVY64"; } def X86MemVZ64Operand : AsmOperandClass { let Name = "MemVZ64"; } + def X86MemVX32XOperand : AsmOperandClass { let Name = "MemVX32X"; } + def X86MemVY32XOperand : AsmOperandClass { let Name = "MemVY32X"; } + def X86MemVX64XOperand : AsmOperandClass { let Name = "MemVX64X"; } + def X86MemVY64XOperand : AsmOperandClass { let Name = "MemVY64X"; } } def X86AbsMemAsmOperand : AsmOperandClass { @@ -332,7 +336,11 @@ def vx32mem : X86VMemOperand; def vy32mem : X86VMemOperand; def vx64mem : X86VMemOperand; def vy64mem : X86VMemOperand; -def vy64xmem : X86VMemOperand; + +def vx32xmem : X86VMemOperand; +def vx64xmem : X86VMemOperand; +def vy32xmem : X86VMemOperand; +def vy64xmem : X86VMemOperand; def vz32mem : X86VMemOperand; def vz64mem : X86VMemOperand; diff --git a/lib/Target/X86/X86IntrinsicsInfo.h b/lib/Target/X86/X86IntrinsicsInfo.h index a15404ce780..c037b7b3d82 100644 --- a/lib/Target/X86/X86IntrinsicsInfo.h +++ b/lib/Target/X86/X86IntrinsicsInfo.h @@ -56,6 +56,22 @@ static const IntrinsicData IntrinsicsWithChain[] = { X86_INTRINSIC_DATA(addcarryx_u32, ADX, X86ISD::ADC, 0), X86_INTRINSIC_DATA(addcarryx_u64, ADX, X86ISD::ADC, 0), + X86_INTRINSIC_DATA(avx512_gather3div2_df, GATHER, X86::VGATHERQPDZ128rm, 0), + X86_INTRINSIC_DATA(avx512_gather3div2_di, GATHER, X86::VPGATHERQQZ128rm, 0), + X86_INTRINSIC_DATA(avx512_gather3div4_df, GATHER, X86::VGATHERQPDZ256rm, 0), + X86_INTRINSIC_DATA(avx512_gather3div4_di, GATHER, X86::VPGATHERQQZ256rm, 0), + X86_INTRINSIC_DATA(avx512_gather3div4_sf, GATHER, X86::VGATHERQPSZ128rm, 0), + X86_INTRINSIC_DATA(avx512_gather3div4_si, GATHER, X86::VPGATHERQDZ128rm, 0), + X86_INTRINSIC_DATA(avx512_gather3div8_sf, GATHER, X86::VGATHERQPSZ256rm, 0), + X86_INTRINSIC_DATA(avx512_gather3div8_si, GATHER, X86::VPGATHERQDZ256rm, 0), + X86_INTRINSIC_DATA(avx512_gather3siv2_df, GATHER, X86::VGATHERDPDZ128rm, 0), + X86_INTRINSIC_DATA(avx512_gather3siv2_di, GATHER, X86::VPGATHERDQZ128rm, 0), + X86_INTRINSIC_DATA(avx512_gather3siv4_df, GATHER, X86::VGATHERDPDZ256rm, 0), + X86_INTRINSIC_DATA(avx512_gather3siv4_di, GATHER, X86::VPGATHERDQZ256rm, 0), + X86_INTRINSIC_DATA(avx512_gather3siv4_sf, GATHER, X86::VGATHERDPSZ128rm, 0), + X86_INTRINSIC_DATA(avx512_gather3siv4_si, GATHER, X86::VPGATHERDDZ128rm, 0), + X86_INTRINSIC_DATA(avx512_gather3siv8_sf, GATHER, X86::VGATHERDPSZ256rm, 0), + X86_INTRINSIC_DATA(avx512_gather3siv8_si, GATHER, X86::VPGATHERDDZ256rm, 0), X86_INTRINSIC_DATA(avx512_gather_dpd_512, GATHER, X86::VGATHERDPDZrm, 0), X86_INTRINSIC_DATA(avx512_gather_dpi_512, GATHER, X86::VPGATHERDDZrm, 0), X86_INTRINSIC_DATA(avx512_gather_dpq_512, GATHER, X86::VPGATHERDQZrm, 0), diff --git a/test/CodeGen/X86/avx512-gather-scatter-intrin.ll b/test/CodeGen/X86/avx512-gather-scatter-intrin.ll index 0e32a1c2806..ea3563b312a 100644 --- a/test/CodeGen/X86/avx512-gather-scatter-intrin.ll +++ b/test/CodeGen/X86/avx512-gather-scatter-intrin.ll @@ -1,4 +1,4 @@ -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck %s declare <16 x float> @llvm.x86.avx512.gather.dps.512 (<16 x float>, i8*, <16 x i32>, i16, i32) declare void @llvm.x86.avx512.scatter.dps.512 (i8*, i16, <16 x i32>, <16 x float>, i32) @@ -10,52 +10,60 @@ declare void @llvm.x86.avx512.scatter.qps.512 (i8*, i8, <8 x i64>, <8 x float>, declare <8 x double> @llvm.x86.avx512.gather.qpd.512 (<8 x double>, i8*, <8 x i64>, i8, i32) declare void @llvm.x86.avx512.scatter.qpd.512 (i8*, i8, <8 x i64>, <8 x double>, i32) -;CHECK-LABEL: gather_mask_dps -;CHECK: kmovw -;CHECK: vgatherdps -;CHECK: vpadd -;CHECK: vscatterdps -;CHECK: ret define void @gather_mask_dps(<16 x i32> %ind, <16 x float> %src, i16 %mask, i8* %base, i8* %stbuf) { +; CHECK-LABEL: gather_mask_dps: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: kmovw %k1, %k2 +; CHECK-NEXT: vgatherdps (%rsi,%zmm0,4), %zmm1 {%k2} +; CHECK-NEXT: vpaddd {{.*}}(%rip), %zmm0, %zmm0 +; CHECK-NEXT: vscatterdps %zmm1, (%rdx,%zmm0,4) {%k1} +; CHECK-NEXT: retq %x = call <16 x float> @llvm.x86.avx512.gather.dps.512 (<16 x float> %src, i8* %base, <16 x i32>%ind, i16 %mask, i32 4) %ind2 = add <16 x i32> %ind, call void @llvm.x86.avx512.scatter.dps.512 (i8* %stbuf, i16 %mask, <16 x i32>%ind2, <16 x float> %x, i32 4) ret void } -;CHECK-LABEL: gather_mask_dpd -;CHECK: kmovw -;CHECK: vgatherdpd -;CHECK: vpadd -;CHECK: vscatterdpd -;CHECK: ret define void @gather_mask_dpd(<8 x i32> %ind, <8 x double> %src, i8 %mask, i8* %base, i8* %stbuf) { +; CHECK-LABEL: gather_mask_dpd: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovb %edi, %k1 +; CHECK-NEXT: kmovw %k1, %k2 +; CHECK-NEXT: vgatherdpd (%rsi,%ymm0,4), %zmm1 {%k2} +; CHECK-NEXT: vpaddd {{.*}}(%rip), %ymm0, %ymm0 +; CHECK-NEXT: vscatterdpd %zmm1, (%rdx,%ymm0,4) {%k1} +; CHECK-NEXT: retq %x = call <8 x double> @llvm.x86.avx512.gather.dpd.512 (<8 x double> %src, i8* %base, <8 x i32>%ind, i8 %mask, i32 4) %ind2 = add <8 x i32> %ind, call void @llvm.x86.avx512.scatter.dpd.512 (i8* %stbuf, i8 %mask, <8 x i32>%ind2, <8 x double> %x, i32 4) ret void } -;CHECK-LABEL: gather_mask_qps -;CHECK: kmovw -;CHECK: vgatherqps -;CHECK: vpadd -;CHECK: vscatterqps -;CHECK: ret define void @gather_mask_qps(<8 x i64> %ind, <8 x float> %src, i8 %mask, i8* %base, i8* %stbuf) { +; CHECK-LABEL: gather_mask_qps: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovb %edi, %k1 +; CHECK-NEXT: kmovw %k1, %k2 +; CHECK-NEXT: vgatherqps (%rsi,%zmm0,4), %ymm1 {%k2} +; CHECK-NEXT: vpaddq {{.*}}(%rip), %zmm0, %zmm0 +; CHECK-NEXT: vscatterqps %ymm1, (%rdx,%zmm0,4) {%k1} +; CHECK-NEXT: retq %x = call <8 x float> @llvm.x86.avx512.gather.qps.512 (<8 x float> %src, i8* %base, <8 x i64>%ind, i8 %mask, i32 4) %ind2 = add <8 x i64> %ind, call void @llvm.x86.avx512.scatter.qps.512 (i8* %stbuf, i8 %mask, <8 x i64>%ind2, <8 x float> %x, i32 4) ret void } -;CHECK-LABEL: gather_mask_qpd -;CHECK: kmovw -;CHECK: vgatherqpd -;CHECK: vpadd -;CHECK: vscatterqpd -;CHECK: ret define void @gather_mask_qpd(<8 x i64> %ind, <8 x double> %src, i8 %mask, i8* %base, i8* %stbuf) { +; CHECK-LABEL: gather_mask_qpd: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovb %edi, %k1 +; CHECK-NEXT: kmovw %k1, %k2 +; CHECK-NEXT: vgatherqpd (%rsi,%zmm0,4), %zmm1 {%k2} +; CHECK-NEXT: vpaddq {{.*}}(%rip), %zmm0, %zmm0 +; CHECK-NEXT: vscatterqpd %zmm1, (%rdx,%zmm0,4) {%k1} +; CHECK-NEXT: retq %x = call <8 x double> @llvm.x86.avx512.gather.qpd.512 (<8 x double> %src, i8* %base, <8 x i64>%ind, i8 %mask, i32 4) %ind2 = add <8 x i64> %ind, call void @llvm.x86.avx512.scatter.qpd.512 (i8* %stbuf, i8 %mask, <8 x i64>%ind2, <8 x double> %x, i32 4) @@ -74,162 +82,469 @@ declare void @llvm.x86.avx512.scatter.qpi.512 (i8*, i8, <8 x i64>, <8 x i32>, i3 declare <8 x i64> @llvm.x86.avx512.gather.qpq.512 (<8 x i64>, i8*, <8 x i64>, i8, i32) declare void @llvm.x86.avx512.scatter.qpq.512 (i8*, i8, <8 x i64>, <8 x i64>, i32) -;CHECK-LABEL: gather_mask_dd -;CHECK: kmovw -;CHECK: vpgatherdd -;CHECK: vpadd -;CHECK: vpscatterdd -;CHECK: ret define void @gather_mask_dd(<16 x i32> %ind, <16 x i32> %src, i16 %mask, i8* %base, i8* %stbuf) { +; CHECK-LABEL: gather_mask_dd: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: kmovw %k1, %k2 +; CHECK-NEXT: vpgatherdd (%rsi,%zmm0,4), %zmm1 {%k2} +; CHECK-NEXT: vpaddd {{.*}}(%rip), %zmm0, %zmm0 +; CHECK-NEXT: vpscatterdd %zmm1, (%rdx,%zmm0,4) {%k1} +; CHECK-NEXT: retq %x = call <16 x i32> @llvm.x86.avx512.gather.dpi.512 (<16 x i32> %src, i8* %base, <16 x i32>%ind, i16 %mask, i32 4) %ind2 = add <16 x i32> %ind, call void @llvm.x86.avx512.scatter.dpi.512 (i8* %stbuf, i16 %mask, <16 x i32>%ind2, <16 x i32> %x, i32 4) ret void } -;CHECK-LABEL: gather_mask_qd -;CHECK: kmovw -;CHECK: vpgatherqd -;CHECK: vpadd -;CHECK: vpscatterqd -;CHECK: ret define void @gather_mask_qd(<8 x i64> %ind, <8 x i32> %src, i8 %mask, i8* %base, i8* %stbuf) { +; CHECK-LABEL: gather_mask_qd: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovb %edi, %k1 +; CHECK-NEXT: kmovw %k1, %k2 +; CHECK-NEXT: vpgatherqd (%rsi,%zmm0,4), %ymm1 {%k2} +; CHECK-NEXT: vpaddq {{.*}}(%rip), %zmm0, %zmm0 +; CHECK-NEXT: vpscatterqd %ymm1, (%rdx,%zmm0,4) {%k1} +; CHECK-NEXT: retq %x = call <8 x i32> @llvm.x86.avx512.gather.qpi.512 (<8 x i32> %src, i8* %base, <8 x i64>%ind, i8 %mask, i32 4) %ind2 = add <8 x i64> %ind, call void @llvm.x86.avx512.scatter.qpi.512 (i8* %stbuf, i8 %mask, <8 x i64>%ind2, <8 x i32> %x, i32 4) ret void } -;CHECK-LABEL: gather_mask_qq -;CHECK: kmovw -;CHECK: vpgatherqq -;CHECK: vpadd -;CHECK: vpscatterqq -;CHECK: ret define void @gather_mask_qq(<8 x i64> %ind, <8 x i64> %src, i8 %mask, i8* %base, i8* %stbuf) { +; CHECK-LABEL: gather_mask_qq: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovb %edi, %k1 +; CHECK-NEXT: kmovw %k1, %k2 +; CHECK-NEXT: vpgatherqq (%rsi,%zmm0,4), %zmm1 {%k2} +; CHECK-NEXT: vpaddq {{.*}}(%rip), %zmm0, %zmm0 +; CHECK-NEXT: vpscatterqq %zmm1, (%rdx,%zmm0,4) {%k1} +; CHECK-NEXT: retq %x = call <8 x i64> @llvm.x86.avx512.gather.qpq.512 (<8 x i64> %src, i8* %base, <8 x i64>%ind, i8 %mask, i32 4) %ind2 = add <8 x i64> %ind, call void @llvm.x86.avx512.scatter.qpq.512 (i8* %stbuf, i8 %mask, <8 x i64>%ind2, <8 x i64> %x, i32 4) ret void } -;CHECK-LABEL: gather_mask_dq -;CHECK: kmovw -;CHECK: vpgatherdq -;CHECK: vpadd -;CHECK: vpscatterdq -;CHECK: ret define void @gather_mask_dq(<8 x i32> %ind, <8 x i64> %src, i8 %mask, i8* %base, i8* %stbuf) { +; CHECK-LABEL: gather_mask_dq: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovb %edi, %k1 +; CHECK-NEXT: kmovw %k1, %k2 +; CHECK-NEXT: vpgatherdq (%rsi,%ymm0,4), %zmm1 {%k2} +; CHECK-NEXT: vpaddd {{.*}}(%rip), %ymm0, %ymm0 +; CHECK-NEXT: vpscatterdq %zmm1, (%rdx,%ymm0,4) {%k1} +; CHECK-NEXT: retq %x = call <8 x i64> @llvm.x86.avx512.gather.dpq.512 (<8 x i64> %src, i8* %base, <8 x i32>%ind, i8 %mask, i32 4) %ind2 = add <8 x i32> %ind, call void @llvm.x86.avx512.scatter.dpq.512 (i8* %stbuf, i8 %mask, <8 x i32>%ind2, <8 x i64> %x, i32 4) ret void } - -;CHECK-LABEL: gather_mask_dpd_execdomain -;CHECK: vgatherdpd -;CHECK: vmovapd -;CHECK: ret define void @gather_mask_dpd_execdomain(<8 x i32> %ind, <8 x double> %src, i8 %mask, i8* %base, <8 x double>* %stbuf) { +; CHECK-LABEL: gather_mask_dpd_execdomain: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovb %edi, %k1 +; CHECK-NEXT: vgatherdpd (%rsi,%ymm0,4), %zmm1 {%k1} +; CHECK-NEXT: vmovapd %zmm1, (%rdx) +; CHECK-NEXT: retq %x = call <8 x double> @llvm.x86.avx512.gather.dpd.512 (<8 x double> %src, i8* %base, <8 x i32>%ind, i8 %mask, i32 4) store <8 x double> %x, <8 x double>* %stbuf ret void } -;CHECK-LABEL: gather_mask_qpd_execdomain -;CHECK: vgatherqpd -;CHECK: vmovapd -;CHECK: ret define void @gather_mask_qpd_execdomain(<8 x i64> %ind, <8 x double> %src, i8 %mask, i8* %base, <8 x double>* %stbuf) { +; CHECK-LABEL: gather_mask_qpd_execdomain: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovb %edi, %k1 +; CHECK-NEXT: vgatherqpd (%rsi,%zmm0,4), %zmm1 {%k1} +; CHECK-NEXT: vmovapd %zmm1, (%rdx) +; CHECK-NEXT: retq %x = call <8 x double> @llvm.x86.avx512.gather.qpd.512 (<8 x double> %src, i8* %base, <8 x i64>%ind, i8 %mask, i32 4) store <8 x double> %x, <8 x double>* %stbuf ret void } -;CHECK-LABEL: gather_mask_dps_execdomain -;CHECK: vgatherdps -;CHECK: vmovaps -;CHECK: ret define <16 x float> @gather_mask_dps_execdomain(<16 x i32> %ind, <16 x float> %src, i16 %mask, i8* %base) { +; CHECK-LABEL: gather_mask_dps_execdomain: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vgatherdps (%rsi,%zmm0,4), %zmm1 {%k1} +; CHECK-NEXT: vmovaps %zmm1, %zmm0 +; CHECK-NEXT: retq %res = call <16 x float> @llvm.x86.avx512.gather.dps.512 (<16 x float> %src, i8* %base, <16 x i32>%ind, i16 %mask, i32 4) ret <16 x float> %res; } -;CHECK-LABEL: gather_mask_qps_execdomain -;CHECK: vgatherqps -;CHECK: vmovaps -;CHECK: ret define <8 x float> @gather_mask_qps_execdomain(<8 x i64> %ind, <8 x float> %src, i8 %mask, i8* %base) { +; CHECK-LABEL: gather_mask_qps_execdomain: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovb %edi, %k1 +; CHECK-NEXT: vgatherqps (%rsi,%zmm0,4), %ymm1 {%k1} +; CHECK-NEXT: vmovaps %zmm1, %zmm0 +; CHECK-NEXT: retq %res = call <8 x float> @llvm.x86.avx512.gather.qps.512 (<8 x float> %src, i8* %base, <8 x i64>%ind, i8 %mask, i32 4) ret <8 x float> %res; } -;CHECK-LABEL: scatter_mask_dpd_execdomain -;CHECK: vmovapd -;CHECK: vscatterdpd -;CHECK: ret define void @scatter_mask_dpd_execdomain(<8 x i32> %ind, <8 x double>* %src, i8 %mask, i8* %base, i8* %stbuf) { - %x = load <8 x double>, <8 x double>* %src, align 64 +; CHECK-LABEL: scatter_mask_dpd_execdomain: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovb %esi, %k1 +; CHECK-NEXT: vmovapd (%rdi), %zmm1 +; CHECK-NEXT: vscatterdpd %zmm1, (%rcx,%ymm0,4) {%k1} +; CHECK-NEXT: retq + %x = load <8 x double>, <8 x double>* %src, align 64 call void @llvm.x86.avx512.scatter.dpd.512 (i8* %stbuf, i8 %mask, <8 x i32>%ind, <8 x double> %x, i32 4) ret void } -;CHECK-LABEL: scatter_mask_qpd_execdomain -;CHECK: vmovapd -;CHECK: vscatterqpd -;CHECK: ret define void @scatter_mask_qpd_execdomain(<8 x i64> %ind, <8 x double>* %src, i8 %mask, i8* %base, i8* %stbuf) { +; CHECK-LABEL: scatter_mask_qpd_execdomain: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovb %esi, %k1 +; CHECK-NEXT: vmovapd (%rdi), %zmm1 +; CHECK-NEXT: vscatterqpd %zmm1, (%rcx,%zmm0,4) {%k1} +; CHECK-NEXT: retq %x = load <8 x double>, <8 x double>* %src, align 64 call void @llvm.x86.avx512.scatter.qpd.512 (i8* %stbuf, i8 %mask, <8 x i64>%ind, <8 x double> %x, i32 4) ret void } -;CHECK-LABEL: scatter_mask_dps_execdomain -;CHECK: vmovaps -;CHECK: vscatterdps -;CHECK: ret define void @scatter_mask_dps_execdomain(<16 x i32> %ind, <16 x float>* %src, i16 %mask, i8* %base, i8* %stbuf) { +; CHECK-LABEL: scatter_mask_dps_execdomain: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovw %esi, %k1 +; CHECK-NEXT: vmovaps (%rdi), %zmm1 +; CHECK-NEXT: vscatterdps %zmm1, (%rcx,%zmm0,4) {%k1} +; CHECK-NEXT: retq %x = load <16 x float>, <16 x float>* %src, align 64 call void @llvm.x86.avx512.scatter.dps.512 (i8* %stbuf, i16 %mask, <16 x i32>%ind, <16 x float> %x, i32 4) ret void } -;CHECK-LABEL: scatter_mask_qps_execdomain -;CHECK: vmovaps -;CHECK: vscatterqps -;CHECK: ret define void @scatter_mask_qps_execdomain(<8 x i64> %ind, <8 x float>* %src, i8 %mask, i8* %base, i8* %stbuf) { - %x = load <8 x float>, <8 x float>* %src, align 32 +; CHECK-LABEL: scatter_mask_qps_execdomain: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovb %esi, %k1 +; CHECK-NEXT: vmovaps (%rdi), %ymm1 +; CHECK-NEXT: vscatterqps %ymm1, (%rcx,%zmm0,4) {%k1} +; CHECK-NEXT: retq + %x = load <8 x float>, <8 x float>* %src, align 32 call void @llvm.x86.avx512.scatter.qps.512 (i8* %stbuf, i8 %mask, <8 x i64>%ind, <8 x float> %x, i32 4) ret void } -;CHECK-LABEL: gather_qps -;CHECK: kxnorw -;CHECK: vgatherqps -;CHECK: vpadd -;CHECK: vscatterqps -;CHECK: ret define void @gather_qps(<8 x i64> %ind, <8 x float> %src, i8* %base, i8* %stbuf) { +; CHECK-LABEL: gather_qps: +; CHECK: ## BB#0: +; CHECK-NEXT: kxnorw %k1, %k1, %k1 +; CHECK-NEXT: kxnorw %k2, %k2, %k2 +; CHECK-NEXT: vgatherqps (%rdi,%zmm0,4), %ymm1 {%k2} +; CHECK-NEXT: vpaddq {{.*}}(%rip), %zmm0, %zmm0 +; CHECK-NEXT: vscatterqps %ymm1, (%rsi,%zmm0,4) {%k1} +; CHECK-NEXT: retq %x = call <8 x float> @llvm.x86.avx512.gather.qps.512 (<8 x float> %src, i8* %base, <8 x i64>%ind, i8 -1, i32 4) %ind2 = add <8 x i64> %ind, call void @llvm.x86.avx512.scatter.qps.512 (i8* %stbuf, i8 -1, <8 x i64>%ind2, <8 x float> %x, i32 4) ret void } -;CHECK-LABEL: prefetch -;CHECK: gatherpf0 -;CHECK: gatherpf1 -;CHECK: scatterpf0 -;CHECK: scatterpf1 -;CHECK: ret declare void @llvm.x86.avx512.gatherpf.qps.512(i8, <8 x i64>, i8* , i32, i32); declare void @llvm.x86.avx512.scatterpf.qps.512(i8, <8 x i64>, i8* , i32, i32); define void @prefetch(<8 x i64> %ind, i8* %base) { +; CHECK-LABEL: prefetch: +; CHECK: ## BB#0: +; CHECK-NEXT: kxnorw %k1, %k1, %k1 +; CHECK-NEXT: vgatherpf0qps (%rdi,%zmm0,4) {%k1} +; CHECK-NEXT: vgatherpf1qps (%rdi,%zmm0,4) {%k1} +; CHECK-NEXT: vscatterpf0qps (%rdi,%zmm0,2) {%k1} +; CHECK-NEXT: vscatterpf1qps (%rdi,%zmm0,2) {%k1} +; CHECK-NEXT: retq call void @llvm.x86.avx512.gatherpf.qps.512(i8 -1, <8 x i64> %ind, i8* %base, i32 4, i32 0) call void @llvm.x86.avx512.gatherpf.qps.512(i8 -1, <8 x i64> %ind, i8* %base, i32 4, i32 1) call void @llvm.x86.avx512.scatterpf.qps.512(i8 -1, <8 x i64> %ind, i8* %base, i32 2, i32 0) call void @llvm.x86.avx512.scatterpf.qps.512(i8 -1, <8 x i64> %ind, i8* %base, i32 2, i32 1) ret void } + + +declare <2 x double> @llvm.x86.avx512.gather3div2.df(<2 x double>, i8*, <2 x i64>, i8, i32) + +define <2 x double>@test_int_x86_avx512_gather3div2_df(<2 x double> %x0, i8* %x1, <2 x i64> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_gather3div2_df: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovb %esi, %k1 +; CHECK-NEXT: vmovaps %zmm0, %zmm2 +; CHECK-NEXT: vgatherqpd (%rdi,%xmm1,4), %xmm2 {%k1} +; CHECK-NEXT: kxnorw %k1, %k1, %k1 +; CHECK-NEXT: vgatherqpd (%rdi,%xmm1,0), %xmm0 {%k1} +; CHECK-NEXT: vaddpd %xmm0, %xmm2, %xmm0 +; CHECK-NEXT: retq + %res = call <2 x double> @llvm.x86.avx512.gather3div2.df(<2 x double> %x0, i8* %x1, <2 x i64> %x2, i8 %x3, i32 4) + %res1 = call <2 x double> @llvm.x86.avx512.gather3div2.df(<2 x double> %x0, i8* %x1, <2 x i64> %x2, i8 -1, i32 0) + %res2 = fadd <2 x double> %res, %res1 + ret <2 x double> %res2 +} + +declare <4 x i32> @llvm.x86.avx512.gather3div2.di(<2 x i64>, i8*, <2 x i64>, i8, i32) + +define <4 x i32>@test_int_x86_avx512_gather3div2_di(<2 x i64> %x0, i8* %x1, <2 x i64> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_gather3div2_di: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovb %esi, %k1 +; CHECK-NEXT: vpgatherqq (%rdi,%xmm1,8), %xmm0 {%k1} +; CHECK-NEXT: vpaddd %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: retq + %res = call <4 x i32> @llvm.x86.avx512.gather3div2.di(<2 x i64> %x0, i8* %x1, <2 x i64> %x2, i8 %x3, i32 8) + %res1 = call <4 x i32> @llvm.x86.avx512.gather3div2.di(<2 x i64> %x0, i8* %x1, <2 x i64> %x2, i8 %x3, i32 8) + %res2 = add <4 x i32> %res, %res1 + ret <4 x i32> %res2 +} + +declare <4 x double> @llvm.x86.avx512.gather3div4.df(<4 x double>, i8*, <4 x i64>, i8, i32) + +define <4 x double>@test_int_x86_avx512_gather3div4_df(<4 x double> %x0, i8* %x1, <4 x i64> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_gather3div4_df: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovb %esi, %k1 +; CHECK-NEXT: vmovaps %zmm0, %zmm2 +; CHECK-NEXT: vgatherqpd (%rdi,%ymm1,4), %ymm2 {%k1} +; CHECK-NEXT: kxnorw %k1, %k1, %k1 +; CHECK-NEXT: vgatherqpd (%rdi,%ymm1,0), %ymm0 {%k1} +; CHECK-NEXT: vaddpd %ymm0, %ymm2, %ymm0 +; CHECK-NEXT: retq + %res = call <4 x double> @llvm.x86.avx512.gather3div4.df(<4 x double> %x0, i8* %x1, <4 x i64> %x2, i8 %x3, i32 4) + %res1 = call <4 x double> @llvm.x86.avx512.gather3div4.df(<4 x double> %x0, i8* %x1, <4 x i64> %x2, i8 -1, i32 0) + %res2 = fadd <4 x double> %res, %res1 + ret <4 x double> %res2 +} + +declare <8 x i32> @llvm.x86.avx512.gather3div4.di(<4 x i64>, i8*, <4 x i64>, i8, i32) + +define <8 x i32>@test_int_x86_avx512_gather3div4_di(<4 x i64> %x0, i8* %x1, <4 x i64> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_gather3div4_di: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovb %esi, %k1 +; CHECK-NEXT: vmovaps %zmm0, %zmm2 +; CHECK-NEXT: vpgatherqq (%rdi,%ymm1,8), %ymm2 {%k1} +; CHECK-NEXT: kxnorw %k1, %k1, %k1 +; CHECK-NEXT: vpgatherqq (%rdi,%ymm1,8), %ymm0 {%k1} +; CHECK-NEXT: vpaddd %ymm0, %ymm2, %ymm0 +; CHECK-NEXT: retq + %res = call <8 x i32> @llvm.x86.avx512.gather3div4.di(<4 x i64> %x0, i8* %x1, <4 x i64> %x2, i8 %x3, i32 8) + %res1 = call <8 x i32> @llvm.x86.avx512.gather3div4.di(<4 x i64> %x0, i8* %x1, <4 x i64> %x2, i8 -1, i32 8) + %res2 = add <8 x i32> %res, %res1 + ret <8 x i32> %res2 +} + +declare <4 x float> @llvm.x86.avx512.gather3div4.sf(<4 x float>, i8*, <2 x i64>, i8, i32) + +define <4 x float>@test_int_x86_avx512_gather3div4_sf(<4 x float> %x0, i8* %x1, <2 x i64> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_gather3div4_sf: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovb %esi, %k1 +; CHECK-NEXT: vmovaps %zmm0, %zmm2 +; CHECK-NEXT: vgatherqps (%rdi,%xmm1,4), %xmm2 {%k1} +; CHECK-NEXT: kxnorw %k1, %k1, %k1 +; CHECK-NEXT: vgatherqps (%rdi,%xmm1,0), %xmm0 {%k1} +; CHECK-NEXT: vaddps %xmm0, %xmm2, %xmm0 +; CHECK-NEXT: retq + %res = call <4 x float> @llvm.x86.avx512.gather3div4.sf(<4 x float> %x0, i8* %x1, <2 x i64> %x2, i8 %x3, i32 4) + %res1 = call <4 x float> @llvm.x86.avx512.gather3div4.sf(<4 x float> %x0, i8* %x1, <2 x i64> %x2, i8 -1, i32 0) + %res2 = fadd <4 x float> %res, %res1 + ret <4 x float> %res2 +} + +declare <4 x i32> @llvm.x86.avx512.gather3div4.si(<4 x i32>, i8*, <2 x i64>, i8, i32) + +define <4 x i32>@test_int_x86_avx512_gather3div4_si(<4 x i32> %x0, i8* %x1, <2 x i64> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_gather3div4_si: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovb %esi, %k1 +; CHECK-NEXT: kxnorw %k2, %k2, %k2 +; CHECK-NEXT: vmovaps %zmm0, %zmm2 +; CHECK-NEXT: vpgatherqd (%rdi,%xmm1,4), %xmm2 {%k2} +; CHECK-NEXT: vpgatherqd (%rdi,%xmm1,4), %xmm0 {%k1} +; CHECK-NEXT: vpaddd %xmm0, %xmm2, %xmm0 +; CHECK-NEXT: retq + %res = call <4 x i32> @llvm.x86.avx512.gather3div4.si(<4 x i32> %x0, i8* %x1, <2 x i64> %x2, i8 -1, i32 4) + %res1 = call <4 x i32> @llvm.x86.avx512.gather3div4.si(<4 x i32> %x0, i8* %x1, <2 x i64> %x2, i8 %x3, i32 4) + %res2 = add <4 x i32> %res, %res1 + ret <4 x i32> %res2 +} + +declare <4 x float> @llvm.x86.avx512.gather3div8.sf(<4 x float>, i8*, <4 x i64>, i8, i32) + +define <4 x float>@test_int_x86_avx512_gather3div8_sf(<4 x float> %x0, i8* %x1, <4 x i64> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_gather3div8_sf: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovb %esi, %k1 +; CHECK-NEXT: vmovaps %zmm0, %zmm2 +; CHECK-NEXT: vgatherqps (%rdi,%ymm1,4), %xmm2 {%k1} +; CHECK-NEXT: kxnorw %k1, %k1, %k1 +; CHECK-NEXT: vgatherqps (%rdi,%ymm1,0), %xmm0 {%k1} +; CHECK-NEXT: vaddps %xmm0, %xmm2, %xmm0 +; CHECK-NEXT: retq + %res = call <4 x float> @llvm.x86.avx512.gather3div8.sf(<4 x float> %x0, i8* %x1, <4 x i64> %x2, i8 %x3, i32 4) + %res1 = call <4 x float> @llvm.x86.avx512.gather3div8.sf(<4 x float> %x0, i8* %x1, <4 x i64> %x2, i8 -1, i32 0) + %res2 = fadd <4 x float> %res, %res1 + ret <4 x float> %res2 +} + +declare <4 x i32> @llvm.x86.avx512.gather3div8.si(<4 x i32>, i8*, <4 x i64>, i8, i32) + +define <4 x i32>@test_int_x86_avx512_gather3div8_si(<4 x i32> %x0, i8* %x1, <4 x i64> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_gather3div8_si: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovb %esi, %k1 +; CHECK-NEXT: vmovaps %zmm0, %zmm2 +; CHECK-NEXT: kmovw %k1, %k2 +; CHECK-NEXT: vpgatherqd (%rdi,%ymm1,4), %xmm2 {%k2} +; CHECK-NEXT: vpgatherqd (%rdi,%ymm1,2), %xmm0 {%k1} +; CHECK-NEXT: vpaddd %xmm0, %xmm2, %xmm0 +; CHECK-NEXT: retq + %res = call <4 x i32> @llvm.x86.avx512.gather3div8.si(<4 x i32> %x0, i8* %x1, <4 x i64> %x2, i8 %x3, i32 4) + %res1 = call <4 x i32> @llvm.x86.avx512.gather3div8.si(<4 x i32> %x0, i8* %x1, <4 x i64> %x2, i8 %x3, i32 2) + %res2 = add <4 x i32> %res, %res1 + ret <4 x i32> %res2 +} + +declare <2 x double> @llvm.x86.avx512.gather3siv2.df(<2 x double>, i8*, <4 x i32>, i8, i32) + +define <2 x double>@test_int_x86_avx512_gather3siv2_df(<2 x double> %x0, i8* %x1, <4 x i32> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_gather3siv2_df: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovb %esi, %k1 +; CHECK-NEXT: vmovaps %zmm0, %zmm2 +; CHECK-NEXT: vgatherdpd (%rdi,%xmm1,4), %xmm2 {%k1} +; CHECK-NEXT: kxnorw %k1, %k1, %k1 +; CHECK-NEXT: vgatherdpd (%rdi,%xmm1,0), %xmm0 {%k1} +; CHECK-NEXT: vaddpd %xmm0, %xmm2, %xmm0 +; CHECK-NEXT: retq + %res = call <2 x double> @llvm.x86.avx512.gather3siv2.df(<2 x double> %x0, i8* %x1, <4 x i32> %x2, i8 %x3, i32 4) + %res1 = call <2 x double> @llvm.x86.avx512.gather3siv2.df(<2 x double> %x0, i8* %x1, <4 x i32> %x2, i8 -1, i32 0) + %res2 = fadd <2 x double> %res, %res1 + ret <2 x double> %res2 +} + +declare <4 x i32> @llvm.x86.avx512.gather3siv2.di(<2 x i64>, i8*, <4 x i32>, i8, i32) + +define <4 x i32>@test_int_x86_avx512_gather3siv2_di(<2 x i64> %x0, i8* %x1, <4 x i32> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_gather3siv2_di: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovb %esi, %k1 +; CHECK-NEXT: vpgatherdq (%rdi,%xmm1,8), %xmm0 {%k1} +; CHECK-NEXT: vpaddd %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: retq + %res = call <4 x i32> @llvm.x86.avx512.gather3siv2.di(<2 x i64> %x0, i8* %x1, <4 x i32> %x2, i8 %x3, i32 8) + %res1 = call <4 x i32> @llvm.x86.avx512.gather3siv2.di(<2 x i64> %x0, i8* %x1, <4 x i32> %x2, i8 %x3, i32 8) + %res2 = add <4 x i32> %res, %res1 + ret <4 x i32> %res2 +} + +declare <4 x double> @llvm.x86.avx512.gather3siv4.df(<4 x double>, i8*, <4 x i32>, i8, i32) + +define <4 x double>@test_int_x86_avx512_gather3siv4_df(<4 x double> %x0, i8* %x1, <4 x i32> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_gather3siv4_df: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovb %esi, %k1 +; CHECK-NEXT: vmovaps %zmm0, %zmm2 +; CHECK-NEXT: vgatherdpd (%rdi,%xmm1,4), %ymm2 {%k1} +; CHECK-NEXT: kxnorw %k1, %k1, %k1 +; CHECK-NEXT: vgatherdpd (%rdi,%xmm1,0), %ymm0 {%k1} +; CHECK-NEXT: vaddpd %ymm0, %ymm2, %ymm0 +; CHECK-NEXT: retq + %res = call <4 x double> @llvm.x86.avx512.gather3siv4.df(<4 x double> %x0, i8* %x1, <4 x i32> %x2, i8 %x3, i32 4) + %res1 = call <4 x double> @llvm.x86.avx512.gather3siv4.df(<4 x double> %x0, i8* %x1, <4 x i32> %x2, i8 -1, i32 0) + %res2 = fadd <4 x double> %res, %res1 + ret <4 x double> %res2 +} + +declare <8 x i32> @llvm.x86.avx512.gather3siv4.di(<4 x i64>, i8*, <4 x i32>, i8, i32) + +define <8 x i32>@test_int_x86_avx512_gather3siv4_di(<4 x i64> %x0, i8* %x1, <4 x i32> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_gather3siv4_di: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovb %esi, %k1 +; CHECK-NEXT: vpgatherdq (%rdi,%xmm1,8), %ymm0 {%k1} +; CHECK-NEXT: vpaddd %ymm0, %ymm0, %ymm0 +; CHECK-NEXT: retq + %res = call <8 x i32> @llvm.x86.avx512.gather3siv4.di(<4 x i64> %x0, i8* %x1, <4 x i32> %x2, i8 %x3, i32 8) + %res1 = call <8 x i32> @llvm.x86.avx512.gather3siv4.di(<4 x i64> %x0, i8* %x1, <4 x i32> %x2, i8 %x3, i32 8) + %res2 = add <8 x i32> %res, %res1 + ret <8 x i32> %res2 +} + +declare <4 x float> @llvm.x86.avx512.gather3siv4.sf(<4 x float>, i8*, <4 x i32>, i8, i32) + +define <4 x float>@test_int_x86_avx512_gather3siv4_sf(<4 x float> %x0, i8* %x1, <4 x i32> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_gather3siv4_sf: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovb %esi, %k1 +; CHECK-NEXT: vmovaps %zmm0, %zmm2 +; CHECK-NEXT: vgatherdps (%rdi,%xmm1,4), %xmm2 {%k1} +; CHECK-NEXT: kxnorw %k1, %k1, %k1 +; CHECK-NEXT: vgatherdps (%rdi,%xmm1,0), %xmm0 {%k1} +; CHECK-NEXT: vaddps %xmm0, %xmm2, %xmm0 +; CHECK-NEXT: retq + %res = call <4 x float> @llvm.x86.avx512.gather3siv4.sf(<4 x float> %x0, i8* %x1, <4 x i32> %x2, i8 %x3, i32 4) + %res1 = call <4 x float> @llvm.x86.avx512.gather3siv4.sf(<4 x float> %x0, i8* %x1, <4 x i32> %x2, i8 -1, i32 0) + %res2 = fadd <4 x float> %res, %res1 + ret <4 x float> %res2 +} + +declare <4 x i32> @llvm.x86.avx512.gather3siv4.si(<4 x i32>, i8*, <4 x i32>, i8, i32) + +define <4 x i32>@test_int_x86_avx512_gather3siv4_si(<4 x i32> %x0, i8* %x1, <4 x i32> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_gather3siv4_si: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovb %esi, %k1 +; CHECK-NEXT: kxnorw %k2, %k2, %k2 +; CHECK-NEXT: vmovaps %zmm0, %zmm2 +; CHECK-NEXT: vpgatherdd (%rdi,%xmm1,4), %xmm2 {%k2} +; CHECK-NEXT: vpgatherdd (%rdi,%xmm1,0), %xmm0 {%k1} +; CHECK-NEXT: vpaddd %xmm0, %xmm2, %xmm0 +; CHECK-NEXT: retq + %res = call <4 x i32> @llvm.x86.avx512.gather3siv4.si(<4 x i32> %x0, i8* %x1, <4 x i32> %x2, i8 -1, i32 4) + %res1 = call <4 x i32> @llvm.x86.avx512.gather3siv4.si(<4 x i32> %x0, i8* %x1, <4 x i32> %x2, i8 %x3, i32 0) + %res2 = add <4 x i32> %res, %res1 + ret <4 x i32> %res2 +} + +declare <8 x float> @llvm.x86.avx512.gather3siv8.sf(<8 x float>, i8*, <8 x i32>, i8, i32) + +define <8 x float>@test_int_x86_avx512_gather3siv8_sf(<8 x float> %x0, i8* %x1, <8 x i32> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_gather3siv8_sf: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovb %esi, %k1 +; CHECK-NEXT: vmovaps %zmm0, %zmm2 +; CHECK-NEXT: vgatherdps (%rdi,%ymm1,4), %ymm2 {%k1} +; CHECK-NEXT: kxnorw %k1, %k1, %k1 +; CHECK-NEXT: vgatherdps (%rdi,%ymm1,0), %ymm0 {%k1} +; CHECK-NEXT: vaddps %ymm0, %ymm2, %ymm0 +; CHECK-NEXT: retq + %res = call <8 x float> @llvm.x86.avx512.gather3siv8.sf(<8 x float> %x0, i8* %x1, <8 x i32> %x2, i8 %x3, i32 4) + %res1 = call <8 x float> @llvm.x86.avx512.gather3siv8.sf(<8 x float> %x0, i8* %x1, <8 x i32> %x2, i8 -1, i32 0) + %res2 = fadd <8 x float> %res, %res1 + ret <8 x float> %res2 +} + +declare <8 x i32> @llvm.x86.avx512.gather3siv8.si(<8 x i32>, i8*, <8 x i32>, i8, i32) + +define <8 x i32>@test_int_x86_avx512_gather3siv8_si(<8 x i32> %x0, i8* %x1, <8 x i32> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_gather3siv8_si: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovb %esi, %k1 +; CHECK-NEXT: vmovaps %zmm0, %zmm2 +; CHECK-NEXT: kmovw %k1, %k2 +; CHECK-NEXT: vpgatherdd (%rdi,%ymm1,4), %ymm2 {%k2} +; CHECK-NEXT: vpgatherdd (%rdi,%ymm1,0), %ymm0 {%k1} +; CHECK-NEXT: vpaddd %ymm0, %ymm2, %ymm0 +; CHECK-NEXT: retq + %res = call <8 x i32> @llvm.x86.avx512.gather3siv8.si(<8 x i32> %x0, i8* %x1, <8 x i32> %x2, i8 %x3, i32 4) + %res1 = call <8 x i32> @llvm.x86.avx512.gather3siv8.si(<8 x i32> %x0, i8* %x1, <8 x i32> %x2, i8 %x3, i32 0) + %res2 = add <8 x i32> %res, %res1 + ret <8 x i32> %res2 +} diff --git a/test/MC/X86/avx512-encodings.s b/test/MC/X86/avx512-encodings.s index 9bd3081ff96..32787a03499 100644 --- a/test/MC/X86/avx512-encodings.s +++ b/test/MC/X86/avx512-encodings.s @@ -9793,3 +9793,115 @@ vpermilpd $0x23, 0x400(%rbx), %zmm2 // CHECK: vpabsq -1032(%rdx){1to8}, %zmm5 // CHECK: encoding: [0x62,0xf2,0xfd,0x58,0x1f,0xaa,0xf8,0xfb,0xff,0xff] vpabsq -1032(%rdx){1to8}, %zmm5 + +// CHECK: vpgatherdd 123(%r14,%zmm11,8), %zmm17 {%k1} +// CHECK: encoding: [0x62,0x82,0x7d,0x49,0x90,0x8c,0xde,0x7b,0x00,0x00,0x00] + vpgatherdd 123(%r14, %zmm11,8), %zmm17 {%k1} + +// CHECK: vpgatherdd 256(%r9,%zmm11), %zmm17 {%k1} +// CHECK: encoding: [0x62,0x82,0x7d,0x49,0x90,0x4c,0x19,0x40] + vpgatherdd 256(%r9,%zmm11), %zmm17 {%k1} + +// CHECK: vpgatherdd 1024(%rcx,%zmm11,4), %zmm17 {%k1} +// CHECK: encoding: [0x62,0xa2,0x7d,0x49,0x90,0x8c,0x99,0x00,0x04,0x00,0x00] + vpgatherdd 1024(%rcx, %zmm11,4), %zmm17 {%k1} + +// CHECK: vpgatherdq 123(%r14,%ymm14,8), %zmm8 {%k1} +// CHECK: encoding: [0x62,0x12,0xfd,0x49,0x90,0x84,0xf6,0x7b,0x00,0x00,0x00] + vpgatherdq 123(%r14, %ymm14,8), %zmm8 {%k1} + +// CHECK: vpgatherdq 256(%r9,%ymm14), %zmm8 {%k1} +// CHECK: encoding: [0x62,0x12,0xfd,0x49,0x90,0x44,0x31,0x20] + vpgatherdq 256(%r9, %ymm14), %zmm8 {%k1} + +// CHECK: vpgatherdq 1024(%rcx,%ymm14,4), %zmm8 {%k1} +// CHECK: encoding: [0x62,0x32,0xfd,0x49,0x90,0x84,0xb1,0x00,0x04,0x00,0x00] + vpgatherdq 1024(%rcx, %ymm14,4), %zmm8 {%k1} + +// CHECK: vpgatherqd 123(%r14,%zmm17,8), %ymm3 {%k1} +// CHECK: encoding: [0x62,0xd2,0x7d,0x41,0x91,0x9c,0xce,0x7b,0x00,0x00,0x00] + vpgatherqd 123(%r14, %zmm17,8), %ymm3 {%k1} + +// CHECK: vpgatherqd 256(%r9,%zmm17), %ymm3 {%k1} +// CHECK: encoding: [0x62,0xd2,0x7d,0x41,0x91,0x5c,0x09,0x40] + vpgatherqd 256(%r9,%zmm17), %ymm3 {%k1} + +// CHECK: vpgatherqd 1024(%rcx,%zmm17,4), %ymm3 {%k1} +// CHECK: encoding: [0x62,0xf2,0x7d,0x41,0x91,0x9c,0x89,0x00,0x04,0x00,0x00] + vpgatherqd 1024(%rcx, %zmm17,4), %ymm3 {%k1} + +// CHECK: vpgatherqq 123(%r14,%zmm21,8), %zmm17 {%k1} +// CHECK: encoding: [0x62,0xc2,0xfd,0x41,0x91,0x8c,0xee,0x7b,0x00,0x00,0x00] + vpgatherqq 123(%r14, %zmm21,8), %zmm17 {%k1} + +// CHECK: vpgatherqq 256(%r9,%zmm21), %zmm17 {%k1} +// CHECK: encoding: [0x62,0xc2,0xfd,0x41,0x91,0x4c,0x29,0x20] + vpgatherqq 256(%r9,%zmm21), %zmm17 {%k1} + +// CHECK: vpgatherqq 1024(%rcx,%zmm21,4), %zmm17 {%k1} +// CHECK: encoding: [0x62,0xe2,0xfd,0x41,0x91,0x8c,0xa9,0x00,0x04,0x00,0x00] + vpgatherqq 1024(%rcx, %zmm21,4), %zmm17 {%k1} + +// CHECK: vpscatterdd %zmm19, 123(%r14,%zmm16,8) {%k1} +// CHECK: encoding: [0x62,0xc2,0x7d,0x41,0xa0,0x9c,0xc6,0x7b,0x00,0x00,0x00] + vpscatterdd %zmm19, 123(%r14,%zmm16,8) {%k1} + +// CHECK: vpscatterdd %zmm19, 123(%r14,%zmm16,8) {%k1} +// CHECK: encoding: [0x62,0xc2,0x7d,0x41,0xa0,0x9c,0xc6,0x7b,0x00,0x00,0x00] + vpscatterdd %zmm19, 123(%r14,%zmm16,8) {%k1} + +// CHECK: vpscatterdd %zmm19, 256(%r9,%zmm16) {%k1} +// CHECK: encoding: [0x62,0xc2,0x7d,0x41,0xa0,0x5c,0x01,0x40] + vpscatterdd %zmm19, 256(%r9,%zmm16) {%k1} + +// CHECK: vpscatterdd %zmm19, 1024(%rcx,%zmm16,4) {%k1} +// CHECK: encoding: [0x62,0xe2,0x7d,0x41,0xa0,0x9c,0x81,0x00,0x04,0x00,0x00] + vpscatterdd %zmm19, 1024(%rcx,%zmm16,4) {%k1} + +// CHECK: vpscatterdq %zmm5, 123(%r14,%ymm6,8) {%k1} +// CHECK: encoding: [0x62,0xd2,0xfd,0x49,0xa0,0xac,0xf6,0x7b,0x00,0x00,0x00] + vpscatterdq %zmm5, 123(%r14,%ymm6,8) {%k1} + +// CHECK: vpscatterdq %zmm5, 123(%r14,%ymm6,8) {%k1} +// CHECK: encoding: [0x62,0xd2,0xfd,0x49,0xa0,0xac,0xf6,0x7b,0x00,0x00,0x00] + vpscatterdq %zmm5, 123(%r14,%ymm6,8) {%k1} + +// CHECK: vpscatterdq %zmm5, 256(%r9,%ymm6) {%k1} +// CHECK: encoding: [0x62,0xd2,0xfd,0x49,0xa0,0x6c,0x31,0x20] + vpscatterdq %zmm5, 256(%r9,%ymm6) {%k1} + +// CHECK: vpscatterdq %zmm5, 1024(%rcx,%ymm6,4) {%k1} +// CHECK: encoding: [0x62,0xf2,0xfd,0x49,0xa0,0xac,0xb1,0x00,0x04,0x00,0x00] + vpscatterdq %zmm5, 1024(%rcx,%ymm6,4) {%k1} + +// CHECK: vpscatterqd %ymm20, 123(%r14,%zmm2,8) {%k1} +// CHECK: encoding: [0x62,0xc2,0x7d,0x49,0xa1,0xa4,0xd6,0x7b,0x00,0x00,0x00] + vpscatterqd %ymm20, 123(%r14,%zmm2,8) {%k1} + +// CHECK: vpscatterqd %ymm20, 123(%r14,%zmm2,8) {%k1} +// CHECK: encoding: [0x62,0xc2,0x7d,0x49,0xa1,0xa4,0xd6,0x7b,0x00,0x00,0x00] + vpscatterqd %ymm20, 123(%r14,%zmm2,8) {%k1} + +// CHECK: vpscatterqd %ymm20, 256(%r9,%zmm2) {%k1} +// CHECK: encoding: [0x62,0xc2,0x7d,0x49,0xa1,0x64,0x11,0x40] + vpscatterqd %ymm20, 256(%r9,%zmm2) {%k1} + +// CHECK: vpscatterqd %ymm20, 1024(%rcx,%zmm2,4) {%k1} +// CHECK: encoding: [0x62,0xe2,0x7d,0x49,0xa1,0xa4,0x91,0x00,0x04,0x00,0x00] + vpscatterqd %ymm20, 1024(%rcx,%zmm2,4) {%k1} + +// CHECK: vpscatterqq %zmm14, 123(%r14,%zmm20,8) {%k1} +// CHECK: encoding: [0x62,0x52,0xfd,0x41,0xa1,0xb4,0xe6,0x7b,0x00,0x00,0x00] + vpscatterqq %zmm14, 123(%r14,%zmm20,8) {%k1} + +// CHECK: vpscatterqq %zmm14, 123(%r14,%zmm20,8) {%k1} +// CHECK: encoding: [0x62,0x52,0xfd,0x41,0xa1,0xb4,0xe6,0x7b,0x00,0x00,0x00] + vpscatterqq %zmm14, 123(%r14,%zmm20,8) {%k1} + +// CHECK: vpscatterqq %zmm14, 256(%r9,%zmm20) {%k1} +// CHECK: encoding: [0x62,0x52,0xfd,0x41,0xa1,0x74,0x21,0x20] + vpscatterqq %zmm14, 256(%r9,%zmm20) {%k1} + +// CHECK: vpscatterqq %zmm14, 1024(%rcx,%zmm20,4) {%k1} +// CHECK: encoding: [0x62,0x72,0xfd,0x41,0xa1,0xb4,0xa1,0x00,0x04,0x00,0x00] + vpscatterqq %zmm14, 1024(%rcx,%zmm20,4) {%k1} diff --git a/test/MC/X86/avx512vl-encoding.s b/test/MC/X86/avx512vl-encoding.s index e3ec448737e..b4b28a62b85 100644 --- a/test/MC/X86/avx512vl-encoding.s +++ b/test/MC/X86/avx512vl-encoding.s @@ -1452,3 +1452,194 @@ // CHECK: encoding: [0x62,0xe2,0xfd,0x38,0x1f,0xb2,0xf8,0xfb,0xff,0xff] vpabsq -1032(%rdx){1to4}, %ymm22 +// CHECK: vpgatherdd 123(%r14,%xmm31,8), %xmm17 {%k1} +// CHECK: encoding: [0x62,0x82,0x7d,0x01,0x90,0x8c,0xfe,0x7b,0x00,0x00,0x00] + vpgatherdd 123(%r14,%xmm31,8), %xmm17 {%k1} + +// CHECK: vpgatherdd 256(%r9,%xmm31), %xmm17 {%k1} +// CHECK: encoding: [0x62,0x82,0x7d,0x01,0x90,0x4c,0x39,0x40] + vpgatherdd 256(%r9,%xmm31), %xmm17 {%k1} + +// CHECK: vpgatherdd 1024(%rcx,%xmm31,4), %xmm17 {%k1} +// CHECK: encoding: [0x62,0xa2,0x7d,0x01,0x90,0x8c,0xb9,0x00,0x04,0x00,0x00] + vpgatherdd 1024(%rcx,%xmm31,4), %xmm17 {%k1} + +// CHECK: vpgatherdd 123(%r14,%ymm31,8), %ymm19 {%k1} +// CHECK: encoding: [0x62,0x82,0x7d,0x21,0x90,0x9c,0xfe,0x7b,0x00,0x00,0x00] + vpgatherdd 123(%r14,%ymm31,8), %ymm19 {%k1} + +// CHECK: vpgatherdd 256(%r9,%ymm31), %ymm19 {%k1} +// CHECK: encoding: [0x62,0x82,0x7d,0x21,0x90,0x5c,0x39,0x40] + vpgatherdd 256(%r9,%ymm31), %ymm19 {%k1} + +// CHECK: vpgatherdd 1024(%rcx,%ymm31,4), %ymm19 {%k1} +// CHECK: encoding: [0x62,0xa2,0x7d,0x21,0x90,0x9c,0xb9,0x00,0x04,0x00,0x00] + vpgatherdd 1024(%rcx,%ymm31,4), %ymm19 {%k1} + +// CHECK: vpgatherdq 123(%r14,%xmm31,8), %xmm17 {%k1} +// CHECK: encoding: [0x62,0x82,0xfd,0x01,0x90,0x8c,0xfe,0x7b,0x00,0x00,0x00] + vpgatherdq 123(%r14,%xmm31,8), %xmm17 {%k1} + +// CHECK: vpgatherdq 256(%r9,%xmm31), %xmm17 {%k1} +// CHECK: encoding: [0x62,0x82,0xfd,0x01,0x90,0x4c,0x39,0x20] + vpgatherdq 256(%r9,%xmm31), %xmm17 {%k1} + +// CHECK: vpgatherdq 1024(%rcx,%xmm31,4), %xmm17 {%k1} +// CHECK: encoding: [0x62,0xa2,0xfd,0x01,0x90,0x8c,0xb9,0x00,0x04,0x00,0x00] + vpgatherdq 1024(%rcx,%xmm31,4), %xmm17 {%k1} + +// CHECK: vpgatherdq 123(%r14,%xmm31,8), %ymm26 {%k1} +// CHECK: encoding: [0x62,0x02,0xfd,0x21,0x90,0x94,0xfe,0x7b,0x00,0x00,0x00] + vpgatherdq 123(%r14,%xmm31,8), %ymm26 {%k1} + +// CHECK: vpgatherdq 256(%r9,%xmm31), %ymm26 {%k1} +// CHECK: encoding: [0x62,0x02,0xfd,0x21,0x90,0x54,0x39,0x20] + vpgatherdq 256(%r9,%xmm31), %ymm26 {%k1} + +// CHECK: vpgatherdq 1024(%rcx,%xmm31,4), %ymm26 {%k1} +// CHECK: encoding: [0x62,0x22,0xfd,0x21,0x90,0x94,0xb9,0x00,0x04,0x00,0x00] + vpgatherdq 1024(%rcx,%xmm31,4), %ymm26 {%k1} + +// CHECK: vpgatherqd 123(%r14,%xmm31,8), %xmm21 {%k1} +// CHECK: encoding: [0x62,0x82,0x7d,0x01,0x91,0xac,0xfe,0x7b,0x00,0x00,0x00] + vpgatherqd 123(%r14,%xmm31,8), %xmm21 {%k1} + +// CHECK: vpgatherqd 256(%r9,%xmm31), %xmm21 {%k1} +// CHECK: encoding: [0x62,0x82,0x7d,0x01,0x91,0x6c,0x39,0x40] + vpgatherqd 256(%r9,%xmm31), %xmm21 {%k1} + +// CHECK: vpgatherqd 1024(%rcx,%xmm31,4), %xmm21 {%k1} +// CHECK: encoding: [0x62,0xa2,0x7d,0x01,0x91,0xac,0xb9,0x00,0x04,0x00,0x00] + vpgatherqd 1024(%rcx,%xmm31,4), %xmm21 {%k1} + +// CHECK: vpgatherqd 123(%r14,%ymm31,8), %xmm25 {%k1} +// CHECK: encoding: [0x62,0x02,0x7d,0x21,0x91,0x8c,0xfe,0x7b,0x00,0x00,0x00] + vpgatherqd 123(%r14,%ymm31,8), %xmm25 {%k1} + +// CHECK: vpgatherqd 256(%r9,%ymm31), %xmm25 {%k1} +// CHECK: encoding: [0x62,0x02,0x7d,0x21,0x91,0x4c,0x39,0x40] + vpgatherqd 256(%r9,%ymm31), %xmm25 {%k1} + +// CHECK: vpgatherqd 1024(%rcx,%ymm31,4), %xmm25 {%k1} +// CHECK: encoding: [0x62,0x22,0x7d,0x21,0x91,0x8c,0xb9,0x00,0x04,0x00,0x00] + vpgatherqd 1024(%rcx,%ymm31,4), %xmm25 {%k1} + +// CHECK: vpgatherqq 123(%r14,%xmm31,8), %xmm18 {%k1} +// CHECK: encoding: [0x62,0x82,0xfd,0x01,0x91,0x94,0xfe,0x7b,0x00,0x00,0x00] + vpgatherqq 123(%r14,%xmm31,8), %xmm18 {%k1} + +// CHECK: vpgatherqq 256(%r9,%xmm31), %xmm18 {%k1} +// CHECK: encoding: [0x62,0x82,0xfd,0x01,0x91,0x54,0x39,0x20] + vpgatherqq 256(%r9,%xmm31), %xmm18 {%k1} + +// CHECK: vpgatherqq 1024(%rcx,%xmm31,4), %xmm18 {%k1} +// CHECK: encoding: [0x62,0xa2,0xfd,0x01,0x91,0x94,0xb9,0x00,0x04,0x00,0x00] + vpgatherqq 1024(%rcx,%xmm31,4), %xmm18 {%k1} + +// CHECK: vpgatherqq 123(%r14,%ymm31,8), %ymm19 {%k1} +// CHECK: encoding: [0x62,0x82,0xfd,0x21,0x91,0x9c,0xfe,0x7b,0x00,0x00,0x00] + vpgatherqq 123(%r14,%ymm31,8), %ymm19 {%k1} + +// CHECK: vpgatherqq 256(%r9,%ymm31), %ymm19 {%k1} +// CHECK: encoding: [0x62,0x82,0xfd,0x21,0x91,0x5c,0x39,0x20] + vpgatherqq 256(%r9,%ymm31), %ymm19 {%k1} + +// CHECK: vpgatherqq 1024(%rcx,%ymm31,4), %ymm19 {%k1} +// CHECK: encoding: [0x62,0xa2,0xfd,0x21,0x91,0x9c,0xb9,0x00,0x04,0x00,0x00] + vpgatherqq 1024(%rcx,%ymm31,4), %ymm19 {%k1} + +// CHECK: vgatherdpd 123(%r14,%xmm31,8), %xmm17 {%k1} +// CHECK: encoding: [0x62,0x82,0xfd,0x01,0x92,0x8c,0xfe,0x7b,0x00,0x00,0x00] + vgatherdpd 123(%r14,%xmm31,8), %xmm17 {%k1} + +// CHECK: vgatherdpd 256(%r9,%xmm31), %xmm17 {%k1} +// CHECK: encoding: [0x62,0x82,0xfd,0x01,0x92,0x4c,0x39,0x20] + vgatherdpd 256(%r9,%xmm31), %xmm17 {%k1} + +// CHECK: vgatherdpd 1024(%rcx,%xmm31,4), %xmm17 {%k1} +// CHECK: encoding: [0x62,0xa2,0xfd,0x01,0x92,0x8c,0xb9,0x00,0x04,0x00,0x00] + vgatherdpd 1024(%rcx,%xmm31,4), %xmm17 {%k1} + +// CHECK: vgatherdpd 123(%r14,%xmm31,8), %ymm23 {%k1} +// CHECK: encoding: [0x62,0x82,0xfd,0x21,0x92,0xbc,0xfe,0x7b,0x00,0x00,0x00] + vgatherdpd 123(%r14,%xmm31,8), %ymm23 {%k1} + +// CHECK: vgatherdpd 256(%r9,%xmm31), %ymm23 {%k1} +// CHECK: encoding: [0x62,0x82,0xfd,0x21,0x92,0x7c,0x39,0x20] + vgatherdpd 256(%r9,%xmm31), %ymm23 {%k1} + +// CHECK: vgatherdpd 1024(%rcx,%xmm31,4), %ymm23 {%k1} +// CHECK: encoding: [0x62,0xa2,0xfd,0x21,0x92,0xbc,0xb9,0x00,0x04,0x00,0x00] + vgatherdpd 1024(%rcx,%xmm31,4), %ymm23 {%k1} + +// CHECK: vgatherdps 123(%r14,%xmm31,8), %xmm18 {%k1} +// CHECK: encoding: [0x62,0x82,0x7d,0x01,0x92,0x94,0xfe,0x7b,0x00,0x00,0x00] + vgatherdps 123(%r14,%xmm31,8), %xmm18 {%k1} + +// CHECK: vgatherdps 256(%r9,%xmm31), %xmm18 {%k1} +// CHECK: encoding: [0x62,0x82,0x7d,0x01,0x92,0x54,0x39,0x40] + vgatherdps 256(%r9,%xmm31), %xmm18 {%k1} + +// CHECK: vgatherdps 1024(%rcx,%xmm31,4), %xmm18 {%k1} +// CHECK: encoding: [0x62,0xa2,0x7d,0x01,0x92,0x94,0xb9,0x00,0x04,0x00,0x00] + vgatherdps 1024(%rcx,%xmm31,4), %xmm18 {%k1} + +// CHECK: vgatherdps 123(%r14,%ymm31,8), %ymm27 {%k1} +// CHECK: encoding: [0x62,0x02,0x7d,0x21,0x92,0x9c,0xfe,0x7b,0x00,0x00,0x00] + vgatherdps 123(%r14,%ymm31,8), %ymm27 {%k1} + +// CHECK: vgatherdps 256(%r9,%ymm31), %ymm27 {%k1} +// CHECK: encoding: [0x62,0x02,0x7d,0x21,0x92,0x5c,0x39,0x40] + vgatherdps 256(%r9,%ymm31), %ymm27 {%k1} + +// CHECK: vgatherdps 1024(%rcx,%ymm31,4), %ymm27 {%k1} +// CHECK: encoding: [0x62,0x22,0x7d,0x21,0x92,0x9c,0xb9,0x00,0x04,0x00,0x00] + vgatherdps 1024(%rcx,%ymm31,4), %ymm27 {%k1} + +// CHECK: vgatherqpd 123(%r14,%xmm31,8), %xmm17 {%k1} +// CHECK: encoding: [0x62,0x82,0xfd,0x01,0x93,0x8c,0xfe,0x7b,0x00,0x00,0x00] + vgatherqpd 123(%r14,%xmm31,8), %xmm17 {%k1} + +// CHECK: vgatherqpd 256(%r9,%xmm31), %xmm17 {%k1} +// CHECK: encoding: [0x62,0x82,0xfd,0x01,0x93,0x4c,0x39,0x20] + vgatherqpd 256(%r9,%xmm31), %xmm17 {%k1} + +// CHECK: vgatherqpd 1024(%rcx,%xmm31,4), %xmm17 {%k1} +// CHECK: encoding: [0x62,0xa2,0xfd,0x01,0x93,0x8c,0xb9,0x00,0x04,0x00,0x00] + vgatherqpd 1024(%rcx,%xmm31,4), %xmm17 {%k1} + +// CHECK: vgatherqpd 123(%r14,%ymm31,8), %ymm29 {%k1} +// CHECK: encoding: [0x62,0x02,0xfd,0x21,0x93,0xac,0xfe,0x7b,0x00,0x00,0x00] + vgatherqpd 123(%r14,%ymm31,8), %ymm29 {%k1} + +// CHECK: vgatherqpd 256(%r9,%ymm31), %ymm29 {%k1} +// CHECK: encoding: [0x62,0x02,0xfd,0x21,0x93,0x6c,0x39,0x20] + vgatherqpd 256(%r9,%ymm31), %ymm29 {%k1} + +// CHECK: vgatherqpd 1024(%rcx,%ymm31,4), %ymm29 {%k1} +// CHECK: encoding: [0x62,0x22,0xfd,0x21,0x93,0xac,0xb9,0x00,0x04,0x00,0x00] + vgatherqpd 1024(%rcx,%ymm31,4), %ymm29 {%k1} + +// CHECK: vgatherqps 123(%r14,%xmm31,8), %xmm21 {%k1} +// CHECK: encoding: [0x62,0x82,0x7d,0x01,0x93,0xac,0xfe,0x7b,0x00,0x00,0x00] + vgatherqps 123(%r14,%xmm31,8), %xmm21 {%k1} + +// CHECK: vgatherqps 256(%r9,%xmm31), %xmm21 {%k1} +// CHECK: encoding: [0x62,0x82,0x7d,0x01,0x93,0x6c,0x39,0x40] + vgatherqps 256(%r9,%xmm31), %xmm21 {%k1} + +// CHECK: vgatherqps 1024(%rcx,%xmm31,4), %xmm21 {%k1} +// CHECK: encoding: [0x62,0xa2,0x7d,0x01,0x93,0xac,0xb9,0x00,0x04,0x00,0x00] + vgatherqps 1024(%rcx,%xmm31,4), %xmm21 {%k1} + +// CHECK: vgatherqps 123(%r14,%ymm31,8), %xmm19 {%k1} +// CHECK: encoding: [0x62,0x82,0x7d,0x21,0x93,0x9c,0xfe,0x7b,0x00,0x00,0x00] + vgatherqps 123(%r14,%ymm31,8), %xmm19 {%k1} + +// CHECK: vgatherqps 256(%r9,%ymm31), %xmm19 {%k1} +// CHECK: encoding: [0x62,0x82,0x7d,0x21,0x93,0x5c,0x39,0x40] + vgatherqps 256(%r9,%ymm31), %xmm19 {%k1} + +// CHECK: vgatherqps 1024(%rcx,%ymm31,4), %xmm19 {%k1} +// CHECK: encoding: [0x62,0xa2,0x7d,0x21,0x93,0x9c,0xb9,0x00,0x04,0x00,0x00] + vgatherqps 1024(%rcx,%ymm31,4), %xmm19 {%k1} diff --git a/utils/TableGen/X86RecognizableInstr.cpp b/utils/TableGen/X86RecognizableInstr.cpp index dde21c6d45f..efcb0c81e1c 100644 --- a/utils/TableGen/X86RecognizableInstr.cpp +++ b/utils/TableGen/X86RecognizableInstr.cpp @@ -1027,9 +1027,12 @@ OperandType RecognizableInstr::typeFromString(const std::string &s, TYPE("GR32_NOAX", TYPE_Rv) TYPE("GR64_NOAX", TYPE_R64) TYPE("vx32mem", TYPE_M32) + TYPE("vx32xmem", TYPE_M32) TYPE("vy32mem", TYPE_M32) + TYPE("vy32xmem", TYPE_M32) TYPE("vz32mem", TYPE_M32) TYPE("vx64mem", TYPE_M64) + TYPE("vx64xmem", TYPE_M64) TYPE("vy64mem", TYPE_M64) TYPE("vy64xmem", TYPE_M64) TYPE("vz64mem", TYPE_M64) @@ -1213,9 +1216,12 @@ RecognizableInstr::memoryEncodingFromString(const std::string &s, ENCODING("opaque80mem", ENCODING_RM) ENCODING("opaque512mem", ENCODING_RM) ENCODING("vx32mem", ENCODING_RM) + ENCODING("vx32xmem", ENCODING_RM) ENCODING("vy32mem", ENCODING_RM) + ENCODING("vy32xmem", ENCODING_RM) ENCODING("vz32mem", ENCODING_RM) ENCODING("vx64mem", ENCODING_RM) + ENCODING("vx64xmem", ENCODING_RM) ENCODING("vy64mem", ENCODING_RM) ENCODING("vy64xmem", ENCODING_RM) ENCODING("vz64mem", ENCODING_RM)