From 6adcd58d3c58a8eeb21bc1bfe399c7b03592f273 Mon Sep 17 00:00:00 2001 From: Elena Demikhovsky Date: Sun, 1 Sep 2013 14:24:41 +0000 Subject: [PATCH] AVX-512: Added GATHER and SCATTER instructions. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@189729 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86ISelLowering.cpp | 225 +++++++++++++++++- lib/Target/X86/X86InstrAVX512.td | 63 +++++ .../X86/avx512-gather-scatter-intrin.ll | 223 +++++++++++++++++ 3 files changed, 508 insertions(+), 3 deletions(-) create mode 100644 test/CodeGen/X86/avx512-gather-scatter-intrin.ll diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 73c4a1cabf5..739c1448cdf 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -1445,6 +1445,7 @@ void X86TargetLowering::resetOperationActions() { // We want to custom lower some of our intrinsics. setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom); + setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom); // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't // handle type legalization for these operations here. @@ -11623,7 +11624,87 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) { } } -static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) { +static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, + SDValue Base, SDValue Index, + SDValue ScaleOp, SDValue Chain, + const X86Subtarget * Subtarget) { + SDLoc dl(Op); + ConstantSDNode *C = dyn_cast(ScaleOp); + assert(C && "Invalid scale type"); + SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), MVT::i8); + SDValue Src = getZeroVector(Op.getValueType(), Subtarget, DAG, dl); + EVT MaskVT = MVT::getVectorVT(MVT::i1, + Index.getValueType().getVectorNumElements()); + SDValue MaskInReg = DAG.getConstant(~0, MaskVT); + SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other); + SDValue Disp = DAG.getTargetConstant(0, MVT::i32); + SDValue Segment = DAG.getRegister(0, MVT::i32); + SDValue Ops[] = {Src, MaskInReg, Base, Scale, Index, Disp, Segment, Chain}; + SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops); + SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) }; + return DAG.getMergeValues(RetOps, array_lengthof(RetOps), dl); +} + +static SDValue getMGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, + SDValue Src, SDValue Mask, SDValue Base, + SDValue Index, SDValue ScaleOp, SDValue Chain, + const X86Subtarget * Subtarget) { + SDLoc dl(Op); + ConstantSDNode *C = dyn_cast(ScaleOp); + assert(C && "Invalid scale type"); + SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), MVT::i8); + EVT MaskVT = MVT::getVectorVT(MVT::i1, + Index.getValueType().getVectorNumElements()); + SDValue MaskInReg = DAG.getNode(ISD::BITCAST, dl, MaskVT, Mask); + SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other); + SDValue Disp = DAG.getTargetConstant(0, MVT::i32); + SDValue Segment = DAG.getRegister(0, MVT::i32); + if (Src.getOpcode() == ISD::UNDEF) + Src = getZeroVector(Op.getValueType(), Subtarget, DAG, dl); + SDValue Ops[] = {Src, MaskInReg, Base, Scale, Index, Disp, Segment, Chain}; + SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops); + SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) }; + return DAG.getMergeValues(RetOps, array_lengthof(RetOps), dl); +} + +static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, + SDValue Src, SDValue Base, SDValue Index, + SDValue ScaleOp, SDValue Chain) { + SDLoc dl(Op); + ConstantSDNode *C = dyn_cast(ScaleOp); + assert(C && "Invalid scale type"); + SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), MVT::i8); + SDValue Disp = DAG.getTargetConstant(0, MVT::i32); + SDValue Segment = DAG.getRegister(0, MVT::i32); + EVT MaskVT = MVT::getVectorVT(MVT::i1, + Index.getValueType().getVectorNumElements()); + SDValue MaskInReg = DAG.getConstant(~0, MaskVT); + SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other); + SDValue Ops[] = {Base, Scale, Index, Disp, Segment, MaskInReg, Src, Chain}; + SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops); + return SDValue(Res, 1); +} + +static SDValue getMScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, + SDValue Src, SDValue Mask, SDValue Base, + SDValue Index, SDValue ScaleOp, SDValue Chain) { + SDLoc dl(Op); + ConstantSDNode *C = dyn_cast(ScaleOp); + assert(C && "Invalid scale type"); + SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), MVT::i8); + SDValue Disp = DAG.getTargetConstant(0, MVT::i32); + SDValue Segment = DAG.getRegister(0, MVT::i32); + EVT MaskVT = MVT::getVectorVT(MVT::i1, + Index.getValueType().getVectorNumElements()); + SDValue MaskInReg = DAG.getNode(ISD::BITCAST, dl, MaskVT, Mask); + SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other); + SDValue Ops[] = {Base, Scale, Index, Disp, Segment, MaskInReg, Src, Chain}; + SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops); + return SDValue(Res, 1); +} + +static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget, + SelectionDAG &DAG) { SDLoc dl(Op); unsigned IntNo = cast(Op.getOperand(1))->getZExtValue(); switch (IntNo) { @@ -11658,7 +11739,144 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) { return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid, SDValue(Result.getNode(), 2)); } - + //int_gather(index, base, scale); + case Intrinsic::x86_avx512_gather_qpd_512: + case Intrinsic::x86_avx512_gather_qps_512: + case Intrinsic::x86_avx512_gather_dpd_512: + case Intrinsic::x86_avx512_gather_qpi_512: + case Intrinsic::x86_avx512_gather_qpq_512: + case Intrinsic::x86_avx512_gather_dpq_512: + case Intrinsic::x86_avx512_gather_dps_512: + case Intrinsic::x86_avx512_gather_dpi_512: { + unsigned Opc; + switch (IntNo) { + default: llvm_unreachable("Unexpected intrinsic!"); + case Intrinsic::x86_avx512_gather_qps_512: Opc = X86::VGATHERQPSZrm; break; + case Intrinsic::x86_avx512_gather_qpd_512: Opc = X86::VGATHERQPDZrm; break; + case Intrinsic::x86_avx512_gather_dpd_512: Opc = X86::VGATHERDPDZrm; break; + case Intrinsic::x86_avx512_gather_dps_512: Opc = X86::VGATHERDPSZrm; break; + case Intrinsic::x86_avx512_gather_qpi_512: Opc = X86::VPGATHERQDZrm; break; + case Intrinsic::x86_avx512_gather_qpq_512: Opc = X86::VPGATHERQQZrm; break; + case Intrinsic::x86_avx512_gather_dpi_512: Opc = X86::VPGATHERDDZrm; break; + case Intrinsic::x86_avx512_gather_dpq_512: Opc = X86::VPGATHERDQZrm; break; + } + SDValue Chain = Op.getOperand(0); + SDValue Index = Op.getOperand(2); + SDValue Base = Op.getOperand(3); + SDValue Scale = Op.getOperand(4); + return getGatherNode(Opc, Op, DAG, Base, Index, Scale, Chain, Subtarget); + } + //int_gather_mask(v1, mask, index, base, scale); + case Intrinsic::x86_avx512_gather_qps_mask_512: + case Intrinsic::x86_avx512_gather_qpd_mask_512: + case Intrinsic::x86_avx512_gather_dpd_mask_512: + case Intrinsic::x86_avx512_gather_dps_mask_512: + case Intrinsic::x86_avx512_gather_qpi_mask_512: + case Intrinsic::x86_avx512_gather_qpq_mask_512: + case Intrinsic::x86_avx512_gather_dpi_mask_512: + case Intrinsic::x86_avx512_gather_dpq_mask_512: { + unsigned Opc; + switch (IntNo) { + default: llvm_unreachable("Unexpected intrinsic!"); + case Intrinsic::x86_avx512_gather_qps_mask_512: + Opc = X86::VGATHERQPSZrm; break; + case Intrinsic::x86_avx512_gather_qpd_mask_512: + Opc = X86::VGATHERQPDZrm; break; + case Intrinsic::x86_avx512_gather_dpd_mask_512: + Opc = X86::VGATHERDPDZrm; break; + case Intrinsic::x86_avx512_gather_dps_mask_512: + Opc = X86::VGATHERDPSZrm; break; + case Intrinsic::x86_avx512_gather_qpi_mask_512: + Opc = X86::VPGATHERQDZrm; break; + case Intrinsic::x86_avx512_gather_qpq_mask_512: + Opc = X86::VPGATHERQQZrm; break; + case Intrinsic::x86_avx512_gather_dpi_mask_512: + Opc = X86::VPGATHERDDZrm; break; + case Intrinsic::x86_avx512_gather_dpq_mask_512: + Opc = X86::VPGATHERDQZrm; break; + } + SDValue Chain = Op.getOperand(0); + SDValue Src = Op.getOperand(2); + SDValue Mask = Op.getOperand(3); + SDValue Index = Op.getOperand(4); + SDValue Base = Op.getOperand(5); + SDValue Scale = Op.getOperand(6); + return getMGatherNode(Opc, Op, DAG, Src, Mask, Base, Index, Scale, Chain, + Subtarget); + } + //int_scatter(base, index, v1, scale); + case Intrinsic::x86_avx512_scatter_qpd_512: + case Intrinsic::x86_avx512_scatter_qps_512: + case Intrinsic::x86_avx512_scatter_dpd_512: + case Intrinsic::x86_avx512_scatter_qpi_512: + case Intrinsic::x86_avx512_scatter_qpq_512: + case Intrinsic::x86_avx512_scatter_dpq_512: + case Intrinsic::x86_avx512_scatter_dps_512: + case Intrinsic::x86_avx512_scatter_dpi_512: { + unsigned Opc; + switch (IntNo) { + default: llvm_unreachable("Unexpected intrinsic!"); + case Intrinsic::x86_avx512_scatter_qpd_512: + Opc = X86::VSCATTERQPDZmr; break; + case Intrinsic::x86_avx512_scatter_qps_512: + Opc = X86::VSCATTERQPSZmr; break; + case Intrinsic::x86_avx512_scatter_dpd_512: + Opc = X86::VSCATTERDPDZmr; break; + case Intrinsic::x86_avx512_scatter_dps_512: + Opc = X86::VSCATTERDPSZmr; break; + case Intrinsic::x86_avx512_scatter_qpi_512: + Opc = X86::VPSCATTERQDZmr; break; + case Intrinsic::x86_avx512_scatter_qpq_512: + Opc = X86::VPSCATTERQQZmr; break; + case Intrinsic::x86_avx512_scatter_dpq_512: + Opc = X86::VPSCATTERDQZmr; break; + case Intrinsic::x86_avx512_scatter_dpi_512: + Opc = X86::VPSCATTERDDZmr; break; + } + SDValue Chain = Op.getOperand(0); + SDValue Base = Op.getOperand(2); + SDValue Index = Op.getOperand(3); + SDValue Src = Op.getOperand(4); + SDValue Scale = Op.getOperand(5); + return getScatterNode(Opc, Op, DAG, Src, Base, Index, Scale, Chain); + } + //int_scatter_mask(base, mask, index, v1, scale); + case Intrinsic::x86_avx512_scatter_qps_mask_512: + case Intrinsic::x86_avx512_scatter_qpd_mask_512: + case Intrinsic::x86_avx512_scatter_dpd_mask_512: + case Intrinsic::x86_avx512_scatter_dps_mask_512: + case Intrinsic::x86_avx512_scatter_qpi_mask_512: + case Intrinsic::x86_avx512_scatter_qpq_mask_512: + case Intrinsic::x86_avx512_scatter_dpi_mask_512: + case Intrinsic::x86_avx512_scatter_dpq_mask_512: { + unsigned Opc; + switch (IntNo) { + default: llvm_unreachable("Unexpected intrinsic!"); + case Intrinsic::x86_avx512_scatter_qpd_mask_512: + Opc = X86::VSCATTERQPDZmr; break; + case Intrinsic::x86_avx512_scatter_qps_mask_512: + Opc = X86::VSCATTERQPSZmr; break; + case Intrinsic::x86_avx512_scatter_dpd_mask_512: + Opc = X86::VSCATTERDPDZmr; break; + case Intrinsic::x86_avx512_scatter_dps_mask_512: + Opc = X86::VSCATTERDPSZmr; break; + case Intrinsic::x86_avx512_scatter_qpi_mask_512: + Opc = X86::VPSCATTERQDZmr; break; + case Intrinsic::x86_avx512_scatter_qpq_mask_512: + Opc = X86::VPSCATTERQQZmr; break; + case Intrinsic::x86_avx512_scatter_dpq_mask_512: + Opc = X86::VPSCATTERDQZmr; break; + case Intrinsic::x86_avx512_scatter_dpi_mask_512: + Opc = X86::VPSCATTERDDZmr; break; + } + SDValue Chain = Op.getOperand(0); + SDValue Base = Op.getOperand(2); + SDValue Mask = Op.getOperand(3); + SDValue Index = Op.getOperand(4); + SDValue Src = Op.getOperand(5); + SDValue Scale = Op.getOperand(6); + return getMScatterNode(Opc, Op, DAG, Src, Mask, Base, Index, Scale, Chain); + } // XTEST intrinsics. case Intrinsic::x86_xtest: { SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other); @@ -13093,7 +13311,8 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::VAARG: return LowerVAARG(Op, DAG); case ISD::VACOPY: return LowerVACOPY(Op, Subtarget, DAG); case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG); - case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, DAG); + case ISD::INTRINSIC_VOID: + case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, Subtarget, DAG); case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG); case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG); case ISD::FRAME_TO_ARGS_OFFSET: diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td index 6b2f1608ca4..ea3a4e17416 100644 --- a/lib/Target/X86/X86InstrAVX512.td +++ b/lib/Target/X86/X86InstrAVX512.td @@ -2922,6 +2922,69 @@ defm VPMOVSXDQZ: avx512_extend<0x25, "vpmovsxdq", VR512, VR256X, X86vsext, memopv4i64, i256mem, v8i64, v8i32>, EVEX_V512, EVEX_CD8<32, CD8VH>; +//===----------------------------------------------------------------------===// +// GATHER - SCATTER Operations + +multiclass avx512_gather opc, string OpcodeStr, RegisterClass KRC, + RegisterClass RC, X86MemOperand memop> { +let mayLoad = 1, + Constraints = "@earlyclobber $dst, $src1 = $dst, $mask = $mask_wb" in + def rm : AVX5128I, EVEX, EVEX_K; +} +defm VGATHERDPDZ : avx512_gather<0x92, "vgatherdpd", VK8WM, VR512, vy64xmem>, + EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>; +defm VGATHERDPSZ : avx512_gather<0x92, "vgatherdps", VK16WM, VR512, vz32mem>, + EVEX_V512, EVEX_CD8<32, CD8VT1>; + +defm VGATHERQPDZ : avx512_gather<0x93, "vgatherqpd", VK8WM, VR512, vz64mem>, + EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>; +defm VGATHERQPSZ : avx512_gather<0x93, "vgatherqps", VK8WM, VR256X, vz64mem>, + EVEX_V512, EVEX_CD8<32, CD8VT1>; + +defm VPGATHERDQZ : avx512_gather<0x90, "vpgatherdq", VK8WM, VR512, vy64xmem>, + EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>; +defm VPGATHERDDZ : avx512_gather<0x90, "vpgatherdd", VK16WM, VR512, vz32mem>, + EVEX_V512, EVEX_CD8<32, CD8VT1>; + +defm VPGATHERQQZ : avx512_gather<0x91, "vpgatherqq", VK8WM, VR512, vz64mem>, + EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>; +defm VPGATHERQDZ : avx512_gather<0x91, "vpgatherqd", VK8WM, VR256X, vz64mem>, + EVEX_V512, EVEX_CD8<32, CD8VT1>; + +multiclass avx512_scatter opc, string OpcodeStr, RegisterClass KRC, + RegisterClass RC, X86MemOperand memop> { +let mayStore = 1, Constraints = "$mask = $mask_wb" in + def mr : AVX5128I, EVEX, EVEX_K; +} + +defm VSCATTERDPDZ : avx512_scatter<0xA2, "vscatterdpd", VK8WM, VR512, vy64xmem>, + EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>; +defm VSCATTERDPSZ : avx512_scatter<0xA2, "vscatterdps", VK16WM, VR512, vz32mem>, + EVEX_V512, EVEX_CD8<32, CD8VT1>; + +defm VSCATTERQPDZ : avx512_scatter<0xA3, "vscatterqpd", VK8WM, VR512, vz64mem>, + EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>; +defm VSCATTERQPSZ : avx512_scatter<0xA3, "vscatterqps", VK8WM, VR256X, vz64mem>, + EVEX_V512, EVEX_CD8<32, CD8VT1>; + +defm VPSCATTERDQZ : avx512_scatter<0xA0, "vpscatterdq", VK8WM, VR512, vy64xmem>, + EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>; +defm VPSCATTERDDZ : avx512_scatter<0xA0, "vpscatterdd", VK16WM, VR512, vz32mem>, + EVEX_V512, EVEX_CD8<32, CD8VT1>; + +defm VPSCATTERQQZ : avx512_scatter<0xA1, "vpscatterqq", VK8WM, VR512, vz64mem>, + EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>; +defm VPSCATTERQDZ : avx512_scatter<0xA1, "vpscatterqd", VK8WM, VR256X, vz64mem>, + EVEX_V512, EVEX_CD8<32, CD8VT1>; + //===----------------------------------------------------------------------===// // VSHUFPS - VSHUFPD Operations diff --git a/test/CodeGen/X86/avx512-gather-scatter-intrin.ll b/test/CodeGen/X86/avx512-gather-scatter-intrin.ll new file mode 100644 index 00000000000..2b87d44fc57 --- /dev/null +++ b/test/CodeGen/X86/avx512-gather-scatter-intrin.ll @@ -0,0 +1,223 @@ +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s + +declare <16 x float> @llvm.x86.avx512.gather.dps.mask.512 (<16 x float>, i16, <16 x i32>, i8*, i32) +declare void @llvm.x86.avx512.scatter.dps.mask.512 (i8*, i16, <16 x i32>, <16 x float>, i32) +declare <8 x double> @llvm.x86.avx512.gather.dpd.mask.512 (<8 x double>, i8, <8 x i32>, i8*, i32) +declare void @llvm.x86.avx512.scatter.dpd.mask.512 (i8*, i8, <8 x i32>, <8 x double>, i32) + +declare <8 x float> @llvm.x86.avx512.gather.qps.mask.512 (<8 x float>, i8, <8 x i64>, i8*, i32) +declare void @llvm.x86.avx512.scatter.qps.mask.512 (i8*, i8, <8 x i64>, <8 x float>, i32) +declare <8 x double> @llvm.x86.avx512.gather.qpd.mask.512 (<8 x double>, i8, <8 x i64>, i8*, i32) +declare void @llvm.x86.avx512.scatter.qpd.mask.512 (i8*, i8, <8 x i64>, <8 x double>, i32) + +;CHECK: gather_mask_dps +;CHECK: kmovw +;CHECK: vgatherdps +;CHECK: vpadd +;CHECK: vscatterdps +;CHECK: ret +define void @gather_mask_dps(<16 x i32> %ind, <16 x float> %src, i16 %mask, i8* %base, i8* %stbuf) { + %x = call <16 x float> @llvm.x86.avx512.gather.dps.mask.512 (<16 x float> %src, i16 %mask, <16 x i32>%ind, i8* %base, i32 4) + %ind2 = add <16 x i32> %ind, + call void @llvm.x86.avx512.scatter.dps.mask.512 (i8* %stbuf, i16 %mask, <16 x i32>%ind2, <16 x float> %x, i32 4) + ret void +} + +;CHECK: gather_mask_dpd +;CHECK: kmovw +;CHECK: vgatherdpd +;CHECK: vpadd +;CHECK: vscatterdpd +;CHECK: ret +define void @gather_mask_dpd(<8 x i32> %ind, <8 x double> %src, i8 %mask, i8* %base, i8* %stbuf) { + %x = call <8 x double> @llvm.x86.avx512.gather.dpd.mask.512 (<8 x double> %src, i8 %mask, <8 x i32>%ind, i8* %base, i32 4) + %ind2 = add <8 x i32> %ind, + call void @llvm.x86.avx512.scatter.dpd.mask.512 (i8* %stbuf, i8 %mask, <8 x i32>%ind2, <8 x double> %x, i32 4) + ret void +} + +;CHECK: gather_mask_qps +;CHECK: kmovw +;CHECK: vgatherqps +;CHECK: vpadd +;CHECK: vscatterqps +;CHECK: ret +define void @gather_mask_qps(<8 x i64> %ind, <8 x float> %src, i8 %mask, i8* %base, i8* %stbuf) { + %x = call <8 x float> @llvm.x86.avx512.gather.qps.mask.512 (<8 x float> %src, i8 %mask, <8 x i64>%ind, i8* %base, i32 4) + %ind2 = add <8 x i64> %ind, + call void @llvm.x86.avx512.scatter.qps.mask.512 (i8* %stbuf, i8 %mask, <8 x i64>%ind2, <8 x float> %x, i32 4) + ret void +} + +;CHECK: gather_mask_qpd +;CHECK: kmovw +;CHECK: vgatherqpd +;CHECK: vpadd +;CHECK: vscatterqpd +;CHECK: ret +define void @gather_mask_qpd(<8 x i64> %ind, <8 x double> %src, i8 %mask, i8* %base, i8* %stbuf) { + %x = call <8 x double> @llvm.x86.avx512.gather.qpd.mask.512 (<8 x double> %src, i8 %mask, <8 x i64>%ind, i8* %base, i32 4) + %ind2 = add <8 x i64> %ind, + call void @llvm.x86.avx512.scatter.qpd.mask.512 (i8* %stbuf, i8 %mask, <8 x i64>%ind2, <8 x double> %x, i32 4) + ret void +} +;; +;; Integer Gather/Scatter +;; +declare <16 x i32> @llvm.x86.avx512.gather.dpi.mask.512 (<16 x i32>, i16, <16 x i32>, i8*, i32) +declare void @llvm.x86.avx512.scatter.dpi.mask.512 (i8*, i16, <16 x i32>, <16 x i32>, i32) +declare <8 x i64> @llvm.x86.avx512.gather.dpq.mask.512 (<8 x i64>, i8, <8 x i32>, i8*, i32) +declare void @llvm.x86.avx512.scatter.dpq.mask.512 (i8*, i8, <8 x i32>, <8 x i64>, i32) + +declare <8 x i32> @llvm.x86.avx512.gather.qpi.mask.512 (<8 x i32>, i8, <8 x i64>, i8*, i32) +declare void @llvm.x86.avx512.scatter.qpi.mask.512 (i8*, i8, <8 x i64>, <8 x i32>, i32) +declare <8 x i64> @llvm.x86.avx512.gather.qpq.mask.512 (<8 x i64>, i8, <8 x i64>, i8*, i32) +declare void @llvm.x86.avx512.scatter.qpq.mask.512 (i8*, i8, <8 x i64>, <8 x i64>, i32) + +;CHECK: gather_mask_dd +;CHECK: kmovw +;CHECK: vpgatherdd +;CHECK: vpadd +;CHECK: vpscatterdd +;CHECK: ret +define void @gather_mask_dd(<16 x i32> %ind, <16 x i32> %src, i16 %mask, i8* %base, i8* %stbuf) { + %x = call <16 x i32> @llvm.x86.avx512.gather.dpi.mask.512 (<16 x i32> %src, i16 %mask, <16 x i32>%ind, i8* %base, i32 4) + %ind2 = add <16 x i32> %ind, + call void @llvm.x86.avx512.scatter.dpi.mask.512 (i8* %stbuf, i16 %mask, <16 x i32>%ind2, <16 x i32> %x, i32 4) + ret void +} + +;CHECK: gather_mask_qd +;CHECK: kmovw +;CHECK: vpgatherqd +;CHECK: vpadd +;CHECK: vpscatterqd +;CHECK: ret +define void @gather_mask_qd(<8 x i64> %ind, <8 x i32> %src, i8 %mask, i8* %base, i8* %stbuf) { + %x = call <8 x i32> @llvm.x86.avx512.gather.qpi.mask.512 (<8 x i32> %src, i8 %mask, <8 x i64>%ind, i8* %base, i32 4) + %ind2 = add <8 x i64> %ind, + call void @llvm.x86.avx512.scatter.qpi.mask.512 (i8* %stbuf, i8 %mask, <8 x i64>%ind2, <8 x i32> %x, i32 4) + ret void +} + +;CHECK: gather_mask_qq +;CHECK: kmovw +;CHECK: vpgatherqq +;CHECK: vpadd +;CHECK: vpscatterqq +;CHECK: ret +define void @gather_mask_qq(<8 x i64> %ind, <8 x i64> %src, i8 %mask, i8* %base, i8* %stbuf) { + %x = call <8 x i64> @llvm.x86.avx512.gather.qpq.mask.512 (<8 x i64> %src, i8 %mask, <8 x i64>%ind, i8* %base, i32 4) + %ind2 = add <8 x i64> %ind, + call void @llvm.x86.avx512.scatter.qpq.mask.512 (i8* %stbuf, i8 %mask, <8 x i64>%ind2, <8 x i64> %x, i32 4) + ret void +} + +;CHECK: gather_mask_dq +;CHECK: kmovw +;CHECK: vpgatherdq +;CHECK: vpadd +;CHECK: vpscatterdq +;CHECK: ret +define void @gather_mask_dq(<8 x i32> %ind, <8 x i64> %src, i8 %mask, i8* %base, i8* %stbuf) { + %x = call <8 x i64> @llvm.x86.avx512.gather.dpq.mask.512 (<8 x i64> %src, i8 %mask, <8 x i32>%ind, i8* %base, i32 4) + %ind2 = add <8 x i32> %ind, + call void @llvm.x86.avx512.scatter.dpq.mask.512 (i8* %stbuf, i8 %mask, <8 x i32>%ind2, <8 x i64> %x, i32 4) + ret void +} + +;; FP Intinsics without masks + +declare <16 x float> @llvm.x86.avx512.gather.dps.512 (<16 x i32>, i8*, i32) +declare void @llvm.x86.avx512.scatter.dps.512 (i8*, <16 x i32>, <16 x float>, i32) +declare <8 x float> @llvm.x86.avx512.gather.qps.512 (<8 x i64>, i8*, i32) +declare void @llvm.x86.avx512.scatter.qps.512 (i8*, <8 x i64>, <8 x float>, i32) +declare <8 x double> @llvm.x86.avx512.gather.qpd.512 (<8 x i64>, i8*, i32) +declare void @llvm.x86.avx512.scatter.qpd.512 (i8*, <8 x i64>, <8 x double>, i32) + +;CHECK: gather_dps +;CHECK: kxnorw +;CHECK: vgatherdps +;CHECK: vscatterdps +;CHECK: ret +define void @gather_dps(<16 x i32> %ind, i8* %base, i8* %stbuf) { + %x = call <16 x float> @llvm.x86.avx512.gather.dps.512 (<16 x i32>%ind, i8* %base, i32 4) + %ind2 = add <16 x i32> %ind, + call void @llvm.x86.avx512.scatter.dps.512 (i8* %stbuf, <16 x i32>%ind2, <16 x float> %x, i32 4) + ret void +} + +;CHECK: gather_qps +;CHECK: kxnorw +;CHECK: vgatherqps +;CHECK: vscatterqps +;CHECK: ret +define void @gather_qps(<8 x i64> %ind, i8* %base, i8* %stbuf) { + %x = call <8 x float> @llvm.x86.avx512.gather.qps.512 (<8 x i64>%ind, i8* %base, i32 4) + %ind2 = add <8 x i64> %ind, + call void @llvm.x86.avx512.scatter.qps.512 (i8* %stbuf, <8 x i64>%ind2, <8 x float> %x, i32 4) + ret void +} + +;CHECK: gather_qpd +;CHECK: kxnorw +;CHECK: vgatherqpd +;CHECK: vpadd +;CHECK: vscatterqpd +;CHECK: ret +define void @gather_qpd(<8 x i64> %ind, i8* %base, i8* %stbuf) { + %x = call <8 x double> @llvm.x86.avx512.gather.qpd.512 (<8 x i64>%ind, i8* %base, i32 4) + %ind2 = add <8 x i64> %ind, + call void @llvm.x86.avx512.scatter.qpd.512 (i8* %stbuf, <8 x i64>%ind2, <8 x double> %x, i32 4) + ret void +} + +;; Integer Intinsics without masks + +declare <16 x i32> @llvm.x86.avx512.gather.dpi.512 (<16 x i32>, i8*, i32) +declare void @llvm.x86.avx512.scatter.dpi.512 (i8*, <16 x i32>, <16 x i32>, i32) +declare <8 x i64> @llvm.x86.avx512.gather.dpq.512 (<8 x i32>, i8*, i32) +declare void @llvm.x86.avx512.scatter.dpq.512 (i8*, <8 x i32>, <8 x i64>, i32) + +declare <8 x i32> @llvm.x86.avx512.gather.qpi.512 (<8 x i64>, i8*, i32) +declare void @llvm.x86.avx512.scatter.qpi.512 (i8*, <8 x i64>, <8 x i32>, i32) +declare <8 x i64> @llvm.x86.avx512.gather.qpq.512 (<8 x i64>, i8*, i32) +declare void @llvm.x86.avx512.scatter.qpq.512 (i8*, <8 x i64>, <8 x i64>, i32) + +;CHECK: gather_dpi +;CHECK: kxnorw +;CHECK: vpgatherdd +;CHECK: vpscatterdd +;CHECK: ret +define void @gather_dpi(<16 x i32> %ind, i8* %base, i8* %stbuf) { + %x = call <16 x i32> @llvm.x86.avx512.gather.dpi.512 (<16 x i32>%ind, i8* %base, i32 4) + %ind2 = add <16 x i32> %ind, + call void @llvm.x86.avx512.scatter.dpi.512 (i8* %stbuf, <16 x i32>%ind2, <16 x i32> %x, i32 4) + ret void +} + +;CHECK: gather_qpq +;CHECK: kxnorw +;CHECK: vpgatherqq +;CHECK: vpadd +;CHECK: vpscatterqq +;CHECK: ret +define void @gather_qpq(<8 x i64> %ind, i8* %base, i8* %stbuf) { + %x = call <8 x i64> @llvm.x86.avx512.gather.qpq.512 (<8 x i64>%ind, i8* %base, i32 4) + %ind2 = add <8 x i64> %ind, + call void @llvm.x86.avx512.scatter.qpq.512 (i8* %stbuf, <8 x i64>%ind2, <8 x i64> %x, i32 4) + ret void +} + +;CHECK: gather_qpi +;CHECK: kxnorw +;CHECK: vpgatherqd +;CHECK: vpadd +;CHECK: vpscatterqd +;CHECK: ret +define void @gather_qpi(<8 x i64> %ind, i8* %base, i8* %stbuf) { + %x = call <8 x i32> @llvm.x86.avx512.gather.qpi.512 (<8 x i64>%ind, i8* %base, i32 4) + %ind2 = add <8 x i64> %ind, + call void @llvm.x86.avx512.scatter.qpi.512 (i8* %stbuf, <8 x i64>%ind2, <8 x i32> %x, i32 4) + ret void +}