AVX-512: changes in intrinsics

1) Changed gather and scatter intrinsics. Now they are aligned with GCC built-ins. There is no more non-masked form. Masked intrinsic receives -1 if all lanes are executed.
2) I changed the function that works with intrinsics inside X86ISelLowering.cpp. I put all intrinsics in one table. I did it for INTRINSICS_W_CHAIN and plan to put all intrinsics from WO_CHAIN set to the same table in order to avoid the long-long "switch". (I wanted to use static map initialization that allowed by C++11 but I wasn't able to compile it on VS2012).
3) I added gather/scatter prefetch intrinsics.
4) I fixed MRMm encoding for masked instructions.



git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@208522 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
Elena Demikhovsky 2014-05-12 07:18:51 +00:00
parent f37d8e6b1a
commit 1cec507d6d
6 changed files with 373 additions and 439 deletions

View File

@ -3029,141 +3029,104 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
// Gather and Scatter ops
let TargetPrefix = "x86" in {
def int_x86_avx512_gather_dpd_mask_512 : GCCBuiltin<"__builtin_ia32_mask_gatherdpd512">,
Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_i8_ty,
llvm_v8i32_ty, llvm_ptr_ty, llvm_i32_ty],
def int_x86_avx512_gather_dpd_512 : GCCBuiltin<"__builtin_ia32_gathersiv8df">,
Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_ptr_ty,
llvm_v8i32_ty, llvm_i8_ty, llvm_i32_ty],
[IntrReadArgMem]>;
def int_x86_avx512_gather_dps_mask_512 : GCCBuiltin<"__builtin_ia32_mask_gatherdps512">,
Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty, llvm_i16_ty,
llvm_v16i32_ty, llvm_ptr_ty, llvm_i32_ty],
def int_x86_avx512_gather_dps_512 : GCCBuiltin<"__builtin_ia32_gathersiv16sf">,
Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty, llvm_ptr_ty,
llvm_v16i32_ty, llvm_i16_ty, llvm_i32_ty],
[IntrReadArgMem]>;
def int_x86_avx512_gather_qpd_mask_512 : GCCBuiltin<"__builtin_ia32_mask_gatherqpd512">,
Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_i8_ty,
llvm_v8i64_ty, llvm_ptr_ty, llvm_i32_ty],
def int_x86_avx512_gather_qpd_512 : GCCBuiltin<"__builtin_ia32_gatherdiv8df">,
Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_ptr_ty,
llvm_v8i64_ty, llvm_i8_ty, llvm_i32_ty],
[IntrReadArgMem]>;
def int_x86_avx512_gather_qps_mask_512 : GCCBuiltin<"__builtin_ia32_mask_gatherqps512">,
Intrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty, llvm_i8_ty,
llvm_v8i64_ty, llvm_ptr_ty, llvm_i32_ty],
def int_x86_avx512_gather_qps_512 : GCCBuiltin<"__builtin_ia32_gatherdiv16sf">,
Intrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty, llvm_ptr_ty,
llvm_v8i64_ty, llvm_i8_ty, llvm_i32_ty],
[IntrReadArgMem]>;
def int_x86_avx512_gather_dpd_512 : GCCBuiltin<"__builtin_ia32_gatherdpd512">,
Intrinsic<[llvm_v8f64_ty], [llvm_v8i32_ty, llvm_ptr_ty,
llvm_i32_ty],
[IntrReadArgMem]>;
def int_x86_avx512_gather_dps_512 : GCCBuiltin<"__builtin_ia32_gatherdps512">,
Intrinsic<[llvm_v16f32_ty], [llvm_v16i32_ty, llvm_ptr_ty,
llvm_i32_ty],
[IntrReadArgMem]>;
def int_x86_avx512_gather_qpd_512 : GCCBuiltin<"__builtin_ia32_gatherqpd512">,
Intrinsic<[llvm_v8f64_ty], [llvm_v8i64_ty, llvm_ptr_ty,
llvm_i32_ty],
[IntrReadArgMem]>;
def int_x86_avx512_gather_qps_512 : GCCBuiltin<"__builtin_ia32_gatherqps512">,
Intrinsic<[llvm_v8f32_ty], [llvm_v8i64_ty, llvm_ptr_ty,
llvm_i32_ty],
[IntrReadArgMem]>;
def int_x86_avx512_gather_dpq_mask_512 : GCCBuiltin<"__builtin_ia32_mask_gatherdpq512">,
Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty, llvm_i8_ty,
llvm_v8i32_ty, llvm_ptr_ty, llvm_i32_ty],
[IntrReadArgMem]>;
def int_x86_avx512_gather_dpi_mask_512 : GCCBuiltin<"__builtin_ia32_mask_gatherdpi512">,
Intrinsic<[llvm_v16i32_ty], [llvm_v16i32_ty, llvm_i16_ty,
llvm_v16i32_ty, llvm_ptr_ty, llvm_i32_ty],
[IntrReadArgMem]>;
def int_x86_avx512_gather_qpq_mask_512 : GCCBuiltin<"__builtin_ia32_mask_gatherqpq512">,
Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty, llvm_i8_ty,
llvm_v8i64_ty, llvm_ptr_ty, llvm_i32_ty],
[IntrReadArgMem]>;
def int_x86_avx512_gather_qpi_mask_512 : GCCBuiltin<"__builtin_ia32_mask_gatherqpi512">,
Intrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty, llvm_i8_ty,
llvm_v8i64_ty, llvm_ptr_ty, llvm_i32_ty],
[IntrReadArgMem]>;
def int_x86_avx512_gather_dpq_512 : GCCBuiltin<"__builtin_ia32_gatherdpq512">,
Intrinsic<[llvm_v8i64_ty], [llvm_v8i32_ty, llvm_ptr_ty,
llvm_i32_ty],
[IntrReadArgMem]>;
def int_x86_avx512_gather_dpi_512 : GCCBuiltin<"__builtin_ia32_gatherdpi512">,
Intrinsic<[llvm_v16i32_ty], [llvm_v16i32_ty, llvm_ptr_ty,
llvm_i32_ty],
[IntrReadArgMem]>;
def int_x86_avx512_gather_qpq_512 : GCCBuiltin<"__builtin_ia32_gatherqpq512">,
def int_x86_avx512_gather_dpq_512 : GCCBuiltin<"__builtin_ia32_gathersiv8di">,
Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty, llvm_ptr_ty,
llvm_i32_ty],
llvm_v8i32_ty, llvm_i8_ty, llvm_i32_ty],
[IntrReadArgMem]>;
def int_x86_avx512_gather_qpi_512 : GCCBuiltin<"__builtin_ia32_gatherqpi512">,
Intrinsic<[llvm_v8i32_ty], [llvm_v8i64_ty, llvm_ptr_ty,
llvm_i32_ty],
def int_x86_avx512_gather_dpi_512 : GCCBuiltin<"__builtin_ia32_gathersiv16si">,
Intrinsic<[llvm_v16i32_ty], [llvm_v16i32_ty, llvm_ptr_ty,
llvm_v16i32_ty, llvm_i16_ty, llvm_i32_ty],
[IntrReadArgMem]>;
def int_x86_avx512_gather_qpq_512 : GCCBuiltin<"__builtin_ia32_gatherdiv8di">,
Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty, llvm_ptr_ty,
llvm_v8i64_ty, llvm_i8_ty, llvm_i32_ty],
[IntrReadArgMem]>;
def int_x86_avx512_gather_qpi_512 : GCCBuiltin<"__builtin_ia32_gatherdiv16si">,
Intrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty, llvm_ptr_ty,
llvm_v8i64_ty, llvm_i8_ty, llvm_i32_ty],
[IntrReadArgMem]>;
// scatter
def int_x86_avx512_scatter_dpd_mask_512 : GCCBuiltin<"__builtin_ia32_mask_scatterdpd512">,
def int_x86_avx512_scatter_dpd_512 : GCCBuiltin<"__builtin_ia32_scattersiv8df">,
Intrinsic<[], [llvm_ptr_ty, llvm_i8_ty,
llvm_v8i32_ty, llvm_v8f64_ty, llvm_i32_ty],
[IntrReadWriteArgMem]>;
def int_x86_avx512_scatter_dps_mask_512 : GCCBuiltin<"__builtin_ia32_mask_scatterdps512">,
def int_x86_avx512_scatter_dps_512 : GCCBuiltin<"__builtin_ia32_scattersiv16sf">,
Intrinsic<[], [llvm_ptr_ty, llvm_i16_ty,
llvm_v16i32_ty, llvm_v16f32_ty, llvm_i32_ty],
[IntrReadWriteArgMem]>;
def int_x86_avx512_scatter_qpd_mask_512 : GCCBuiltin<"__builtin_ia32_mask_scatterqpd512">,
def int_x86_avx512_scatter_qpd_512 : GCCBuiltin<"__builtin_ia32_scatterdiv8df">,
Intrinsic<[], [llvm_ptr_ty, llvm_i8_ty,
llvm_v8i64_ty, llvm_v8f64_ty, llvm_i32_ty],
[IntrReadWriteArgMem]>;
def int_x86_avx512_scatter_qps_mask_512 : GCCBuiltin<"__builtin_ia32_mask_scatterqps512">,
def int_x86_avx512_scatter_qps_512 : GCCBuiltin<"__builtin_ia32_scatterdiv16sf">,
Intrinsic<[], [llvm_ptr_ty, llvm_i8_ty,
llvm_v8i64_ty, llvm_v8f32_ty, llvm_i32_ty],
[IntrReadWriteArgMem]>;
def int_x86_avx512_scatter_dpd_512 : GCCBuiltin<"__builtin_ia32_scatterdpd512">,
Intrinsic<[], [llvm_ptr_ty, llvm_v8i32_ty, llvm_v8f64_ty,
llvm_i32_ty],
[IntrReadWriteArgMem]>;
def int_x86_avx512_scatter_dps_512 : GCCBuiltin<"__builtin_ia32_scatterdps512">,
Intrinsic<[], [llvm_ptr_ty, llvm_v16i32_ty, llvm_v16f32_ty,
llvm_i32_ty],
[IntrReadWriteArgMem]>;
def int_x86_avx512_scatter_qpd_512 : GCCBuiltin<"__builtin_ia32_scatterqpd512">,
Intrinsic<[], [llvm_ptr_ty, llvm_v8i64_ty, llvm_v8f64_ty,
llvm_i32_ty],
[IntrReadWriteArgMem]>;
def int_x86_avx512_scatter_qps_512 : GCCBuiltin<"__builtin_ia32_scatterqps512">,
Intrinsic<[], [llvm_ptr_ty, llvm_v8i64_ty, llvm_v8f32_ty,
llvm_i32_ty],
[IntrReadWriteArgMem]>;
def int_x86_avx512_scatter_dpq_mask_512 : GCCBuiltin<"__builtin_ia32_mask_scatterdpq512">,
Intrinsic<[], [llvm_ptr_ty, llvm_i8_ty, llvm_v8i32_ty,
llvm_v8i64_ty, llvm_i32_ty],
def int_x86_avx512_scatter_dpq_512 : GCCBuiltin<"__builtin_ia32_scattersiv8di">,
Intrinsic<[], [llvm_ptr_ty, llvm_i8_ty,
llvm_v8i32_ty, llvm_v8i64_ty, llvm_i32_ty],
[IntrReadWriteArgMem]>;
def int_x86_avx512_scatter_dpi_mask_512 : GCCBuiltin<"__builtin_ia32_mask_scatterdpi512">,
def int_x86_avx512_scatter_dpi_512 : GCCBuiltin<"__builtin_ia32_scattersiv16si">,
Intrinsic<[], [llvm_ptr_ty, llvm_i16_ty,
llvm_v16i32_ty, llvm_v16i32_ty, llvm_i32_ty],
[IntrReadWriteArgMem]>;
def int_x86_avx512_scatter_qpq_mask_512 : GCCBuiltin<"__builtin_ia32_mask_scatterqpq512">,
Intrinsic<[], [llvm_ptr_ty, llvm_i8_ty,
llvm_v8i64_ty, llvm_v8i64_ty, llvm_i32_ty],
def int_x86_avx512_scatter_qpq_512 : GCCBuiltin<"__builtin_ia32_scatterdiv8di">,
Intrinsic<[], [llvm_ptr_ty, llvm_i8_ty,llvm_v8i64_ty, llvm_v8i64_ty,
llvm_i32_ty],
[IntrReadWriteArgMem]>;
def int_x86_avx512_scatter_qpi_mask_512 : GCCBuiltin<"__builtin_ia32_mask_scatterqpi512">,
Intrinsic<[], [llvm_ptr_ty, llvm_i8_ty,
llvm_v8i64_ty, llvm_v8i32_ty, llvm_i32_ty],
def int_x86_avx512_scatter_qpi_512 : GCCBuiltin<"__builtin_ia32_scatterdiv16si">,
Intrinsic<[], [llvm_ptr_ty, llvm_i8_ty, llvm_v8i64_ty, llvm_v8i32_ty,
llvm_i32_ty],
[IntrReadWriteArgMem]>;
def int_x86_avx512_scatter_dpq_512 : GCCBuiltin<"__builtin_ia32_scatterdpq512">,
Intrinsic<[], [llvm_ptr_ty,
llvm_v8i32_ty, llvm_v8i64_ty, llvm_i32_ty],
[]>;
def int_x86_avx512_scatter_dpi_512 : GCCBuiltin<"__builtin_ia32_scatterdpi512">,
Intrinsic<[], [llvm_ptr_ty,
llvm_v16i32_ty, llvm_v16i32_ty, llvm_i32_ty],
[]>;
def int_x86_avx512_scatter_qpq_512 : GCCBuiltin<"__builtin_ia32_scatterqpq512">,
Intrinsic<[], [llvm_ptr_ty, llvm_v8i64_ty, llvm_v8i64_ty,
llvm_i32_ty],
[]>;
def int_x86_avx512_scatter_qpi_512 : GCCBuiltin<"__builtin_ia32_scatterqpi512">,
Intrinsic<[], [llvm_ptr_ty, llvm_v8i64_ty, llvm_v8i32_ty,
llvm_i32_ty],
[]>;
// gather prefetch
def int_x86_avx512_gatherpf_dpd_512 : GCCBuiltin<"__builtin_ia32_gatherpfdpd">,
Intrinsic<[], [llvm_i8_ty, llvm_v8i32_ty, llvm_ptr_ty,
llvm_i32_ty, llvm_i32_ty], [IntrReadWriteArgMem]>;
def int_x86_avx512_gatherpf_dps_512 : GCCBuiltin<"__builtin_ia32_gatherpfdps">,
Intrinsic<[], [llvm_i16_ty, llvm_v16i32_ty, llvm_ptr_ty,
llvm_i32_ty, llvm_i32_ty], [IntrReadWriteArgMem]>;
def int_x86_avx512_gatherpf_qpd_512 : GCCBuiltin<"__builtin_ia32_gatherpfqpd">,
Intrinsic<[], [llvm_i8_ty, llvm_v8i64_ty, llvm_ptr_ty,
llvm_i32_ty, llvm_i32_ty], [IntrReadWriteArgMem]>;
def int_x86_avx512_gatherpf_qps_512 : GCCBuiltin<"__builtin_ia32_gatherpfqps">,
Intrinsic<[], [llvm_i8_ty, llvm_v8i64_ty, llvm_ptr_ty,
llvm_i32_ty, llvm_i32_ty], [IntrReadWriteArgMem]>;
// scatter prefetch
def int_x86_avx512_scatterpf_dpd_512 : GCCBuiltin<"__builtin_ia32_scatterpfdpd">,
Intrinsic<[], [llvm_i8_ty, llvm_v8i32_ty, llvm_ptr_ty,
llvm_i32_ty, llvm_i32_ty], [IntrReadWriteArgMem]>;
def int_x86_avx512_scatterpf_dps_512 : GCCBuiltin<"__builtin_ia32_scatterpfdps">,
Intrinsic<[], [llvm_i16_ty, llvm_v16i32_ty, llvm_ptr_ty,
llvm_i32_ty, llvm_i32_ty], [IntrReadWriteArgMem]>;
def int_x86_avx512_scatterpf_qpd_512 : GCCBuiltin<"__builtin_ia32_scatterpfqpd">,
Intrinsic<[], [llvm_i8_ty, llvm_v8i64_ty, llvm_ptr_ty,
llvm_i32_ty, llvm_i32_ty], [IntrReadWriteArgMem]>;
def int_x86_avx512_scatterpf_qps_512 : GCCBuiltin<"__builtin_ia32_scatterpfqps">,
Intrinsic<[], [llvm_i8_ty, llvm_v8i64_ty, llvm_ptr_ty,
llvm_i32_ty, llvm_i32_ty], [IntrReadWriteArgMem]>;
}
// AVX-512 conflict detection

View File

@ -643,6 +643,10 @@ namespace X86II {
/// counted as one operand.
///
inline int getMemoryOperandNo(uint64_t TSFlags, unsigned Opcode) {
bool HasVEX_4V = (TSFlags >> X86II::VEXShift) & X86II::VEX_4V;
bool HasMemOp4 = (TSFlags >> X86II::VEXShift) & X86II::MemOp4;
bool HasEVEX_K = ((TSFlags >> X86II::VEXShift) & X86II::EVEX_K);
switch (TSFlags & X86II::FormMask) {
default: llvm_unreachable("Unknown FormMask value in getMemoryOperandNo!");
case X86II::Pseudo:
@ -660,9 +664,6 @@ namespace X86II {
case X86II::MRMDestMem:
return 0;
case X86II::MRMSrcMem: {
bool HasVEX_4V = (TSFlags >> X86II::VEXShift) & X86II::VEX_4V;
bool HasMemOp4 = (TSFlags >> X86II::VEXShift) & X86II::MemOp4;
bool HasEVEX_K = ((TSFlags >> X86II::VEXShift) & X86II::EVEX_K);
unsigned FirstMemOp = 1;
if (HasVEX_4V)
++FirstMemOp;// Skip the register source (which is encoded in VEX_VVVV).
@ -690,6 +691,8 @@ namespace X86II {
unsigned FirstMemOp = 0;
if (HasVEX_4V)
++FirstMemOp;// Skip the register dest (which is encoded in VEX_VVVV).
if (HasEVEX_K)
++FirstMemOp;// Skip the mask register
return FirstMemOp;
}
case X86II::MRM_C0: case X86II::MRM_C1: case X86II::MRM_C2:

View File

@ -1428,6 +1428,8 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS,
case X86II::MRM6r: case X86II::MRM7r: {
if (HasVEX_4V) // Skip the register dst (which is encoded in VEX_VVVV).
++CurOp;
if (HasEVEX_K) // Skip writemask
++CurOp;
EmitByte(BaseOpcode, CurByte, OS);
uint64_t Form = TSFlags & X86II::FormMask;
EmitRegModRMByte(MI.getOperand(CurOp++),
@ -1443,6 +1445,8 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS,
case X86II::MRM6m: case X86II::MRM7m: {
if (HasVEX_4V) // Skip the register dst (which is encoded in VEX_VVVV).
++CurOp;
if (HasEVEX_K) // Skip writemask
++CurOp;
EmitByte(BaseOpcode, CurByte, OS);
uint64_t Form = TSFlags & X86II::FormMask;
EmitMemModRMByte(MI, CurOp, (Form == X86II::MRMXm) ? 0 : Form-X86II::MRM0m,

View File

@ -12420,27 +12420,6 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) {
}
static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
SDValue Base, SDValue Index,
SDValue ScaleOp, SDValue Chain,
const X86Subtarget * Subtarget) {
SDLoc dl(Op);
ConstantSDNode *C = dyn_cast<ConstantSDNode>(ScaleOp);
assert(C && "Invalid scale type");
SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), MVT::i8);
SDValue Src = getZeroVector(Op.getValueType(), Subtarget, DAG, dl);
EVT MaskVT = MVT::getVectorVT(MVT::i1,
Index.getSimpleValueType().getVectorNumElements());
SDValue MaskInReg = DAG.getConstant(~0, MaskVT);
SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
SDValue Disp = DAG.getTargetConstant(0, MVT::i32);
SDValue Segment = DAG.getRegister(0, MVT::i32);
SDValue Ops[] = {Src, MaskInReg, Base, Scale, Index, Disp, Segment, Chain};
SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) };
return DAG.getMergeValues(RetOps, dl);
}
static SDValue getMGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
SDValue Src, SDValue Mask, SDValue Base,
SDValue Index, SDValue ScaleOp, SDValue Chain,
const X86Subtarget * Subtarget) {
@ -12450,7 +12429,12 @@ static SDValue getMGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), MVT::i8);
EVT MaskVT = MVT::getVectorVT(MVT::i1,
Index.getSimpleValueType().getVectorNumElements());
SDValue MaskInReg = DAG.getNode(ISD::BITCAST, dl, MaskVT, Mask);
SDValue MaskInReg;
ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask);
if (MaskC)
MaskInReg = DAG.getTargetConstant(MaskC->getSExtValue(), MaskVT);
else
MaskInReg = DAG.getNode(ISD::BITCAST, dl, MaskVT, Mask);
SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
SDValue Disp = DAG.getTargetConstant(0, MVT::i32);
SDValue Segment = DAG.getRegister(0, MVT::i32);
@ -12463,24 +12447,6 @@ static SDValue getMGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
}
static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
SDValue Src, SDValue Base, SDValue Index,
SDValue ScaleOp, SDValue Chain) {
SDLoc dl(Op);
ConstantSDNode *C = dyn_cast<ConstantSDNode>(ScaleOp);
assert(C && "Invalid scale type");
SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), MVT::i8);
SDValue Disp = DAG.getTargetConstant(0, MVT::i32);
SDValue Segment = DAG.getRegister(0, MVT::i32);
EVT MaskVT = MVT::getVectorVT(MVT::i1,
Index.getSimpleValueType().getVectorNumElements());
SDValue MaskInReg = DAG.getConstant(~0, MaskVT);
SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other);
SDValue Ops[] = {Base, Scale, Index, Disp, Segment, MaskInReg, Src, Chain};
SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
return SDValue(Res, 1);
}
static SDValue getMScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
SDValue Src, SDValue Mask, SDValue Base,
SDValue Index, SDValue ScaleOp, SDValue Chain) {
SDLoc dl(Op);
@ -12491,13 +12457,41 @@ static SDValue getMScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
SDValue Segment = DAG.getRegister(0, MVT::i32);
EVT MaskVT = MVT::getVectorVT(MVT::i1,
Index.getSimpleValueType().getVectorNumElements());
SDValue MaskInReg = DAG.getNode(ISD::BITCAST, dl, MaskVT, Mask);
SDValue MaskInReg;
ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask);
if (MaskC)
MaskInReg = DAG.getTargetConstant(MaskC->getSExtValue(), MaskVT);
else
MaskInReg = DAG.getNode(ISD::BITCAST, dl, MaskVT, Mask);
SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other);
SDValue Ops[] = {Base, Scale, Index, Disp, Segment, MaskInReg, Src, Chain};
SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
return SDValue(Res, 1);
}
static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
SDValue Mask, SDValue Base, SDValue Index,
SDValue ScaleOp, SDValue Chain) {
SDLoc dl(Op);
ConstantSDNode *C = dyn_cast<ConstantSDNode>(ScaleOp);
assert(C && "Invalid scale type");
SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), MVT::i8);
SDValue Disp = DAG.getTargetConstant(0, MVT::i32);
SDValue Segment = DAG.getRegister(0, MVT::i32);
EVT MaskVT =
MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements());
SDValue MaskInReg;
ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask);
if (MaskC)
MaskInReg = DAG.getTargetConstant(MaskC->getSExtValue(), MaskVT);
else
MaskInReg = DAG.getNode(ISD::BITCAST, dl, MaskVT, Mask);
//SDVTList VTs = DAG.getVTList(MVT::Other);
SDValue Ops[] = {MaskInReg, Base, Scale, Index, Disp, Segment, Chain};
SDNode *Res = DAG.getMachineNode(Opc, dl, MVT::Other, Ops);
return SDValue(Res, 0);
}
// getReadTimeStampCounter - Handles the lowering of builtin intrinsics that
// read the time stamp counter (x86_rdtsc and x86_rdtscp). This function is
// also used to custom lower READCYCLECOUNTER nodes.
@ -12561,27 +12555,122 @@ static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget *Subtarget,
return DAG.getMergeValues(Results, DL);
}
enum IntrinsicType {
GATHER, SCATTER, PREFETCH, RDSEED, RDRAND, RDTSC, XTEST
};
struct IntrinsicData {
IntrinsicData(IntrinsicType IType, unsigned IOpc0, unsigned IOpc1)
:Type(IType), Opc0(IOpc0), Opc1(IOpc1) {}
IntrinsicType Type;
unsigned Opc0;
unsigned Opc1;
};
std::map < unsigned, IntrinsicData> IntrMap;
static void InitIntinsicsMap() {
static bool Initialized = false;
if (Initialized)
return;
IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gather_qps_512,
IntrinsicData(GATHER, X86::VGATHERQPSZrm, 0)));
IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gather_qps_512,
IntrinsicData(GATHER, X86::VGATHERQPSZrm, 0)));
IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gather_qpd_512,
IntrinsicData(GATHER, X86::VGATHERQPDZrm, 0)));
IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gather_dpd_512,
IntrinsicData(GATHER, X86::VGATHERDPDZrm, 0)));
IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gather_dps_512,
IntrinsicData(GATHER, X86::VGATHERDPSZrm, 0)));
IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gather_qpi_512,
IntrinsicData(GATHER, X86::VPGATHERQDZrm, 0)));
IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gather_qpq_512,
IntrinsicData(GATHER, X86::VPGATHERQQZrm, 0)));
IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gather_dpi_512,
IntrinsicData(GATHER, X86::VPGATHERDDZrm, 0)));
IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gather_dpq_512,
IntrinsicData(GATHER, X86::VPGATHERDQZrm, 0)));
IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_scatter_qps_512,
IntrinsicData(SCATTER, X86::VSCATTERQPSZmr, 0)));
IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_scatter_qpd_512,
IntrinsicData(SCATTER, X86::VSCATTERQPDZmr, 0)));
IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_scatter_dpd_512,
IntrinsicData(SCATTER, X86::VSCATTERDPDZmr, 0)));
IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_scatter_dps_512,
IntrinsicData(SCATTER, X86::VSCATTERDPSZmr, 0)));
IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_scatter_qpi_512,
IntrinsicData(SCATTER, X86::VPSCATTERQDZmr, 0)));
IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_scatter_qpq_512,
IntrinsicData(SCATTER, X86::VPSCATTERQQZmr, 0)));
IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_scatter_dpi_512,
IntrinsicData(SCATTER, X86::VPSCATTERDDZmr, 0)));
IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_scatter_dpq_512,
IntrinsicData(SCATTER, X86::VPSCATTERDQZmr, 0)));
IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gatherpf_qps_512,
IntrinsicData(PREFETCH, X86::VGATHERPF0QPSm,
X86::VGATHERPF1QPSm)));
IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gatherpf_qpd_512,
IntrinsicData(PREFETCH, X86::VGATHERPF0QPDm,
X86::VGATHERPF1QPDm)));
IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gatherpf_dpd_512,
IntrinsicData(PREFETCH, X86::VGATHERPF0DPDm,
X86::VGATHERPF1DPDm)));
IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gatherpf_dps_512,
IntrinsicData(PREFETCH, X86::VGATHERPF0DPSm,
X86::VGATHERPF1DPSm)));
IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_scatterpf_qps_512,
IntrinsicData(PREFETCH, X86::VSCATTERPF0QPSm,
X86::VSCATTERPF1QPSm)));
IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_scatterpf_qpd_512,
IntrinsicData(PREFETCH, X86::VSCATTERPF0QPDm,
X86::VSCATTERPF1QPDm)));
IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_scatterpf_dpd_512,
IntrinsicData(PREFETCH, X86::VSCATTERPF0DPDm,
X86::VSCATTERPF1DPDm)));
IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_scatterpf_dps_512,
IntrinsicData(PREFETCH, X86::VSCATTERPF0DPSm,
X86::VSCATTERPF1DPSm)));
IntrMap.insert(std::make_pair(Intrinsic::x86_rdrand_16,
IntrinsicData(RDRAND, X86ISD::RDRAND, 0)));
IntrMap.insert(std::make_pair(Intrinsic::x86_rdrand_32,
IntrinsicData(RDRAND, X86ISD::RDRAND, 0)));
IntrMap.insert(std::make_pair(Intrinsic::x86_rdrand_64,
IntrinsicData(RDRAND, X86ISD::RDRAND, 0)));
IntrMap.insert(std::make_pair(Intrinsic::x86_rdseed_16,
IntrinsicData(RDSEED, X86ISD::RDSEED, 0)));
IntrMap.insert(std::make_pair(Intrinsic::x86_rdseed_32,
IntrinsicData(RDSEED, X86ISD::RDSEED, 0)));
IntrMap.insert(std::make_pair(Intrinsic::x86_rdseed_64,
IntrinsicData(RDSEED, X86ISD::RDSEED, 0)));
IntrMap.insert(std::make_pair(Intrinsic::x86_xtest,
IntrinsicData(XTEST, X86ISD::XTEST, 0)));
IntrMap.insert(std::make_pair(Intrinsic::x86_rdtsc,
IntrinsicData(RDTSC, X86ISD::RDTSC_DAG, 0)));
IntrMap.insert(std::make_pair(Intrinsic::x86_rdtscp,
IntrinsicData(RDTSC, X86ISD::RDTSCP_DAG, 0)));
Initialized = true;
}
static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget,
SelectionDAG &DAG) {
SDLoc dl(Op);
InitIntinsicsMap();
unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
switch (IntNo) {
default: return SDValue(); // Don't custom lower most intrinsics.
std::map < unsigned, IntrinsicData>::const_iterator itr = IntrMap.find(IntNo);
if (itr == IntrMap.end())
return SDValue();
// RDRAND/RDSEED intrinsics.
case Intrinsic::x86_rdrand_16:
case Intrinsic::x86_rdrand_32:
case Intrinsic::x86_rdrand_64:
case Intrinsic::x86_rdseed_16:
case Intrinsic::x86_rdseed_32:
case Intrinsic::x86_rdseed_64: {
unsigned Opcode = (IntNo == Intrinsic::x86_rdseed_16 ||
IntNo == Intrinsic::x86_rdseed_32 ||
IntNo == Intrinsic::x86_rdseed_64) ? X86ISD::RDSEED :
X86ISD::RDRAND;
SDLoc dl(Op);
IntrinsicData Intr = itr->second;
switch(Intr.Type) {
default:
llvm_unreachable("Unhandled intrinsic");
case RDSEED:
case RDRAND: {
// Emit the node with the right value type.
SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Glue, MVT::Other);
SDValue Result = DAG.getNode(Opcode, dl, VTs, Op.getOperand(0));
SDValue Result = DAG.getNode(Intr.Opc0, dl, VTs, Op.getOperand(0));
// If the value returned by RDRAND/RDSEED was valid (CF=1), return 1.
// Otherwise return the value from Rand, which is always 0, casted to i32.
@ -12597,162 +12686,49 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget,
return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid,
SDValue(Result.getNode(), 2));
}
//int_gather(index, base, scale);
case Intrinsic::x86_avx512_gather_qpd_512:
case Intrinsic::x86_avx512_gather_qps_512:
case Intrinsic::x86_avx512_gather_dpd_512:
case Intrinsic::x86_avx512_gather_qpi_512:
case Intrinsic::x86_avx512_gather_qpq_512:
case Intrinsic::x86_avx512_gather_dpq_512:
case Intrinsic::x86_avx512_gather_dps_512:
case Intrinsic::x86_avx512_gather_dpi_512: {
unsigned Opc;
switch (IntNo) {
default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
case Intrinsic::x86_avx512_gather_qps_512: Opc = X86::VGATHERQPSZrm; break;
case Intrinsic::x86_avx512_gather_qpd_512: Opc = X86::VGATHERQPDZrm; break;
case Intrinsic::x86_avx512_gather_dpd_512: Opc = X86::VGATHERDPDZrm; break;
case Intrinsic::x86_avx512_gather_dps_512: Opc = X86::VGATHERDPSZrm; break;
case Intrinsic::x86_avx512_gather_qpi_512: Opc = X86::VPGATHERQDZrm; break;
case Intrinsic::x86_avx512_gather_qpq_512: Opc = X86::VPGATHERQQZrm; break;
case Intrinsic::x86_avx512_gather_dpi_512: Opc = X86::VPGATHERDDZrm; break;
case Intrinsic::x86_avx512_gather_dpq_512: Opc = X86::VPGATHERDQZrm; break;
}
SDValue Chain = Op.getOperand(0);
SDValue Index = Op.getOperand(2);
SDValue Base = Op.getOperand(3);
SDValue Scale = Op.getOperand(4);
return getGatherNode(Opc, Op, DAG, Base, Index, Scale, Chain, Subtarget);
}
//int_gather_mask(v1, mask, index, base, scale);
case Intrinsic::x86_avx512_gather_qps_mask_512:
case Intrinsic::x86_avx512_gather_qpd_mask_512:
case Intrinsic::x86_avx512_gather_dpd_mask_512:
case Intrinsic::x86_avx512_gather_dps_mask_512:
case Intrinsic::x86_avx512_gather_qpi_mask_512:
case Intrinsic::x86_avx512_gather_qpq_mask_512:
case Intrinsic::x86_avx512_gather_dpi_mask_512:
case Intrinsic::x86_avx512_gather_dpq_mask_512: {
unsigned Opc;
switch (IntNo) {
default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
case Intrinsic::x86_avx512_gather_qps_mask_512:
Opc = X86::VGATHERQPSZrm; break;
case Intrinsic::x86_avx512_gather_qpd_mask_512:
Opc = X86::VGATHERQPDZrm; break;
case Intrinsic::x86_avx512_gather_dpd_mask_512:
Opc = X86::VGATHERDPDZrm; break;
case Intrinsic::x86_avx512_gather_dps_mask_512:
Opc = X86::VGATHERDPSZrm; break;
case Intrinsic::x86_avx512_gather_qpi_mask_512:
Opc = X86::VPGATHERQDZrm; break;
case Intrinsic::x86_avx512_gather_qpq_mask_512:
Opc = X86::VPGATHERQQZrm; break;
case Intrinsic::x86_avx512_gather_dpi_mask_512:
Opc = X86::VPGATHERDDZrm; break;
case Intrinsic::x86_avx512_gather_dpq_mask_512:
Opc = X86::VPGATHERDQZrm; break;
}
case GATHER: {
//gather(v1, mask, index, base, scale);
SDValue Chain = Op.getOperand(0);
SDValue Src = Op.getOperand(2);
SDValue Mask = Op.getOperand(3);
SDValue Base = Op.getOperand(3);
SDValue Index = Op.getOperand(4);
SDValue Base = Op.getOperand(5);
SDValue Mask = Op.getOperand(5);
SDValue Scale = Op.getOperand(6);
return getMGatherNode(Opc, Op, DAG, Src, Mask, Base, Index, Scale, Chain,
return getGatherNode(Intr.Opc0, Op, DAG, Src, Mask, Base, Index, Scale, Chain,
Subtarget);
}
//int_scatter(base, index, v1, scale);
case Intrinsic::x86_avx512_scatter_qpd_512:
case Intrinsic::x86_avx512_scatter_qps_512:
case Intrinsic::x86_avx512_scatter_dpd_512:
case Intrinsic::x86_avx512_scatter_qpi_512:
case Intrinsic::x86_avx512_scatter_qpq_512:
case Intrinsic::x86_avx512_scatter_dpq_512:
case Intrinsic::x86_avx512_scatter_dps_512:
case Intrinsic::x86_avx512_scatter_dpi_512: {
unsigned Opc;
switch (IntNo) {
default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
case Intrinsic::x86_avx512_scatter_qpd_512:
Opc = X86::VSCATTERQPDZmr; break;
case Intrinsic::x86_avx512_scatter_qps_512:
Opc = X86::VSCATTERQPSZmr; break;
case Intrinsic::x86_avx512_scatter_dpd_512:
Opc = X86::VSCATTERDPDZmr; break;
case Intrinsic::x86_avx512_scatter_dps_512:
Opc = X86::VSCATTERDPSZmr; break;
case Intrinsic::x86_avx512_scatter_qpi_512:
Opc = X86::VPSCATTERQDZmr; break;
case Intrinsic::x86_avx512_scatter_qpq_512:
Opc = X86::VPSCATTERQQZmr; break;
case Intrinsic::x86_avx512_scatter_dpq_512:
Opc = X86::VPSCATTERDQZmr; break;
case Intrinsic::x86_avx512_scatter_dpi_512:
Opc = X86::VPSCATTERDDZmr; break;
}
SDValue Chain = Op.getOperand(0);
SDValue Base = Op.getOperand(2);
SDValue Index = Op.getOperand(3);
SDValue Src = Op.getOperand(4);
SDValue Scale = Op.getOperand(5);
return getScatterNode(Opc, Op, DAG, Src, Base, Index, Scale, Chain);
}
//int_scatter_mask(base, mask, index, v1, scale);
case Intrinsic::x86_avx512_scatter_qps_mask_512:
case Intrinsic::x86_avx512_scatter_qpd_mask_512:
case Intrinsic::x86_avx512_scatter_dpd_mask_512:
case Intrinsic::x86_avx512_scatter_dps_mask_512:
case Intrinsic::x86_avx512_scatter_qpi_mask_512:
case Intrinsic::x86_avx512_scatter_qpq_mask_512:
case Intrinsic::x86_avx512_scatter_dpi_mask_512:
case Intrinsic::x86_avx512_scatter_dpq_mask_512: {
unsigned Opc;
switch (IntNo) {
default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
case Intrinsic::x86_avx512_scatter_qpd_mask_512:
Opc = X86::VSCATTERQPDZmr; break;
case Intrinsic::x86_avx512_scatter_qps_mask_512:
Opc = X86::VSCATTERQPSZmr; break;
case Intrinsic::x86_avx512_scatter_dpd_mask_512:
Opc = X86::VSCATTERDPDZmr; break;
case Intrinsic::x86_avx512_scatter_dps_mask_512:
Opc = X86::VSCATTERDPSZmr; break;
case Intrinsic::x86_avx512_scatter_qpi_mask_512:
Opc = X86::VPSCATTERQDZmr; break;
case Intrinsic::x86_avx512_scatter_qpq_mask_512:
Opc = X86::VPSCATTERQQZmr; break;
case Intrinsic::x86_avx512_scatter_dpq_mask_512:
Opc = X86::VPSCATTERDQZmr; break;
case Intrinsic::x86_avx512_scatter_dpi_mask_512:
Opc = X86::VPSCATTERDDZmr; break;
}
case SCATTER: {
//scatter(base, mask, index, v1, scale);
SDValue Chain = Op.getOperand(0);
SDValue Base = Op.getOperand(2);
SDValue Mask = Op.getOperand(3);
SDValue Index = Op.getOperand(4);
SDValue Src = Op.getOperand(5);
SDValue Scale = Op.getOperand(6);
return getMScatterNode(Opc, Op, DAG, Src, Mask, Base, Index, Scale, Chain);
return getScatterNode(Intr.Opc0, Op, DAG, Src, Mask, Base, Index, Scale, Chain);
}
// Read Time Stamp Counter (RDTSC).
case Intrinsic::x86_rdtsc:
// Read Time Stamp Counter and Processor ID (RDTSCP).
case Intrinsic::x86_rdtscp: {
unsigned Opc;
switch (IntNo) {
default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
case Intrinsic::x86_rdtsc:
Opc = X86ISD::RDTSC_DAG; break;
case Intrinsic::x86_rdtscp:
Opc = X86ISD::RDTSCP_DAG; break;
}
case PREFETCH: {
SDValue Hint = Op.getOperand(6);
unsigned HintVal;
if (dyn_cast<ConstantSDNode> (Hint) == 0 ||
(HintVal = dyn_cast<ConstantSDNode> (Hint)->getZExtValue()) > 1)
llvm_unreachable("Wrong prefetch hint in intrinsic: should be 0 or 1");
unsigned Opcode = (HintVal ? Intr.Opc1 : Intr.Opc0);
SDValue Chain = Op.getOperand(0);
SDValue Mask = Op.getOperand(2);
SDValue Index = Op.getOperand(3);
SDValue Base = Op.getOperand(4);
SDValue Scale = Op.getOperand(5);
return getPrefetchNode(Opcode, Op, DAG, Mask, Base, Index, Scale, Chain);
}
// Read Time Stamp Counter (RDTSC) and Processor ID (RDTSCP).
case RDTSC: {
SmallVector<SDValue, 2> Results;
getReadTimeStampCounter(Op.getNode(), dl, Opc, DAG, Subtarget, Results);
getReadTimeStampCounter(Op.getNode(), dl, Intr.Opc0, DAG, Subtarget, Results);
return DAG.getMergeValues(Results, dl);
}
// XTEST intrinsics.
case Intrinsic::x86_xtest: {
case XTEST: {
SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
SDValue InTrans = DAG.getNode(X86ISD::XTEST, dl, VTs, Op.getOperand(0));
SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,

View File

@ -4070,6 +4070,62 @@ defm VPSCATTERQQZ : avx512_scatter<0xA1, "vpscatterqq", VK8WM, VR512, vz64mem>,
defm VPSCATTERQDZ : avx512_scatter<0xA1, "vpscatterqd", VK8WM, VR256X, vz64mem>,
EVEX_V512, EVEX_CD8<32, CD8VT1>;
// prefetch
multiclass avx512_gather_scatter_prefetch<bits<8> opc, Format F, string OpcodeStr,
RegisterClass KRC, X86MemOperand memop> {
let Predicates = [HasPFI], hasSideEffects = 1 in
def m : AVX5128I<opc, F, (outs), (ins KRC:$mask, memop:$src),
!strconcat(OpcodeStr, " \t{$src {${mask}}|{${mask}}, $src}"),
[]>, EVEX, EVEX_K;
}
defm VGATHERPF0DPS: avx512_gather_scatter_prefetch<0xC6, MRM1m, "vgatherpf0dps",
VK16WM, vz32mem>, EVEX_V512, EVEX_CD8<32, CD8VT1>;
defm VGATHERPF0QPS: avx512_gather_scatter_prefetch<0xC7, MRM1m, "vgatherpf0qps",
VK8WM, vz64mem>, EVEX_V512, EVEX_CD8<64, CD8VT1>;
defm VGATHERPF0DPD: avx512_gather_scatter_prefetch<0xC6, MRM1m, "vgatherpf0dpd",
VK8WM, vy32mem>, EVEX_V512, VEX_W, EVEX_CD8<32, CD8VT1>;
defm VGATHERPF0QPD: avx512_gather_scatter_prefetch<0xC7, MRM1m, "vgatherpf0qpd",
VK8WM, vz64mem>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>;
defm VGATHERPF1DPS: avx512_gather_scatter_prefetch<0xC6, MRM2m, "vgatherpf1dps",
VK16WM, vz32mem>, EVEX_V512, EVEX_CD8<32, CD8VT1>;
defm VGATHERPF1QPS: avx512_gather_scatter_prefetch<0xC7, MRM2m, "vgatherpf1qps",
VK8WM, vz64mem>, EVEX_V512, EVEX_CD8<64, CD8VT1>;
defm VGATHERPF1DPD: avx512_gather_scatter_prefetch<0xC6, MRM2m, "vgatherpf1dpd",
VK8WM, vy32mem>, EVEX_V512, VEX_W, EVEX_CD8<32, CD8VT1>;
defm VGATHERPF1QPD: avx512_gather_scatter_prefetch<0xC7, MRM2m, "vgatherpf1qpd",
VK8WM, vz64mem>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>;
defm VSCATTERPF0DPS: avx512_gather_scatter_prefetch<0xC6, MRM5m, "vscatterpf0dps",
VK16WM, vz32mem>, EVEX_V512, EVEX_CD8<32, CD8VT1>;
defm VSCATTERPF0QPS: avx512_gather_scatter_prefetch<0xC7, MRM5m, "vscatterpf0qps",
VK8WM, vz64mem>, EVEX_V512, EVEX_CD8<64, CD8VT1>;
defm VSCATTERPF0DPD: avx512_gather_scatter_prefetch<0xC6, MRM5m, "vscatterpf0dpd",
VK8WM, vy32mem>, EVEX_V512, VEX_W, EVEX_CD8<32, CD8VT1>;
defm VSCATTERPF0QPD: avx512_gather_scatter_prefetch<0xC7, MRM5m, "vscatterpf0qpd",
VK8WM, vz64mem>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>;
defm VSCATTERPF1DPS: avx512_gather_scatter_prefetch<0xC6, MRM6m, "vscatterpf1dps",
VK16WM, vz32mem>, EVEX_V512, EVEX_CD8<32, CD8VT1>;
defm VSCATTERPF1QPS: avx512_gather_scatter_prefetch<0xC7, MRM6m, "vscatterpf1qps",
VK8WM, vz64mem>, EVEX_V512, EVEX_CD8<64, CD8VT1>;
defm VSCATTERPF1DPD: avx512_gather_scatter_prefetch<0xC6, MRM6m, "vscatterpf1dpd",
VK8WM, vy32mem>, EVEX_V512, VEX_W, EVEX_CD8<32, CD8VT1>;
defm VSCATTERPF1QPD: avx512_gather_scatter_prefetch<0xC7, MRM6m, "vscatterpf1qpd",
VK8WM, vz64mem>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>;
//===----------------------------------------------------------------------===//
// VSHUFPS - VSHUFPD Operations

View File

@ -1,14 +1,14 @@
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s
declare <16 x float> @llvm.x86.avx512.gather.dps.mask.512 (<16 x float>, i16, <16 x i32>, i8*, i32)
declare void @llvm.x86.avx512.scatter.dps.mask.512 (i8*, i16, <16 x i32>, <16 x float>, i32)
declare <8 x double> @llvm.x86.avx512.gather.dpd.mask.512 (<8 x double>, i8, <8 x i32>, i8*, i32)
declare void @llvm.x86.avx512.scatter.dpd.mask.512 (i8*, i8, <8 x i32>, <8 x double>, i32)
declare <16 x float> @llvm.x86.avx512.gather.dps.512 (<16 x float>, i8*, <16 x i32>, i16, i32)
declare void @llvm.x86.avx512.scatter.dps.512 (i8*, i16, <16 x i32>, <16 x float>, i32)
declare <8 x double> @llvm.x86.avx512.gather.dpd.512 (<8 x double>, i8*, <8 x i32>, i8, i32)
declare void @llvm.x86.avx512.scatter.dpd.512 (i8*, i8, <8 x i32>, <8 x double>, i32)
declare <8 x float> @llvm.x86.avx512.gather.qps.mask.512 (<8 x float>, i8, <8 x i64>, i8*, i32)
declare void @llvm.x86.avx512.scatter.qps.mask.512 (i8*, i8, <8 x i64>, <8 x float>, i32)
declare <8 x double> @llvm.x86.avx512.gather.qpd.mask.512 (<8 x double>, i8, <8 x i64>, i8*, i32)
declare void @llvm.x86.avx512.scatter.qpd.mask.512 (i8*, i8, <8 x i64>, <8 x double>, i32)
declare <8 x float> @llvm.x86.avx512.gather.qps.512 (<8 x float>, i8*, <8 x i64>, i8, i32)
declare void @llvm.x86.avx512.scatter.qps.512 (i8*, i8, <8 x i64>, <8 x float>, i32)
declare <8 x double> @llvm.x86.avx512.gather.qpd.512 (<8 x double>, i8*, <8 x i64>, i8, i32)
declare void @llvm.x86.avx512.scatter.qpd.512 (i8*, i8, <8 x i64>, <8 x double>, i32)
;CHECK-LABEL: gather_mask_dps
;CHECK: kmovw
@ -17,9 +17,9 @@ declare void @llvm.x86.avx512.scatter.qpd.mask.512 (i8*, i8, <8 x i64>, <8 x dou
;CHECK: vscatterdps
;CHECK: ret
define void @gather_mask_dps(<16 x i32> %ind, <16 x float> %src, i16 %mask, i8* %base, i8* %stbuf) {
%x = call <16 x float> @llvm.x86.avx512.gather.dps.mask.512 (<16 x float> %src, i16 %mask, <16 x i32>%ind, i8* %base, i32 4)
%x = call <16 x float> @llvm.x86.avx512.gather.dps.512 (<16 x float> %src, i8* %base, <16 x i32>%ind, i16 %mask, i32 4)
%ind2 = add <16 x i32> %ind, <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
call void @llvm.x86.avx512.scatter.dps.mask.512 (i8* %stbuf, i16 %mask, <16 x i32>%ind2, <16 x float> %x, i32 4)
call void @llvm.x86.avx512.scatter.dps.512 (i8* %stbuf, i16 %mask, <16 x i32>%ind2, <16 x float> %x, i32 4)
ret void
}
@ -30,9 +30,9 @@ define void @gather_mask_dps(<16 x i32> %ind, <16 x float> %src, i16 %mask, i8*
;CHECK: vscatterdpd
;CHECK: ret
define void @gather_mask_dpd(<8 x i32> %ind, <8 x double> %src, i8 %mask, i8* %base, i8* %stbuf) {
%x = call <8 x double> @llvm.x86.avx512.gather.dpd.mask.512 (<8 x double> %src, i8 %mask, <8 x i32>%ind, i8* %base, i32 4)
%x = call <8 x double> @llvm.x86.avx512.gather.dpd.512 (<8 x double> %src, i8* %base, <8 x i32>%ind, i8 %mask, i32 4)
%ind2 = add <8 x i32> %ind, <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
call void @llvm.x86.avx512.scatter.dpd.mask.512 (i8* %stbuf, i8 %mask, <8 x i32>%ind2, <8 x double> %x, i32 4)
call void @llvm.x86.avx512.scatter.dpd.512 (i8* %stbuf, i8 %mask, <8 x i32>%ind2, <8 x double> %x, i32 4)
ret void
}
@ -43,9 +43,9 @@ define void @gather_mask_dpd(<8 x i32> %ind, <8 x double> %src, i8 %mask, i8* %b
;CHECK: vscatterqps
;CHECK: ret
define void @gather_mask_qps(<8 x i64> %ind, <8 x float> %src, i8 %mask, i8* %base, i8* %stbuf) {
%x = call <8 x float> @llvm.x86.avx512.gather.qps.mask.512 (<8 x float> %src, i8 %mask, <8 x i64>%ind, i8* %base, i32 4)
%x = call <8 x float> @llvm.x86.avx512.gather.qps.512 (<8 x float> %src, i8* %base, <8 x i64>%ind, i8 %mask, i32 4)
%ind2 = add <8 x i64> %ind, <i64 0, i64 1, i64 2, i64 3, i64 0, i64 1, i64 2, i64 3>
call void @llvm.x86.avx512.scatter.qps.mask.512 (i8* %stbuf, i8 %mask, <8 x i64>%ind2, <8 x float> %x, i32 4)
call void @llvm.x86.avx512.scatter.qps.512 (i8* %stbuf, i8 %mask, <8 x i64>%ind2, <8 x float> %x, i32 4)
ret void
}
@ -56,23 +56,23 @@ define void @gather_mask_qps(<8 x i64> %ind, <8 x float> %src, i8 %mask, i8* %ba
;CHECK: vscatterqpd
;CHECK: ret
define void @gather_mask_qpd(<8 x i64> %ind, <8 x double> %src, i8 %mask, i8* %base, i8* %stbuf) {
%x = call <8 x double> @llvm.x86.avx512.gather.qpd.mask.512 (<8 x double> %src, i8 %mask, <8 x i64>%ind, i8* %base, i32 4)
%x = call <8 x double> @llvm.x86.avx512.gather.qpd.512 (<8 x double> %src, i8* %base, <8 x i64>%ind, i8 %mask, i32 4)
%ind2 = add <8 x i64> %ind, <i64 0, i64 1, i64 2, i64 3, i64 0, i64 1, i64 2, i64 3>
call void @llvm.x86.avx512.scatter.qpd.mask.512 (i8* %stbuf, i8 %mask, <8 x i64>%ind2, <8 x double> %x, i32 4)
call void @llvm.x86.avx512.scatter.qpd.512 (i8* %stbuf, i8 %mask, <8 x i64>%ind2, <8 x double> %x, i32 4)
ret void
}
;;
;; Integer Gather/Scatter
;;
declare <16 x i32> @llvm.x86.avx512.gather.dpi.mask.512 (<16 x i32>, i16, <16 x i32>, i8*, i32)
declare void @llvm.x86.avx512.scatter.dpi.mask.512 (i8*, i16, <16 x i32>, <16 x i32>, i32)
declare <8 x i64> @llvm.x86.avx512.gather.dpq.mask.512 (<8 x i64>, i8, <8 x i32>, i8*, i32)
declare void @llvm.x86.avx512.scatter.dpq.mask.512 (i8*, i8, <8 x i32>, <8 x i64>, i32)
declare <16 x i32> @llvm.x86.avx512.gather.dpi.512 (<16 x i32>, i8*, <16 x i32>, i16, i32)
declare void @llvm.x86.avx512.scatter.dpi.512 (i8*, i16, <16 x i32>, <16 x i32>, i32)
declare <8 x i64> @llvm.x86.avx512.gather.dpq.512 (<8 x i64>, i8*, <8 x i32>, i8, i32)
declare void @llvm.x86.avx512.scatter.dpq.512 (i8*, i8, <8 x i32>, <8 x i64>, i32)
declare <8 x i32> @llvm.x86.avx512.gather.qpi.mask.512 (<8 x i32>, i8, <8 x i64>, i8*, i32)
declare void @llvm.x86.avx512.scatter.qpi.mask.512 (i8*, i8, <8 x i64>, <8 x i32>, i32)
declare <8 x i64> @llvm.x86.avx512.gather.qpq.mask.512 (<8 x i64>, i8, <8 x i64>, i8*, i32)
declare void @llvm.x86.avx512.scatter.qpq.mask.512 (i8*, i8, <8 x i64>, <8 x i64>, i32)
declare <8 x i32> @llvm.x86.avx512.gather.qpi.512 (<8 x i32>, i8*, <8 x i64>, i8, i32)
declare void @llvm.x86.avx512.scatter.qpi.512 (i8*, i8, <8 x i64>, <8 x i32>, i32)
declare <8 x i64> @llvm.x86.avx512.gather.qpq.512 (<8 x i64>, i8*, <8 x i64>, i8, i32)
declare void @llvm.x86.avx512.scatter.qpq.512 (i8*, i8, <8 x i64>, <8 x i64>, i32)
;CHECK-LABEL: gather_mask_dd
;CHECK: kmovw
@ -81,9 +81,9 @@ declare void @llvm.x86.avx512.scatter.qpq.mask.512 (i8*, i8, <8 x i64>, <8 x i64
;CHECK: vpscatterdd
;CHECK: ret
define void @gather_mask_dd(<16 x i32> %ind, <16 x i32> %src, i16 %mask, i8* %base, i8* %stbuf) {
%x = call <16 x i32> @llvm.x86.avx512.gather.dpi.mask.512 (<16 x i32> %src, i16 %mask, <16 x i32>%ind, i8* %base, i32 4)
%x = call <16 x i32> @llvm.x86.avx512.gather.dpi.512 (<16 x i32> %src, i8* %base, <16 x i32>%ind, i16 %mask, i32 4)
%ind2 = add <16 x i32> %ind, <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
call void @llvm.x86.avx512.scatter.dpi.mask.512 (i8* %stbuf, i16 %mask, <16 x i32>%ind2, <16 x i32> %x, i32 4)
call void @llvm.x86.avx512.scatter.dpi.512 (i8* %stbuf, i16 %mask, <16 x i32>%ind2, <16 x i32> %x, i32 4)
ret void
}
@ -94,9 +94,9 @@ define void @gather_mask_dd(<16 x i32> %ind, <16 x i32> %src, i16 %mask, i8* %ba
;CHECK: vpscatterqd
;CHECK: ret
define void @gather_mask_qd(<8 x i64> %ind, <8 x i32> %src, i8 %mask, i8* %base, i8* %stbuf) {
%x = call <8 x i32> @llvm.x86.avx512.gather.qpi.mask.512 (<8 x i32> %src, i8 %mask, <8 x i64>%ind, i8* %base, i32 4)
%x = call <8 x i32> @llvm.x86.avx512.gather.qpi.512 (<8 x i32> %src, i8* %base, <8 x i64>%ind, i8 %mask, i32 4)
%ind2 = add <8 x i64> %ind, <i64 0, i64 1, i64 2, i64 3, i64 0, i64 1, i64 2, i64 3>
call void @llvm.x86.avx512.scatter.qpi.mask.512 (i8* %stbuf, i8 %mask, <8 x i64>%ind2, <8 x i32> %x, i32 4)
call void @llvm.x86.avx512.scatter.qpi.512 (i8* %stbuf, i8 %mask, <8 x i64>%ind2, <8 x i32> %x, i32 4)
ret void
}
@ -107,9 +107,9 @@ define void @gather_mask_qd(<8 x i64> %ind, <8 x i32> %src, i8 %mask, i8* %base,
;CHECK: vpscatterqq
;CHECK: ret
define void @gather_mask_qq(<8 x i64> %ind, <8 x i64> %src, i8 %mask, i8* %base, i8* %stbuf) {
%x = call <8 x i64> @llvm.x86.avx512.gather.qpq.mask.512 (<8 x i64> %src, i8 %mask, <8 x i64>%ind, i8* %base, i32 4)
%x = call <8 x i64> @llvm.x86.avx512.gather.qpq.512 (<8 x i64> %src, i8* %base, <8 x i64>%ind, i8 %mask, i32 4)
%ind2 = add <8 x i64> %ind, <i64 0, i64 1, i64 2, i64 3, i64 0, i64 1, i64 2, i64 3>
call void @llvm.x86.avx512.scatter.qpq.mask.512 (i8* %stbuf, i8 %mask, <8 x i64>%ind2, <8 x i64> %x, i32 4)
call void @llvm.x86.avx512.scatter.qpq.512 (i8* %stbuf, i8 %mask, <8 x i64>%ind2, <8 x i64> %x, i32 4)
ret void
}
@ -120,116 +120,19 @@ define void @gather_mask_qq(<8 x i64> %ind, <8 x i64> %src, i8 %mask, i8* %base,
;CHECK: vpscatterdq
;CHECK: ret
define void @gather_mask_dq(<8 x i32> %ind, <8 x i64> %src, i8 %mask, i8* %base, i8* %stbuf) {
%x = call <8 x i64> @llvm.x86.avx512.gather.dpq.mask.512 (<8 x i64> %src, i8 %mask, <8 x i32>%ind, i8* %base, i32 4)
%x = call <8 x i64> @llvm.x86.avx512.gather.dpq.512 (<8 x i64> %src, i8* %base, <8 x i32>%ind, i8 %mask, i32 4)
%ind2 = add <8 x i32> %ind, <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
call void @llvm.x86.avx512.scatter.dpq.mask.512 (i8* %stbuf, i8 %mask, <8 x i32>%ind2, <8 x i64> %x, i32 4)
call void @llvm.x86.avx512.scatter.dpq.512 (i8* %stbuf, i8 %mask, <8 x i32>%ind2, <8 x i64> %x, i32 4)
ret void
}
;; FP Intinsics without masks
declare <16 x float> @llvm.x86.avx512.gather.dps.512 (<16 x i32>, i8*, i32)
declare void @llvm.x86.avx512.scatter.dps.512 (i8*, <16 x i32>, <16 x float>, i32)
declare <8 x float> @llvm.x86.avx512.gather.qps.512 (<8 x i64>, i8*, i32)
declare void @llvm.x86.avx512.scatter.qps.512 (i8*, <8 x i64>, <8 x float>, i32)
declare <8 x double> @llvm.x86.avx512.gather.qpd.512 (<8 x i64>, i8*, i32)
declare void @llvm.x86.avx512.scatter.qpd.512 (i8*, <8 x i64>, <8 x double>, i32)
;CHECK-LABEL: gather_dps
;CHECK: kxnorw
;CHECK: vgatherdps
;CHECK: vscatterdps
;CHECK: ret
define void @gather_dps(<16 x i32> %ind, i8* %base, i8* %stbuf) {
%x = call <16 x float> @llvm.x86.avx512.gather.dps.512 (<16 x i32>%ind, i8* %base, i32 4)
%ind2 = add <16 x i32> %ind, <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
call void @llvm.x86.avx512.scatter.dps.512 (i8* %stbuf, <16 x i32>%ind2, <16 x float> %x, i32 4)
ret void
}
;CHECK-LABEL: gather_qps
;CHECK: kxnorw
;CHECK: vgatherqps
;CHECK: vscatterqps
;CHECK: ret
define void @gather_qps(<8 x i64> %ind, i8* %base, i8* %stbuf) {
%x = call <8 x float> @llvm.x86.avx512.gather.qps.512 (<8 x i64>%ind, i8* %base, i32 4)
%ind2 = add <8 x i64> %ind, <i64 0, i64 1, i64 2, i64 3, i64 0, i64 1, i64 2, i64 3>
call void @llvm.x86.avx512.scatter.qps.512 (i8* %stbuf, <8 x i64>%ind2, <8 x float> %x, i32 4)
ret void
}
;CHECK-LABEL: gather_qpd
;CHECK: kxnorw
;CHECK: vgatherqpd
;CHECK: vpadd
;CHECK: vscatterqpd
;CHECK: ret
define void @gather_qpd(<8 x i64> %ind, i8* %base, i8* %stbuf) {
%x = call <8 x double> @llvm.x86.avx512.gather.qpd.512 (<8 x i64>%ind, i8* %base, i32 4)
%ind2 = add <8 x i64> %ind, <i64 0, i64 1, i64 2, i64 3, i64 0, i64 1, i64 2, i64 3>
call void @llvm.x86.avx512.scatter.qpd.512 (i8* %stbuf, <8 x i64>%ind2, <8 x double> %x, i32 4)
ret void
}
;; Integer Intinsics without masks
declare <16 x i32> @llvm.x86.avx512.gather.dpi.512 (<16 x i32>, i8*, i32)
declare void @llvm.x86.avx512.scatter.dpi.512 (i8*, <16 x i32>, <16 x i32>, i32)
declare <8 x i64> @llvm.x86.avx512.gather.dpq.512 (<8 x i32>, i8*, i32)
declare void @llvm.x86.avx512.scatter.dpq.512 (i8*, <8 x i32>, <8 x i64>, i32)
declare <8 x i32> @llvm.x86.avx512.gather.qpi.512 (<8 x i64>, i8*, i32)
declare void @llvm.x86.avx512.scatter.qpi.512 (i8*, <8 x i64>, <8 x i32>, i32)
declare <8 x i64> @llvm.x86.avx512.gather.qpq.512 (<8 x i64>, i8*, i32)
declare void @llvm.x86.avx512.scatter.qpq.512 (i8*, <8 x i64>, <8 x i64>, i32)
;CHECK-LABEL: gather_dpi
;CHECK: kxnorw
;CHECK: vpgatherdd
;CHECK: vpscatterdd
;CHECK: ret
define void @gather_dpi(<16 x i32> %ind, i8* %base, i8* %stbuf) {
%x = call <16 x i32> @llvm.x86.avx512.gather.dpi.512 (<16 x i32>%ind, i8* %base, i32 4)
%ind2 = add <16 x i32> %ind, <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
call void @llvm.x86.avx512.scatter.dpi.512 (i8* %stbuf, <16 x i32>%ind2, <16 x i32> %x, i32 4)
ret void
}
;CHECK-LABEL: gather_qpq
;CHECK: vpxord %zmm
;CHECK: kxnorw
;CHECK: vpgatherqq
;CHECK: vpadd
;CHECK: vpscatterqq
;CHECK: ret
define void @gather_qpq(<8 x i64> %ind, i8* %base, i8* %stbuf) {
%x = call <8 x i64> @llvm.x86.avx512.gather.qpq.512 (<8 x i64>%ind, i8* %base, i32 4)
%ind2 = add <8 x i64> %ind, <i64 0, i64 1, i64 2, i64 3, i64 0, i64 1, i64 2, i64 3>
call void @llvm.x86.avx512.scatter.qpq.512 (i8* %stbuf, <8 x i64>%ind2, <8 x i64> %x, i32 4)
ret void
}
;CHECK-LABEL: gather_qpi
;CHECK: vpxor %ymm
;CHECK: kxnorw
;CHECK: vpgatherqd
;CHECK: vpadd
;CHECK: vpscatterqd
;CHECK: ret
define void @gather_qpi(<8 x i64> %ind, i8* %base, i8* %stbuf) {
%x = call <8 x i32> @llvm.x86.avx512.gather.qpi.512 (<8 x i64>%ind, i8* %base, i32 4)
%ind2 = add <8 x i64> %ind, <i64 0, i64 1, i64 2, i64 3, i64 0, i64 1, i64 2, i64 3>
call void @llvm.x86.avx512.scatter.qpi.512 (i8* %stbuf, <8 x i64>%ind2, <8 x i32> %x, i32 4)
ret void
}
;CHECK-LABEL: gather_mask_dpd_execdomain
;CHECK: vgatherdpd
;CHECK: vmovapd
;CHECK: ret
define void @gather_mask_dpd_execdomain(<8 x i32> %ind, <8 x double> %src, i8 %mask, i8* %base, <8 x double>* %stbuf) {
%x = call <8 x double> @llvm.x86.avx512.gather.dpd.mask.512 (<8 x double> %src, i8 %mask, <8 x i32>%ind, i8* %base, i32 4)
%x = call <8 x double> @llvm.x86.avx512.gather.dpd.512 (<8 x double> %src, i8* %base, <8 x i32>%ind, i8 %mask, i32 4)
store <8 x double> %x, <8 x double>* %stbuf
ret void
}
@ -239,7 +142,7 @@ define void @gather_mask_dpd_execdomain(<8 x i32> %ind, <8 x double> %src, i8 %m
;CHECK: vmovapd
;CHECK: ret
define void @gather_mask_qpd_execdomain(<8 x i64> %ind, <8 x double> %src, i8 %mask, i8* %base, <8 x double>* %stbuf) {
%x = call <8 x double> @llvm.x86.avx512.gather.qpd.mask.512 (<8 x double> %src, i8 %mask, <8 x i64>%ind, i8* %base, i32 4)
%x = call <8 x double> @llvm.x86.avx512.gather.qpd.512 (<8 x double> %src, i8* %base, <8 x i64>%ind, i8 %mask, i32 4)
store <8 x double> %x, <8 x double>* %stbuf
ret void
}
@ -249,7 +152,7 @@ define void @gather_mask_qpd_execdomain(<8 x i64> %ind, <8 x double> %src, i8 %m
;CHECK: vmovaps
;CHECK: ret
define <16 x float> @gather_mask_dps_execdomain(<16 x i32> %ind, <16 x float> %src, i16 %mask, i8* %base) {
%res = call <16 x float> @llvm.x86.avx512.gather.dps.mask.512 (<16 x float> %src, i16 %mask, <16 x i32>%ind, i8* %base, i32 4)
%res = call <16 x float> @llvm.x86.avx512.gather.dps.512 (<16 x float> %src, i8* %base, <16 x i32>%ind, i16 %mask, i32 4)
ret <16 x float> %res;
}
@ -258,7 +161,7 @@ define <16 x float> @gather_mask_dps_execdomain(<16 x i32> %ind, <16 x float> %s
;CHECK: vmovaps
;CHECK: ret
define <8 x float> @gather_mask_qps_execdomain(<8 x i64> %ind, <8 x float> %src, i8 %mask, i8* %base) {
%res = call <8 x float> @llvm.x86.avx512.gather.qps.mask.512 (<8 x float> %src, i8 %mask, <8 x i64>%ind, i8* %base, i32 4)
%res = call <8 x float> @llvm.x86.avx512.gather.qps.512 (<8 x float> %src, i8* %base, <8 x i64>%ind, i8 %mask, i32 4)
ret <8 x float> %res;
}
@ -268,7 +171,7 @@ define <8 x float> @gather_mask_qps_execdomain(<8 x i64> %ind, <8 x float> %src,
;CHECK: ret
define void @scatter_mask_dpd_execdomain(<8 x i32> %ind, <8 x double>* %src, i8 %mask, i8* %base, i8* %stbuf) {
%x = load <8 x double>* %src, align 64
call void @llvm.x86.avx512.scatter.dpd.mask.512 (i8* %stbuf, i8 %mask, <8 x i32>%ind, <8 x double> %x, i32 4)
call void @llvm.x86.avx512.scatter.dpd.512 (i8* %stbuf, i8 %mask, <8 x i32>%ind, <8 x double> %x, i32 4)
ret void
}
@ -278,7 +181,7 @@ define void @scatter_mask_dpd_execdomain(<8 x i32> %ind, <8 x double>* %src, i8
;CHECK: ret
define void @scatter_mask_qpd_execdomain(<8 x i64> %ind, <8 x double>* %src, i8 %mask, i8* %base, i8* %stbuf) {
%x = load <8 x double>* %src, align 64
call void @llvm.x86.avx512.scatter.qpd.mask.512 (i8* %stbuf, i8 %mask, <8 x i64>%ind, <8 x double> %x, i32 4)
call void @llvm.x86.avx512.scatter.qpd.512 (i8* %stbuf, i8 %mask, <8 x i64>%ind, <8 x double> %x, i32 4)
ret void
}
@ -288,7 +191,7 @@ define void @scatter_mask_qpd_execdomain(<8 x i64> %ind, <8 x double>* %src, i8
;CHECK: ret
define void @scatter_mask_dps_execdomain(<16 x i32> %ind, <16 x float>* %src, i16 %mask, i8* %base, i8* %stbuf) {
%x = load <16 x float>* %src, align 64
call void @llvm.x86.avx512.scatter.dps.mask.512 (i8* %stbuf, i16 %mask, <16 x i32>%ind, <16 x float> %x, i32 4)
call void @llvm.x86.avx512.scatter.dps.512 (i8* %stbuf, i16 %mask, <16 x i32>%ind, <16 x float> %x, i32 4)
ret void
}
@ -298,6 +201,35 @@ define void @scatter_mask_dps_execdomain(<16 x i32> %ind, <16 x float>* %src, i1
;CHECK: ret
define void @scatter_mask_qps_execdomain(<8 x i64> %ind, <8 x float>* %src, i8 %mask, i8* %base, i8* %stbuf) {
%x = load <8 x float>* %src, align 32
call void @llvm.x86.avx512.scatter.qps.mask.512 (i8* %stbuf, i8 %mask, <8 x i64>%ind, <8 x float> %x, i32 4)
call void @llvm.x86.avx512.scatter.qps.512 (i8* %stbuf, i8 %mask, <8 x i64>%ind, <8 x float> %x, i32 4)
ret void
}
;CHECK-LABEL: gather_qps
;CHECK: kxnorw
;CHECK: vgatherqps
;CHECK: vpadd
;CHECK: vscatterqps
;CHECK: ret
define void @gather_qps(<8 x i64> %ind, <8 x float> %src, i8* %base, i8* %stbuf) {
%x = call <8 x float> @llvm.x86.avx512.gather.qps.512 (<8 x float> %src, i8* %base, <8 x i64>%ind, i8 -1, i32 4)
%ind2 = add <8 x i64> %ind, <i64 0, i64 1, i64 2, i64 3, i64 0, i64 1, i64 2, i64 3>
call void @llvm.x86.avx512.scatter.qps.512 (i8* %stbuf, i8 -1, <8 x i64>%ind2, <8 x float> %x, i32 4)
ret void
}
;CHECK-LABEL: prefetch
;CHECK: gatherpf0
;CHECK: gatherpf1
;CHECK: scatterpf0
;CHECK: scatterpf1
;CHECK: ret
declare void @llvm.x86.avx512.gatherpf.qps.512(i8, <8 x i64>, i8* , i32, i32);
declare void @llvm.x86.avx512.scatterpf.qps.512(i8, <8 x i64>, i8* , i32, i32);
define void @prefetch(<8 x i64> %ind, i8* %base) {
call void @llvm.x86.avx512.gatherpf.qps.512(i8 -1, <8 x i64> %ind, i8* %base, i32 4, i32 0)
call void @llvm.x86.avx512.gatherpf.qps.512(i8 -1, <8 x i64> %ind, i8* %base, i32 4, i32 1)
call void @llvm.x86.avx512.scatterpf.qps.512(i8 -1, <8 x i64> %ind, i8* %base, i32 2, i32 0)
call void @llvm.x86.avx512.scatterpf.qps.512(i8 -1, <8 x i64> %ind, i8* %base, i32 2, i32 1)
ret void
}