diff --git a/include/llvm/IR/IntrinsicsX86.td b/include/llvm/IR/IntrinsicsX86.td index 3badd82fbd2..d50f98c0c59 100644 --- a/include/llvm/IR/IntrinsicsX86.td +++ b/include/llvm/IR/IntrinsicsX86.td @@ -1388,6 +1388,12 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". def int_x86_avx2_psad_bw : GCCBuiltin<"__builtin_ia32_psadbw256">, Intrinsic<[llvm_v4i64_ty], [llvm_v32i8_ty, llvm_v32i8_ty], [IntrNoMem, Commutative]>; + def int_x86_avx512_mask_pmulu_dq_512 : GCCBuiltin<"__builtin_ia32_pmuludq512_mask">, + Intrinsic<[llvm_v8i64_ty], [llvm_v16i32_ty, llvm_v16i32_ty, + llvm_v8i64_ty, llvm_i8_ty], [IntrNoMem]>; + def int_x86_avx512_mask_pmul_dq_512 : GCCBuiltin<"__builtin_ia32_pmuldq512_mask">, + Intrinsic<[llvm_v8i64_ty], [llvm_v16i32_ty, llvm_v16i32_ty, + llvm_v8i64_ty, llvm_i8_ty], [IntrNoMem]>; } // Vector min, max @@ -1428,6 +1434,30 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". def int_x86_avx2_pmins_d : GCCBuiltin<"__builtin_ia32_pminsd256">, Intrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty, llvm_v8i32_ty], [IntrNoMem, Commutative]>; + def int_x86_avx512_mask_pmaxu_d_512 : GCCBuiltin<"__builtin_ia32_pmaxud512_mask">, + Intrinsic<[llvm_v16i32_ty], [llvm_v16i32_ty, llvm_v16i32_ty, + llvm_v16i32_ty, llvm_i16_ty], [IntrNoMem]>; + def int_x86_avx512_mask_pmaxs_d_512 : GCCBuiltin<"__builtin_ia32_pmaxsd512_mask">, + Intrinsic<[llvm_v16i32_ty], [llvm_v16i32_ty, llvm_v16i32_ty, + llvm_v16i32_ty, llvm_i16_ty], [IntrNoMem]>; + def int_x86_avx512_mask_pmaxu_q_512 : GCCBuiltin<"__builtin_ia32_pmaxuq512_mask">, + Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty, llvm_v8i64_ty, + llvm_v8i64_ty, llvm_i8_ty], [IntrNoMem]>; + def int_x86_avx512_mask_pmaxs_q_512 : GCCBuiltin<"__builtin_ia32_pmaxsq512_mask">, + Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty, llvm_v8i64_ty, + llvm_v8i64_ty, llvm_i8_ty], [IntrNoMem]>; + def int_x86_avx512_mask_pminu_d_512 : GCCBuiltin<"__builtin_ia32_pminud512_mask">, + Intrinsic<[llvm_v16i32_ty], [llvm_v16i32_ty, llvm_v16i32_ty, + llvm_v16i32_ty, llvm_i16_ty], [IntrNoMem]>; + def int_x86_avx512_mask_pmins_d_512 : GCCBuiltin<"__builtin_ia32_pminsd512_mask">, + Intrinsic<[llvm_v16i32_ty], [llvm_v16i32_ty, llvm_v16i32_ty, + llvm_v16i32_ty, llvm_i16_ty], [IntrNoMem]>; + def int_x86_avx512_mask_pminu_q_512 : GCCBuiltin<"__builtin_ia32_pminuq512_mask">, + Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty, llvm_v8i64_ty, + llvm_v8i64_ty, llvm_i8_ty], [IntrNoMem]>; + def int_x86_avx512_mask_pmins_q_512 : GCCBuiltin<"__builtin_ia32_pminsq512_mask">, + Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty, llvm_v8i64_ty, + llvm_v8i64_ty, llvm_i8_ty], [IntrNoMem]>; } // Integer shift ops. @@ -1520,6 +1550,12 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty], [IntrNoMem]>; def int_x86_avx2_pabs_d : GCCBuiltin<"__builtin_ia32_pabsd256">, Intrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty], [IntrNoMem]>; + def int_x86_avx512_mask_pabs_d_512 : GCCBuiltin<"__builtin_ia32_pabsd512_mask">, + Intrinsic<[llvm_v16i32_ty], [llvm_v16i32_ty, llvm_v16i32_ty, + llvm_i16_ty], [IntrNoMem]>; + def int_x86_avx512_mask_pabs_q_512 : GCCBuiltin<"__builtin_ia32_pabsq512_mask">, + Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty, llvm_v8i64_ty, + llvm_i8_ty], [IntrNoMem]>; } // Horizontal arithmetic ops @@ -2822,32 +2858,6 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_v8f64_ty, llvm_v8f64_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>; - def int_x86_avx512_pmaxu_d : GCCBuiltin<"__builtin_ia32_pmaxud512">, - Intrinsic<[llvm_v16i32_ty], [llvm_v16i32_ty, - llvm_v16i32_ty], [IntrNoMem]>; - def int_x86_avx512_pmaxu_q : GCCBuiltin<"__builtin_ia32_pmaxuq512">, - Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty, - llvm_v8i64_ty], [IntrNoMem]>; - def int_x86_avx512_pmaxs_d : GCCBuiltin<"__builtin_ia32_pmaxsd512">, - Intrinsic<[llvm_v16i32_ty], [llvm_v16i32_ty, - llvm_v16i32_ty], [IntrNoMem]>; - def int_x86_avx512_pmaxs_q : GCCBuiltin<"__builtin_ia32_pmaxsq512">, - Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty, - llvm_v8i64_ty], [IntrNoMem]>; - - def int_x86_avx512_pminu_d : GCCBuiltin<"__builtin_ia32_pminud512">, - Intrinsic<[llvm_v16i32_ty], [llvm_v16i32_ty, - llvm_v16i32_ty], [IntrNoMem]>; - def int_x86_avx512_pminu_q : GCCBuiltin<"__builtin_ia32_pminuq512">, - Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty, - llvm_v8i64_ty], [IntrNoMem]>; - def int_x86_avx512_pmins_d : GCCBuiltin<"__builtin_ia32_pminsd512">, - Intrinsic<[llvm_v16i32_ty], [llvm_v16i32_ty, - llvm_v16i32_ty], [IntrNoMem]>; - def int_x86_avx512_pmins_q : GCCBuiltin<"__builtin_ia32_pminsq512">, - Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty, - llvm_v8i64_ty], [IntrNoMem]>; - def int_x86_avx512_rndscale_ss : GCCBuiltin<"__builtin_ia32_rndscaless">, Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty, llvm_i32_ty], [IntrNoMem]>; @@ -3088,22 +3098,22 @@ let TargetPrefix = "x86" in { // Vector blend let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". - def int_x86_avx512_mask_blend_ps_512 : GCCBuiltin<"__builtin_ia32_mask_blendps512">, + def int_x86_avx512_mask_blend_ps_512 : GCCBuiltin<"__builtin_ia32_blendmps_512_mask">, Intrinsic<[llvm_v16f32_ty], - [llvm_v16i1_ty, llvm_v16f32_ty, llvm_v16f32_ty], + [llvm_v16f32_ty, llvm_v16f32_ty, llvm_i16_ty], [IntrNoMem]>; - def int_x86_avx512_mask_blend_pd_512 : GCCBuiltin<"__builtin_ia32_mask_blendpd512">, + def int_x86_avx512_mask_blend_pd_512 : GCCBuiltin<"__builtin_ia32_blendmpd_512_mask">, Intrinsic<[llvm_v8f64_ty], - [llvm_v8i1_ty, llvm_v8f64_ty, llvm_v8f64_ty], + [llvm_v8f64_ty, llvm_v8f64_ty, llvm_i8_ty], [IntrNoMem]>; - def int_x86_avx512_mask_blend_d_512 : GCCBuiltin<"__builtin_ia32_mask_blendd512">, + def int_x86_avx512_mask_blend_d_512 : GCCBuiltin<"__builtin_ia32_blendmd_512_mask">, Intrinsic<[llvm_v16i32_ty], - [llvm_v16i1_ty, llvm_v16i32_ty, llvm_v16i32_ty], + [llvm_v16i32_ty, llvm_v16i32_ty, llvm_i16_ty], [IntrNoMem]>; - def int_x86_avx512_mask_blend_q_512 : GCCBuiltin<"__builtin_ia32_mask_blendq512">, + def int_x86_avx512_mask_blend_q_512 : GCCBuiltin<"__builtin_ia32_blendmq_512_mask">, Intrinsic<[llvm_v8i64_ty], - [llvm_v8i1_ty, llvm_v8i64_ty, llvm_v8i64_ty], + [llvm_v8i64_ty, llvm_v8i64_ty, llvm_i8_ty], [IntrNoMem]>; } diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 01e60a4717e..a41faab73b4 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -11383,32 +11383,24 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) { case Intrinsic::x86_avx2_pmaxu_b: case Intrinsic::x86_avx2_pmaxu_w: case Intrinsic::x86_avx2_pmaxu_d: - case Intrinsic::x86_avx512_pmaxu_d: - case Intrinsic::x86_avx512_pmaxu_q: case Intrinsic::x86_sse2_pminu_b: case Intrinsic::x86_sse41_pminuw: case Intrinsic::x86_sse41_pminud: case Intrinsic::x86_avx2_pminu_b: case Intrinsic::x86_avx2_pminu_w: case Intrinsic::x86_avx2_pminu_d: - case Intrinsic::x86_avx512_pminu_d: - case Intrinsic::x86_avx512_pminu_q: case Intrinsic::x86_sse41_pmaxsb: case Intrinsic::x86_sse2_pmaxs_w: case Intrinsic::x86_sse41_pmaxsd: case Intrinsic::x86_avx2_pmaxs_b: case Intrinsic::x86_avx2_pmaxs_w: case Intrinsic::x86_avx2_pmaxs_d: - case Intrinsic::x86_avx512_pmaxs_d: - case Intrinsic::x86_avx512_pmaxs_q: case Intrinsic::x86_sse41_pminsb: case Intrinsic::x86_sse2_pmins_w: case Intrinsic::x86_sse41_pminsd: case Intrinsic::x86_avx2_pmins_b: case Intrinsic::x86_avx2_pmins_w: - case Intrinsic::x86_avx2_pmins_d: - case Intrinsic::x86_avx512_pmins_d: - case Intrinsic::x86_avx512_pmins_q: { + case Intrinsic::x86_avx2_pmins_d: { unsigned Opcode; switch (IntNo) { default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. @@ -11418,8 +11410,6 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) { case Intrinsic::x86_avx2_pmaxu_b: case Intrinsic::x86_avx2_pmaxu_w: case Intrinsic::x86_avx2_pmaxu_d: - case Intrinsic::x86_avx512_pmaxu_d: - case Intrinsic::x86_avx512_pmaxu_q: Opcode = X86ISD::UMAX; break; case Intrinsic::x86_sse2_pminu_b: @@ -11428,8 +11418,6 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) { case Intrinsic::x86_avx2_pminu_b: case Intrinsic::x86_avx2_pminu_w: case Intrinsic::x86_avx2_pminu_d: - case Intrinsic::x86_avx512_pminu_d: - case Intrinsic::x86_avx512_pminu_q: Opcode = X86ISD::UMIN; break; case Intrinsic::x86_sse41_pmaxsb: @@ -11438,8 +11426,6 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) { case Intrinsic::x86_avx2_pmaxs_b: case Intrinsic::x86_avx2_pmaxs_w: case Intrinsic::x86_avx2_pmaxs_d: - case Intrinsic::x86_avx512_pmaxs_d: - case Intrinsic::x86_avx512_pmaxs_q: Opcode = X86ISD::SMAX; break; case Intrinsic::x86_sse41_pminsb: @@ -11448,8 +11434,6 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) { case Intrinsic::x86_avx2_pmins_b: case Intrinsic::x86_avx2_pmins_w: case Intrinsic::x86_avx2_pmins_d: - case Intrinsic::x86_avx512_pmins_d: - case Intrinsic::x86_avx512_pmins_q: Opcode = X86ISD::SMIN; break; } diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td index 58d720c80e4..5d91e37f5ef 100644 --- a/lib/Target/X86/X86InstrAVX512.td +++ b/lib/Target/X86/X86InstrAVX512.td @@ -608,68 +608,65 @@ defm VPERMI2PD : avx512_perm_3src<0x77, "vpermi2pd", VR512, memopv8f64, i512mem //===----------------------------------------------------------------------===// // AVX-512 - BLEND using mask // -multiclass avx512_blendmask opc, string OpcodeStr, Intrinsic Int, +multiclass avx512_blendmask opc, string OpcodeStr, RegisterClass KRC, RegisterClass RC, X86MemOperand x86memop, PatFrag mem_frag, SDNode OpNode, ValueType vt> { def rr : AVX5128I, EVEX_4V, EVEX_K; - let isCodeGenOnly = 1 in - def rr_Int : AVX5128I, EVEX_4V, EVEX_K; - - let mayLoad = 1 in { - def rm : AVX5128I, - EVEX_4V, EVEX_K; - - let isCodeGenOnly = 1 in - def rm_Int : AVX5128I, - EVEX_4V, EVEX_K; - } + let mayLoad = 1 in + def rm : AVX5128I, EVEX_4V, EVEX_K; } let ExeDomain = SSEPackedSingle in defm VBLENDMPSZ : avx512_blendmask<0x65, "vblendmps", - int_x86_avx512_mask_blend_ps_512, VK16WM, VR512, f512mem, memopv16f32, vselect, v16f32>, EVEX_CD8<32, CD8VF>, EVEX_V512; let ExeDomain = SSEPackedDouble in defm VBLENDMPDZ : avx512_blendmask<0x65, "vblendmpd", - int_x86_avx512_mask_blend_pd_512, VK8WM, VR512, f512mem, memopv8f64, vselect, v8f64>, VEX_W, EVEX_CD8<64, CD8VF>, EVEX_V512; +def : Pat<(v16f32 (int_x86_avx512_mask_blend_ps_512 (v16f32 VR512:$src1), + (v16f32 VR512:$src2), (i16 GR16:$mask))), + (VBLENDMPSZrr (COPY_TO_REGCLASS GR16:$mask, VK16), + VR512:$src1, VR512:$src2)>; + +def : Pat<(v8f64 (int_x86_avx512_mask_blend_pd_512 (v8f64 VR512:$src1), + (v8f64 VR512:$src2), (i8 GR8:$mask))), + (VBLENDMPDZrr (COPY_TO_REGCLASS GR8:$mask, VK8), + VR512:$src1, VR512:$src2)>; + defm VPBLENDMDZ : avx512_blendmask<0x64, "vpblendmd", - int_x86_avx512_mask_blend_d_512, VK16WM, VR512, f512mem, memopv16i32, vselect, v16i32>, EVEX_CD8<32, CD8VF>, EVEX_V512; defm VPBLENDMQZ : avx512_blendmask<0x64, "vpblendmq", - int_x86_avx512_mask_blend_q_512, VK8WM, VR512, f512mem, memopv8i64, vselect, v8i64>, VEX_W, EVEX_CD8<64, CD8VF>, EVEX_V512; +def : Pat<(v16i32 (int_x86_avx512_mask_blend_d_512 (v16i32 VR512:$src1), + (v16i32 VR512:$src2), (i16 GR16:$mask))), + (VPBLENDMDZrr (COPY_TO_REGCLASS GR16:$mask, VK16), + VR512:$src1, VR512:$src2)>; + +def : Pat<(v8i64 (int_x86_avx512_mask_blend_q_512 (v8i64 VR512:$src1), + (v8i64 VR512:$src2), (i8 GR8:$mask))), + (VPBLENDMQZrr (COPY_TO_REGCLASS GR8:$mask, VK8), + VR512:$src1, VR512:$src2)>; + let Predicates = [HasAVX512] in { def : Pat<(v8f32 (vselect (v8i1 VK8WM:$mask), (v8f32 VR256X:$src1), (v8f32 VR256X:$src2))), @@ -1780,6 +1777,13 @@ defm VPMULUDQZ : avx512_binop_rm2<0xF4, "vpmuludq", v8i64, v16i32, def : Pat<(v8i64 (X86pmuludq (v16i32 VR512:$src1), (v16i32 VR512:$src2))), (VPMULUDQZrr VR512:$src1, VR512:$src2)>; +def : Pat<(v8i64 (int_x86_avx512_mask_pmulu_dq_512 (v16i32 VR512:$src1), + (v16i32 VR512:$src2), (bc_v8i64 (v16i32 immAllZerosV)), (i8 -1))), + (VPMULUDQZrr VR512:$src1, VR512:$src2)>; +def : Pat<(v8i64 (int_x86_avx512_mask_pmul_dq_512 (v16i32 VR512:$src1), + (v16i32 VR512:$src2), (bc_v8i64 (v16i32 immAllZerosV)), (i8 -1))), + (VPMULDQZrr VR512:$src1, VR512:$src2)>; + defm VPMAXUDZ : avx512_binop_rm<0x3F, "vpmaxud", X86umax, v16i32, VR512, memopv16i32, i512mem, loadi32, i32mem, "{1to16}", SSE_INTALU_ITINS_P, 1>, T8, EVEX_V512, EVEX_CD8<32, CD8VF>; @@ -1789,7 +1793,7 @@ defm VPMAXUQZ : avx512_binop_rm<0x3F, "vpmaxuq", X86umax, v8i64, VR512, memopv8i defm VPMAXSDZ : avx512_binop_rm<0x3D, "vpmaxsd", X86smax, v16i32, VR512, memopv16i32, i512mem, loadi32, i32mem, "{1to16}", SSE_INTALU_ITINS_P, 1>, - EVEX_V512, EVEX_CD8<32, CD8VF>; + T8, EVEX_V512, EVEX_CD8<32, CD8VF>; defm VPMAXSQZ : avx512_binop_rm<0x3D, "vpmaxsq", X86smax, v8i64, VR512, memopv8i64, i512mem, loadi64, i64mem, "{1to8}", SSE_INTALU_ITINS_P, 0>, T8, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; @@ -1808,6 +1812,30 @@ defm VPMINSQZ : avx512_binop_rm<0x39, "vpminsq", X86smin, v8i64, VR512, memopv8i i512mem, loadi64, i64mem, "{1to8}", SSE_INTALU_ITINS_P, 0>, T8, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; +def : Pat <(v16i32 (int_x86_avx512_mask_pmaxs_d_512 (v16i32 VR512:$src1), + (v16i32 VR512:$src2), (v16i32 immAllZerosV), (i16 -1))), + (VPMAXSDZrr VR512:$src1, VR512:$src2)>; +def : Pat <(v16i32 (int_x86_avx512_mask_pmaxu_d_512 (v16i32 VR512:$src1), + (v16i32 VR512:$src2), (v16i32 immAllZerosV), (i16 -1))), + (VPMAXUDZrr VR512:$src1, VR512:$src2)>; +def : Pat <(v8i64 (int_x86_avx512_mask_pmaxs_q_512 (v8i64 VR512:$src1), + (v8i64 VR512:$src2), (bc_v8i64 (v16i32 immAllZerosV)), (i8 -1))), + (VPMAXSQZrr VR512:$src1, VR512:$src2)>; +def : Pat <(v8i64 (int_x86_avx512_mask_pmaxu_q_512 (v8i64 VR512:$src1), + (v8i64 VR512:$src2), (bc_v8i64 (v16i32 immAllZerosV)), (i8 -1))), + (VPMAXUQZrr VR512:$src1, VR512:$src2)>; +def : Pat <(v16i32 (int_x86_avx512_mask_pmins_d_512 (v16i32 VR512:$src1), + (v16i32 VR512:$src2), (v16i32 immAllZerosV), (i16 -1))), + (VPMINSDZrr VR512:$src1, VR512:$src2)>; +def : Pat <(v16i32 (int_x86_avx512_mask_pminu_d_512 (v16i32 VR512:$src1), + (v16i32 VR512:$src2), (v16i32 immAllZerosV), (i16 -1))), + (VPMINUDZrr VR512:$src1, VR512:$src2)>; +def : Pat <(v8i64 (int_x86_avx512_mask_pmins_q_512 (v8i64 VR512:$src1), + (v8i64 VR512:$src2), (bc_v8i64 (v16i32 immAllZerosV)), (i8 -1))), + (VPMINSQZrr VR512:$src1, VR512:$src2)>; +def : Pat <(v8i64 (int_x86_avx512_mask_pminu_q_512 (v8i64 VR512:$src1), + (v8i64 VR512:$src2), (bc_v8i64 (v16i32 immAllZerosV)), (i8 -1))), + (VPMINUQZrr VR512:$src1, VR512:$src2)>; //===----------------------------------------------------------------------===// // AVX-512 - Unpack Instructions //===----------------------------------------------------------------------===// @@ -3773,6 +3801,13 @@ defm VPABSD : avx512_vpabs<0x1E, "vpabsd", VR512, i512mem>, EVEX_V512, defm VPABSQ : avx512_vpabs<0x1F, "vpabsq", VR512, i512mem>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; +def : Pat<(v16i32 (int_x86_avx512_mask_pabs_d_512 (v16i32 VR512:$src), + (v16i32 immAllZerosV), (i16 -1))), + (VPABSDrr VR512:$src)>; +def : Pat<(v8i64 (int_x86_avx512_mask_pabs_q_512 (v8i64 VR512:$src), + (bc_v8i64 (v16i32 immAllZerosV)), (i8 -1))), + (VPABSQrr VR512:$src)>; + multiclass avx512_conflict opc, string OpcodeStr, RegisterClass RC, RegisterClass KRC, X86MemOperand x86memop, diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp index 2e136ad8057..ed2bf9cf6a7 100644 --- a/lib/Target/X86/X86InstrInfo.cpp +++ b/lib/Target/X86/X86InstrInfo.cpp @@ -1420,6 +1420,10 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm) { X86::VPERMI2Qrr, X86::VPERMI2Qrm, 0 }, { X86::VPERMI2PSrr, X86::VPERMI2PSrm, 0 }, { X86::VPERMI2PDrr, X86::VPERMI2PDrm, 0 }, + { X86::VBLENDMPDZrr, X86::VBLENDMPDZrm, 0 }, + { X86::VBLENDMPSZrr, X86::VBLENDMPSZrm, 0 }, + { X86::VPBLENDMDZrr, X86::VPBLENDMDZrm, 0 }, + { X86::VPBLENDMQZrr, X86::VPBLENDMQZrm, 0 } }; for (unsigned i = 0, e = array_lengthof(OpTbl3); i != e; ++i) { diff --git a/test/CodeGen/X86/avx512-intrinsics.ll b/test/CodeGen/X86/avx512-intrinsics.ll index 12c3ba25f5c..6c17346a1e5 100644 --- a/test/CodeGen/X86/avx512-intrinsics.ll +++ b/test/CodeGen/X86/avx512-intrinsics.ll @@ -290,62 +290,6 @@ define <8 x i64> @test_x86_pbroadcastq_i64_512(i64 %a0) { } declare <8 x i64> @llvm.x86.avx512.pbroadcastq.i64.512(i64) nounwind readonly -define <16 x i32> @test_x86_pmaxu_d(<16 x i32> %a0, <16 x i32> %a1) { - ; CHECK: vpmaxud - %res = call <16 x i32> @llvm.x86.avx512.pmaxu.d(<16 x i32> %a0, <16 x i32> %a1) ; <<16 x i32>> [#uses=1] - ret <16 x i32> %res -} -declare <16 x i32> @llvm.x86.avx512.pmaxu.d(<16 x i32>, <16 x i32>) nounwind readonly - -define <8 x i64> @test_x86_pmaxu_q(<8 x i64> %a0, <8 x i64> %a1) { - ; CHECK: vpmaxuq - %res = call <8 x i64> @llvm.x86.avx512.pmaxu.q(<8 x i64> %a0, <8 x i64> %a1) ; <<8 x i64>> [#uses=1] - ret <8 x i64> %res -} -declare <8 x i64> @llvm.x86.avx512.pmaxu.q(<8 x i64>, <8 x i64>) nounwind readonly - -define <16 x i32> @test_x86_pmaxs_d(<16 x i32> %a0, <16 x i32> %a1) { - ; CHECK: vpmaxsd - %res = call <16 x i32> @llvm.x86.avx512.pmaxs.d(<16 x i32> %a0, <16 x i32> %a1) ; <<16 x i32>> [#uses=1] - ret <16 x i32> %res -} -declare <16 x i32> @llvm.x86.avx512.pmaxs.d(<16 x i32>, <16 x i32>) nounwind readonly - -define <8 x i64> @test_x86_pmaxs_q(<8 x i64> %a0, <8 x i64> %a1) { - ; CHECK: vpmaxsq - %res = call <8 x i64> @llvm.x86.avx512.pmaxs.q(<8 x i64> %a0, <8 x i64> %a1) ; <<8 x i64>> [#uses=1] - ret <8 x i64> %res -} -declare <8 x i64> @llvm.x86.avx512.pmaxs.q(<8 x i64>, <8 x i64>) nounwind readonly - -define <16 x i32> @test_x86_pminu_d(<16 x i32> %a0, <16 x i32> %a1) { - ; CHECK: vpminud - %res = call <16 x i32> @llvm.x86.avx512.pminu.d(<16 x i32> %a0, <16 x i32> %a1) ; <<16 x i32>> [#uses=1] - ret <16 x i32> %res -} -declare <16 x i32> @llvm.x86.avx512.pminu.d(<16 x i32>, <16 x i32>) nounwind readonly - -define <8 x i64> @test_x86_pminu_q(<8 x i64> %a0, <8 x i64> %a1) { - ; CHECK: vpminuq - %res = call <8 x i64> @llvm.x86.avx512.pminu.q(<8 x i64> %a0, <8 x i64> %a1) ; <<8 x i64>> [#uses=1] - ret <8 x i64> %res -} -declare <8 x i64> @llvm.x86.avx512.pminu.q(<8 x i64>, <8 x i64>) nounwind readonly - -define <16 x i32> @test_x86_pmins_d(<16 x i32> %a0, <16 x i32> %a1) { - ; CHECK: vpminsd - %res = call <16 x i32> @llvm.x86.avx512.pmins.d(<16 x i32> %a0, <16 x i32> %a1) ; <<16 x i32>> [#uses=1] - ret <16 x i32> %res -} -declare <16 x i32> @llvm.x86.avx512.pmins.d(<16 x i32>, <16 x i32>) nounwind readonly - -define <8 x i64> @test_x86_pmins_q(<8 x i64> %a0, <8 x i64> %a1) { - ; CHECK: vpminsq - %res = call <8 x i64> @llvm.x86.avx512.pmins.q(<8 x i64> %a0, <8 x i64> %a1) ; <<8 x i64>> [#uses=1] - ret <8 x i64> %res -} -declare <8 x i64> @llvm.x86.avx512.pmins.q(<8 x i64>, <8 x i64>) nounwind readonly - define <16 x i32> @test_conflict_d(<16 x i32> %a) { ; CHECK: movw $-1, %ax ; CHECK: vpxor @@ -381,45 +325,40 @@ define <8 x i64> @test_mask_conflict_q(<8 x i64> %a, <8 x i64> %b, i8 %mask) { define <16 x float> @test_x86_mask_blend_ps_512(i16 %a0, <16 x float> %a1, <16 x float> %a2) { ; CHECK: vblendmps - %m0 = bitcast i16 %a0 to <16 x i1> - %res = call <16 x float> @llvm.x86.avx512.mask.blend.ps.512(<16 x i1> %m0, <16 x float> %a1, <16 x float> %a2) ; <<16 x float>> [#uses=1] + %res = call <16 x float> @llvm.x86.avx512.mask.blend.ps.512(<16 x float> %a1, <16 x float> %a2, i16 %a0) ; <<16 x float>> [#uses=1] ret <16 x float> %res } -declare <16 x float> @llvm.x86.avx512.mask.blend.ps.512(<16 x i1> %a0, <16 x float> %a1, <16 x float> %a2) nounwind readonly +declare <16 x float> @llvm.x86.avx512.mask.blend.ps.512(<16 x float>, <16 x float>, i16) nounwind readonly define <8 x double> @test_x86_mask_blend_pd_512(i8 %a0, <8 x double> %a1, <8 x double> %a2) { ; CHECK: vblendmpd - %m0 = bitcast i8 %a0 to <8 x i1> - %res = call <8 x double> @llvm.x86.avx512.mask.blend.pd.512(<8 x i1> %m0, <8 x double> %a1, <8 x double> %a2) ; <<8 x double>> [#uses=1] + %res = call <8 x double> @llvm.x86.avx512.mask.blend.pd.512(<8 x double> %a1, <8 x double> %a2, i8 %a0) ; <<8 x double>> [#uses=1] ret <8 x double> %res } define <8 x double> @test_x86_mask_blend_pd_512_memop(<8 x double> %a, <8 x double>* %ptr, i8 %mask) { ; CHECK-LABEL: test_x86_mask_blend_pd_512_memop - ; CHECK: vblendmpd {{.*}}, {{%zmm[0-9]}}, {{%zmm[0-9]}} {%k1} - %vmask = bitcast i8 %mask to <8 x i1> + ; CHECK: vblendmpd (% %b = load <8 x double>* %ptr - %res = call <8 x double> @llvm.x86.avx512.mask.blend.pd.512(<8 x i1> %vmask, <8 x double> %a, <8 x double> %b) ; <<8 x double>> [#uses=1] + %res = call <8 x double> @llvm.x86.avx512.mask.blend.pd.512(<8 x double> %a, <8 x double> %b, i8 %mask) ; <<8 x double>> [#uses=1] ret <8 x double> %res } -declare <8 x double> @llvm.x86.avx512.mask.blend.pd.512(<8 x i1> %a0, <8 x double> %a1, <8 x double> %a2) nounwind readonly +declare <8 x double> @llvm.x86.avx512.mask.blend.pd.512(<8 x double>, <8 x double>, i8) nounwind readonly define <16 x i32> @test_x86_mask_blend_d_512(i16 %a0, <16 x i32> %a1, <16 x i32> %a2) { ; CHECK: vpblendmd - %m0 = bitcast i16 %a0 to <16 x i1> - %res = call <16 x i32> @llvm.x86.avx512.mask.blend.d.512(<16 x i1> %m0, <16 x i32> %a1, <16 x i32> %a2) ; <<16 x i32>> [#uses=1] + %res = call <16 x i32> @llvm.x86.avx512.mask.blend.d.512(<16 x i32> %a1, <16 x i32> %a2, i16 %a0) ; <<16 x i32>> [#uses=1] ret <16 x i32> %res } -declare <16 x i32> @llvm.x86.avx512.mask.blend.d.512(<16 x i1> %a0, <16 x i32> %a1, <16 x i32> %a2) nounwind readonly +declare <16 x i32> @llvm.x86.avx512.mask.blend.d.512(<16 x i32>, <16 x i32>, i16) nounwind readonly define <8 x i64> @test_x86_mask_blend_q_512(i8 %a0, <8 x i64> %a1, <8 x i64> %a2) { ; CHECK: vpblendmq - %m0 = bitcast i8 %a0 to <8 x i1> - %res = call <8 x i64> @llvm.x86.avx512.mask.blend.q.512(<8 x i1> %m0, <8 x i64> %a1, <8 x i64> %a2) ; <<8 x i64>> [#uses=1] + %res = call <8 x i64> @llvm.x86.avx512.mask.blend.q.512(<8 x i64> %a1, <8 x i64> %a2, i8 %a0) ; <<8 x i64>> [#uses=1] ret <8 x i64> %res } -declare <8 x i64> @llvm.x86.avx512.mask.blend.q.512(<8 x i1> %a0, <8 x i64> %a1, <8 x i64> %a2) nounwind readonly +declare <8 x i64> @llvm.x86.avx512.mask.blend.q.512(<8 x i64>, <8 x i64>, i8) nounwind readonly define <8 x i32> @test_cvtpd2udq(<8 x double> %a) { ;CHECK: vcvtpd2udq {ru-sae}{{.*}}encoding: [0x62,0xf1,0xfc,0x58,0x79,0xc0] @@ -521,3 +460,49 @@ declare <8 x double> @llvm.x86.avx512.mask.min.pd.512(<8 x double>, <8 x double> ret <8 x float>%res } declare <8 x float> @llvm.x86.avx512.mask.cvtpd2ps.512(<8 x double>, <8 x float>, i8, i32) + + define <16 x i32> @test_pabsd(<16 x i32> %a) { + ;CHECK: vpabsd {{.*}}encoding: [0x62,0xf2,0x7d,0x48,0x1e,0xc0] + %res = call <16 x i32> @llvm.x86.avx512.mask.pabs.d.512(<16 x i32> %a, <16 x i32>zeroinitializer, i16 -1) + ret < 16 x i32> %res + } + declare <16 x i32> @llvm.x86.avx512.mask.pabs.d.512(<16 x i32>, <16 x i32>, i16) + + define <8 x i64> @test_pabsq(<8 x i64> %a) { + ;CHECK: vpabsq {{.*}}encoding: [0x62,0xf2,0xfd,0x48,0x1f,0xc0] + %res = call <8 x i64> @llvm.x86.avx512.mask.pabs.q.512(<8 x i64> %a, <8 x i64>zeroinitializer, i8 -1) + ret <8 x i64> %res + } + declare <8 x i64> @llvm.x86.avx512.mask.pabs.q.512(<8 x i64>, <8 x i64>, i8) + +define <8 x i64> @test_vpmaxq(<8 x i64> %a0, <8 x i64> %a1) { + ; CHECK: vpmaxsq {{.*}}encoding: [0x62,0xf2,0xfd,0x48,0x3d,0xc1] + %res = call <8 x i64> @llvm.x86.avx512.mask.pmaxs.q.512(<8 x i64> %a0, <8 x i64> %a1, + <8 x i64>zeroinitializer, i8 -1) + ret <8 x i64> %res +} +declare <8 x i64> @llvm.x86.avx512.mask.pmaxs.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8) + +define <16 x i32> @test_vpminud(<16 x i32> %a0, <16 x i32> %a1) { + ; CHECK: vpminud {{.*}}encoding: [0x62,0xf2,0x7d,0x48,0x3b,0xc1] + %res = call <16 x i32> @llvm.x86.avx512.mask.pminu.d.512(<16 x i32> %a0, <16 x i32> %a1, + <16 x i32>zeroinitializer, i16 -1) + ret <16 x i32> %res +} +declare <16 x i32> @llvm.x86.avx512.mask.pminu.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16) + +define <16 x i32> @test_vpmaxsd(<16 x i32> %a0, <16 x i32> %a1) { + ; CHECK: vpmaxsd {{.*}}encoding: [0x62,0xf2,0x7d,0x48,0x3d,0xc1] + %res = call <16 x i32> @llvm.x86.avx512.mask.pmaxs.d.512(<16 x i32> %a0, <16 x i32> %a1, + <16 x i32>zeroinitializer, i16 -1) + ret <16 x i32> %res +} +declare <16 x i32> @llvm.x86.avx512.mask.pmaxs.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16) + +define <8 x i64> @test_vpmuludq(<16 x i32> %a0, <16 x i32> %a1) { + ; CHECK: vpmuludq {{.*}}encoding: [0x62,0xf1,0xfd,0x48,0xf4,0xc1] + %res = call <8 x i64> @llvm.x86.avx512.mask.pmulu.dq.512(<16 x i32> %a0, <16 x i32> %a1, + <8 x i64>zeroinitializer, i8 -1) + ret <8 x i64> %res +} +declare <8 x i64> @llvm.x86.avx512.mask.pmulu.dq.512(<16 x i32>, <16 x i32>, <8 x i64>, i8)