From 175ff01f0f3f6364c4ad7dce7071dfa513a495ec Mon Sep 17 00:00:00 2001 From: Robert Khasanov Date: Tue, 30 Sep 2014 11:41:54 +0000 Subject: [PATCH] [AVX512] Added intrinsics for 128- and 256-bit versions of VCMPEQ{BWDQ} Fixed lowering of this intrinsics in case when mask is v2i1 and v4i1. Now cmp intrinsics lower in the following way: (i8 (int_x86_avx512_mask_pcmpeq_q_128 (v2i64 %a), (v2i64 %b), (i8 %mask))) -> (i8 (bitcast (v8i1 (insert_subvector undef, (v2i1 (and (PCMPEQM %a, %b), (extract_subvector (v8i1 (bitcast %mask)), 0))), 0)))) git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@218669 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/IR/IntrinsicsX86.td | 28 +++++++++ lib/Target/X86/X86ISelLowering.cpp | 35 +++++++++--- lib/Target/X86/X86InstrAVX512.td | 11 ++++ lib/Target/X86/X86IntrinsicsInfo.h | 8 +++ test/CodeGen/X86/avx512bwvl-intrinsics.ll | 70 +++++++++++++++++++++++ test/CodeGen/X86/avx512vl-intrinsics.ll | 69 ++++++++++++++++++++++ 6 files changed, 214 insertions(+), 7 deletions(-) create mode 100644 test/CodeGen/X86/avx512bwvl-intrinsics.ll create mode 100644 test/CodeGen/X86/avx512vl-intrinsics.ll diff --git a/include/llvm/IR/IntrinsicsX86.td b/include/llvm/IR/IntrinsicsX86.td index 3012cdf7db4..1e366d17cd7 100644 --- a/include/llvm/IR/IntrinsicsX86.td +++ b/include/llvm/IR/IntrinsicsX86.td @@ -3249,6 +3249,34 @@ let TargetPrefix = "x86" in { def int_x86_avx512_mask_pcmpeq_q_512 : GCCBuiltin<"__builtin_ia32_pcmpeqq512_mask">, Intrinsic<[llvm_i8_ty], [llvm_v8i64_ty, llvm_v8i64_ty, llvm_i8_ty], [IntrNoMem]>; + + // 256-bit + def int_x86_avx512_mask_pcmpeq_b_256 : GCCBuiltin<"__builtin_ia32_pcmpeqb256_mask">, + Intrinsic<[llvm_i32_ty], [llvm_v32i8_ty, llvm_v32i8_ty, llvm_i32_ty], + [IntrNoMem]>; + def int_x86_avx512_mask_pcmpeq_w_256 : GCCBuiltin<"__builtin_ia32_pcmpeqw256_mask">, + Intrinsic<[llvm_i16_ty], [llvm_v16i16_ty, llvm_v16i16_ty, llvm_i16_ty], + [IntrNoMem]>; + def int_x86_avx512_mask_pcmpeq_d_256 : GCCBuiltin<"__builtin_ia32_pcmpeqd256_mask">, + Intrinsic<[llvm_i8_ty], [llvm_v8i32_ty, llvm_v8i32_ty, llvm_i8_ty], + [IntrNoMem]>; + def int_x86_avx512_mask_pcmpeq_q_256 : GCCBuiltin<"__builtin_ia32_pcmpeqq256_mask">, + Intrinsic<[llvm_i8_ty], [llvm_v4i64_ty, llvm_v4i64_ty, llvm_i8_ty], + [IntrNoMem]>; + + // 128-bit + def int_x86_avx512_mask_pcmpeq_b_128 : GCCBuiltin<"__builtin_ia32_pcmpeqb128_mask">, + Intrinsic<[llvm_i16_ty], [llvm_v16i8_ty, llvm_v16i8_ty, llvm_i16_ty], + [IntrNoMem]>; + def int_x86_avx512_mask_pcmpeq_w_128 : GCCBuiltin<"__builtin_ia32_pcmpeqw128_mask">, + Intrinsic<[llvm_i8_ty], [llvm_v8i16_ty, llvm_v8i16_ty, llvm_i8_ty], + [IntrNoMem]>; + def int_x86_avx512_mask_pcmpeq_d_128 : GCCBuiltin<"__builtin_ia32_pcmpeqd128_mask">, + Intrinsic<[llvm_i8_ty], [llvm_v4i32_ty, llvm_v4i32_ty, llvm_i8_ty], + [IntrNoMem]>; + def int_x86_avx512_mask_pcmpeq_q_128 : GCCBuiltin<"__builtin_ia32_pcmpeqq128_mask">, + Intrinsic<[llvm_i8_ty], [llvm_v2i64_ty, llvm_v2i64_ty, llvm_i8_ty], + [IntrNoMem]>; } // Misc. diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 98616119fa0..e50ca6d08f1 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -1560,6 +1560,7 @@ void X86TargetLowering::resetOperationActions() { setOperationAction(ISD::SETCC, MVT::v4i1, Custom); setOperationAction(ISD::SETCC, MVT::v2i1, Custom); + setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v8i1, Legal); } // SIGN_EXTEND_INREGs are evaluated by the extend type. Handle the expansion @@ -15867,6 +15868,8 @@ static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask, EVT VT = Op.getValueType(); EVT MaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, VT.getVectorNumElements()); + EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, + Mask.getValueType().getSizeInBits()); SDLoc dl(Op); assert(MaskVT.isSimple() && "invalid mask type"); @@ -15874,19 +15877,22 @@ static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask, if (isAllOnes(Mask)) return Op; + // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements + // are extracted by EXTRACT_SUBVECTOR. + SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT, + DAG.getNode(ISD::BITCAST, dl, BitcastVT, Mask), + DAG.getIntPtrConstant(0)); + switch (Op.getOpcode()) { default: break; case X86ISD::PCMPEQM: case X86ISD::PCMPGTM: case X86ISD::CMPM: case X86ISD::CMPMU: - return DAG.getNode(ISD::AND, dl, VT, Op, - DAG.getNode(ISD::BITCAST, dl, MaskVT, Mask)); + return DAG.getNode(ISD::AND, dl, VT, Op, VMask); } - return DAG.getNode(ISD::VSELECT, dl, VT, - DAG.getNode(ISD::BITCAST, dl, MaskVT, Mask), - Op, PreservedSrc); + return DAG.getNode(ISD::VSELECT, dl, VT, VMask, Op, PreservedSrc); } static unsigned getOpcodeForFMAIntrinsic(unsigned IntNo) { @@ -15953,13 +15959,28 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) { return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); case CMP_MASK: { + // Comparison intrinsics with masks. + // Example of transformation: + // (i8 (int_x86_avx512_mask_pcmpeq_q_128 + // (v2i64 %a), (v2i64 %b), (i8 %mask))) -> + // (i8 (bitcast + // (v8i1 (insert_subvector undef, + // (v2i1 (and (PCMPEQM %a, %b), + // (extract_subvector + // (v8i1 (bitcast %mask)), 0))), 0)))) EVT VT = Op.getOperand(1).getValueType(); EVT MaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, VT.getVectorNumElements()); + SDValue Mask = Op.getOperand(3); + EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, + Mask.getValueType().getSizeInBits()); SDValue Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1), Op.getOperand(2)); - SDValue Res = getVectorMaskingNode(Cmp, Op.getOperand(3), - DAG.getTargetConstant(0, MaskVT), DAG); + SDValue CmpMask = getVectorMaskingNode(Cmp, Op.getOperand(3), + DAG.getTargetConstant(0, MaskVT), DAG); + SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT, + DAG.getUNDEF(BitcastVT), CmpMask, + DAG.getIntPtrConstant(0)); return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Res); } case COMI: { // Comparison intrinsics diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td index c2a95849b6a..7383ae09145 100644 --- a/lib/Target/X86/X86InstrAVX512.td +++ b/lib/Target/X86/X86InstrAVX512.td @@ -1768,6 +1768,17 @@ def : Pat<(v16i1 (insert_subvector undef, (v8i1 VK8:$src), (iPTR 0))), def : Pat<(v8i1 (extract_subvector (v16i1 VK16:$src), (iPTR 8))), (v8i1 (COPY_TO_REGCLASS (KSHIFTRWri VK16:$src, (i8 8)), VK8))>; +let Predicates = [HasVLX] in { + def : Pat<(v8i1 (insert_subvector undef, (v4i1 VK4:$src), (iPTR 0))), + (v8i1 (COPY_TO_REGCLASS VK4:$src, VK8))>; + def : Pat<(v8i1 (insert_subvector undef, (v2i1 VK2:$src), (iPTR 0))), + (v8i1 (COPY_TO_REGCLASS VK2:$src, VK8))>; + def : Pat<(v4i1 (extract_subvector (v8i1 VK8:$src), (iPTR 0))), + (v4i1 (COPY_TO_REGCLASS VK8:$src, VK4))>; + def : Pat<(v2i1 (extract_subvector (v8i1 VK8:$src), (iPTR 0))), + (v2i1 (COPY_TO_REGCLASS VK8:$src, VK2))>; +} + def : Pat<(v8i1 (X86vshli VK8:$src, (i8 imm:$imm))), (v8i1 (COPY_TO_REGCLASS (KSHIFTLWri (COPY_TO_REGCLASS VK8:$src, VK16), (I8Imm $imm)), VK8))>; diff --git a/lib/Target/X86/X86IntrinsicsInfo.h b/lib/Target/X86/X86IntrinsicsInfo.h index 4c374af1f04..35d6e89ce90 100644 --- a/lib/Target/X86/X86IntrinsicsInfo.h +++ b/lib/Target/X86/X86IntrinsicsInfo.h @@ -156,9 +156,17 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx2_psubus_b, INTR_TYPE_2OP, X86ISD::SUBUS, 0), X86_INTRINSIC_DATA(avx2_psubus_w, INTR_TYPE_2OP, X86ISD::SUBUS, 0), X86_INTRINSIC_DATA(avx2_vperm2i128, INTR_TYPE_3OP, X86ISD::VPERM2X128, 0), + X86_INTRINSIC_DATA(avx512_mask_pcmpeq_b_128, CMP_MASK, X86ISD::PCMPEQM, 0), + X86_INTRINSIC_DATA(avx512_mask_pcmpeq_b_256, CMP_MASK, X86ISD::PCMPEQM, 0), X86_INTRINSIC_DATA(avx512_mask_pcmpeq_b_512, CMP_MASK, X86ISD::PCMPEQM, 0), + X86_INTRINSIC_DATA(avx512_mask_pcmpeq_d_128, CMP_MASK, X86ISD::PCMPEQM, 0), + X86_INTRINSIC_DATA(avx512_mask_pcmpeq_d_256, CMP_MASK, X86ISD::PCMPEQM, 0), X86_INTRINSIC_DATA(avx512_mask_pcmpeq_d_512, CMP_MASK, X86ISD::PCMPEQM, 0), + X86_INTRINSIC_DATA(avx512_mask_pcmpeq_q_128, CMP_MASK, X86ISD::PCMPEQM, 0), + X86_INTRINSIC_DATA(avx512_mask_pcmpeq_q_256, CMP_MASK, X86ISD::PCMPEQM, 0), X86_INTRINSIC_DATA(avx512_mask_pcmpeq_q_512, CMP_MASK, X86ISD::PCMPEQM, 0), + X86_INTRINSIC_DATA(avx512_mask_pcmpeq_w_128, CMP_MASK, X86ISD::PCMPEQM, 0), + X86_INTRINSIC_DATA(avx512_mask_pcmpeq_w_256, CMP_MASK, X86ISD::PCMPEQM, 0), X86_INTRINSIC_DATA(avx512_mask_pcmpeq_w_512, CMP_MASK, X86ISD::PCMPEQM, 0), X86_INTRINSIC_DATA(avx_hadd_pd_256, INTR_TYPE_2OP, X86ISD::FHADD, 0), X86_INTRINSIC_DATA(avx_hadd_ps_256, INTR_TYPE_2OP, X86ISD::FHADD, 0), diff --git a/test/CodeGen/X86/avx512bwvl-intrinsics.ll b/test/CodeGen/X86/avx512bwvl-intrinsics.ll new file mode 100644 index 00000000000..06cc91452a7 --- /dev/null +++ b/test/CodeGen/X86/avx512bwvl-intrinsics.ll @@ -0,0 +1,70 @@ +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512bw -mattr=+avx512vl --show-mc-encoding| FileCheck %s + +; 256-bit + +define i32 @test_pcmpeq_b_256(<32 x i8> %a, <32 x i8> %b) { +; CHECK-LABEL: test_pcmpeq_b_256 +; CHECK: vpcmpeqb %ymm1, %ymm0, %k0 ## + %res = call i32 @llvm.x86.avx512.mask.pcmpeq.b.256(<32 x i8> %a, <32 x i8> %b, i32 -1) + ret i32 %res +} + +define i32 @test_mask_pcmpeq_b_256(<32 x i8> %a, <32 x i8> %b, i32 %mask) { +; CHECK-LABEL: test_mask_pcmpeq_b_256 +; CHECK: vpcmpeqb %ymm1, %ymm0, %k0 {%k1} ## + %res = call i32 @llvm.x86.avx512.mask.pcmpeq.b.256(<32 x i8> %a, <32 x i8> %b, i32 %mask) + ret i32 %res +} + +declare i32 @llvm.x86.avx512.mask.pcmpeq.b.256(<32 x i8>, <32 x i8>, i32) + +define i16 @test_pcmpeq_w_256(<16 x i16> %a, <16 x i16> %b) { +; CHECK-LABEL: test_pcmpeq_w_256 +; CHECK: vpcmpeqw %ymm1, %ymm0, %k0 ## + %res = call i16 @llvm.x86.avx512.mask.pcmpeq.w.256(<16 x i16> %a, <16 x i16> %b, i16 -1) + ret i16 %res +} + +define i16 @test_mask_pcmpeq_w_256(<16 x i16> %a, <16 x i16> %b, i16 %mask) { +; CHECK-LABEL: test_mask_pcmpeq_w_256 +; CHECK: vpcmpeqw %ymm1, %ymm0, %k0 {%k1} ## + %res = call i16 @llvm.x86.avx512.mask.pcmpeq.w.256(<16 x i16> %a, <16 x i16> %b, i16 %mask) + ret i16 %res +} + +declare i16 @llvm.x86.avx512.mask.pcmpeq.w.256(<16 x i16>, <16 x i16>, i16) + +; 128-bit + +define i16 @test_pcmpeq_b_128(<16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: test_pcmpeq_b_128 +; CHECK: vpcmpeqb %xmm1, %xmm0, %k0 ## + %res = call i16 @llvm.x86.avx512.mask.pcmpeq.b.128(<16 x i8> %a, <16 x i8> %b, i16 -1) + ret i16 %res +} + +define i16 @test_mask_pcmpeq_b_128(<16 x i8> %a, <16 x i8> %b, i16 %mask) { +; CHECK-LABEL: test_mask_pcmpeq_b_128 +; CHECK: vpcmpeqb %xmm1, %xmm0, %k0 {%k1} ## + %res = call i16 @llvm.x86.avx512.mask.pcmpeq.b.128(<16 x i8> %a, <16 x i8> %b, i16 %mask) + ret i16 %res +} + +declare i16 @llvm.x86.avx512.mask.pcmpeq.b.128(<16 x i8>, <16 x i8>, i16) + +define i8 @test_pcmpeq_w_128(<8 x i16> %a, <8 x i16> %b) { +; CHECK-LABEL: test_pcmpeq_w_128 +; CHECK: vpcmpeqw %xmm1, %xmm0, %k0 ## + %res = call i8 @llvm.x86.avx512.mask.pcmpeq.w.128(<8 x i16> %a, <8 x i16> %b, i8 -1) + ret i8 %res +} + +define i8 @test_mask_pcmpeq_w_128(<8 x i16> %a, <8 x i16> %b, i8 %mask) { +; CHECK-LABEL: test_mask_pcmpeq_w_128 +; CHECK: vpcmpeqw %xmm1, %xmm0, %k0 {%k1} ## + %res = call i8 @llvm.x86.avx512.mask.pcmpeq.w.128(<8 x i16> %a, <8 x i16> %b, i8 %mask) + ret i8 %res +} + +declare i8 @llvm.x86.avx512.mask.pcmpeq.w.128(<8 x i16>, <8 x i16>, i8) + diff --git a/test/CodeGen/X86/avx512vl-intrinsics.ll b/test/CodeGen/X86/avx512vl-intrinsics.ll new file mode 100644 index 00000000000..ada13dea249 --- /dev/null +++ b/test/CodeGen/X86/avx512vl-intrinsics.ll @@ -0,0 +1,69 @@ +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512vl --show-mc-encoding| FileCheck %s + +; 256-bit + +define i8 @test_pcmpeq_d_256(<8 x i32> %a, <8 x i32> %b) { +; CHECK-LABEL: test_pcmpeq_d_256 +; CHECK: vpcmpeqd %ymm1, %ymm0, %k0 ## + %res = call i8 @llvm.x86.avx512.mask.pcmpeq.d.256(<8 x i32> %a, <8 x i32> %b, i8 -1) + ret i8 %res +} + +define i8 @test_mask_pcmpeq_d_256(<8 x i32> %a, <8 x i32> %b, i8 %mask) { +; CHECK-LABEL: test_mask_pcmpeq_d_256 +; CHECK: vpcmpeqd %ymm1, %ymm0, %k0 {%k1} ## + %res = call i8 @llvm.x86.avx512.mask.pcmpeq.d.256(<8 x i32> %a, <8 x i32> %b, i8 %mask) + ret i8 %res +} + +declare i8 @llvm.x86.avx512.mask.pcmpeq.d.256(<8 x i32>, <8 x i32>, i8) + +define i8 @test_pcmpeq_q_256(<4 x i64> %a, <4 x i64> %b) { +; CHECK-LABEL: test_pcmpeq_q_256 +; CHECK: vpcmpeqq %ymm1, %ymm0, %k0 ## + %res = call i8 @llvm.x86.avx512.mask.pcmpeq.q.256(<4 x i64> %a, <4 x i64> %b, i8 -1) + ret i8 %res +} + +define i8 @test_mask_pcmpeq_q_256(<4 x i64> %a, <4 x i64> %b, i8 %mask) { +; CHECK-LABEL: test_mask_pcmpeq_q_256 +; CHECK: vpcmpeqq %ymm1, %ymm0, %k0 {%k1} ## + %res = call i8 @llvm.x86.avx512.mask.pcmpeq.q.256(<4 x i64> %a, <4 x i64> %b, i8 %mask) + ret i8 %res +} + +declare i8 @llvm.x86.avx512.mask.pcmpeq.q.256(<4 x i64>, <4 x i64>, i8) + +; 128-bit + +define i8 @test_pcmpeq_d_128(<4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: test_pcmpeq_d_128 +; CHECK: vpcmpeqd %xmm1, %xmm0, %k0 ## + %res = call i8 @llvm.x86.avx512.mask.pcmpeq.d.128(<4 x i32> %a, <4 x i32> %b, i8 -1) + ret i8 %res +} + +define i8 @test_mask_pcmpeq_d_128(<4 x i32> %a, <4 x i32> %b, i8 %mask) { +; CHECK-LABEL: test_mask_pcmpeq_d_128 +; CHECK: vpcmpeqd %xmm1, %xmm0, %k0 {%k1} ## + %res = call i8 @llvm.x86.avx512.mask.pcmpeq.d.128(<4 x i32> %a, <4 x i32> %b, i8 %mask) + ret i8 %res +} + +declare i8 @llvm.x86.avx512.mask.pcmpeq.d.128(<4 x i32>, <4 x i32>, i8) + +define i8 @test_pcmpeq_q_128(<2 x i64> %a, <2 x i64> %b) { +; CHECK-LABEL: test_pcmpeq_q_128 +; CHECK: vpcmpeqq %xmm1, %xmm0, %k0 ## + %res = call i8 @llvm.x86.avx512.mask.pcmpeq.q.128(<2 x i64> %a, <2 x i64> %b, i8 -1) + ret i8 %res +} + +define i8 @test_mask_pcmpeq_q_128(<2 x i64> %a, <2 x i64> %b, i8 %mask) { +; CHECK-LABEL: test_mask_pcmpeq_q_128 +; CHECK: vpcmpeqq %xmm1, %xmm0, %k0 {%k1} ## + %res = call i8 @llvm.x86.avx512.mask.pcmpeq.q.128(<2 x i64> %a, <2 x i64> %b, i8 %mask) + ret i8 %res +} + +declare i8 @llvm.x86.avx512.mask.pcmpeq.q.128(<2 x i64>, <2 x i64>, i8)