[AVX512] Added intrinsics for 128- and 256-bit versions of VCMPEQ{BWDQ}

Fixed lowering of this intrinsics in case when mask is v2i1 and v4i1.
Now cmp intrinsics lower in the following way:
 (i8 (int_x86_avx512_mask_pcmpeq_q_128
             (v2i64 %a), (v2i64 %b), (i8 %mask))) ->
 (i8 (bitcast
   (v8i1 (insert_subvector undef,
           (v2i1 (and (PCMPEQM %a, %b),
                      (extract_subvector
                         (v8i1 (bitcast %mask)), 0))), 0))))


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@218669 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
Robert Khasanov 2014-09-30 11:41:54 +00:00
parent cfa5724d50
commit 175ff01f0f
6 changed files with 214 additions and 7 deletions

View File

@ -3249,6 +3249,34 @@ let TargetPrefix = "x86" in {
def int_x86_avx512_mask_pcmpeq_q_512 : GCCBuiltin<"__builtin_ia32_pcmpeqq512_mask">,
Intrinsic<[llvm_i8_ty], [llvm_v8i64_ty, llvm_v8i64_ty, llvm_i8_ty],
[IntrNoMem]>;
// 256-bit
def int_x86_avx512_mask_pcmpeq_b_256 : GCCBuiltin<"__builtin_ia32_pcmpeqb256_mask">,
Intrinsic<[llvm_i32_ty], [llvm_v32i8_ty, llvm_v32i8_ty, llvm_i32_ty],
[IntrNoMem]>;
def int_x86_avx512_mask_pcmpeq_w_256 : GCCBuiltin<"__builtin_ia32_pcmpeqw256_mask">,
Intrinsic<[llvm_i16_ty], [llvm_v16i16_ty, llvm_v16i16_ty, llvm_i16_ty],
[IntrNoMem]>;
def int_x86_avx512_mask_pcmpeq_d_256 : GCCBuiltin<"__builtin_ia32_pcmpeqd256_mask">,
Intrinsic<[llvm_i8_ty], [llvm_v8i32_ty, llvm_v8i32_ty, llvm_i8_ty],
[IntrNoMem]>;
def int_x86_avx512_mask_pcmpeq_q_256 : GCCBuiltin<"__builtin_ia32_pcmpeqq256_mask">,
Intrinsic<[llvm_i8_ty], [llvm_v4i64_ty, llvm_v4i64_ty, llvm_i8_ty],
[IntrNoMem]>;
// 128-bit
def int_x86_avx512_mask_pcmpeq_b_128 : GCCBuiltin<"__builtin_ia32_pcmpeqb128_mask">,
Intrinsic<[llvm_i16_ty], [llvm_v16i8_ty, llvm_v16i8_ty, llvm_i16_ty],
[IntrNoMem]>;
def int_x86_avx512_mask_pcmpeq_w_128 : GCCBuiltin<"__builtin_ia32_pcmpeqw128_mask">,
Intrinsic<[llvm_i8_ty], [llvm_v8i16_ty, llvm_v8i16_ty, llvm_i8_ty],
[IntrNoMem]>;
def int_x86_avx512_mask_pcmpeq_d_128 : GCCBuiltin<"__builtin_ia32_pcmpeqd128_mask">,
Intrinsic<[llvm_i8_ty], [llvm_v4i32_ty, llvm_v4i32_ty, llvm_i8_ty],
[IntrNoMem]>;
def int_x86_avx512_mask_pcmpeq_q_128 : GCCBuiltin<"__builtin_ia32_pcmpeqq128_mask">,
Intrinsic<[llvm_i8_ty], [llvm_v2i64_ty, llvm_v2i64_ty, llvm_i8_ty],
[IntrNoMem]>;
}
// Misc.

View File

@ -1560,6 +1560,7 @@ void X86TargetLowering::resetOperationActions() {
setOperationAction(ISD::SETCC, MVT::v4i1, Custom);
setOperationAction(ISD::SETCC, MVT::v2i1, Custom);
setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v8i1, Legal);
}
// SIGN_EXTEND_INREGs are evaluated by the extend type. Handle the expansion
@ -15867,6 +15868,8 @@ static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
EVT VT = Op.getValueType();
EVT MaskVT = EVT::getVectorVT(*DAG.getContext(),
MVT::i1, VT.getVectorNumElements());
EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
Mask.getValueType().getSizeInBits());
SDLoc dl(Op);
assert(MaskVT.isSimple() && "invalid mask type");
@ -15874,19 +15877,22 @@ static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
if (isAllOnes(Mask))
return Op;
// In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements
// are extracted by EXTRACT_SUBVECTOR.
SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
DAG.getNode(ISD::BITCAST, dl, BitcastVT, Mask),
DAG.getIntPtrConstant(0));
switch (Op.getOpcode()) {
default: break;
case X86ISD::PCMPEQM:
case X86ISD::PCMPGTM:
case X86ISD::CMPM:
case X86ISD::CMPMU:
return DAG.getNode(ISD::AND, dl, VT, Op,
DAG.getNode(ISD::BITCAST, dl, MaskVT, Mask));
return DAG.getNode(ISD::AND, dl, VT, Op, VMask);
}
return DAG.getNode(ISD::VSELECT, dl, VT,
DAG.getNode(ISD::BITCAST, dl, MaskVT, Mask),
Op, PreservedSrc);
return DAG.getNode(ISD::VSELECT, dl, VT, VMask, Op, PreservedSrc);
}
static unsigned getOpcodeForFMAIntrinsic(unsigned IntNo) {
@ -15953,13 +15959,28 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) {
return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
Op.getOperand(2), Op.getOperand(3));
case CMP_MASK: {
// Comparison intrinsics with masks.
// Example of transformation:
// (i8 (int_x86_avx512_mask_pcmpeq_q_128
// (v2i64 %a), (v2i64 %b), (i8 %mask))) ->
// (i8 (bitcast
// (v8i1 (insert_subvector undef,
// (v2i1 (and (PCMPEQM %a, %b),
// (extract_subvector
// (v8i1 (bitcast %mask)), 0))), 0))))
EVT VT = Op.getOperand(1).getValueType();
EVT MaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
VT.getVectorNumElements());
SDValue Mask = Op.getOperand(3);
EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
Mask.getValueType().getSizeInBits());
SDValue Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT,
Op.getOperand(1), Op.getOperand(2));
SDValue Res = getVectorMaskingNode(Cmp, Op.getOperand(3),
DAG.getTargetConstant(0, MaskVT), DAG);
SDValue CmpMask = getVectorMaskingNode(Cmp, Op.getOperand(3),
DAG.getTargetConstant(0, MaskVT), DAG);
SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT,
DAG.getUNDEF(BitcastVT), CmpMask,
DAG.getIntPtrConstant(0));
return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Res);
}
case COMI: { // Comparison intrinsics

View File

@ -1768,6 +1768,17 @@ def : Pat<(v16i1 (insert_subvector undef, (v8i1 VK8:$src), (iPTR 0))),
def : Pat<(v8i1 (extract_subvector (v16i1 VK16:$src), (iPTR 8))),
(v8i1 (COPY_TO_REGCLASS (KSHIFTRWri VK16:$src, (i8 8)), VK8))>;
let Predicates = [HasVLX] in {
def : Pat<(v8i1 (insert_subvector undef, (v4i1 VK4:$src), (iPTR 0))),
(v8i1 (COPY_TO_REGCLASS VK4:$src, VK8))>;
def : Pat<(v8i1 (insert_subvector undef, (v2i1 VK2:$src), (iPTR 0))),
(v8i1 (COPY_TO_REGCLASS VK2:$src, VK8))>;
def : Pat<(v4i1 (extract_subvector (v8i1 VK8:$src), (iPTR 0))),
(v4i1 (COPY_TO_REGCLASS VK8:$src, VK4))>;
def : Pat<(v2i1 (extract_subvector (v8i1 VK8:$src), (iPTR 0))),
(v2i1 (COPY_TO_REGCLASS VK8:$src, VK2))>;
}
def : Pat<(v8i1 (X86vshli VK8:$src, (i8 imm:$imm))),
(v8i1 (COPY_TO_REGCLASS (KSHIFTLWri (COPY_TO_REGCLASS VK8:$src, VK16), (I8Imm $imm)), VK8))>;

View File

@ -156,9 +156,17 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(avx2_psubus_b, INTR_TYPE_2OP, X86ISD::SUBUS, 0),
X86_INTRINSIC_DATA(avx2_psubus_w, INTR_TYPE_2OP, X86ISD::SUBUS, 0),
X86_INTRINSIC_DATA(avx2_vperm2i128, INTR_TYPE_3OP, X86ISD::VPERM2X128, 0),
X86_INTRINSIC_DATA(avx512_mask_pcmpeq_b_128, CMP_MASK, X86ISD::PCMPEQM, 0),
X86_INTRINSIC_DATA(avx512_mask_pcmpeq_b_256, CMP_MASK, X86ISD::PCMPEQM, 0),
X86_INTRINSIC_DATA(avx512_mask_pcmpeq_b_512, CMP_MASK, X86ISD::PCMPEQM, 0),
X86_INTRINSIC_DATA(avx512_mask_pcmpeq_d_128, CMP_MASK, X86ISD::PCMPEQM, 0),
X86_INTRINSIC_DATA(avx512_mask_pcmpeq_d_256, CMP_MASK, X86ISD::PCMPEQM, 0),
X86_INTRINSIC_DATA(avx512_mask_pcmpeq_d_512, CMP_MASK, X86ISD::PCMPEQM, 0),
X86_INTRINSIC_DATA(avx512_mask_pcmpeq_q_128, CMP_MASK, X86ISD::PCMPEQM, 0),
X86_INTRINSIC_DATA(avx512_mask_pcmpeq_q_256, CMP_MASK, X86ISD::PCMPEQM, 0),
X86_INTRINSIC_DATA(avx512_mask_pcmpeq_q_512, CMP_MASK, X86ISD::PCMPEQM, 0),
X86_INTRINSIC_DATA(avx512_mask_pcmpeq_w_128, CMP_MASK, X86ISD::PCMPEQM, 0),
X86_INTRINSIC_DATA(avx512_mask_pcmpeq_w_256, CMP_MASK, X86ISD::PCMPEQM, 0),
X86_INTRINSIC_DATA(avx512_mask_pcmpeq_w_512, CMP_MASK, X86ISD::PCMPEQM, 0),
X86_INTRINSIC_DATA(avx_hadd_pd_256, INTR_TYPE_2OP, X86ISD::FHADD, 0),
X86_INTRINSIC_DATA(avx_hadd_ps_256, INTR_TYPE_2OP, X86ISD::FHADD, 0),

View File

@ -0,0 +1,70 @@
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512bw -mattr=+avx512vl --show-mc-encoding| FileCheck %s
; 256-bit
define i32 @test_pcmpeq_b_256(<32 x i8> %a, <32 x i8> %b) {
; CHECK-LABEL: test_pcmpeq_b_256
; CHECK: vpcmpeqb %ymm1, %ymm0, %k0 ##
%res = call i32 @llvm.x86.avx512.mask.pcmpeq.b.256(<32 x i8> %a, <32 x i8> %b, i32 -1)
ret i32 %res
}
define i32 @test_mask_pcmpeq_b_256(<32 x i8> %a, <32 x i8> %b, i32 %mask) {
; CHECK-LABEL: test_mask_pcmpeq_b_256
; CHECK: vpcmpeqb %ymm1, %ymm0, %k0 {%k1} ##
%res = call i32 @llvm.x86.avx512.mask.pcmpeq.b.256(<32 x i8> %a, <32 x i8> %b, i32 %mask)
ret i32 %res
}
declare i32 @llvm.x86.avx512.mask.pcmpeq.b.256(<32 x i8>, <32 x i8>, i32)
define i16 @test_pcmpeq_w_256(<16 x i16> %a, <16 x i16> %b) {
; CHECK-LABEL: test_pcmpeq_w_256
; CHECK: vpcmpeqw %ymm1, %ymm0, %k0 ##
%res = call i16 @llvm.x86.avx512.mask.pcmpeq.w.256(<16 x i16> %a, <16 x i16> %b, i16 -1)
ret i16 %res
}
define i16 @test_mask_pcmpeq_w_256(<16 x i16> %a, <16 x i16> %b, i16 %mask) {
; CHECK-LABEL: test_mask_pcmpeq_w_256
; CHECK: vpcmpeqw %ymm1, %ymm0, %k0 {%k1} ##
%res = call i16 @llvm.x86.avx512.mask.pcmpeq.w.256(<16 x i16> %a, <16 x i16> %b, i16 %mask)
ret i16 %res
}
declare i16 @llvm.x86.avx512.mask.pcmpeq.w.256(<16 x i16>, <16 x i16>, i16)
; 128-bit
define i16 @test_pcmpeq_b_128(<16 x i8> %a, <16 x i8> %b) {
; CHECK-LABEL: test_pcmpeq_b_128
; CHECK: vpcmpeqb %xmm1, %xmm0, %k0 ##
%res = call i16 @llvm.x86.avx512.mask.pcmpeq.b.128(<16 x i8> %a, <16 x i8> %b, i16 -1)
ret i16 %res
}
define i16 @test_mask_pcmpeq_b_128(<16 x i8> %a, <16 x i8> %b, i16 %mask) {
; CHECK-LABEL: test_mask_pcmpeq_b_128
; CHECK: vpcmpeqb %xmm1, %xmm0, %k0 {%k1} ##
%res = call i16 @llvm.x86.avx512.mask.pcmpeq.b.128(<16 x i8> %a, <16 x i8> %b, i16 %mask)
ret i16 %res
}
declare i16 @llvm.x86.avx512.mask.pcmpeq.b.128(<16 x i8>, <16 x i8>, i16)
define i8 @test_pcmpeq_w_128(<8 x i16> %a, <8 x i16> %b) {
; CHECK-LABEL: test_pcmpeq_w_128
; CHECK: vpcmpeqw %xmm1, %xmm0, %k0 ##
%res = call i8 @llvm.x86.avx512.mask.pcmpeq.w.128(<8 x i16> %a, <8 x i16> %b, i8 -1)
ret i8 %res
}
define i8 @test_mask_pcmpeq_w_128(<8 x i16> %a, <8 x i16> %b, i8 %mask) {
; CHECK-LABEL: test_mask_pcmpeq_w_128
; CHECK: vpcmpeqw %xmm1, %xmm0, %k0 {%k1} ##
%res = call i8 @llvm.x86.avx512.mask.pcmpeq.w.128(<8 x i16> %a, <8 x i16> %b, i8 %mask)
ret i8 %res
}
declare i8 @llvm.x86.avx512.mask.pcmpeq.w.128(<8 x i16>, <8 x i16>, i8)

View File

@ -0,0 +1,69 @@
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512vl --show-mc-encoding| FileCheck %s
; 256-bit
define i8 @test_pcmpeq_d_256(<8 x i32> %a, <8 x i32> %b) {
; CHECK-LABEL: test_pcmpeq_d_256
; CHECK: vpcmpeqd %ymm1, %ymm0, %k0 ##
%res = call i8 @llvm.x86.avx512.mask.pcmpeq.d.256(<8 x i32> %a, <8 x i32> %b, i8 -1)
ret i8 %res
}
define i8 @test_mask_pcmpeq_d_256(<8 x i32> %a, <8 x i32> %b, i8 %mask) {
; CHECK-LABEL: test_mask_pcmpeq_d_256
; CHECK: vpcmpeqd %ymm1, %ymm0, %k0 {%k1} ##
%res = call i8 @llvm.x86.avx512.mask.pcmpeq.d.256(<8 x i32> %a, <8 x i32> %b, i8 %mask)
ret i8 %res
}
declare i8 @llvm.x86.avx512.mask.pcmpeq.d.256(<8 x i32>, <8 x i32>, i8)
define i8 @test_pcmpeq_q_256(<4 x i64> %a, <4 x i64> %b) {
; CHECK-LABEL: test_pcmpeq_q_256
; CHECK: vpcmpeqq %ymm1, %ymm0, %k0 ##
%res = call i8 @llvm.x86.avx512.mask.pcmpeq.q.256(<4 x i64> %a, <4 x i64> %b, i8 -1)
ret i8 %res
}
define i8 @test_mask_pcmpeq_q_256(<4 x i64> %a, <4 x i64> %b, i8 %mask) {
; CHECK-LABEL: test_mask_pcmpeq_q_256
; CHECK: vpcmpeqq %ymm1, %ymm0, %k0 {%k1} ##
%res = call i8 @llvm.x86.avx512.mask.pcmpeq.q.256(<4 x i64> %a, <4 x i64> %b, i8 %mask)
ret i8 %res
}
declare i8 @llvm.x86.avx512.mask.pcmpeq.q.256(<4 x i64>, <4 x i64>, i8)
; 128-bit
define i8 @test_pcmpeq_d_128(<4 x i32> %a, <4 x i32> %b) {
; CHECK-LABEL: test_pcmpeq_d_128
; CHECK: vpcmpeqd %xmm1, %xmm0, %k0 ##
%res = call i8 @llvm.x86.avx512.mask.pcmpeq.d.128(<4 x i32> %a, <4 x i32> %b, i8 -1)
ret i8 %res
}
define i8 @test_mask_pcmpeq_d_128(<4 x i32> %a, <4 x i32> %b, i8 %mask) {
; CHECK-LABEL: test_mask_pcmpeq_d_128
; CHECK: vpcmpeqd %xmm1, %xmm0, %k0 {%k1} ##
%res = call i8 @llvm.x86.avx512.mask.pcmpeq.d.128(<4 x i32> %a, <4 x i32> %b, i8 %mask)
ret i8 %res
}
declare i8 @llvm.x86.avx512.mask.pcmpeq.d.128(<4 x i32>, <4 x i32>, i8)
define i8 @test_pcmpeq_q_128(<2 x i64> %a, <2 x i64> %b) {
; CHECK-LABEL: test_pcmpeq_q_128
; CHECK: vpcmpeqq %xmm1, %xmm0, %k0 ##
%res = call i8 @llvm.x86.avx512.mask.pcmpeq.q.128(<2 x i64> %a, <2 x i64> %b, i8 -1)
ret i8 %res
}
define i8 @test_mask_pcmpeq_q_128(<2 x i64> %a, <2 x i64> %b, i8 %mask) {
; CHECK-LABEL: test_mask_pcmpeq_q_128
; CHECK: vpcmpeqq %xmm1, %xmm0, %k0 {%k1} ##
%res = call i8 @llvm.x86.avx512.mask.pcmpeq.q.128(<2 x i64> %a, <2 x i64> %b, i8 %mask)
ret i8 %res
}
declare i8 @llvm.x86.avx512.mask.pcmpeq.q.128(<2 x i64>, <2 x i64>, i8)