AVX-512: Changes in operations on masks registers for KNL and SKX

- Added KSHIFTB/D/Q for skx
- Added KORTESTB/D/Q for skx
- Fixed store operation for v8i1 type for KNL
- Store size of v8i1, v4i1 and v2i1 are changed to 8 bits



git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@227043 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
Elena Demikhovsky 2015-01-25 12:47:15 +00:00
parent c4fbd5d26b
commit 717d41d8c3
5 changed files with 121 additions and 53 deletions

View File

@ -12871,6 +12871,8 @@ X86TargetLowering::ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG) const
MVT EltVT = Op.getSimpleValueType();
assert((EltVT == MVT::i1) && "Unexpected operands in ExtractBitFromMaskVector");
assert((VecVT.getVectorNumElements() <= 16 || Subtarget->hasBWI()) &&
"Unexpected vector type in ExtractBitFromMaskVector");
// variable index can't be handled in mask registers,
// extend vector to VR512
@ -12884,6 +12886,8 @@ X86TargetLowering::ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG) const
unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
const TargetRegisterClass* rc = getRegClassFor(VecVT);
if (!Subtarget->hasDQI() && (VecVT.getVectorNumElements() <= 8))
rc = getRegClassFor(MVT::v16i1);
unsigned MaxSift = rc->getSize()*8 - 1;
Vec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, Vec,
DAG.getConstant(MaxSift - IdxVal, MVT::i8));

View File

@ -1624,7 +1624,8 @@ multiclass avx512_mask_mov<bits<8> opc_kk, bits<8> opc_km, bits<8> opc_mk,
[(set KRC:$dst, (vvt (bitconvert (ivt (load addr:$src)))))]>;
let mayStore = 1 in
def mk : I<opc_mk, MRMDestMem, (outs), (ins x86memop:$dst, KRC:$src),
!strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>;
!strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
[(store KRC:$src, addr:$dst)]>;
}
}
@ -1951,13 +1952,26 @@ multiclass avx512_mask_testop<bits<8> opc, string OpcodeStr, RegisterClass KRC,
multiclass avx512_mask_testop_w<bits<8> opc, string OpcodeStr, SDNode OpNode> {
defm W : avx512_mask_testop<opc, !strconcat(OpcodeStr, "w"), VK16, OpNode>,
VEX, PS;
let Predicates = [HasDQI] in
defm B : avx512_mask_testop<opc, !strconcat(OpcodeStr, "b"), VK8, OpNode>,
VEX, PD;
let Predicates = [HasBWI] in {
defm Q : avx512_mask_testop<opc, !strconcat(OpcodeStr, "q"), VK64, OpNode>,
VEX, PS, VEX_W;
defm D : avx512_mask_testop<opc, !strconcat(OpcodeStr, "d"), VK32, OpNode>,
VEX, PD, VEX_W;
}
}
defm KORTEST : avx512_mask_testop_w<0x98, "kortest", X86kortest>;
def : Pat<(X86cmp VK1:$src1, (i1 0)),
(KORTESTWrr (COPY_TO_REGCLASS VK1:$src1, VK16),
(COPY_TO_REGCLASS VK1:$src1, VK16))>;
(COPY_TO_REGCLASS VK1:$src1, VK16))>, Requires<[HasAVX512, NoDQI]>;
def : Pat<(X86cmp VK1:$src1, (i1 0)),
(KORTESTBrr (COPY_TO_REGCLASS VK1:$src1, VK8),
(COPY_TO_REGCLASS VK1:$src1, VK8))>, Requires<[HasDQI]>;
// Mask shift
multiclass avx512_mask_shiftop<bits<8> opc, string OpcodeStr, RegisterClass KRC,
@ -1972,7 +1986,17 @@ multiclass avx512_mask_shiftop<bits<8> opc, string OpcodeStr, RegisterClass KRC,
multiclass avx512_mask_shiftop_w<bits<8> opc1, bits<8> opc2, string OpcodeStr,
SDNode OpNode> {
defm W : avx512_mask_shiftop<opc1, !strconcat(OpcodeStr, "w"), VK16, OpNode>,
VEX, TAPD, VEX_W;
VEX, TAPD, VEX_W;
let Predicates = [HasDQI] in
defm B : avx512_mask_shiftop<opc1, !strconcat(OpcodeStr, "b"), VK8, OpNode>,
VEX, TAPD;
let Predicates = [HasBWI] in {
defm Q : avx512_mask_shiftop<opc2, !strconcat(OpcodeStr, "q"), VK64, OpNode>,
VEX, TAPD, VEX_W;
let Predicates = [HasDQI] in
defm D : avx512_mask_shiftop<opc2, !strconcat(OpcodeStr, "d"), VK32, OpNode>,
VEX, TAPD;
}
}
defm KSHIFTL : avx512_mask_shiftop_w<0x32, 0x33, "kshiftl", X86vshli>;
@ -2023,10 +2047,14 @@ let Predicates = [HasVLX] in {
}
def : Pat<(v8i1 (X86vshli VK8:$src, (i8 imm:$imm))),
(v8i1 (COPY_TO_REGCLASS (KSHIFTLWri (COPY_TO_REGCLASS VK8:$src, VK16), (I8Imm $imm)), VK8))>;
(v8i1 (COPY_TO_REGCLASS
(KSHIFTLWri (COPY_TO_REGCLASS VK8:$src, VK16),
(I8Imm $imm)), VK8))>, Requires<[HasAVX512, NoDQI]>;
def : Pat<(v8i1 (X86vsrli VK8:$src, (i8 imm:$imm))),
(v8i1 (COPY_TO_REGCLASS (KSHIFTRWri (COPY_TO_REGCLASS VK8:$src, VK16), (I8Imm $imm)), VK8))>;
(v8i1 (COPY_TO_REGCLASS
(KSHIFTRWri (COPY_TO_REGCLASS VK8:$src, VK16),
(I8Imm $imm)), VK8))>, Requires<[HasAVX512, NoDQI]>;
//===----------------------------------------------------------------------===//
// AVX-512 - Aligned and unaligned load and store
//
@ -5204,7 +5232,14 @@ def : Pat<(store (i1 1), addr:$dst), (MOV8mi addr:$dst, (i8 1))>;
def : Pat<(store (i1 0), addr:$dst), (MOV8mi addr:$dst, (i8 0))>;
def : Pat<(store VK1:$src, addr:$dst),
(KMOVWmk addr:$dst, (COPY_TO_REGCLASS VK1:$src, VK16))>;
(MOV8mr addr:$dst,
(EXTRACT_SUBREG (KMOVWrk (COPY_TO_REGCLASS VK1:$src, VK16)),
sub_8bit))>, Requires<[HasAVX512, NoDQI]>;
def : Pat<(store VK8:$src, addr:$dst),
(MOV8mr addr:$dst,
(EXTRACT_SUBREG (KMOVWrk (COPY_TO_REGCLASS VK8:$src, VK16)),
sub_8bit))>, Requires<[HasAVX512, NoDQI]>;
def truncstorei1 : PatFrag<(ops node:$val, node:$ptr),
(truncstore node:$val, node:$ptr), [{

View File

@ -469,18 +469,18 @@ def VR256X : RegisterClass<"X86", [v32i8, v16i16, v8i32, v4i64, v8f32, v4f64],
256, (sequence "YMM%u", 0, 31)>;
// Mask registers
def VK1 : RegisterClass<"X86", [i1], 16, (sequence "K%u", 0, 7)> {let Size = 16;}
def VK2 : RegisterClass<"X86", [v2i1], 16, (add VK1)> {let Size = 16;}
def VK4 : RegisterClass<"X86", [v4i1], 16, (add VK2)> {let Size = 16;}
def VK8 : RegisterClass<"X86", [v8i1], 16, (add VK4)> {let Size = 16;}
def VK1 : RegisterClass<"X86", [i1], 8, (sequence "K%u", 0, 7)> {let Size = 8;}
def VK2 : RegisterClass<"X86", [v2i1], 8, (add VK1)> {let Size = 8;}
def VK4 : RegisterClass<"X86", [v4i1], 8, (add VK2)> {let Size = 8;}
def VK8 : RegisterClass<"X86", [v8i1], 8, (add VK4)> {let Size = 8;}
def VK16 : RegisterClass<"X86", [v16i1], 16, (add VK8)> {let Size = 16;}
def VK32 : RegisterClass<"X86", [v32i1], 32, (add VK16)> {let Size = 32;}
def VK64 : RegisterClass<"X86", [v64i1], 64, (add VK32)> {let Size = 64;}
def VK1WM : RegisterClass<"X86", [i1], 16, (sub VK1, K0)> {let Size = 16;}
def VK2WM : RegisterClass<"X86", [v2i1], 16, (sub VK2, K0)> {let Size = 16;}
def VK4WM : RegisterClass<"X86", [v4i1], 16, (sub VK4, K0)> {let Size = 16;}
def VK8WM : RegisterClass<"X86", [v8i1], 16, (sub VK8, K0)> {let Size = 16;}
def VK1WM : RegisterClass<"X86", [i1], 8, (sub VK1, K0)> {let Size = 8;}
def VK2WM : RegisterClass<"X86", [v2i1], 8, (sub VK2, K0)> {let Size = 8;}
def VK4WM : RegisterClass<"X86", [v4i1], 8, (sub VK4, K0)> {let Size = 8;}
def VK8WM : RegisterClass<"X86", [v8i1], 8, (sub VK8, K0)> {let Size = 8;}
def VK16WM : RegisterClass<"X86", [v16i1], 16, (add VK8WM)> {let Size = 16;}
def VK32WM : RegisterClass<"X86", [v32i1], 32, (add VK16WM)> {let Size = 32;}
def VK64WM : RegisterClass<"X86", [v64i1], 64, (add VK32WM)> {let Size = 64;}

View File

@ -106,7 +106,8 @@ define i32 @test10(<16 x i32> %x, i32 %ind) nounwind {
;CHECK: vpcmpltud
;CHECK: kshiftlw $11
;CHECK: kshiftrw $15
;CHECK: kortestw
;KNL: kortestw
;SKX: kortestb
;CHECK: je
;CHECK: ret
;CHECK: ret
@ -125,7 +126,8 @@ define <16 x i32> @test11(<16 x i32>%a, <16 x i32>%b) {
;CHECK: vpcmpgtq
;CHECK: kshiftlw $15
;CHECK: kshiftrw $15
;CHECK: kortestw
;KNL: kortestw
;SKX: kortestb
;CHECK: ret
define i64 @test12(<16 x i64>%a, <16 x i64>%b, i64 %a1, i64 %b1) {
@ -150,9 +152,12 @@ define i16 @test13(i32 %a, i32 %b) {
;CHECK-LABEL: test14
;CHECK: vpcmpgtq
;CHECK: kshiftlw $11
;CHECK: kshiftrw $15
;CHECK: kortestw
;KNL: kshiftlw $11
;KNL: kshiftrw $15
;KNL: kortestw
;SKX: kshiftlb $3
;SKX: kshiftrb $7
;SKX: kortestb
;CHECK: ret
define i64 @test14(<8 x i64>%a, <8 x i64>%b, i64 %a1, i64 %b1) {
@ -188,9 +193,11 @@ define i16 @test16(i1 *%addr, i16 %a) {
}
;CHECK-LABEL: test17
;CHECK: kshiftlw
;CHECK: kshiftrw
;KNL: kshiftlw
;KNL: kshiftrw
;KNL: korw
;SKX: kshiftlb
;SKX: kshiftrb
;SKX: korb
;CHECK: ret
define i8 @test17(i1 *%addr, i8 %a) {

View File

@ -1,29 +1,38 @@
; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s
; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s --check-prefix=KNL --check-prefix=CHECK
; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck %s --check-prefix=SKX --check-prefix=CHECK
; CHECK-LABEL: mask16
; CHECK: kmovw
; CHECK-NEXT: knotw
; CHECK-NEXT: kmovw
define i16 @mask16(i16 %x) {
%m0 = bitcast i16 %x to <16 x i1>
%m1 = xor <16 x i1> %m0, <i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1>
%ret = bitcast <16 x i1> %m1 to i16
ret i16 %ret
; CHECK-LABEL: mask16
; CHECK: kmovw
; CHECK-NEXT: knotw
; CHECK-NEXT: kmovw
; CHECK: ret
}
; CHECK-LABEL: mask8
; KNL: kmovw
; KNL-NEXT: knotw
; KNL-NEXT: kmovw
; SKX: kmovb
; SKX-NEXT: knotb
; SKX-NEXT: kmovb
define i8 @mask8(i8 %x) {
%m0 = bitcast i8 %x to <8 x i1>
%m1 = xor <8 x i1> %m0, <i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1>
%ret = bitcast <8 x i1> %m1 to i8
ret i8 %ret
; CHECK-LABEL: mask8
; CHECK: kmovw
; CHECK-NEXT: knotw
; CHECK-NEXT: kmovw
; CHECK: ret
}
; CHECK-LABEL: mask16_mem
; CHECK: kmovw ([[ARG1:%rdi|%rcx]]), %k{{[0-7]}}
; CHECK-NEXT: knotw
; CHECK-NEXT: kmovw %k{{[0-7]}}, ([[ARG1]])
; CHECK: ret
define void @mask16_mem(i16* %ptr) {
%x = load i16* %ptr, align 4
%m0 = bitcast i16 %x to <16 x i1>
@ -31,13 +40,16 @@ define void @mask16_mem(i16* %ptr) {
%ret = bitcast <16 x i1> %m1 to i16
store i16 %ret, i16* %ptr, align 4
ret void
; CHECK-LABEL: mask16_mem
; CHECK: kmovw ([[ARG1:%rdi|%rcx]]), %k{{[0-7]}}
; CHECK-NEXT: knotw
; CHECK-NEXT: kmovw %k{{[0-7]}}, ([[ARG1]])
; CHECK: ret
}
; CHECK-LABEL: mask8_mem
; KNL: kmovw ([[ARG1]]), %k{{[0-7]}}
; KNL-NEXT: knotw
; KNL-NEXT: kmovw %k{{[0-7]}}, ([[ARG1]])
; SKX: kmovb ([[ARG1]]), %k{{[0-7]}}
; SKX-NEXT: knotb
; SKX-NEXT: kmovb %k{{[0-7]}}, ([[ARG1]])
define void @mask8_mem(i8* %ptr) {
%x = load i8* %ptr, align 4
%m0 = bitcast i8 %x to <8 x i1>
@ -45,13 +57,12 @@ define void @mask8_mem(i8* %ptr) {
%ret = bitcast <8 x i1> %m1 to i8
store i8 %ret, i8* %ptr, align 4
ret void
; CHECK-LABEL: mask8_mem
; CHECK: kmovw ([[ARG1]]), %k{{[0-7]}}
; CHECK-NEXT: knotw
; CHECK-NEXT: kmovw %k{{[0-7]}}, ([[ARG1]])
; CHECK: ret
}
; CHECK-LABEL: mand16
; CHECK: kandw
; CHECK: kxorw
; CHECK: korw
define i16 @mand16(i16 %x, i16 %y) {
%ma = bitcast i16 %x to <16 x i1>
%mb = bitcast i16 %y to <16 x i1>
@ -59,15 +70,11 @@ define i16 @mand16(i16 %x, i16 %y) {
%md = xor <16 x i1> %ma, %mb
%me = or <16 x i1> %mc, %md
%ret = bitcast <16 x i1> %me to i16
; CHECK: kandw
; CHECK: kxorw
; CHECK: korw
ret i16 %ret
}
; CHECK: shuf_test1
; CHECK-LABEL: shuf_test1
; CHECK: kshiftrw $8
; CHECK:ret
define i8 @shuf_test1(i16 %v) nounwind {
%v1 = bitcast i16 %v to <16 x i1>
%mask = shufflevector <16 x i1> %v1, <16 x i1> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@ -75,11 +82,11 @@ define i8 @shuf_test1(i16 %v) nounwind {
ret i8 %mask1
}
; CHECK: zext_test1
; CHECK-LABEL: zext_test1
; CHECK: kshiftlw
; CHECK: kshiftrw
; CHECK: kmovw
; CHECK:ret
define i32 @zext_test1(<16 x i32> %a, <16 x i32> %b) {
%cmp_res = icmp ugt <16 x i32> %a, %b
%cmp_res.i1 = extractelement <16 x i1> %cmp_res, i32 5
@ -87,11 +94,11 @@ define i32 @zext_test1(<16 x i32> %a, <16 x i32> %b) {
ret i32 %res
}
; CHECK: zext_test2
; CHECK-LABEL: zext_test2
; CHECK: kshiftlw
; CHECK: kshiftrw
; CHECK: kmovw
; CHECK:ret
define i16 @zext_test2(<16 x i32> %a, <16 x i32> %b) {
%cmp_res = icmp ugt <16 x i32> %a, %b
%cmp_res.i1 = extractelement <16 x i1> %cmp_res, i32 5
@ -99,14 +106,29 @@ define i16 @zext_test2(<16 x i32> %a, <16 x i32> %b) {
ret i16 %res
}
; CHECK: zext_test3
; CHECK-LABEL: zext_test3
; CHECK: kshiftlw
; CHECK: kshiftrw
; CHECK: kmovw
; CHECK:ret
define i8 @zext_test3(<16 x i32> %a, <16 x i32> %b) {
%cmp_res = icmp ugt <16 x i32> %a, %b
%cmp_res.i1 = extractelement <16 x i1> %cmp_res, i32 5
%res = zext i1 %cmp_res.i1 to i8
ret i8 %res
}
; CHECK-LABEL: conv1
; KNL: kmovw %k0, %eax
; KNL: movb %al, (%rdi)
; SKX: kmovb %k0, (%rdi)
define i8 @conv1(<8 x i1>* %R) {
entry:
store <8 x i1> <i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1>, <8 x i1>* %R
%maskPtr = alloca <8 x i1>
store <8 x i1> <i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1>, <8 x i1>* %maskPtr
%mask = load <8 x i1>* %maskPtr
%mask_convert = bitcast <8 x i1> %mask to i8
ret i8 %mask_convert
}