From 717d41d8c3b021066bcbeb460aa962322bc1752d Mon Sep 17 00:00:00 2001 From: Elena Demikhovsky Date: Sun, 25 Jan 2015 12:47:15 +0000 Subject: [PATCH] AVX-512: Changes in operations on masks registers for KNL and SKX - Added KSHIFTB/D/Q for skx - Added KORTESTB/D/Q for skx - Fixed store operation for v8i1 type for KNL - Store size of v8i1, v4i1 and v2i1 are changed to 8 bits git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@227043 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86ISelLowering.cpp | 4 ++ lib/Target/X86/X86InstrAVX512.td | 47 +++++++++++-- lib/Target/X86/X86RegisterInfo.td | 16 ++--- test/CodeGen/X86/avx512-insert-extract.ll | 21 ++++-- test/CodeGen/X86/avx512-mask-op.ll | 86 ++++++++++++++--------- 5 files changed, 121 insertions(+), 53 deletions(-) diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 684f23b464e..8b82ae9a42d 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -12871,6 +12871,8 @@ X86TargetLowering::ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG) const MVT EltVT = Op.getSimpleValueType(); assert((EltVT == MVT::i1) && "Unexpected operands in ExtractBitFromMaskVector"); + assert((VecVT.getVectorNumElements() <= 16 || Subtarget->hasBWI()) && + "Unexpected vector type in ExtractBitFromMaskVector"); // variable index can't be handled in mask registers, // extend vector to VR512 @@ -12884,6 +12886,8 @@ X86TargetLowering::ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG) const unsigned IdxVal = cast(Idx)->getZExtValue(); const TargetRegisterClass* rc = getRegClassFor(VecVT); + if (!Subtarget->hasDQI() && (VecVT.getVectorNumElements() <= 8)) + rc = getRegClassFor(MVT::v16i1); unsigned MaxSift = rc->getSize()*8 - 1; Vec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, Vec, DAG.getConstant(MaxSift - IdxVal, MVT::i8)); diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td index a6fcd2656ae..89cc08b2fc3 100644 --- a/lib/Target/X86/X86InstrAVX512.td +++ b/lib/Target/X86/X86InstrAVX512.td @@ -1624,7 +1624,8 @@ multiclass avx512_mask_mov opc_kk, bits<8> opc_km, bits<8> opc_mk, [(set KRC:$dst, (vvt (bitconvert (ivt (load addr:$src)))))]>; let mayStore = 1 in def mk : I; + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(store KRC:$src, addr:$dst)]>; } } @@ -1951,13 +1952,26 @@ multiclass avx512_mask_testop opc, string OpcodeStr, RegisterClass KRC, multiclass avx512_mask_testop_w opc, string OpcodeStr, SDNode OpNode> { defm W : avx512_mask_testop, VEX, PS; + let Predicates = [HasDQI] in + defm B : avx512_mask_testop, + VEX, PD; + let Predicates = [HasBWI] in { + defm Q : avx512_mask_testop, + VEX, PS, VEX_W; + defm D : avx512_mask_testop, + VEX, PD, VEX_W; + } } defm KORTEST : avx512_mask_testop_w<0x98, "kortest", X86kortest>; def : Pat<(X86cmp VK1:$src1, (i1 0)), (KORTESTWrr (COPY_TO_REGCLASS VK1:$src1, VK16), - (COPY_TO_REGCLASS VK1:$src1, VK16))>; + (COPY_TO_REGCLASS VK1:$src1, VK16))>, Requires<[HasAVX512, NoDQI]>; + +def : Pat<(X86cmp VK1:$src1, (i1 0)), + (KORTESTBrr (COPY_TO_REGCLASS VK1:$src1, VK8), + (COPY_TO_REGCLASS VK1:$src1, VK8))>, Requires<[HasDQI]>; // Mask shift multiclass avx512_mask_shiftop opc, string OpcodeStr, RegisterClass KRC, @@ -1972,7 +1986,17 @@ multiclass avx512_mask_shiftop opc, string OpcodeStr, RegisterClass KRC, multiclass avx512_mask_shiftop_w opc1, bits<8> opc2, string OpcodeStr, SDNode OpNode> { defm W : avx512_mask_shiftop, - VEX, TAPD, VEX_W; + VEX, TAPD, VEX_W; + let Predicates = [HasDQI] in + defm B : avx512_mask_shiftop, + VEX, TAPD; + let Predicates = [HasBWI] in { + defm Q : avx512_mask_shiftop, + VEX, TAPD, VEX_W; + let Predicates = [HasDQI] in + defm D : avx512_mask_shiftop, + VEX, TAPD; + } } defm KSHIFTL : avx512_mask_shiftop_w<0x32, 0x33, "kshiftl", X86vshli>; @@ -2023,10 +2047,14 @@ let Predicates = [HasVLX] in { } def : Pat<(v8i1 (X86vshli VK8:$src, (i8 imm:$imm))), - (v8i1 (COPY_TO_REGCLASS (KSHIFTLWri (COPY_TO_REGCLASS VK8:$src, VK16), (I8Imm $imm)), VK8))>; + (v8i1 (COPY_TO_REGCLASS + (KSHIFTLWri (COPY_TO_REGCLASS VK8:$src, VK16), + (I8Imm $imm)), VK8))>, Requires<[HasAVX512, NoDQI]>; def : Pat<(v8i1 (X86vsrli VK8:$src, (i8 imm:$imm))), - (v8i1 (COPY_TO_REGCLASS (KSHIFTRWri (COPY_TO_REGCLASS VK8:$src, VK16), (I8Imm $imm)), VK8))>; + (v8i1 (COPY_TO_REGCLASS + (KSHIFTRWri (COPY_TO_REGCLASS VK8:$src, VK16), + (I8Imm $imm)), VK8))>, Requires<[HasAVX512, NoDQI]>; //===----------------------------------------------------------------------===// // AVX-512 - Aligned and unaligned load and store // @@ -5204,7 +5232,14 @@ def : Pat<(store (i1 1), addr:$dst), (MOV8mi addr:$dst, (i8 1))>; def : Pat<(store (i1 0), addr:$dst), (MOV8mi addr:$dst, (i8 0))>; def : Pat<(store VK1:$src, addr:$dst), - (KMOVWmk addr:$dst, (COPY_TO_REGCLASS VK1:$src, VK16))>; + (MOV8mr addr:$dst, + (EXTRACT_SUBREG (KMOVWrk (COPY_TO_REGCLASS VK1:$src, VK16)), + sub_8bit))>, Requires<[HasAVX512, NoDQI]>; + +def : Pat<(store VK8:$src, addr:$dst), + (MOV8mr addr:$dst, + (EXTRACT_SUBREG (KMOVWrk (COPY_TO_REGCLASS VK8:$src, VK16)), + sub_8bit))>, Requires<[HasAVX512, NoDQI]>; def truncstorei1 : PatFrag<(ops node:$val, node:$ptr), (truncstore node:$val, node:$ptr), [{ diff --git a/lib/Target/X86/X86RegisterInfo.td b/lib/Target/X86/X86RegisterInfo.td index 30cd09a5a2f..45a676a0d03 100644 --- a/lib/Target/X86/X86RegisterInfo.td +++ b/lib/Target/X86/X86RegisterInfo.td @@ -469,18 +469,18 @@ def VR256X : RegisterClass<"X86", [v32i8, v16i16, v8i32, v4i64, v8f32, v4f64], 256, (sequence "YMM%u", 0, 31)>; // Mask registers -def VK1 : RegisterClass<"X86", [i1], 16, (sequence "K%u", 0, 7)> {let Size = 16;} -def VK2 : RegisterClass<"X86", [v2i1], 16, (add VK1)> {let Size = 16;} -def VK4 : RegisterClass<"X86", [v4i1], 16, (add VK2)> {let Size = 16;} -def VK8 : RegisterClass<"X86", [v8i1], 16, (add VK4)> {let Size = 16;} +def VK1 : RegisterClass<"X86", [i1], 8, (sequence "K%u", 0, 7)> {let Size = 8;} +def VK2 : RegisterClass<"X86", [v2i1], 8, (add VK1)> {let Size = 8;} +def VK4 : RegisterClass<"X86", [v4i1], 8, (add VK2)> {let Size = 8;} +def VK8 : RegisterClass<"X86", [v8i1], 8, (add VK4)> {let Size = 8;} def VK16 : RegisterClass<"X86", [v16i1], 16, (add VK8)> {let Size = 16;} def VK32 : RegisterClass<"X86", [v32i1], 32, (add VK16)> {let Size = 32;} def VK64 : RegisterClass<"X86", [v64i1], 64, (add VK32)> {let Size = 64;} -def VK1WM : RegisterClass<"X86", [i1], 16, (sub VK1, K0)> {let Size = 16;} -def VK2WM : RegisterClass<"X86", [v2i1], 16, (sub VK2, K0)> {let Size = 16;} -def VK4WM : RegisterClass<"X86", [v4i1], 16, (sub VK4, K0)> {let Size = 16;} -def VK8WM : RegisterClass<"X86", [v8i1], 16, (sub VK8, K0)> {let Size = 16;} +def VK1WM : RegisterClass<"X86", [i1], 8, (sub VK1, K0)> {let Size = 8;} +def VK2WM : RegisterClass<"X86", [v2i1], 8, (sub VK2, K0)> {let Size = 8;} +def VK4WM : RegisterClass<"X86", [v4i1], 8, (sub VK4, K0)> {let Size = 8;} +def VK8WM : RegisterClass<"X86", [v8i1], 8, (sub VK8, K0)> {let Size = 8;} def VK16WM : RegisterClass<"X86", [v16i1], 16, (add VK8WM)> {let Size = 16;} def VK32WM : RegisterClass<"X86", [v32i1], 32, (add VK16WM)> {let Size = 32;} def VK64WM : RegisterClass<"X86", [v64i1], 64, (add VK32WM)> {let Size = 64;} diff --git a/test/CodeGen/X86/avx512-insert-extract.ll b/test/CodeGen/X86/avx512-insert-extract.ll index eba895ebf56..3d263df03dc 100644 --- a/test/CodeGen/X86/avx512-insert-extract.ll +++ b/test/CodeGen/X86/avx512-insert-extract.ll @@ -106,7 +106,8 @@ define i32 @test10(<16 x i32> %x, i32 %ind) nounwind { ;CHECK: vpcmpltud ;CHECK: kshiftlw $11 ;CHECK: kshiftrw $15 -;CHECK: kortestw +;KNL: kortestw +;SKX: kortestb ;CHECK: je ;CHECK: ret ;CHECK: ret @@ -125,7 +126,8 @@ define <16 x i32> @test11(<16 x i32>%a, <16 x i32>%b) { ;CHECK: vpcmpgtq ;CHECK: kshiftlw $15 ;CHECK: kshiftrw $15 -;CHECK: kortestw +;KNL: kortestw +;SKX: kortestb ;CHECK: ret define i64 @test12(<16 x i64>%a, <16 x i64>%b, i64 %a1, i64 %b1) { @@ -150,9 +152,12 @@ define i16 @test13(i32 %a, i32 %b) { ;CHECK-LABEL: test14 ;CHECK: vpcmpgtq -;CHECK: kshiftlw $11 -;CHECK: kshiftrw $15 -;CHECK: kortestw +;KNL: kshiftlw $11 +;KNL: kshiftrw $15 +;KNL: kortestw +;SKX: kshiftlb $3 +;SKX: kshiftrb $7 +;SKX: kortestb ;CHECK: ret define i64 @test14(<8 x i64>%a, <8 x i64>%b, i64 %a1, i64 %b1) { @@ -188,9 +193,11 @@ define i16 @test16(i1 *%addr, i16 %a) { } ;CHECK-LABEL: test17 -;CHECK: kshiftlw -;CHECK: kshiftrw +;KNL: kshiftlw +;KNL: kshiftrw ;KNL: korw +;SKX: kshiftlb +;SKX: kshiftrb ;SKX: korb ;CHECK: ret define i8 @test17(i1 *%addr, i8 %a) { diff --git a/test/CodeGen/X86/avx512-mask-op.ll b/test/CodeGen/X86/avx512-mask-op.ll index 35d334813fa..264d91503ac 100644 --- a/test/CodeGen/X86/avx512-mask-op.ll +++ b/test/CodeGen/X86/avx512-mask-op.ll @@ -1,29 +1,38 @@ -; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s +; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s --check-prefix=KNL --check-prefix=CHECK +; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck %s --check-prefix=SKX --check-prefix=CHECK +; CHECK-LABEL: mask16 +; CHECK: kmovw +; CHECK-NEXT: knotw +; CHECK-NEXT: kmovw define i16 @mask16(i16 %x) { %m0 = bitcast i16 %x to <16 x i1> %m1 = xor <16 x i1> %m0, %ret = bitcast <16 x i1> %m1 to i16 ret i16 %ret -; CHECK-LABEL: mask16 -; CHECK: kmovw -; CHECK-NEXT: knotw -; CHECK-NEXT: kmovw -; CHECK: ret } +; CHECK-LABEL: mask8 +; KNL: kmovw +; KNL-NEXT: knotw +; KNL-NEXT: kmovw +; SKX: kmovb +; SKX-NEXT: knotb +; SKX-NEXT: kmovb + define i8 @mask8(i8 %x) { %m0 = bitcast i8 %x to <8 x i1> %m1 = xor <8 x i1> %m0, %ret = bitcast <8 x i1> %m1 to i8 ret i8 %ret -; CHECK-LABEL: mask8 -; CHECK: kmovw -; CHECK-NEXT: knotw -; CHECK-NEXT: kmovw -; CHECK: ret } +; CHECK-LABEL: mask16_mem +; CHECK: kmovw ([[ARG1:%rdi|%rcx]]), %k{{[0-7]}} +; CHECK-NEXT: knotw +; CHECK-NEXT: kmovw %k{{[0-7]}}, ([[ARG1]]) +; CHECK: ret + define void @mask16_mem(i16* %ptr) { %x = load i16* %ptr, align 4 %m0 = bitcast i16 %x to <16 x i1> @@ -31,13 +40,16 @@ define void @mask16_mem(i16* %ptr) { %ret = bitcast <16 x i1> %m1 to i16 store i16 %ret, i16* %ptr, align 4 ret void -; CHECK-LABEL: mask16_mem -; CHECK: kmovw ([[ARG1:%rdi|%rcx]]), %k{{[0-7]}} -; CHECK-NEXT: knotw -; CHECK-NEXT: kmovw %k{{[0-7]}}, ([[ARG1]]) -; CHECK: ret } +; CHECK-LABEL: mask8_mem +; KNL: kmovw ([[ARG1]]), %k{{[0-7]}} +; KNL-NEXT: knotw +; KNL-NEXT: kmovw %k{{[0-7]}}, ([[ARG1]]) +; SKX: kmovb ([[ARG1]]), %k{{[0-7]}} +; SKX-NEXT: knotb +; SKX-NEXT: kmovb %k{{[0-7]}}, ([[ARG1]]) + define void @mask8_mem(i8* %ptr) { %x = load i8* %ptr, align 4 %m0 = bitcast i8 %x to <8 x i1> @@ -45,13 +57,12 @@ define void @mask8_mem(i8* %ptr) { %ret = bitcast <8 x i1> %m1 to i8 store i8 %ret, i8* %ptr, align 4 ret void -; CHECK-LABEL: mask8_mem -; CHECK: kmovw ([[ARG1]]), %k{{[0-7]}} -; CHECK-NEXT: knotw -; CHECK-NEXT: kmovw %k{{[0-7]}}, ([[ARG1]]) -; CHECK: ret } +; CHECK-LABEL: mand16 +; CHECK: kandw +; CHECK: kxorw +; CHECK: korw define i16 @mand16(i16 %x, i16 %y) { %ma = bitcast i16 %x to <16 x i1> %mb = bitcast i16 %y to <16 x i1> @@ -59,15 +70,11 @@ define i16 @mand16(i16 %x, i16 %y) { %md = xor <16 x i1> %ma, %mb %me = or <16 x i1> %mc, %md %ret = bitcast <16 x i1> %me to i16 -; CHECK: kandw -; CHECK: kxorw -; CHECK: korw ret i16 %ret } -; CHECK: shuf_test1 +; CHECK-LABEL: shuf_test1 ; CHECK: kshiftrw $8 -; CHECK:ret define i8 @shuf_test1(i16 %v) nounwind { %v1 = bitcast i16 %v to <16 x i1> %mask = shufflevector <16 x i1> %v1, <16 x i1> undef, <8 x i32> @@ -75,11 +82,11 @@ define i8 @shuf_test1(i16 %v) nounwind { ret i8 %mask1 } -; CHECK: zext_test1 +; CHECK-LABEL: zext_test1 ; CHECK: kshiftlw ; CHECK: kshiftrw ; CHECK: kmovw -; CHECK:ret + define i32 @zext_test1(<16 x i32> %a, <16 x i32> %b) { %cmp_res = icmp ugt <16 x i32> %a, %b %cmp_res.i1 = extractelement <16 x i1> %cmp_res, i32 5 @@ -87,11 +94,11 @@ define i32 @zext_test1(<16 x i32> %a, <16 x i32> %b) { ret i32 %res } -; CHECK: zext_test2 +; CHECK-LABEL: zext_test2 ; CHECK: kshiftlw ; CHECK: kshiftrw ; CHECK: kmovw -; CHECK:ret + define i16 @zext_test2(<16 x i32> %a, <16 x i32> %b) { %cmp_res = icmp ugt <16 x i32> %a, %b %cmp_res.i1 = extractelement <16 x i1> %cmp_res, i32 5 @@ -99,14 +106,29 @@ define i16 @zext_test2(<16 x i32> %a, <16 x i32> %b) { ret i16 %res } -; CHECK: zext_test3 +; CHECK-LABEL: zext_test3 ; CHECK: kshiftlw ; CHECK: kshiftrw ; CHECK: kmovw -; CHECK:ret + define i8 @zext_test3(<16 x i32> %a, <16 x i32> %b) { %cmp_res = icmp ugt <16 x i32> %a, %b %cmp_res.i1 = extractelement <16 x i1> %cmp_res, i32 5 %res = zext i1 %cmp_res.i1 to i8 ret i8 %res } + +; CHECK-LABEL: conv1 +; KNL: kmovw %k0, %eax +; KNL: movb %al, (%rdi) +; SKX: kmovb %k0, (%rdi) +define i8 @conv1(<8 x i1>* %R) { +entry: + store <8 x i1> , <8 x i1>* %R + + %maskPtr = alloca <8 x i1> + store <8 x i1> , <8 x i1>* %maskPtr + %mask = load <8 x i1>* %maskPtr + %mask_convert = bitcast <8 x i1> %mask to i8 + ret i8 %mask_convert +} \ No newline at end of file