From 0d5d656524e11532c3f8fbd3c0deae9236676bf7 Mon Sep 17 00:00:00 2001 From: Elena Demikhovsky Date: Wed, 9 Apr 2014 12:37:50 +0000 Subject: [PATCH] AVX-512: insert element to mask vector; store i1 data Implemented INSERT_VECTOR_ELT operation for v16i1 and v8i1 vectors; Implemented "store" for i1 type git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@205850 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86ISelLowering.cpp | 60 ++++++++++++++++++++++- lib/Target/X86/X86ISelLowering.h | 2 + lib/Target/X86/X86InstrAVX512.td | 16 ++++++ test/CodeGen/X86/avx512-insert-extract.ll | 38 ++++++++++++++ test/CodeGen/X86/avx512-mov.ll | 9 ++++ 5 files changed, 124 insertions(+), 1 deletion(-) diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 5426656f256..bc2c4e9f66e 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -1394,6 +1394,8 @@ void X86TargetLowering::resetOperationActions() { setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i1, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i1, Custom); + setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i1, Custom); + setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i1, Custom); setOperationAction(ISD::BUILD_VECTOR, MVT::v8i1, Custom); setOperationAction(ISD::BUILD_VECTOR, MVT::v16i1, Custom); setOperationAction(ISD::SELECT, MVT::v8f64, Custom); @@ -5811,6 +5813,8 @@ X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const { uint64_t Immediate = 0; int NonConstIdx = -1; bool IsSplat = true; + unsigned NumNonConsts = 0; + unsigned NumConsts = 0; for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) { SDValue In = Op.getOperand(idx); if (In.getOpcode() == ISD::UNDEF) @@ -5818,9 +5822,13 @@ X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const { if (!isa(In)) { AllContants = false; NonConstIdx = idx; + NumNonConsts++; } - else if (cast(In)->getZExtValue()) + else { + NumConsts++; + if (cast(In)->getZExtValue()) Immediate |= (1ULL << idx); + } if (In != Op.getOperand(0)) IsSplat = false; } @@ -5832,6 +5840,19 @@ X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const { DAG.getIntPtrConstant(0)); } + if (NumNonConsts == 1 && NonConstIdx != 0) { + SDValue DstVec; + if (NumConsts) { + SDValue VecAsImm = DAG.getConstant(Immediate, + MVT::getIntegerVT(VT.getSizeInBits())); + DstVec = DAG.getNode(ISD::BITCAST, dl, VT, VecAsImm); + } + else + DstVec = DAG.getUNDEF(VT); + return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec, + Op.getOperand(NonConstIdx), + DAG.getIntPtrConstant(NonConstIdx)); + } if (!IsSplat && (NonConstIdx != 0)) llvm_unreachable("Unsupported BUILD_VECTOR operation"); MVT SelectVT = (VT == MVT::v16i1)? MVT::i16 : MVT::i8; @@ -7948,10 +7969,47 @@ static SDValue LowerINSERT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) { return SDValue(); } +/// Insert one bit to mask vector, like v16i1 or v8i1. +/// AVX-512 feature. +SDValue +X86TargetLowering::InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG) const { + SDLoc dl(Op); + SDValue Vec = Op.getOperand(0); + SDValue Elt = Op.getOperand(1); + SDValue Idx = Op.getOperand(2); + MVT VecVT = Vec.getSimpleValueType(); + + if (!isa(Idx)) { + // Non constant index. Extend source and destination, + // insert element and then truncate the result. + MVT ExtVecVT = (VecVT == MVT::v8i1 ? MVT::v8i64 : MVT::v16i32); + MVT ExtEltVT = (VecVT == MVT::v8i1 ? MVT::i64 : MVT::i32); + SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT, + DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVecVT, Vec), + DAG.getNode(ISD::ZERO_EXTEND, dl, ExtEltVT, Elt), Idx); + return DAG.getNode(ISD::TRUNCATE, dl, VecVT, ExtOp); + } + + unsigned IdxVal = cast(Idx)->getZExtValue(); + SDValue EltInVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Elt); + if (Vec.getOpcode() == ISD::UNDEF) + return DAG.getNode(X86ISD::VSHLI, dl, VecVT, EltInVec, + DAG.getConstant(IdxVal, MVT::i8)); + const TargetRegisterClass* rc = getRegClassFor(VecVT); + unsigned MaxSift = rc->getSize()*8 - 1; + EltInVec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, EltInVec, + DAG.getConstant(MaxSift, MVT::i8)); + EltInVec = DAG.getNode(X86ISD::VSRLI, dl, VecVT, EltInVec, + DAG.getConstant(MaxSift - IdxVal, MVT::i8)); + return DAG.getNode(ISD::OR, dl, VecVT, Vec, EltInVec); +} SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const { MVT VT = Op.getSimpleValueType(); MVT EltVT = VT.getVectorElementType(); + + if (EltVT == MVT::i1) + return InsertBitToMaskVector(Op, DAG); SDLoc dl(Op); SDValue N0 = Op.getOperand(0); diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h index 3d4cf342a8e..c430f7e2ae8 100644 --- a/lib/Target/X86/X86ISelLowering.h +++ b/lib/Target/X86/X86ISelLowering.h @@ -874,6 +874,8 @@ namespace llvm { SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const; SDValue ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG) const; + SDValue InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const; SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) const; SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const; diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td index 1bc58fe6630..34d517fb72e 100644 --- a/lib/Target/X86/X86InstrAVX512.td +++ b/lib/Target/X86/X86InstrAVX512.td @@ -984,6 +984,10 @@ let Predicates = [HasAVX512] in { (EXTRACT_SUBREG (AND32ri (KMOVWrk (COPY_TO_REGCLASS VK1:$src, VK16)), (i32 1)), sub_16bit)>; + def : Pat<(v16i1 (scalar_to_vector VK1:$src)), + (COPY_TO_REGCLASS VK1:$src, VK16)>; + def : Pat<(v8i1 (scalar_to_vector VK1:$src)), + (COPY_TO_REGCLASS VK1:$src, VK8)>; } // With AVX-512 only, 8-bit mask is promoted to 16-bit mask. let Predicates = [HasAVX512] in { @@ -1356,6 +1360,14 @@ defm VMOVDQU64: avx512_load<0x6F, VR512, VK8WM, i512mem, load, "vmovdqu64", SSEPackedInt, v8i64>, XS, VEX_W, EVEX_V512, EVEX_CD8<64, CD8VF>; +def: Pat<(v16i32 (int_x86_avx512_mask_loadu_d_512 addr:$ptr, + (v16i32 immAllZerosV), GR16:$mask)), + (VMOVDQU32rmkz (v16i1 (COPY_TO_REGCLASS GR16:$mask, VK16WM)), addr:$ptr)>; + +def: Pat<(v8i64 (int_x86_avx512_mask_loadu_q_512 addr:$ptr, + (bc_v8i64 (v16i32 immAllZerosV)), GR8:$mask)), + (VMOVDQU64rmkz (v8i1 (COPY_TO_REGCLASS GR8:$mask, VK8WM)), addr:$ptr)>; + let AddedComplexity = 20 in { def : Pat<(v8i64 (vselect VK8WM:$mask, (v8i64 VR512:$src), (bc_v8i64 (v16i32 immAllZerosV)))), @@ -4211,3 +4223,7 @@ def : Pat<(int_x86_avx512_mask_conflict_q_512 VR512:$src2, VR512:$src1, GR8:$mask), (VPCONFLICTQrrk VR512:$src1, (v8i1 (COPY_TO_REGCLASS GR8:$mask, VK8WM)), VR512:$src2)>; + +def : Pat<(store (i1 -1), addr:$dst), (MOV8mi addr:$dst, (i8 1))>; +def : Pat<(store (i1 1), addr:$dst), (MOV8mi addr:$dst, (i8 1))>; +def : Pat<(store (i1 0), addr:$dst), (MOV8mi addr:$dst, (i8 0))>; diff --git a/test/CodeGen/X86/avx512-insert-extract.ll b/test/CodeGen/X86/avx512-insert-extract.ll index 6557ac34935..b360c716b00 100644 --- a/test/CodeGen/X86/avx512-insert-extract.ll +++ b/test/CodeGen/X86/avx512-insert-extract.ll @@ -158,3 +158,41 @@ define i64 @test14(<8 x i64>%a, <8 x i64>%b, i64 %a1, i64 %b1) { %res = select i1 %extract24vector_func.i, i64 %a1, i64 %b1 ret i64 %res } + +;CHECK-LABEL: test15 +;CHECK: kshiftlw +;CHECK: kmovw +;CHECK: ret +define i16 @test15(i1 *%addr) { + %x = load i1 * %addr, align 128 + %x1 = insertelement <16 x i1> undef, i1 %x, i32 10 + %x2 = bitcast <16 x i1>%x1 to i16 + ret i16 %x2 +} + +;CHECK-LABEL: test16 +;CHECK: kshiftlw +;CHECK: kshiftrw +;CHECK: korw +;CHECK: ret +define i16 @test16(i1 *%addr, i16 %a) { + %x = load i1 * %addr, align 128 + %a1 = bitcast i16 %a to <16 x i1> + %x1 = insertelement <16 x i1> %a1, i1 %x, i32 10 + %x2 = bitcast <16 x i1>%x1 to i16 + ret i16 %x2 +} + +;CHECK-LABEL: test17 +;CHECK: kshiftlw +;CHECK: kshiftrw +;CHECK: korw +;CHECK: ret +define i8 @test17(i1 *%addr, i8 %a) { + %x = load i1 * %addr, align 128 + %a1 = bitcast i8 %a to <8 x i1> + %x1 = insertelement <8 x i1> %a1, i1 %x, i32 10 + %x2 = bitcast <8 x i1>%x1 to i8 + ret i8 %x2 +} + diff --git a/test/CodeGen/X86/avx512-mov.ll b/test/CodeGen/X86/avx512-mov.ll index 13e68432447..a769eb61d92 100644 --- a/test/CodeGen/X86/avx512-mov.ll +++ b/test/CodeGen/X86/avx512-mov.ll @@ -153,3 +153,12 @@ define void @test18(i8 * %addr, <8 x i64> %data) { ret void } +; CHECK-LABEL: store_i1 +; CHECK: movb +; CHECK: movb +; CHECK: ret +define void @store_i1() { + store i1 true, i1 addrspace(3)* undef, align 128 + store i1 false, i1 addrspace(2)* undef, align 128 + ret void +} \ No newline at end of file