From e670dc7848379e7d616ba631aaf0bb42dd4def07 Mon Sep 17 00:00:00 2001 From: Elena Demikhovsky Date: Thu, 5 Mar 2015 15:11:35 +0000 Subject: [PATCH] AVX-512, SKX: Enabled masked_load/store operations for this target. Added lowering for ISD::CONCAT_VECTORS and ISD::INSERT_SUBVECTOR for i1 vectors, it is needed to pass all masked_memop.ll tests for SKX. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@231371 91177308-0d34-0410-b5e6-96231b3b80d8 --- .../SelectionDAG/LegalizeIntegerTypes.cpp | 4 +- lib/Target/X86/X86ISelLowering.cpp | 111 ++++++++++++++++-- lib/Target/X86/X86InstrAVX512.td | 13 ++ test/CodeGen/X86/masked_memop.ll | 47 +++++++- 4 files changed, 164 insertions(+), 11 deletions(-) diff --git a/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp index 5507c703b51..5ed385ee483 100644 --- a/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp +++ b/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp @@ -1116,7 +1116,6 @@ SDValue DAGTypeLegalizer::PromoteIntOp_STORE(StoreSDNode *N, unsigned OpNo){ SDValue DAGTypeLegalizer::PromoteIntOp_MSTORE(MaskedStoreSDNode *N, unsigned OpNo){ - assert(OpNo == 2 && "Only know how to promote the mask!"); SDValue DataOp = N->getValue(); EVT DataVT = DataOp.getValueType(); SDValue Mask = N->getMask(); @@ -1127,7 +1126,8 @@ SDValue DAGTypeLegalizer::PromoteIntOp_MSTORE(MaskedStoreSDNode *N, unsigned OpN if (!TLI.isTypeLegal(DataVT)) { if (getTypeAction(DataVT) == TargetLowering::TypePromoteInteger) { DataOp = GetPromotedInteger(DataOp); - Mask = PromoteTargetBoolean(Mask, DataOp.getValueType()); + if (!TLI.isTypeLegal(MaskVT)) + Mask = PromoteTargetBoolean(Mask, DataOp.getValueType()); TruncateStore = true; } else { diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index e51149210bf..dd8a9cee9ad 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -1473,7 +1473,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i64, Custom); setOperationAction(ISD::CONCAT_VECTORS, MVT::v16f32, Custom); setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i32, Custom); - setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i1, Custom); setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i1, Legal); setOperationAction(ISD::SETCC, MVT::v16i1, Custom); @@ -1575,6 +1574,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::SUB, MVT::v32i16, Legal); setOperationAction(ISD::SUB, MVT::v64i8, Legal); setOperationAction(ISD::MUL, MVT::v32i16, Legal); + setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i1, Custom); + setOperationAction(ISD::CONCAT_VECTORS, MVT::v64i1, Custom); + setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32i1, Custom); + setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v64i1, Custom); for (int i = MVT::v32i8; i != MVT::v8i64; ++i) { const MVT VT = (MVT::SimpleValueType)i; @@ -1598,7 +1601,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::SETCC, MVT::v4i1, Custom); setOperationAction(ISD::SETCC, MVT::v2i1, Custom); - setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v8i1, Legal); + setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i1, Custom); + setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i1, Custom); + setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v8i1, Custom); + setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v4i1, Custom); setOperationAction(ISD::AND, MVT::v8i32, Legal); setOperationAction(ISD::OR, MVT::v8i32, Legal); @@ -4082,9 +4088,13 @@ static SDValue getZeroVector(EVT VT, const X86Subtarget *Subtarget, Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i32, Ops); } else if (VT.getScalarType() == MVT::i1) { - assert(VT.getVectorNumElements() <= 16 && "Unexpected vector type"); + + assert((Subtarget->hasBWI() || VT.getVectorNumElements() <= 16) + && "Unexpected vector type"); + assert((Subtarget->hasVLX() || VT.getVectorNumElements() >= 8) + && "Unexpected vector type"); SDValue Cst = DAG.getConstant(0, MVT::i1); - SmallVector Ops(VT.getVectorNumElements(), Cst); + SmallVector Ops(VT.getVectorNumElements(), Cst); return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops); } else llvm_unreachable("Unexpected vector type"); @@ -5893,8 +5903,64 @@ static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) { return Concat256BitVectors(V1, V2, ResVT, NumElems, DAG, dl); } -static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) { - MVT LLVM_ATTRIBUTE_UNUSED VT = Op.getSimpleValueType(); +static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op, + const X86Subtarget *Subtarget, + SelectionDAG & DAG) { + SDLoc dl(Op); + MVT ResVT = Op.getSimpleValueType(); + unsigned NumOfOperands = Op.getNumOperands(); + + assert(isPowerOf2_32(NumOfOperands) && + "Unexpected number of operands in CONCAT_VECTORS"); + + if (NumOfOperands > 2) { + MVT HalfVT = MVT::getVectorVT(ResVT.getScalarType(), + ResVT.getVectorNumElements()/2); + SmallVector Ops; + for (unsigned i = 0; i < NumOfOperands/2; i++) + Ops.push_back(Op.getOperand(i)); + SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, Ops); + Ops.clear(); + for (unsigned i = NumOfOperands/2; i < NumOfOperands; i++) + Ops.push_back(Op.getOperand(i)); + SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, Ops); + return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi); + } + + SDValue V1 = Op.getOperand(0); + SDValue V2 = Op.getOperand(1); + bool IsZeroV1 = ISD::isBuildVectorAllZeros(V1.getNode()); + bool IsZeroV2 = ISD::isBuildVectorAllZeros(V2.getNode()); + + if (IsZeroV1 && IsZeroV2) + return getZeroVector(ResVT, Subtarget, DAG, dl); + + SDValue ZeroIdx = DAG.getIntPtrConstant(0); + SDValue Undef = DAG.getUNDEF(ResVT); + unsigned NumElems = ResVT.getVectorNumElements(); + SDValue ShiftBits = DAG.getConstant(NumElems/2, MVT::i8); + + V2 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, V2, ZeroIdx); + V2 = DAG.getNode(X86ISD::VSHLI, dl, ResVT, V2, ShiftBits); + if (IsZeroV1) + return V2; + + V1 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, V1, ZeroIdx); + // Zero the upper bits of V1 + V1 = DAG.getNode(X86ISD::VSHLI, dl, ResVT, V1, ShiftBits); + V1 = DAG.getNode(X86ISD::VSRLI, dl, ResVT, V1, ShiftBits); + if (IsZeroV2) + return V1; + return DAG.getNode(ISD::OR, dl, ResVT, V1, V2); +} + +static SDValue LowerCONCAT_VECTORS(SDValue Op, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + MVT VT = Op.getSimpleValueType(); + if (VT.getVectorElementType() == MVT::i1) + return LowerCONCAT_VECTORSvXi1(Op, Subtarget, DAG); + assert((VT.is256BitVector() && Op.getNumOperands() == 2) || (VT.is512BitVector() && (Op.getNumOperands() == 2 || Op.getNumOperands() == 4))); @@ -10618,6 +10684,37 @@ static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget *Subtarget, if (OpVT.is512BitVector() && SubVecVT.is256BitVector()) return Insert256BitVector(Vec, SubVec, IdxVal, DAG, dl); + if (OpVT.getVectorElementType() == MVT::i1) { + if (IdxVal == 0 && Vec.getOpcode() == ISD::UNDEF) // the operation is legal + return Op; + SDValue ZeroIdx = DAG.getIntPtrConstant(0); + SDValue Undef = DAG.getUNDEF(OpVT); + unsigned NumElems = OpVT.getVectorNumElements(); + SDValue ShiftBits = DAG.getConstant(NumElems/2, MVT::i8); + + if (IdxVal == OpVT.getVectorNumElements() / 2) { + // Zero upper bits of the Vec + Vec = DAG.getNode(X86ISD::VSHLI, dl, OpVT, Vec, ShiftBits); + Vec = DAG.getNode(X86ISD::VSRLI, dl, OpVT, Vec, ShiftBits); + + SDValue Vec2 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Undef, + SubVec, ZeroIdx); + Vec2 = DAG.getNode(X86ISD::VSHLI, dl, OpVT, Vec2, ShiftBits); + return DAG.getNode(ISD::OR, dl, OpVT, Vec, Vec2); + } + if (IdxVal == 0) { + SDValue Vec2 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Undef, + SubVec, ZeroIdx); + // Zero upper bits of the Vec2 + Vec2 = DAG.getNode(X86ISD::VSHLI, dl, OpVT, Vec2, ShiftBits); + Vec2 = DAG.getNode(X86ISD::VSRLI, dl, OpVT, Vec2, ShiftBits); + // Zero lower bits of the Vec + Vec = DAG.getNode(X86ISD::VSRLI, dl, OpVT, Vec, ShiftBits); + Vec = DAG.getNode(X86ISD::VSHLI, dl, OpVT, Vec, ShiftBits); + // Merge them together + return DAG.getNode(ISD::OR, dl, OpVT, Vec, Vec2); + } + } return SDValue(); } @@ -16900,7 +16997,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::ATOMIC_LOAD_SUB: return LowerLOAD_SUB(Op,DAG); case ISD::ATOMIC_STORE: return LowerATOMIC_STORE(Op,DAG); case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG); - case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG); + case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, Subtarget, DAG); case ISD::VECTOR_SHUFFLE: return lowerVectorShuffle(Op, Subtarget, DAG); case ISD::VSELECT: return LowerVSELECT(Op, DAG); case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG); diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td index cf2cb0d8b33..1e6ec8ecd3f 100644 --- a/lib/Target/X86/X86InstrAVX512.td +++ b/lib/Target/X86/X86InstrAVX512.td @@ -2064,6 +2064,8 @@ let Predicates = [HasVLX] in { (v8i1 (COPY_TO_REGCLASS VK4:$src, VK8))>; def : Pat<(v8i1 (insert_subvector undef, (v2i1 VK2:$src), (iPTR 0))), (v8i1 (COPY_TO_REGCLASS VK2:$src, VK8))>; + def : Pat<(v4i1 (insert_subvector undef, (v2i1 VK2:$src), (iPTR 0))), + (v4i1 (COPY_TO_REGCLASS VK2:$src, VK4))>; def : Pat<(v4i1 (extract_subvector (v8i1 VK8:$src), (iPTR 0))), (v4i1 (COPY_TO_REGCLASS VK8:$src, VK4))>; def : Pat<(v2i1 (extract_subvector (v8i1 VK8:$src), (iPTR 0))), @@ -2079,6 +2081,17 @@ def : Pat<(v8i1 (X86vsrli VK8:$src, (i8 imm:$imm))), (v8i1 (COPY_TO_REGCLASS (KSHIFTRWri (COPY_TO_REGCLASS VK8:$src, VK16), (I8Imm $imm)), VK8))>, Requires<[HasAVX512, NoDQI]>; + +def : Pat<(v4i1 (X86vshli VK4:$src, (i8 imm:$imm))), + (v4i1 (COPY_TO_REGCLASS + (KSHIFTLWri (COPY_TO_REGCLASS VK4:$src, VK16), + (I8Imm $imm)), VK4))>, Requires<[HasAVX512]>; + +def : Pat<(v4i1 (X86vsrli VK4:$src, (i8 imm:$imm))), + (v4i1 (COPY_TO_REGCLASS + (KSHIFTRWri (COPY_TO_REGCLASS VK4:$src, VK16), + (I8Imm $imm)), VK4))>, Requires<[HasAVX512]>; + //===----------------------------------------------------------------------===// // AVX-512 - Aligned and unaligned load and store // diff --git a/test/CodeGen/X86/masked_memop.ll b/test/CodeGen/X86/masked_memop.ll index f268c57f65b..da2f78655d1 100644 --- a/test/CodeGen/X86/masked_memop.ll +++ b/test/CodeGen/X86/masked_memop.ll @@ -1,6 +1,7 @@ ; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=knl < %s | FileCheck %s -check-prefix=AVX512 ; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=core-avx2 < %s | FileCheck %s -check-prefix=AVX2 ; RUN: opt -mtriple=x86_64-apple-darwin -codegenprepare -mcpu=corei7-avx -S < %s | FileCheck %s -check-prefix=AVX_SCALAR +; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=skx < %s | FileCheck %s -check-prefix=SKX ; AVX512-LABEL: test1 ; AVX512: vmovdqu32 (%rdi), %zmm0 {%k1} {z} @@ -82,6 +83,9 @@ define <8 x double> @test5(<8 x i32> %trigger, <8 x double>* %addr, <8 x double> ; AVX2-LABEL: test6 ; AVX2: vmaskmovpd ; AVX2: vblendvpd + +; SKX-LABEL: test6 +; SKX: vmovupd {{.*}}{%k1} define <2 x double> @test6(<2 x i64> %trigger, <2 x double>* %addr, <2 x double> %dst) { %mask = icmp eq <2 x i64> %trigger, zeroinitializer %res = call <2 x double> @llvm.masked.load.v2f64(<2 x double>* %addr, i32 4, <2 x i1>%mask, <2 x double>%dst) @@ -91,6 +95,9 @@ define <2 x double> @test6(<2 x i64> %trigger, <2 x double>* %addr, <2 x double> ; AVX2-LABEL: test7 ; AVX2: vmaskmovps {{.*}}(%rdi) ; AVX2: blend + +; SKX-LABEL: test7 +; SKX: vmovups (%rdi){{.*}}{%k1} define <4 x float> @test7(<4 x i32> %trigger, <4 x float>* %addr, <4 x float> %dst) { %mask = icmp eq <4 x i32> %trigger, zeroinitializer %res = call <4 x float> @llvm.masked.load.v4f32(<4 x float>* %addr, i32 4, <4 x i1>%mask, <4 x float>%dst) @@ -100,6 +107,9 @@ define <4 x float> @test7(<4 x i32> %trigger, <4 x float>* %addr, <4 x float> %d ; AVX2-LABEL: test8 ; AVX2: vpmaskmovd {{.*}}(%rdi) ; AVX2: blend + +; SKX-LABEL: test8 +; SKX: vmovdqu32 (%rdi){{.*}}{%k1} define <4 x i32> @test8(<4 x i32> %trigger, <4 x i32>* %addr, <4 x i32> %dst) { %mask = icmp eq <4 x i32> %trigger, zeroinitializer %res = call <4 x i32> @llvm.masked.load.v4i32(<4 x i32>* %addr, i32 4, <4 x i1>%mask, <4 x i32>%dst) @@ -108,6 +118,9 @@ define <4 x i32> @test8(<4 x i32> %trigger, <4 x i32>* %addr, <4 x i32> %dst) { ; AVX2-LABEL: test9 ; AVX2: vpmaskmovd %xmm + +; SKX-LABEL: test9 +; SKX: vmovdqu32 %xmm{{.*}}{%k1} define void @test9(<4 x i32> %trigger, <4 x i32>* %addr, <4 x i32> %val) { %mask = icmp eq <4 x i32> %trigger, zeroinitializer call void @llvm.masked.store.v4i32(<4 x i32>%val, <4 x i32>* %addr, i32 4, <4 x i1>%mask) @@ -117,23 +130,32 @@ define void @test9(<4 x i32> %trigger, <4 x i32>* %addr, <4 x i32> %val) { ; AVX2-LABEL: test10 ; AVX2: vmaskmovpd (%rdi), %ymm ; AVX2: blend + +; SKX-LABEL: test10 +; SKX: vmovapd {{.*}}{%k1} define <4 x double> @test10(<4 x i32> %trigger, <4 x double>* %addr, <4 x double> %dst) { %mask = icmp eq <4 x i32> %trigger, zeroinitializer - %res = call <4 x double> @llvm.masked.load.v4f64(<4 x double>* %addr, i32 4, <4 x i1>%mask, <4 x double>%dst) + %res = call <4 x double> @llvm.masked.load.v4f64(<4 x double>* %addr, i32 32, <4 x i1>%mask, <4 x double>%dst) ret <4 x double> %res } ; AVX2-LABEL: test11 ; AVX2: vmaskmovps ; AVX2: vblendvps + +; SKX-LABEL: test11 +; SKX: vmovaps {{.*}}{%k1} define <8 x float> @test11(<8 x i32> %trigger, <8 x float>* %addr, <8 x float> %dst) { %mask = icmp eq <8 x i32> %trigger, zeroinitializer - %res = call <8 x float> @llvm.masked.load.v8f32(<8 x float>* %addr, i32 4, <8 x i1>%mask, <8 x float>%dst) + %res = call <8 x float> @llvm.masked.load.v8f32(<8 x float>* %addr, i32 32, <8 x i1>%mask, <8 x float>%dst) ret <8 x float> %res } ; AVX2-LABEL: test12 ; AVX2: vpmaskmovd %ymm + +; SKX-LABEL: test12 +; SKX: vmovdqu32 {{.*}}{%k1} define void @test12(<8 x i32> %trigger, <8 x i32>* %addr, <8 x i32> %val) { %mask = icmp eq <8 x i32> %trigger, zeroinitializer call void @llvm.masked.store.v8i32(<8 x i32>%val, <8 x i32>* %addr, i32 4, <8 x i1>%mask) @@ -153,6 +175,12 @@ define void @test13(<16 x i32> %trigger, <16 x float>* %addr, <16 x float> %val) ; AVX2: vpshufd ; AVX2: vmovq ; AVX2: vmaskmovps + +; SKX-LABEL: test14 +; SKX: kshiftl +; SKX: kshiftr +; SKX: vmovups {{.*}}{%k1} + define void @test14(<2 x i32> %trigger, <2 x float>* %addr, <2 x float> %val) { %mask = icmp eq <2 x i32> %trigger, zeroinitializer call void @llvm.masked.store.v2f32(<2 x float>%val, <2 x float>* %addr, i32 4, <2 x i1>%mask) @@ -161,6 +189,11 @@ define void @test14(<2 x i32> %trigger, <2 x float>* %addr, <2 x float> %val) { ; AVX2-LABEL: test15 ; AVX2: vpmaskmovd + +; SKX-LABEL: test15 +; SKX: kshiftl +; SKX: kshiftr +; SKX: vmovdqu32 {{.*}}{%k1} define void @test15(<2 x i32> %trigger, <2 x i32>* %addr, <2 x i32> %val) { %mask = icmp eq <2 x i32> %trigger, zeroinitializer call void @llvm.masked.store.v2i32(<2 x i32>%val, <2 x i32>* %addr, i32 4, <2 x i1>%mask) @@ -170,6 +203,11 @@ define void @test15(<2 x i32> %trigger, <2 x i32>* %addr, <2 x i32> %val) { ; AVX2-LABEL: test16 ; AVX2: vmaskmovps ; AVX2: vblendvps + +; SKX-LABEL: test16 +; SKX: kshiftl +; SKX: kshiftr +; SKX: vmovups {{.*}}{%k1} define <2 x float> @test16(<2 x i32> %trigger, <2 x float>* %addr, <2 x float> %dst) { %mask = icmp eq <2 x i32> %trigger, zeroinitializer %res = call <2 x float> @llvm.masked.load.v2f32(<2 x float>* %addr, i32 4, <2 x i1>%mask, <2 x float>%dst) @@ -180,6 +218,11 @@ define <2 x float> @test16(<2 x i32> %trigger, <2 x float>* %addr, <2 x float> % ; AVX2: vpmaskmovd ; AVX2: vblendvps ; AVX2: vpmovsxdq + +; SKX-LABEL: test17 +; SKX: kshiftl +; SKX: kshiftr +; SKX: vmovdqu32 {{.*}}{%k1} define <2 x i32> @test17(<2 x i32> %trigger, <2 x i32>* %addr, <2 x i32> %dst) { %mask = icmp eq <2 x i32> %trigger, zeroinitializer %res = call <2 x i32> @llvm.masked.load.v2i32(<2 x i32>* %addr, i32 4, <2 x i1>%mask, <2 x i32>%dst)