From 70bae89669665e8b786419177c86f1d8ed4678b2 Mon Sep 17 00:00:00 2001 From: Elena Demikhovsky Date: Sun, 25 Jan 2015 08:44:46 +0000 Subject: [PATCH] Implemented cost model for masked load/store operations. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@227035 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/Analysis/TargetTransformInfo.h | 5 ++ lib/Analysis/TargetTransformInfo.cpp | 12 +++ lib/CodeGen/BasicTargetTransformInfo.cpp | 4 + lib/Target/X86/X86TargetTransformInfo.cpp | 57 ++++++++++++ .../CostModel/X86/masked-intrinsic-cost.ll | 89 +++++++++++++++++++ 5 files changed, 167 insertions(+) create mode 100644 test/Analysis/CostModel/X86/masked-intrinsic-cost.ll diff --git a/include/llvm/Analysis/TargetTransformInfo.h b/include/llvm/Analysis/TargetTransformInfo.h index 4bd5dd8a221..fb834ae99f4 100644 --- a/include/llvm/Analysis/TargetTransformInfo.h +++ b/include/llvm/Analysis/TargetTransformInfo.h @@ -397,6 +397,11 @@ public: unsigned Alignment, unsigned AddressSpace) const; + /// \return The cost of masked Load and Store instructions. + virtual unsigned getMaskedMemoryOpCost(unsigned Opcode, Type *Src, + unsigned Alignment, + unsigned AddressSpace) const; + /// \brief Calculate the cost of performing a vector reduction. /// /// This is the cost of reducing the vector value of type \p Ty to a scalar diff --git a/lib/Analysis/TargetTransformInfo.cpp b/lib/Analysis/TargetTransformInfo.cpp index e3317851714..9184842052b 100644 --- a/lib/Analysis/TargetTransformInfo.cpp +++ b/lib/Analysis/TargetTransformInfo.cpp @@ -221,6 +221,13 @@ unsigned TargetTransformInfo::getMemoryOpCost(unsigned Opcode, Type *Src, return PrevTTI->getMemoryOpCost(Opcode, Src, Alignment, AddressSpace); } +unsigned +TargetTransformInfo::getMaskedMemoryOpCost(unsigned Opcode, Type *Src, + unsigned Alignment, + unsigned AddressSpace) const { + return PrevTTI->getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace); +} + unsigned TargetTransformInfo::getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy, @@ -623,6 +630,11 @@ struct NoTTI final : ImmutablePass, TargetTransformInfo { return 1; } + unsigned getMaskedMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, + unsigned AddressSpace) const override { + return 1; + } + unsigned getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy, ArrayRef Tys) const override { return 1; diff --git a/lib/CodeGen/BasicTargetTransformInfo.cpp b/lib/CodeGen/BasicTargetTransformInfo.cpp index 72da80646c5..4e962b469fb 100644 --- a/lib/CodeGen/BasicTargetTransformInfo.cpp +++ b/lib/CodeGen/BasicTargetTransformInfo.cpp @@ -582,6 +582,10 @@ unsigned BasicTTI::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy, case Intrinsic::lifetime_start: case Intrinsic::lifetime_end: return 0; + case Intrinsic::masked_store: + return TopTTI->getMaskedMemoryOpCost(Instruction::Store, Tys[0], 0, 0); + case Intrinsic::masked_load: + return TopTTI->getMaskedMemoryOpCost(Instruction::Load, RetTy, 0, 0); } const TargetLoweringBase *TLI = getTLI(); diff --git a/lib/Target/X86/X86TargetTransformInfo.cpp b/lib/Target/X86/X86TargetTransformInfo.cpp index 67488f7ad79..9d7f1238fff 100644 --- a/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/lib/Target/X86/X86TargetTransformInfo.cpp @@ -96,6 +96,9 @@ public: unsigned Index) const override; unsigned getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, unsigned AddressSpace) const override; + unsigned getMaskedMemoryOpCost(unsigned Opcode, Type *Src, + unsigned Alignment, + unsigned AddressSpace) const override; unsigned getAddressComputationCost(Type *PtrTy, bool IsComplex) const override; @@ -917,6 +920,60 @@ unsigned X86TTI::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, return Cost; } +unsigned X86TTI::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy, + unsigned Alignment, + unsigned AddressSpace) const { + VectorType *SrcVTy = dyn_cast(SrcTy); + if (!SrcVTy) + // To calculate scalar take the regular cost, without mask + return getMemoryOpCost(Opcode, SrcTy, Alignment, AddressSpace); + + unsigned NumElem = SrcVTy->getVectorNumElements(); + VectorType *MaskTy = + VectorType::get(Type::getInt8Ty(getGlobalContext()), NumElem); + if ((Opcode == Instruction::Load && !isLegalMaskedLoad(SrcVTy, 1)) || + (Opcode == Instruction::Store && !isLegalMaskedStore(SrcVTy, 1)) || + !isPowerOf2_32(NumElem)) { + // Scalarization + unsigned MaskSplitCost = getScalarizationOverhead(MaskTy, false, true); + unsigned ScalarCompareCost = + getCmpSelInstrCost(Instruction::ICmp, + Type::getInt8Ty(getGlobalContext()), NULL); + unsigned BranchCost = getCFInstrCost(Instruction::Br); + unsigned MaskCmpCost = NumElem * (BranchCost + ScalarCompareCost); + + unsigned ValueSplitCost = + getScalarizationOverhead(SrcVTy, Opcode == Instruction::Load, + Opcode == Instruction::Store); + unsigned MemopCost = NumElem * + TargetTransformInfo::getMemoryOpCost(Opcode, SrcVTy->getScalarType(), + Alignment, AddressSpace); + return MemopCost + ValueSplitCost + MaskSplitCost + MaskCmpCost; + } + + // Legalize the type. + std::pair LT = TLI->getTypeLegalizationCost(SrcVTy); + unsigned Cost = 0; + if (LT.second != TLI->getValueType(SrcVTy).getSimpleVT() && + LT.second.getVectorNumElements() == NumElem) + // Promotion requires expand/truncate for data and a shuffle for mask. + Cost += getShuffleCost(TargetTransformInfo::SK_Alternate, SrcVTy, 0, 0) + + getShuffleCost(TargetTransformInfo::SK_Alternate, MaskTy, 0, 0); + + else if (LT.second.getVectorNumElements() > NumElem) { + VectorType *NewMaskTy = VectorType::get(MaskTy->getVectorElementType(), + LT.second.getVectorNumElements()); + // Expanding requires fill mask with zeroes + Cost += getShuffleCost(TargetTransformInfo::SK_InsertSubvector, + NewMaskTy, 0, MaskTy); + } + if (!ST->hasAVX512()) + return Cost + LT.first*4; // Each maskmov costs 4 + + // AVX-512 masked load/store is cheapper + return Cost+LT.first; +} + unsigned X86TTI::getAddressComputationCost(Type *Ty, bool IsComplex) const { // Address computations in vectorized code with non-consecutive addresses will // likely result in more instructions compared to scalar code where the diff --git a/test/Analysis/CostModel/X86/masked-intrinsic-cost.ll b/test/Analysis/CostModel/X86/masked-intrinsic-cost.ll new file mode 100644 index 00000000000..4683c432c55 --- /dev/null +++ b/test/Analysis/CostModel/X86/masked-intrinsic-cost.ll @@ -0,0 +1,89 @@ +; RUN: opt -S -mtriple=x86_64-apple-darwin -mcpu=core-avx2 -cost-model -analyze < %s | FileCheck %s -check-prefix=AVX2 + + +; AVX2-LABEL: test1 +; AVX2: Found an estimated cost of 4 {{.*}}.masked +define <2 x double> @test1(<2 x i64> %trigger, <2 x double>* %addr, <2 x double> %dst) { + %mask = icmp eq <2 x i64> %trigger, zeroinitializer + %res = call <2 x double> @llvm.masked.load.v2f64(<2 x double>* %addr, i32 4, <2 x i1>%mask, <2 x double>%dst) + ret <2 x double> %res +} + +; AVX2-LABEL: test2 +; AVX2: Found an estimated cost of 4 {{.*}}.masked +define <4 x i32> @test2(<4 x i32> %trigger, <4 x i32>* %addr, <4 x i32> %dst) { + %mask = icmp eq <4 x i32> %trigger, zeroinitializer + %res = call <4 x i32> @llvm.masked.load.v4i32(<4 x i32>* %addr, i32 4, <4 x i1>%mask, <4 x i32>%dst) + ret <4 x i32> %res +} + +; AVX2-LABEL: test3 +; AVX2: Found an estimated cost of 4 {{.*}}.masked +define void @test3(<4 x i32> %trigger, <4 x i32>* %addr, <4 x i32> %val) { + %mask = icmp eq <4 x i32> %trigger, zeroinitializer + call void @llvm.masked.store.v4i32(<4 x i32>%val, <4 x i32>* %addr, i32 4, <4 x i1>%mask) + ret void +} + +; AVX2-LABEL: test4 +; AVX2: Found an estimated cost of 4 {{.*}}.masked +define <8 x float> @test4(<8 x i32> %trigger, <8 x float>* %addr, <8 x float> %dst) { + %mask = icmp eq <8 x i32> %trigger, zeroinitializer + %res = call <8 x float> @llvm.masked.load.v8f32(<8 x float>* %addr, i32 4, <8 x i1>%mask, <8 x float>%dst) + ret <8 x float> %res +} + +; AVX2-LABEL: test5 +; AVX2: Found an estimated cost of 5 {{.*}}.masked +define void @test5(<2 x i32> %trigger, <2 x float>* %addr, <2 x float> %val) { + %mask = icmp eq <2 x i32> %trigger, zeroinitializer + call void @llvm.masked.store.v2f32(<2 x float>%val, <2 x float>* %addr, i32 4, <2 x i1>%mask) + ret void +} + +; AVX2-LABEL: test6 +; AVX2: Found an estimated cost of 6 {{.*}}.masked +define void @test6(<2 x i32> %trigger, <2 x i32>* %addr, <2 x i32> %val) { + %mask = icmp eq <2 x i32> %trigger, zeroinitializer + call void @llvm.masked.store.v2i32(<2 x i32>%val, <2 x i32>* %addr, i32 4, <2 x i1>%mask) + ret void +} + +; AVX2-LABEL: test7 +; AVX2: Found an estimated cost of 5 {{.*}}.masked +define <2 x float> @test7(<2 x i32> %trigger, <2 x float>* %addr, <2 x float> %dst) { + %mask = icmp eq <2 x i32> %trigger, zeroinitializer + %res = call <2 x float> @llvm.masked.load.v2f32(<2 x float>* %addr, i32 4, <2 x i1>%mask, <2 x float>%dst) + ret <2 x float> %res +} + +; AVX2-LABEL: test8 +; AVX2: Found an estimated cost of 6 {{.*}}.masked +define <2 x i32> @test8(<2 x i32> %trigger, <2 x i32>* %addr, <2 x i32> %dst) { + %mask = icmp eq <2 x i32> %trigger, zeroinitializer + %res = call <2 x i32> @llvm.masked.load.v2i32(<2 x i32>* %addr, i32 4, <2 x i1>%mask, <2 x i32>%dst) + ret <2 x i32> %res +} + + +declare <16 x i32> @llvm.masked.load.v16i32(<16 x i32>*, i32, <16 x i1>, <16 x i32>) +declare <4 x i32> @llvm.masked.load.v4i32(<4 x i32>*, i32, <4 x i1>, <4 x i32>) +declare <2 x i32> @llvm.masked.load.v2i32(<2 x i32>*, i32, <2 x i1>, <2 x i32>) +declare void @llvm.masked.store.v16i32(<16 x i32>, <16 x i32>*, i32, <16 x i1>) +declare void @llvm.masked.store.v8i32(<8 x i32>, <8 x i32>*, i32, <8 x i1>) +declare void @llvm.masked.store.v4i32(<4 x i32>, <4 x i32>*, i32, <4 x i1>) +declare void @llvm.masked.store.v2f32(<2 x float>, <2 x float>*, i32, <2 x i1>) +declare void @llvm.masked.store.v2i32(<2 x i32>, <2 x i32>*, i32, <2 x i1>) +declare void @llvm.masked.store.v16f32(<16 x float>, <16 x float>*, i32, <16 x i1>) +declare void @llvm.masked.store.v16f32p(<16 x float>*, <16 x float>**, i32, <16 x i1>) +declare <16 x float> @llvm.masked.load.v16f32(<16 x float>*, i32, <16 x i1>, <16 x float>) +declare <8 x float> @llvm.masked.load.v8f32(<8 x float>*, i32, <8 x i1>, <8 x float>) +declare <4 x float> @llvm.masked.load.v4f32(<4 x float>*, i32, <4 x i1>, <4 x float>) +declare <2 x float> @llvm.masked.load.v2f32(<2 x float>*, i32, <2 x i1>, <2 x float>) +declare <8 x double> @llvm.masked.load.v8f64(<8 x double>*, i32, <8 x i1>, <8 x double>) +declare <4 x double> @llvm.masked.load.v4f64(<4 x double>*, i32, <4 x i1>, <4 x double>) +declare <2 x double> @llvm.masked.load.v2f64(<2 x double>*, i32, <2 x i1>, <2 x double>) +declare void @llvm.masked.store.v8f64(<8 x double>, <8 x double>*, i32, <8 x i1>) +declare void @llvm.masked.store.v2f64(<2 x double>, <2 x double>*, i32, <2 x i1>) +declare void @llvm.masked.store.v2i64(<2 x i64>, <2 x i64>*, i32, <2 x i1>) +