From 029a76b0a26981afb6bc3252b2a75b16393d893e Mon Sep 17 00:00:00 2001 From: Andrea Di Biagio Date: Wed, 12 Feb 2014 23:43:47 +0000 Subject: [PATCH] [Vectorizer] Add a new 'OperandValueKind' in TargetTransformInfo called 'OK_NonUniformConstValue' to identify operands which are constants but not constant splats. The cost model now allows returning 'OK_NonUniformConstValue' for non splat operands that are instances of ConstantVector or ConstantDataVector. With this change, targets are now able to compute different costs for instructions with non-uniform constant operands. For example, On X86 the cost of a vector shift may vary depending on whether the second operand is a uniform or non-uniform constant. This patch applies the following changes: - The cost model computation now takes into account non-uniform constants; - The cost of vector shift instructions has been improved in X86TargetTransformInfo analysis pass; - BBVectorize, SLPVectorizer and LoopVectorize now know how to distinguish between non-uniform and uniform constant operands. Added a new test to verify that the output of opt '-cost-model -analyze' is valid in the following configurations: SSE2, SSE4.1, AVX, AVX2. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@201272 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/Analysis/TargetTransformInfo.h | 7 +- lib/Analysis/CostModel.cpp | 11 +- lib/Target/X86/X86TargetTransformInfo.cpp | 35 +++- lib/Transforms/Vectorize/BBVectorize.cpp | 55 ++++++- lib/Transforms/Vectorize/LoopVectorize.cpp | 13 +- lib/Transforms/Vectorize/SLPVectorizer.cpp | 20 ++- test/Analysis/CostModel/X86/vshift-cost.ll | 167 ++++++++++++++++++++ 7 files changed, 293 insertions(+), 15 deletions(-) create mode 100644 test/Analysis/CostModel/X86/vshift-cost.ll diff --git a/include/llvm/Analysis/TargetTransformInfo.h b/include/llvm/Analysis/TargetTransformInfo.h index ec3e6065444..03ef8004c18 100644 --- a/include/llvm/Analysis/TargetTransformInfo.h +++ b/include/llvm/Analysis/TargetTransformInfo.h @@ -321,9 +321,10 @@ public: /// \brief Additional information about an operand's possible values. enum OperandValueKind { - OK_AnyValue, // Operand can have any value. - OK_UniformValue, // Operand is uniform (splat of a value). - OK_UniformConstantValue // Operand is uniform constant. + OK_AnyValue, // Operand can have any value. + OK_UniformValue, // Operand is uniform (splat of a value). + OK_UniformConstantValue, // Operand is uniform constant. + OK_NonUniformConstantValue // Operand is a non uniform constant value. }; /// \return The number of scalar or vector registers that the target has. diff --git a/lib/Analysis/CostModel.cpp b/lib/Analysis/CostModel.cpp index 543977a376f..898da8d0e8d 100644 --- a/lib/Analysis/CostModel.cpp +++ b/lib/Analysis/CostModel.cpp @@ -98,15 +98,20 @@ static TargetTransformInfo::OperandValueKind getOperandInfo(Value *V) { TargetTransformInfo::OperandValueKind OpInfo = TargetTransformInfo::OK_AnyValue; - // Check for a splat of a constant. + // Check for a splat of a constant or for a non uniform vector of constants. ConstantDataVector *CDV = 0; - if ((CDV = dyn_cast(V))) + if ((CDV = dyn_cast(V))) { + OpInfo = TargetTransformInfo::OK_NonUniformConstantValue; if (CDV->getSplatValue() != NULL) OpInfo = TargetTransformInfo::OK_UniformConstantValue; + } + ConstantVector *CV = 0; - if ((CV = dyn_cast(V))) + if ((CV = dyn_cast(V))) { + OpInfo = TargetTransformInfo::OK_NonUniformConstantValue; if (CV->getSplatValue() != NULL) OpInfo = TargetTransformInfo::OK_UniformConstantValue; + } return OpInfo; } diff --git a/lib/Target/X86/X86TargetTransformInfo.cpp b/lib/Target/X86/X86TargetTransformInfo.cpp index 207a7685c59..d50bab99ff3 100644 --- a/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/lib/Target/X86/X86TargetTransformInfo.cpp @@ -225,6 +225,13 @@ unsigned X86TTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty, // Look for AVX2 lowering tricks. if (ST->hasAVX2()) { + if (ISD == ISD::SHL && LT.second == MVT::v16i16 && + (Op2Info == TargetTransformInfo::OK_UniformConstantValue || + Op2Info == TargetTransformInfo::OK_NonUniformConstantValue)) + // On AVX2, a packed v16i16 shift left by a constant build_vector + // is lowered into a vector multiply (vpmullw). + return LT.first; + int Idx = CostTableLookup(AVX2CostTable, ISD, LT.second); if (Idx != -1) return LT.first * AVX2CostTable[Idx].Cost; @@ -257,6 +264,20 @@ unsigned X86TTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty, return LT.first * SSE2UniformConstCostTable[Idx].Cost; } + if (ISD == ISD::SHL && + Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) { + EVT VT = LT.second; + if ((VT == MVT::v8i16 && ST->hasSSE2()) || + (VT == MVT::v4i32 && ST->hasSSE41())) + // Vector shift left by non uniform constant can be lowered + // into vector multiply (pmullw/pmulld). + return LT.first; + if (VT == MVT::v4i32 && ST->hasSSE2()) + // A vector shift left by non uniform constant is converted + // into a vector multiply; the new multiply is eventually + // lowered into a sequence of shuffles and 2 x pmuludq. + ISD = ISD::MUL; + } static const CostTblEntry SSE2CostTable[] = { // We don't correctly identify costs of casts because they are marked as @@ -271,6 +292,7 @@ unsigned X86TTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty, { ISD::SHL, MVT::v8i16, 8*10 }, // Scalarized. { ISD::SHL, MVT::v4i32, 2*5 }, // We optimized this using mul. { ISD::SHL, MVT::v2i64, 2*10 }, // Scalarized. + { ISD::SHL, MVT::v4i64, 4*10 }, // Scalarized. { ISD::SRL, MVT::v16i8, 16*10 }, // Scalarized. { ISD::SRL, MVT::v8i16, 8*10 }, // Scalarized. @@ -308,6 +330,7 @@ unsigned X86TTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty, // We don't have to scalarize unsupported ops. We can issue two half-sized // operations and we only need to extract the upper YMM half. // Two ops + 1 extract + 1 insert = 4. + { ISD::MUL, MVT::v16i16, 4 }, { ISD::MUL, MVT::v8i32, 4 }, { ISD::SUB, MVT::v8i32, 4 }, { ISD::ADD, MVT::v8i32, 4 }, @@ -323,7 +346,15 @@ unsigned X86TTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty, // Look for AVX1 lowering tricks. if (ST->hasAVX() && !ST->hasAVX2()) { - int Idx = CostTableLookup(AVX1CostTable, ISD, LT.second); + EVT VT = LT.second; + + // v16i16 and v8i32 shifts by non-uniform constants are lowered into a + // sequence of extract + two vector multiply + insert. + if (ISD == ISD::SHL && (VT == MVT::v8i32 || VT == MVT::v16i16) && + Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) + ISD = ISD::MUL; + + int Idx = CostTableLookup(AVX1CostTable, ISD, VT); if (Idx != -1) return LT.first * AVX1CostTable[Idx].Cost; } @@ -343,7 +374,7 @@ unsigned X86TTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty, // 2x pmuludq, 2x shuffle. if (ISD == ISD::MUL && LT.second == MVT::v4i32 && ST->hasSSE2() && !ST->hasSSE41()) - return 6; + return LT.first * 6; // Fallback to the default implementation. return TargetTransformInfo::getArithmeticInstrCost(Opcode, Ty, Op1Info, diff --git a/lib/Transforms/Vectorize/BBVectorize.cpp b/lib/Transforms/Vectorize/BBVectorize.cpp index 0cc1f3962a4..f59dd2160a9 100644 --- a/lib/Transforms/Vectorize/BBVectorize.cpp +++ b/lib/Transforms/Vectorize/BBVectorize.cpp @@ -532,7 +532,11 @@ namespace { // Returns the cost of the provided instruction using TTI. // This does not handle loads and stores. - unsigned getInstrCost(unsigned Opcode, Type *T1, Type *T2) { + unsigned getInstrCost(unsigned Opcode, Type *T1, Type *T2, + TargetTransformInfo::OperandValueKind Op1VK = + TargetTransformInfo::OK_AnyValue, + TargetTransformInfo::OperandValueKind Op2VK = + TargetTransformInfo::OK_AnyValue) { switch (Opcode) { default: break; case Instruction::GetElementPtr: @@ -562,7 +566,7 @@ namespace { case Instruction::And: case Instruction::Or: case Instruction::Xor: - return TTI->getArithmeticInstrCost(Opcode, T1); + return TTI->getArithmeticInstrCost(Opcode, T1, Op1VK, Op2VK); case Instruction::Select: case Instruction::ICmp: case Instruction::FCmp: @@ -1013,13 +1017,58 @@ namespace { unsigned JCost = getInstrCost(J->getOpcode(), JT1, JT2); Type *VT1 = getVecTypeForPair(IT1, JT1), *VT2 = getVecTypeForPair(IT2, JT2); + TargetTransformInfo::OperandValueKind Op1VK = + TargetTransformInfo::OK_AnyValue; + TargetTransformInfo::OperandValueKind Op2VK = + TargetTransformInfo::OK_AnyValue; + + // On some targets (example X86) the cost of a vector shift may vary + // depending on whether the second operand is a Uniform or + // NonUniform Constant. + switch (I->getOpcode()) { + default : break; + case Instruction::Shl: + case Instruction::LShr: + case Instruction::AShr: + + // If both I and J are scalar shifts by constant, then the + // merged vector shift count would be either a constant splat value + // or a non-uniform vector of constants. + if (ConstantInt *CII = dyn_cast(I->getOperand(1))) { + if (ConstantInt *CIJ = dyn_cast(J->getOperand(1))) + Op2VK = CII == CIJ ? TargetTransformInfo::OK_UniformConstantValue : + TargetTransformInfo::OK_NonUniformConstantValue; + } else { + // Check for a splat of a constant or for a non uniform vector + // of constants. + Value *IOp = I->getOperand(1); + Value *JOp = J->getOperand(1); + if (ConstantDataVector *CDVI = dyn_cast(IOp)) { + if (ConstantDataVector *CDVJ = dyn_cast(JOp)) { + Op2VK = TargetTransformInfo::OK_NonUniformConstantValue; + Constant *SplatValue = CDVI->getSplatValue(); + if (SplatValue != NULL && SplatValue == CDVJ->getSplatValue()) + Op2VK = TargetTransformInfo::OK_UniformConstantValue; + } + } + + if (ConstantVector *CVI = dyn_cast(IOp)) { + if (ConstantVector *CVJ = dyn_cast(JOp)) { + Op2VK = TargetTransformInfo::OK_NonUniformConstantValue; + Constant *SplatValue = CVI->getSplatValue(); + if (SplatValue != NULL && SplatValue == CVJ->getSplatValue()) + Op2VK = TargetTransformInfo::OK_UniformConstantValue; + } + } + } + } // Note that this procedure is incorrect for insert and extract element // instructions (because combining these often results in a shuffle), // but this cost is ignored (because insert and extract element // instructions are assigned a zero depth factor and are not really // fused in general). - unsigned VCost = getInstrCost(I->getOpcode(), VT1, VT2); + unsigned VCost = getInstrCost(I->getOpcode(), VT1, VT2, Op1VK, Op2VK); if (VCost > ICost + JCost) return false; diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index b52970119a5..ecbab63acf0 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -5491,9 +5491,20 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { TargetTransformInfo::OK_AnyValue; TargetTransformInfo::OperandValueKind Op2VK = TargetTransformInfo::OK_AnyValue; + Value *Op2 = I->getOperand(1); - if (isa(I->getOperand(1))) + // Check for a splat of a constant or for a non uniform vector of constants. + if (isa(Op2)) Op2VK = TargetTransformInfo::OK_UniformConstantValue; + else if (ConstantDataVector *CDV = dyn_cast(Op2)) { + Op2VK = TargetTransformInfo::OK_NonUniformConstantValue; + if (CDV->getSplatValue() != NULL) + Op2VK = TargetTransformInfo::OK_UniformConstantValue; + } else if (ConstantVector *CV = dyn_cast(Op2)) { + Op2VK = TargetTransformInfo::OK_NonUniformConstantValue; + if (CV->getSplatValue() != NULL) + Op2VK = TargetTransformInfo::OK_UniformConstantValue; + } return TTI.getArithmeticInstrCost(I->getOpcode(), VectorTy, Op1VK, Op2VK); } diff --git a/lib/Transforms/Vectorize/SLPVectorizer.cpp b/lib/Transforms/Vectorize/SLPVectorizer.cpp index 9eadfb58bea..80826bd6b17 100644 --- a/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -1044,12 +1044,26 @@ int BoUpSLP::getEntryCost(TreeEntry *E) { TargetTransformInfo::OperandValueKind Op2VK = TargetTransformInfo::OK_UniformConstantValue; - // Check whether all second operands are constant. - for (unsigned i = 0; i < VL.size(); ++i) - if (!isa(cast(VL[i])->getOperand(1))) { + // If all operands are exactly the same ConstantInt then set the + // operand kind to OK_UniformConstantValue. + // If instead not all operands are constants, then set the operand kind + // to OK_AnyValue. If all operands are constants but not the same, + // then set the operand kind to OK_NonUniformConstantValue. + ConstantInt *CInt = NULL; + for (unsigned i = 0; i < VL.size(); ++i) { + const Instruction *I = cast(VL[i]); + if (!isa(I->getOperand(1))) { Op2VK = TargetTransformInfo::OK_AnyValue; break; } + if (i == 0) { + CInt = cast(I->getOperand(1)); + continue; + } + if (Op2VK == TargetTransformInfo::OK_UniformConstantValue && + CInt != cast(I->getOperand(1))) + Op2VK = TargetTransformInfo::OK_NonUniformConstantValue; + } ScalarCost = VecTy->getNumElements() * diff --git a/test/Analysis/CostModel/X86/vshift-cost.ll b/test/Analysis/CostModel/X86/vshift-cost.ll new file mode 100644 index 00000000000..84d72463ac0 --- /dev/null +++ b/test/Analysis/CostModel/X86/vshift-cost.ll @@ -0,0 +1,167 @@ +; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -mattr=+sse2,-sse4.1 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=SSE2 +; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=SSE41 +; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7-avx -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=AVX +; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=core-avx2 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=AVX2 + + +; Verify the cost of vector shift left instructions. + +; We always emit a single pmullw in the case of v8i16 vector shifts by +; non-uniform constant. + +define <8 x i16> @test1(<8 x i16> %a) { + %shl = shl <8 x i16> %a, + ret <8 x i16> %shl +} +; CHECK: 'Cost Model Analysis' for function 'test1': +; CHECK: Found an estimated cost of 1 for instruction: %shl + + +define <8 x i16> @test2(<8 x i16> %a) { + %shl = shl <8 x i16> %a, + ret <8 x i16> %shl +} +; CHECK: 'Cost Model Analysis' for function 'test2': +; CHECK: Found an estimated cost of 1 for instruction: %shl + + +; With SSE4.1, v4i32 shifts can be lowered into a single pmulld instruction. +; Make sure that the estimated cost is always 1 except for the case where +; we only have SSE2 support. With SSE2, we are forced to special lower the +; v4i32 mul as a 2x shuffle, 2x pmuludq, 2x shuffle. + +define <4 x i32> @test3(<4 x i32> %a) { + %shl = shl <4 x i32> %a, + ret <4 x i32> %shl +} +; CHECK: 'Cost Model Analysis' for function 'test3': +; SSE2: Found an estimated cost of 6 for instruction: %shl +; SSE41: Found an estimated cost of 1 for instruction: %shl +; AVX: Found an estimated cost of 1 for instruction: %shl +; AVX2: Found an estimated cost of 1 for instruction: %shl + + +define <4 x i32> @test4(<4 x i32> %a) { + %shl = shl <4 x i32> %a, + ret <4 x i32> %shl +} +; CHECK: 'Cost Model Analysis' for function 'test4': +; SSE2: Found an estimated cost of 6 for instruction: %shl +; SSE41: Found an estimated cost of 1 for instruction: %shl +; AVX: Found an estimated cost of 1 for instruction: %shl +; AVX2: Found an estimated cost of 1 for instruction: %shl + + +; On AVX2 we are able to lower the following shift into a single +; vpsllvq. Therefore, the expected cost is only 1. +; In all other cases, this shift is scalarized as the target does not support +; vpsllv instructions. + +define <2 x i64> @test5(<2 x i64> %a) { + %shl = shl <2 x i64> %a, + ret <2 x i64> %shl +} +; CHECK: 'Cost Model Analysis' for function 'test5': +; SSE2: Found an estimated cost of 20 for instruction: %shl +; SSE41: Found an estimated cost of 20 for instruction: %shl +; AVX: Found an estimated cost of 20 for instruction: %shl +; AVX2: Found an estimated cost of 1 for instruction: %shl + + +; v16i16 and v8i32 shift left by non-uniform constant are lowered into +; vector multiply instructions. With AVX (but not AVX2), the vector multiply +; is lowered into a sequence of: 1 extract + 2 vpmullw + 1 insert. +; +; With AVX2, instruction vpmullw works with 256bit quantities and +; therefore there is no need to split the resulting vector multiply into +; a sequence of two multiply. +; +; With SSE2 and SSE4.1, the vector shift cost for 'test6' is twice +; the cost computed in the case of 'test1'. That is because the backend +; simply emits 2 pmullw with no extract/insert. + + +define <16 x i16> @test6(<16 x i16> %a) { + %shl = shl <16 x i16> %a, + ret <16 x i16> %shl +} +; CHECK: 'Cost Model Analysis' for function 'test6': +; SSE2: Found an estimated cost of 2 for instruction: %shl +; SSE41: Found an estimated cost of 2 for instruction: %shl +; AVX: Found an estimated cost of 4 for instruction: %shl +; AVX2: Found an estimated cost of 1 for instruction: %shl + + +; With SSE2 and SSE4.1, the vector shift cost for 'test7' is twice +; the cost computed in the case of 'test3'. That is because the multiply +; is type-legalized into two 4i32 vector multiply. + +define <8 x i32> @test7(<8 x i32> %a) { + %shl = shl <8 x i32> %a, + ret <8 x i32> %shl +} +; CHECK: 'Cost Model Analysis' for function 'test7': +; SSE2: Found an estimated cost of 12 for instruction: %shl +; SSE41: Found an estimated cost of 2 for instruction: %shl +; AVX: Found an estimated cost of 4 for instruction: %shl +; AVX2: Found an estimated cost of 1 for instruction: %shl + + +; On AVX2 we are able to lower the following shift into a single +; vpsllvq. Therefore, the expected cost is only 1. +; In all other cases, this shift is scalarized as the target does not support +; vpsllv instructions. + +define <4 x i64> @test8(<4 x i64> %a) { + %shl = shl <4 x i64> %a, + ret <4 x i64> %shl +} +; CHECK: 'Cost Model Analysis' for function 'test8': +; SSE2: Found an estimated cost of 40 for instruction: %shl +; SSE41: Found an estimated cost of 40 for instruction: %shl +; AVX: Found an estimated cost of 40 for instruction: %shl +; AVX2: Found an estimated cost of 1 for instruction: %shl + + +; Same as 'test6', with the difference that the cost is double. + +define <32 x i16> @test9(<32 x i16> %a) { + %shl = shl <32 x i16> %a, + ret <32 x i16> %shl +} +; CHECK: 'Cost Model Analysis' for function 'test9': +; SSE2: Found an estimated cost of 4 for instruction: %shl +; SSE41: Found an estimated cost of 4 for instruction: %shl +; AVX: Found an estimated cost of 8 for instruction: %shl +; AVX2: Found an estimated cost of 2 for instruction: %shl + + +; Same as 'test7', except that now the cost is double. + +define <16 x i32> @test10(<16 x i32> %a) { + %shl = shl <16 x i32> %a, + ret <16 x i32> %shl +} +; CHECK: 'Cost Model Analysis' for function 'test10': +; SSE2: Found an estimated cost of 24 for instruction: %shl +; SSE41: Found an estimated cost of 4 for instruction: %shl +; AVX: Found an estimated cost of 8 for instruction: %shl +; AVX2: Found an estimated cost of 2 for instruction: %shl + + +; On AVX2 we are able to lower the following shift into a sequence of +; two vpsllvq instructions. Therefore, the expected cost is only 2. +; In all other cases, this shift is scalarized as we don't have vpsllv +; instructions. + +define <8 x i64> @test11(<8 x i64> %a) { + %shl = shl <8 x i64> %a, + ret <8 x i64> %shl +} +; CHECK: 'Cost Model Analysis' for function 'test11': +; SSE2: Found an estimated cost of 80 for instruction: %shl +; SSE41: Found an estimated cost of 80 for instruction: %shl +; AVX: Found an estimated cost of 80 for instruction: %shl +; AVX2: Found an estimated cost of 2 for instruction: %shl + +