From d54fed27865dcbc69932e1e6c372bb5a932e662a Mon Sep 17 00:00:00 2001 From: Nadav Rotem Date: Sun, 23 Dec 2012 07:23:55 +0000 Subject: [PATCH] Loop Vectorizer: Update the cost model of scatter/gather operations and make them more expensive. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@170995 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/Target/TargetTransformImpl.h | 2 - include/llvm/TargetTransformInfo.h | 26 +++--------- lib/Target/TargetTransformImpl.cpp | 10 +---- lib/Target/X86/X86ISelLowering.cpp | 1 - lib/Transforms/Vectorize/LoopVectorize.cpp | 42 +++++++++++++------ .../LoopVectorize/X86/cost-model.ll | 5 ++- 6 files changed, 40 insertions(+), 46 deletions(-) diff --git a/include/llvm/Target/TargetTransformImpl.h b/include/llvm/Target/TargetTransformImpl.h index a94278cfa3b..59b7ffc826a 100644 --- a/include/llvm/Target/TargetTransformImpl.h +++ b/include/llvm/Target/TargetTransformImpl.h @@ -69,8 +69,6 @@ public: virtual ~VectorTargetTransformImpl() {} - virtual unsigned getInstrCost(unsigned Opcode, Type *Ty1, Type *Ty2) const; - virtual unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty) const; virtual unsigned getBroadcastCost(Type *Tp) const; diff --git a/include/llvm/TargetTransformInfo.h b/include/llvm/TargetTransformInfo.h index 59fcf3b7855..718d4bcfea9 100644 --- a/include/llvm/TargetTransformInfo.h +++ b/include/llvm/TargetTransformInfo.h @@ -135,44 +135,28 @@ public: virtual bool shouldBuildLookupTables() const { return true; } - /// getPopcntHwSupport - Return hardware support for population count. virtual PopcntHwSupport getPopcntHwSupport(unsigned IntTyWidthInBit) const { return None; } - /// getIntImmCost - Return the expected cost of materializing the given /// integer immediate of the specified type. virtual unsigned getIntImmCost(const APInt&, Type*) const { - // Default assumption is immediate is cheap. + // The default assumption is that the immediate is cheap. return 1; } }; /// VectorTargetTransformInfo - This interface is used by the vectorizers /// to estimate the profitability of vectorization for different instructions. +/// This interface provides the cost of different IR instructions. The cost +/// is unit-less and represents the estimated throughput of the instruction +/// (not the latency!) assuming that all branches are predicted, cache is hit, +/// etc. class VectorTargetTransformInfo { public: virtual ~VectorTargetTransformInfo() {} - /// Returns the expected cost of the instruction opcode. The opcode is one of - /// the enums like Instruction::Add. The type arguments are the type of the - /// operation. - /// Most instructions only use the first type and in that case the second - /// operand is ignored. - /// - /// Exceptions: - /// * Br instructions do not use any of the types. - /// * Select instructions pass the return type as Ty1 and the selector as Ty2. - /// * Cast instructions pass the destination as Ty1 and the source as Ty2. - /// * Insert/Extract element pass only the vector type as Ty1. - /// * ShuffleVector, Load, Store do not use this call. - virtual unsigned getInstrCost(unsigned Opcode, - Type *Ty1 = 0, - Type *Ty2 = 0) const { - return 1; - } - /// Returns the expected cost of arithmetic ops, such as mul, xor, fsub, etc. virtual unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty) const { return 1; diff --git a/lib/Target/TargetTransformImpl.cpp b/lib/Target/TargetTransformImpl.cpp index 43204370202..1f568506d9a 100644 --- a/lib/Target/TargetTransformImpl.cpp +++ b/lib/Target/TargetTransformImpl.cpp @@ -132,7 +132,6 @@ int VectorTargetTransformImpl::InstructionOpcodeToISD(unsigned Opcode) const { std::pair VectorTargetTransformImpl::getTypeLegalizationCost(Type *Ty) const { - LLVMContext &C = Ty->getContext(); EVT MTy = TLI->getValueType(Ty); @@ -271,7 +270,7 @@ unsigned VectorTargetTransformImpl::getCastInstrCost(unsigned Opcode, Type *Dst, return getScalarizationOverhead(Dst, true, true) + Num * Cost; } - // We already handled vector-to-vector and scalar-to-scalar conversions. This + // We already handled vector-to-vector and scalar-to-scalar conversions. This // is where we handle bitcast between vectors and scalars. We need to assume // that the conversion is scalarized in one way or another. if (Opcode == Instruction::BitCast) @@ -283,6 +282,7 @@ unsigned VectorTargetTransformImpl::getCastInstrCost(unsigned Opcode, Type *Dst, } unsigned VectorTargetTransformImpl::getCFInstrCost(unsigned Opcode) const { + // Branches are assumed to be predicted. return 0; } @@ -330,12 +330,6 @@ unsigned VectorTargetTransformImpl::getVectorInstrCost(unsigned Opcode, return 1; } -unsigned -VectorTargetTransformImpl::getInstrCost(unsigned Opcode, Type *Ty1, - Type *Ty2) const { - return 1; -} - unsigned VectorTargetTransformImpl::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 262475e97fc..b53a023a81b 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -17988,7 +17988,6 @@ X86VectorTargetTransformInfo::getArithmeticInstrCost(unsigned Opcode, return VectorTargetTransformImpl::getArithmeticInstrCost(Opcode, Ty); } - unsigned X86VectorTargetTransformInfo::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index f5ff79c0b90..5b1db0b9d14 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -2080,17 +2080,23 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { VectorTy = ToVectorTy(ValTy, VF); if (VF == 1) - return VTTI->getMemoryOpCost(I->getOpcode(), ValTy, + return VTTI->getMemoryOpCost(I->getOpcode(), VectorTy, SI->getAlignment(), SI->getPointerAddressSpace()); // Scalarized stores. if (!Legal->isConsecutivePtr(SI->getPointerOperand())) { unsigned Cost = 0; - unsigned ExtCost = VTTI->getInstrCost(Instruction::ExtractElement, - ValTy); - // The cost of extracting from the value vector. - Cost += VF * (ExtCost); + + // The cost of extracting from the value vector and pointer vector. + Type *PtrTy = ToVectorTy(I->getOperand(0)->getType(), VF); + for (unsigned i = 0; i < VF; ++i) { + Cost += VTTI->getVectorInstrCost(Instruction::ExtractElement, + VectorTy, i); + Cost += VTTI->getVectorInstrCost(Instruction::ExtractElement, + PtrTy, i); + } + // The cost of the scalar stores. Cost += VF * VTTI->getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), @@ -2107,16 +2113,25 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { LoadInst *LI = cast(I); if (VF == 1) - return VTTI->getMemoryOpCost(I->getOpcode(), RetTy, + return VTTI->getMemoryOpCost(I->getOpcode(), VectorTy, LI->getAlignment(), LI->getPointerAddressSpace()); // Scalarized loads. if (!Legal->isConsecutivePtr(LI->getPointerOperand())) { unsigned Cost = 0; - unsigned InCost = VTTI->getInstrCost(Instruction::InsertElement, RetTy); - // The cost of inserting the loaded value into the result vector. - Cost += VF * (InCost); + Type *PtrTy = ToVectorTy(I->getOperand(0)->getType(), VF); + + // The cost of extracting from the pointer vector. + for (unsigned i = 0; i < VF; ++i) + Cost += VTTI->getVectorInstrCost(Instruction::ExtractElement, + PtrTy, i); + + // The cost of inserting data to the result vector. + for (unsigned i = 0; i < VF; ++i) + Cost += VTTI->getVectorInstrCost(Instruction::InsertElement, + VectorTy, i); + // The cost of the scalar stores. Cost += VF * VTTI->getMemoryOpCost(I->getOpcode(), RetTy->getScalarType(), @@ -2169,18 +2184,19 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { bool IsVoid = RetTy->isVoidTy(); unsigned InsCost = (IsVoid ? 0 : - VTTI->getInstrCost(Instruction::InsertElement, + VTTI->getVectorInstrCost(Instruction::InsertElement, VectorTy)); - unsigned ExtCost = VTTI->getInstrCost(Instruction::ExtractElement, + unsigned ExtCost = VTTI->getVectorInstrCost(Instruction::ExtractElement, VectorTy); // The cost of inserting the results plus extracting each one of the // operands. Cost += VF * (InsCost + ExtCost * I->getNumOperands()); - // The cost of executing VF copies of the scalar instruction. - Cost += VF * VTTI->getInstrCost(I->getOpcode(), RetTy); + // The cost of executing VF copies of the scalar instruction. This opcode + // is unknown. Assume that it is the same as 'mul'. + Cost += VF * VTTI->getArithmeticInstrCost(Instruction::Mul, VectorTy); return Cost; } }// end of switch. diff --git a/test/Transforms/LoopVectorize/X86/cost-model.ll b/test/Transforms/LoopVectorize/X86/cost-model.ll index 628f9912c8c..b7f479acf96 100644 --- a/test/Transforms/LoopVectorize/X86/cost-model.ll +++ b/test/Transforms/LoopVectorize/X86/cost-model.ll @@ -8,8 +8,11 @@ target triple = "x86_64-apple-macosx10.8.0" @d = common global [2048 x i32] zeroinitializer, align 16 @a = common global [2048 x i32] zeroinitializer, align 16 +; The program below gathers and scatters data. We better not vectorize it. ;CHECK: cost_model_1 -;CHECK: <4 x i32> +;CHECK-NOT: <2 x i32> +;CHECK-NOT: <4 x i32> +;CHECK-NOT: <8 x i32> ;CHECK: ret void define void @cost_model_1() nounwind uwtable noinline ssp { entry: