TargetTransformInfo: address calculation parameter for gather/scather

Address calculation for gather/scather in vectorized code can incur a
significant cost making vectorization unbeneficial. Add infrastructure to add
cost.
Tests and cost model for targets will be in follow-up commits.

radar://14351991

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@186187 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
Arnold Schwaighofer 2013-07-12 19:16:02 +00:00
parent ec11706155
commit c0a11edba6
5 changed files with 69 additions and 9 deletions

View File

@ -339,7 +339,11 @@ public:
/// merged into the instruction indexing mode. Some targets might want to
/// distinguish between address computation for memory operations on vector
/// types and scalar types. Such targets should override this function.
virtual unsigned getAddressComputationCost(Type *Ty) const;
/// The 'IsComplex' parameter is a hint that the address computation is likely
/// to involve multiple instructions and as such unlikely to be merged into
/// the address indexing mode.
virtual unsigned getAddressComputationCost(Type *Ty,
bool IsComplex = false) const;
/// @}

View File

@ -206,8 +206,9 @@ unsigned TargetTransformInfo::getNumberOfParts(Type *Tp) const {
return PrevTTI->getNumberOfParts(Tp);
}
unsigned TargetTransformInfo::getAddressComputationCost(Type *Tp) const {
return PrevTTI->getAddressComputationCost(Tp);
unsigned TargetTransformInfo::getAddressComputationCost(Type *Tp,
bool IsComplex) const {
return PrevTTI->getAddressComputationCost(Tp, IsComplex);
}
namespace {
@ -559,7 +560,7 @@ struct NoTTI : ImmutablePass, TargetTransformInfo {
return 0;
}
unsigned getAddressComputationCost(Type *Tp) const {
unsigned getAddressComputationCost(Type *Tp, bool) const {
return 0;
}
};

View File

@ -108,7 +108,7 @@ public:
virtual unsigned getIntrinsicInstrCost(Intrinsic::ID, Type *RetTy,
ArrayRef<Type*> Tys) const;
virtual unsigned getNumberOfParts(Type *Tp) const;
virtual unsigned getAddressComputationCost(Type *Ty) const;
virtual unsigned getAddressComputationCost(Type *Ty, bool IsComplex) const;
/// @}
};
@ -489,6 +489,6 @@ unsigned BasicTTI::getNumberOfParts(Type *Tp) const {
return LT.first;
}
unsigned BasicTTI::getAddressComputationCost(Type *Ty) const {
unsigned BasicTTI::getAddressComputationCost(Type *Ty, bool IsComplex) const {
return 0;
}

View File

@ -124,7 +124,7 @@ public:
unsigned getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) const;
unsigned getAddressComputationCost(Type *Val) const;
unsigned getAddressComputationCost(Type *Val, bool IsComplex) const;
unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty,
OperandValueKind Op1Info = OK_AnyValue,
@ -425,7 +425,7 @@ unsigned ARMTTI::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
return TargetTransformInfo::getCmpSelInstrCost(Opcode, ValTy, CondTy);
}
unsigned ARMTTI::getAddressComputationCost(Type *Ty) const {
unsigned ARMTTI::getAddressComputationCost(Type *Ty, bool IsComplex) const {
// In many cases the address computation is not merged into the instruction
// addressing mode.
return 1;

View File

@ -4403,6 +4403,59 @@ unsigned LoopVectorizationCostModel::expectedCost(unsigned VF) {
return Cost;
}
/// \brief Check whether the address computation for a non-consecutive memory
/// access looks like an unlikely candidate for being merged into the indexing
/// mode.
///
/// We look for a GEP which has one index that is an induction variable and all
/// other indices are loop invariant. If the stride of this access is also
/// within a small bound we decide that this address computation can likely be
/// merged into the addressing mode.
/// In all other cases, we identify the address computation as complex.
static bool isLikelyComplexAddressComputation(Value *Ptr,
LoopVectorizationLegality *Legal,
ScalarEvolution *SE,
const Loop *TheLoop) {
GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr);
if (!Gep)
return true;
// We are looking for a gep with all loop invariant indices except for one
// which should be an induction variable.
unsigned NumOperands = Gep->getNumOperands();
for (unsigned i = 1; i < NumOperands; ++i) {
Value *Opd = Gep->getOperand(i);
if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) &&
!Legal->isInductionVariable(Opd))
return true;
}
// Now we know we have a GEP ptr, %inv, %ind, %inv. Make sure that the step
// can likely be merged into the address computation.
unsigned MaxMergeDistance = 64;
const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(Ptr));
if (!AddRec)
return true;
// Check the step is constant.
const SCEV *Step = AddRec->getStepRecurrence(*SE);
// Calculate the pointer stride and check if it is consecutive.
const SCEVConstant *C = dyn_cast<SCEVConstant>(Step);
if (!C)
return true;
const APInt &APStepVal = C->getValue()->getValue();
// Huge step value - give up.
if (APStepVal.getBitWidth() > 64)
return true;
int64_t StepVal = APStepVal.getSExtValue();
return StepVal > MaxMergeDistance;
}
unsigned
LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
// If we know that this instruction will remain uniform, check the cost of
@ -4498,6 +4551,8 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
unsigned ScalarAllocatedSize = DL->getTypeAllocSize(ValTy);
unsigned VectorElementSize = DL->getTypeStoreSize(VectorTy)/VF;
if (!ConsecutiveStride || ScalarAllocatedSize != VectorElementSize) {
bool IsComplexComputation =
isLikelyComplexAddressComputation(Ptr, Legal, SE, TheLoop);
unsigned Cost = 0;
// The cost of extracting from the value vector and pointer vector.
Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
@ -4513,7 +4568,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
}
// The cost of the scalar loads/stores.
Cost += VF * TTI.getAddressComputationCost(ValTy->getScalarType());
Cost += VF * TTI.getAddressComputationCost(PtrTy, IsComplexComputation);
Cost += VF * TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(),
Alignment, AS);
return Cost;