mirror of
https://github.com/c64scene-ar/llvm-6502.git
synced 2026-04-25 05:22:04 +00:00
LoopVectorizer: Add a basic cost model which uses the VTTI interface.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@166620 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
@@ -18,10 +18,13 @@
|
||||
//
|
||||
// This pass has three parts:
|
||||
// 1. The main loop pass that drives the different parts.
|
||||
// 2. LoopVectorizationLegality - A helper class that checks for the legality
|
||||
// 2. LoopVectorizationLegality - A unit that checks for the legality
|
||||
// of the vectorization.
|
||||
// 3. SingleBlockLoopVectorizer - A helper class that performs the actual
|
||||
// 3. SingleBlockLoopVectorizer - A unit that performs the actual
|
||||
// widening of instructions.
|
||||
// 4. LoopVectorizationCostModel - A unit that checks for the profitability
|
||||
// of vectorization. It decides on the optimal vector width, which
|
||||
// can be one, if vectorization is not profitable.
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
// The reduction-variable vectorization is based on the paper:
|
||||
@@ -51,13 +54,14 @@
|
||||
#include "llvm/ADT/StringExtras.h"
|
||||
#include "llvm/Analysis/AliasAnalysis.h"
|
||||
#include "llvm/Analysis/AliasSetTracker.h"
|
||||
#include "llvm/Transforms/Scalar.h"
|
||||
#include "llvm/Analysis/ScalarEvolution.h"
|
||||
#include "llvm/Analysis/ScalarEvolutionExpressions.h"
|
||||
#include "llvm/Analysis/ScalarEvolutionExpander.h"
|
||||
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
|
||||
#include "llvm/Analysis/ValueTracking.h"
|
||||
#include "llvm/Analysis/LoopInfo.h"
|
||||
#include "llvm/Analysis/ValueTracking.h"
|
||||
#include "llvm/Transforms/Scalar.h"
|
||||
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
|
||||
#include "llvm/TargetTransformInfo.h"
|
||||
#include "llvm/Support/CommandLine.h"
|
||||
#include "llvm/Support/Debug.h"
|
||||
#include "llvm/Support/raw_ostream.h"
|
||||
@@ -67,13 +71,14 @@
|
||||
using namespace llvm;
|
||||
|
||||
static cl::opt<unsigned>
|
||||
DefaultVectorizationFactor("default-loop-vectorize-width",
|
||||
cl::init(4), cl::Hidden,
|
||||
cl::desc("Set the default loop vectorization width"));
|
||||
VectorizationFactor("force-vector-width", cl::init(0), cl::Hidden,
|
||||
cl::desc("Set the default vectorization width. Zero is autoselect."));
|
||||
|
||||
namespace {
|
||||
|
||||
// Forward declaration.
|
||||
// Forward declarations.
|
||||
class LoopVectorizationLegality;
|
||||
class LoopVectorizationCostModel;
|
||||
|
||||
/// SingleBlockLoopVectorizer vectorizes loops which contain only one basic
|
||||
/// block to a specified vectorization factor (VF).
|
||||
@@ -229,11 +234,10 @@ public:
|
||||
/// of the reductions that were found in the loop.
|
||||
typedef DenseMap<PHINode*, ReductionDescriptor> ReductionList;
|
||||
|
||||
/// Returns the maximum vectorization factor that we *can* use to vectorize
|
||||
/// this loop. This does not mean that it is profitable to vectorize this
|
||||
/// loop, only that it is legal to do so. This may be a large number. We
|
||||
/// can vectorize to any SIMD width below this number.
|
||||
unsigned getLoopMaxVF();
|
||||
/// Returns true if it is legal to vectorize this loop.
|
||||
/// This does not mean that it is profitable to vectorize this
|
||||
/// loop, only that it is legal to do so.
|
||||
bool canVectorize();
|
||||
|
||||
/// Returns the Induction variable.
|
||||
PHINode *getInduction() {return Induction;}
|
||||
@@ -286,6 +290,49 @@ private:
|
||||
SmallPtrSet<Value*, 4> AllowedExit;
|
||||
};
|
||||
|
||||
/// LoopVectorizationCostModel - estimates the expected speedups due to
|
||||
/// vectorization.
|
||||
/// In many cases vectorization is not profitable. This can happen because
|
||||
/// of a number of reasons. In this class we mainly attempt to predict
|
||||
/// the expected speedup/slowdowns due to the supported instruction set.
|
||||
/// We use the VectorTargetTransformInfo to query the different backends
|
||||
/// for the cost of different operations.
|
||||
class LoopVectorizationCostModel {
|
||||
public:
|
||||
/// C'tor.
|
||||
LoopVectorizationCostModel(Loop *Lp, ScalarEvolution *Se, DataLayout *Dl,
|
||||
LoopVectorizationLegality *Leg,
|
||||
const VectorTargetTransformInfo *Vtti):
|
||||
TheLoop(Lp), SE(Se), DL(Dl), Legal(Leg), VTTI(Vtti) { }
|
||||
|
||||
/// Returns the most profitable vectorization factor for the loop that is
|
||||
/// smaller or equal to the VF argument. This method checks every power
|
||||
/// of two up to VF.
|
||||
unsigned findBestVectorizationFactor(unsigned VF = 4);
|
||||
|
||||
private:
|
||||
/// Returns the expected execution cost. The unit of the cost does
|
||||
/// not matter because we use the 'cost' units to compare different
|
||||
/// vector widths. The cost that is returned is *not* normalized by
|
||||
/// the factor width.
|
||||
unsigned expectedCost(unsigned VF);
|
||||
|
||||
/// Returns the execution time cost of an instruction for a given vector
|
||||
/// width. Vector width of one means scalar.
|
||||
unsigned getInstructionCost(Instruction *I, unsigned VF);
|
||||
|
||||
/// The loop that we evaluate.
|
||||
Loop *TheLoop;
|
||||
/// Scev analysis.
|
||||
ScalarEvolution *SE;
|
||||
/// DataLayout analysis.
|
||||
DataLayout *DL;
|
||||
/// Vectorization legality.
|
||||
LoopVectorizationLegality *Legal;
|
||||
/// Vector target information.
|
||||
const VectorTargetTransformInfo *VTTI;
|
||||
};
|
||||
|
||||
struct LoopVectorize : public LoopPass {
|
||||
static char ID; // Pass identification, replacement for typeid
|
||||
|
||||
@@ -296,6 +343,7 @@ struct LoopVectorize : public LoopPass {
|
||||
ScalarEvolution *SE;
|
||||
DataLayout *DL;
|
||||
LoopInfo *LI;
|
||||
TargetTransformInfo *TTI;
|
||||
|
||||
virtual bool runOnLoop(Loop *L, LPPassManager &LPM) {
|
||||
// We only vectorize innermost loops.
|
||||
@@ -305,25 +353,42 @@ struct LoopVectorize : public LoopPass {
|
||||
SE = &getAnalysis<ScalarEvolution>();
|
||||
DL = getAnalysisIfAvailable<DataLayout>();
|
||||
LI = &getAnalysis<LoopInfo>();
|
||||
TTI = getAnalysisIfAvailable<TargetTransformInfo>();
|
||||
|
||||
DEBUG(dbgs() << "LV: Checking a loop in \"" <<
|
||||
L->getHeader()->getParent()->getName() << "\"\n");
|
||||
|
||||
// Check if it is legal to vectorize the loop.
|
||||
LoopVectorizationLegality LVL(L, SE, DL);
|
||||
unsigned MaxVF = LVL.getLoopMaxVF();
|
||||
|
||||
// Check that we can vectorize this loop using the chosen vectorization
|
||||
// width.
|
||||
if (MaxVF < DefaultVectorizationFactor) {
|
||||
DEBUG(dbgs() << "LV: non-vectorizable MaxVF ("<< MaxVF << ").\n");
|
||||
if (!LVL.canVectorize()) {
|
||||
DEBUG(dbgs() << "LV: Not vectorizing.\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
DEBUG(dbgs() << "LV: Found a vectorizable loop ("<< MaxVF << ").\n");
|
||||
// Select the preffered vectorization factor.
|
||||
unsigned VF = 1;
|
||||
if (VectorizationFactor == 0) {
|
||||
const VectorTargetTransformInfo *VTTI = 0;
|
||||
if (TTI)
|
||||
VTTI = TTI->getVectorTargetTransformInfo();
|
||||
// Use the cost model.
|
||||
LoopVectorizationCostModel CM(L, SE, DL, &LVL, VTTI);
|
||||
VF = CM.findBestVectorizationFactor();
|
||||
|
||||
if (VF == 1) {
|
||||
DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
} else {
|
||||
// Use the user command flag.
|
||||
VF = VectorizationFactor;
|
||||
}
|
||||
|
||||
DEBUG(dbgs() << "LV: Found a vectorizable loop ("<< VF << ").\n");
|
||||
|
||||
// If we decided that it is *legal* to vectorizer the loop then do it.
|
||||
SingleBlockLoopVectorizer LB(L, SE, LI, &LPM, DefaultVectorizationFactor);
|
||||
SingleBlockLoopVectorizer LB(L, SE, LI, &LPM, VF);
|
||||
LB.vectorize(&LVL);
|
||||
|
||||
DEBUG(verifyFunction(*L->getHeader()->getParent()));
|
||||
@@ -656,6 +721,13 @@ void SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal
|
||||
|
||||
void
|
||||
SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
|
||||
//===------------------------------------------------===//
|
||||
//
|
||||
// Notice: any optimization or new instruction that go
|
||||
// into the code below should be also be implemented in
|
||||
// the cost-model.
|
||||
//
|
||||
//===------------------------------------------------===//
|
||||
typedef SmallVector<PHINode*, 4> PhiVector;
|
||||
BasicBlock &BB = *OrigLoop->getHeader();
|
||||
Constant *Zero = ConstantInt::get(
|
||||
@@ -957,18 +1029,18 @@ void SingleBlockLoopVectorizer::cleanup() {
|
||||
SE->forgetLoop(OrigLoop);
|
||||
}
|
||||
|
||||
unsigned LoopVectorizationLegality::getLoopMaxVF() {
|
||||
bool LoopVectorizationLegality::canVectorize() {
|
||||
if (!TheLoop->getLoopPreheader()) {
|
||||
assert(false && "No preheader!!");
|
||||
DEBUG(dbgs() << "LV: Loop not normalized." << "\n");
|
||||
return 1;
|
||||
return false;
|
||||
}
|
||||
|
||||
// We can only vectorize single basic block loops.
|
||||
unsigned NumBlocks = TheLoop->getNumBlocks();
|
||||
if (NumBlocks != 1) {
|
||||
DEBUG(dbgs() << "LV: Too many blocks:" << NumBlocks << "\n");
|
||||
return 1;
|
||||
return false;
|
||||
}
|
||||
|
||||
// We need to have a loop header.
|
||||
@@ -978,22 +1050,22 @@ unsigned LoopVectorizationLegality::getLoopMaxVF() {
|
||||
// Go over each instruction and look at memory deps.
|
||||
if (!canVectorizeBlock(*BB)) {
|
||||
DEBUG(dbgs() << "LV: Can't vectorize this loop header\n");
|
||||
return 1;
|
||||
return false;
|
||||
}
|
||||
|
||||
// ScalarEvolution needs to be able to find the exit count.
|
||||
const SCEV *ExitCount = SE->getExitCount(TheLoop, BB);
|
||||
if (ExitCount == SE->getCouldNotCompute()) {
|
||||
DEBUG(dbgs() << "LV: SCEV could not compute the loop exit count.\n");
|
||||
return 1;
|
||||
return false;
|
||||
}
|
||||
|
||||
DEBUG(dbgs() << "LV: We can vectorize this loop!\n");
|
||||
|
||||
// Okay! We can vectorize. At this point we don't have any other mem analysis
|
||||
// which may limit our maximum vectorization factor, so just return the
|
||||
// maximum SIMD size.
|
||||
return DefaultVectorizationFactor;
|
||||
// which may limit our maximum vectorization factor, so just return true with
|
||||
// no restrictions.
|
||||
return true;
|
||||
}
|
||||
|
||||
bool LoopVectorizationLegality::canVectorizeBlock(BasicBlock &BB) {
|
||||
@@ -1323,6 +1395,177 @@ bool LoopVectorizationLegality::isInductionVariable(PHINode *Phi) {
|
||||
return true;
|
||||
}
|
||||
|
||||
unsigned
|
||||
LoopVectorizationCostModel::findBestVectorizationFactor(unsigned VF) {
|
||||
if (!VTTI) {
|
||||
DEBUG(dbgs() << "LV: No vector target information. Not vectorizing. \n");
|
||||
return 1;
|
||||
}
|
||||
|
||||
float Cost = expectedCost(1);
|
||||
unsigned Width = 1;
|
||||
DEBUG(dbgs() << "LV: Scalar loop costs: "<< (int)Cost << ".\n");
|
||||
for (unsigned i=2; i <= VF; i*=2) {
|
||||
// Notice that the vector loop needs to be executed less times, so
|
||||
// we need to divide the cost of the vector loops by the width of
|
||||
// the vector elements.
|
||||
float VectorCost = expectedCost(i) / (float)i;
|
||||
DEBUG(dbgs() << "LV: Vector loop of width "<< i << " costs: " <<
|
||||
(int)VectorCost << ".\n");
|
||||
if (VectorCost < Cost) {
|
||||
Cost = VectorCost;
|
||||
Width = i;
|
||||
}
|
||||
}
|
||||
|
||||
DEBUG(dbgs() << "LV: Selecting VF = : "<< Width << ".\n");
|
||||
return Width;
|
||||
}
|
||||
|
||||
unsigned LoopVectorizationCostModel::expectedCost(unsigned VF) {
|
||||
// We can only estimate the cost of single basic block loops.
|
||||
assert(1 == TheLoop->getNumBlocks() && "Too many blocks in loop");
|
||||
|
||||
BasicBlock *BB = TheLoop->getHeader();
|
||||
unsigned Cost = 0;
|
||||
|
||||
// For each instruction in the old loop.
|
||||
for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
|
||||
Instruction *Inst = it;
|
||||
Cost += getInstructionCost(Inst, VF);
|
||||
}
|
||||
|
||||
// Return the cost divided by VF, because we will be executing
|
||||
// less iterations of the vector form.
|
||||
return Cost;
|
||||
}
|
||||
|
||||
unsigned
|
||||
LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
|
||||
assert(VTTI && "Invalid vector target transformation info");
|
||||
switch (I->getOpcode()) {
|
||||
case Instruction::Br: {
|
||||
return VTTI->getInstrCost(I->getOpcode());
|
||||
}
|
||||
case Instruction::PHI:
|
||||
// PHIs are handled the same as the binary instructions below.
|
||||
case Instruction::Add:
|
||||
case Instruction::FAdd:
|
||||
case Instruction::Sub:
|
||||
case Instruction::FSub:
|
||||
case Instruction::Mul:
|
||||
case Instruction::FMul:
|
||||
case Instruction::UDiv:
|
||||
case Instruction::SDiv:
|
||||
case Instruction::FDiv:
|
||||
case Instruction::URem:
|
||||
case Instruction::SRem:
|
||||
case Instruction::FRem:
|
||||
case Instruction::Shl:
|
||||
case Instruction::LShr:
|
||||
case Instruction::AShr:
|
||||
case Instruction::And:
|
||||
case Instruction::Or:
|
||||
case Instruction::Xor: {
|
||||
Type *VTy = VectorType::get(I->getType(), VF);
|
||||
return VTTI->getInstrCost(I->getOpcode(), VTy);
|
||||
}
|
||||
case Instruction::Select: {
|
||||
SelectInst *SI = cast<SelectInst>(I);
|
||||
Type *VTy = VectorType::get(I->getType(), VF);
|
||||
const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
|
||||
bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
|
||||
Type *CondTy = SI->getCondition()->getType();
|
||||
if (ScalarCond)
|
||||
CondTy = VectorType::get(CondTy, VF);
|
||||
|
||||
return VTTI->getInstrCost(I->getOpcode(), VTy, CondTy);
|
||||
}
|
||||
case Instruction::ICmp:
|
||||
case Instruction::FCmp: {
|
||||
Type *VTy = VectorType::get(I->getOperand(0)->getType(), VF);
|
||||
return VTTI->getInstrCost(I->getOpcode(), VTy);
|
||||
}
|
||||
case Instruction::Store: {
|
||||
StoreInst *SI = cast<StoreInst>(I);
|
||||
Type *VTy = VectorType::get(SI->getValueOperand()->getType(), VF);
|
||||
|
||||
// Scalarized stores.
|
||||
if (!Legal->isConsecutiveGep(SI->getPointerOperand())) {
|
||||
unsigned Cost = 0;
|
||||
unsigned ExtCost = VTTI->getInstrCost(Instruction::ExtractElement, VTy);
|
||||
// The cost of extracting from the vector value.
|
||||
Cost += VF * ExtCost;
|
||||
// The cost of the scalar stores.
|
||||
Cost += VF * VTTI->getInstrCost(I->getOpcode(), VTy->getScalarType());
|
||||
return Cost;
|
||||
}
|
||||
|
||||
// Wide stores.
|
||||
return VTTI->getMemoryOpCost(I->getOpcode(), VTy, SI->getAlignment(),
|
||||
SI->getPointerAddressSpace());
|
||||
}
|
||||
case Instruction::Load: {
|
||||
LoadInst *LI = cast<LoadInst>(I);
|
||||
Type *VTy = VectorType::get(I->getType(), VF);
|
||||
|
||||
// Scalarized loads.
|
||||
if (!Legal->isConsecutiveGep(LI->getPointerOperand())) {
|
||||
unsigned Cost = 0;
|
||||
unsigned InCost = VTTI->getInstrCost(Instruction::InsertElement, VTy);
|
||||
// The cost of inserting the loaded value into the result vector.
|
||||
Cost += VF * InCost;
|
||||
// The cost of the scalar stores.
|
||||
Cost += VF * VTTI->getInstrCost(I->getOpcode(), VTy->getScalarType());
|
||||
return Cost;
|
||||
}
|
||||
|
||||
// Wide loads.
|
||||
return VTTI->getMemoryOpCost(I->getOpcode(), VTy, LI->getAlignment(),
|
||||
LI->getPointerAddressSpace());
|
||||
}
|
||||
case Instruction::ZExt:
|
||||
case Instruction::SExt:
|
||||
case Instruction::FPToUI:
|
||||
case Instruction::FPToSI:
|
||||
case Instruction::FPExt:
|
||||
case Instruction::PtrToInt:
|
||||
case Instruction::IntToPtr:
|
||||
case Instruction::SIToFP:
|
||||
case Instruction::UIToFP:
|
||||
case Instruction::Trunc:
|
||||
case Instruction::FPTrunc:
|
||||
case Instruction::BitCast: {
|
||||
Type *SrcTy = VectorType::get(I->getOperand(0)->getType(), VF);
|
||||
Type *DstTy = VectorType::get(I->getType(), VF);
|
||||
return VTTI->getInstrCost(I->getOpcode(), DstTy, SrcTy);
|
||||
}
|
||||
default: {
|
||||
// We are scalarizing the instruction. Return the cost of the scalar
|
||||
// instruction, plus the cost of insert and extract into vector
|
||||
// elements, times the vector width.
|
||||
unsigned Cost = 0;
|
||||
Type *Ty = I->getType();
|
||||
|
||||
if (!Ty->isVoidTy()) {
|
||||
Type *VTy = VectorType::get(Ty, VF);
|
||||
unsigned InsCost = VTTI->getInstrCost(Instruction::InsertElement, VTy);
|
||||
unsigned ExtCost = VTTI->getInstrCost(Instruction::ExtractElement, VTy);
|
||||
Cost += VF * (InsCost + ExtCost);
|
||||
}
|
||||
|
||||
/// We don't have any information on the scalar instruction, but maybe
|
||||
/// the target has.
|
||||
/// TODO: This may be a target-specific intrinsic.
|
||||
/// Need to add API for that.
|
||||
Cost += VF * VTTI->getInstrCost(I->getOpcode(), Ty);
|
||||
|
||||
return Cost;
|
||||
}
|
||||
}// end of switch.
|
||||
}
|
||||
|
||||
|
||||
} // namespace
|
||||
|
||||
char LoopVectorize::ID = 0;
|
||||
|
||||
Reference in New Issue
Block a user