mirror of
https://github.com/c64scene-ar/llvm-6502.git
synced 2024-11-02 07:11:49 +00:00
LoopVectorizer:
1. Add code to estimate register pressure. 2. Add code to select the unroll factor based on register pressure. 3. Add bits to TargetTransformInfo to provide the number of registers. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@171469 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
parent
e12bf18754
commit
e503319874
@ -69,6 +69,8 @@ public:
|
|||||||
|
|
||||||
virtual ~VectorTargetTransformImpl() {}
|
virtual ~VectorTargetTransformImpl() {}
|
||||||
|
|
||||||
|
virtual unsigned getNumberOfRegisters(bool Vector) const;
|
||||||
|
|
||||||
virtual unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty) const;
|
virtual unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty) const;
|
||||||
|
|
||||||
virtual unsigned getShuffleCost(ShuffleKind Kind, Type *Tp,
|
virtual unsigned getShuffleCost(ShuffleKind Kind, Type *Tp,
|
||||||
|
@ -164,12 +164,19 @@ public:
|
|||||||
ExtractSubvector // ExtractSubvector Index indicates start offset.
|
ExtractSubvector // ExtractSubvector Index indicates start offset.
|
||||||
};
|
};
|
||||||
|
|
||||||
/// Returns the expected cost of arithmetic ops, such as mul, xor, fsub, etc.
|
/// \return The number of scalar or vector registers that the target has.
|
||||||
|
/// If 'Vectors' is true, it returns the number of vector registers. If it is
|
||||||
|
/// set to false, it returns the number of scalar registers.
|
||||||
|
virtual unsigned getNumberOfRegisters(bool Vector) const {
|
||||||
|
return 8;
|
||||||
|
}
|
||||||
|
|
||||||
|
/// \return The expected cost of arithmetic ops, such as mul, xor, fsub, etc.
|
||||||
virtual unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty) const {
|
virtual unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty) const {
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Returns the cost of a shuffle instruction of kind Kind and of type Tp.
|
/// \return The cost of a shuffle instruction of kind Kind and of type Tp.
|
||||||
/// The index and subtype parameters are used by the subvector insertion and
|
/// The index and subtype parameters are used by the subvector insertion and
|
||||||
/// extraction shuffle kinds.
|
/// extraction shuffle kinds.
|
||||||
virtual unsigned getShuffleCost(ShuffleKind Kind, Type *Tp,
|
virtual unsigned getShuffleCost(ShuffleKind Kind, Type *Tp,
|
||||||
@ -177,47 +184,47 @@ public:
|
|||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Returns the expected cost of cast instructions, such as bitcast, trunc,
|
/// \return The expected cost of cast instructions, such as bitcast, trunc,
|
||||||
/// zext, etc.
|
/// zext, etc.
|
||||||
virtual unsigned getCastInstrCost(unsigned Opcode, Type *Dst,
|
virtual unsigned getCastInstrCost(unsigned Opcode, Type *Dst,
|
||||||
Type *Src) const {
|
Type *Src) const {
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Returns the expected cost of control-flow related instrutctions such as
|
/// \return The expected cost of control-flow related instrutctions such as
|
||||||
/// Phi, Ret, Br.
|
/// Phi, Ret, Br.
|
||||||
virtual unsigned getCFInstrCost(unsigned Opcode) const {
|
virtual unsigned getCFInstrCost(unsigned Opcode) const {
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Returns the expected cost of compare and select instructions.
|
/// \returns The expected cost of compare and select instructions.
|
||||||
virtual unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
|
virtual unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
|
||||||
Type *CondTy = 0) const {
|
Type *CondTy = 0) const {
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Returns the expected cost of vector Insert and Extract.
|
/// \return The expected cost of vector Insert and Extract.
|
||||||
/// Use -1 to indicate that there is no information on the index value.
|
/// Use -1 to indicate that there is no information on the index value.
|
||||||
virtual unsigned getVectorInstrCost(unsigned Opcode, Type *Val,
|
virtual unsigned getVectorInstrCost(unsigned Opcode, Type *Val,
|
||||||
unsigned Index = -1) const {
|
unsigned Index = -1) const {
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Returns the cost of Load and Store instructions.
|
/// \return The cost of Load and Store instructions.
|
||||||
virtual unsigned getMemoryOpCost(unsigned Opcode, Type *Src,
|
virtual unsigned getMemoryOpCost(unsigned Opcode, Type *Src,
|
||||||
unsigned Alignment,
|
unsigned Alignment,
|
||||||
unsigned AddressSpace) const {
|
unsigned AddressSpace) const {
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Returns the cost of Intrinsic instructions.
|
/// \returns The cost of Intrinsic instructions.
|
||||||
virtual unsigned getIntrinsicInstrCost(Intrinsic::ID,
|
virtual unsigned getIntrinsicInstrCost(Intrinsic::ID,
|
||||||
Type *RetTy,
|
Type *RetTy,
|
||||||
ArrayRef<Type*> Tys) const {
|
ArrayRef<Type*> Tys) const {
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Returns the number of pieces into which the provided type must be
|
/// \returns The number of pieces into which the provided type must be
|
||||||
/// split during legalization. Zero is returned when the answer is unknown.
|
/// split during legalization. Zero is returned when the answer is unknown.
|
||||||
virtual unsigned getNumberOfParts(Type *Tp) const {
|
virtual unsigned getNumberOfParts(Type *Tp) const {
|
||||||
return 0;
|
return 0;
|
||||||
|
@ -171,6 +171,10 @@ VectorTargetTransformImpl::getScalarizationOverhead(Type *Ty,
|
|||||||
return Cost;
|
return Cost;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
unsigned VectorTargetTransformImpl::getNumberOfRegisters(bool Vector) const {
|
||||||
|
return 8;
|
||||||
|
}
|
||||||
|
|
||||||
unsigned VectorTargetTransformImpl::getArithmeticInstrCost(unsigned Opcode,
|
unsigned VectorTargetTransformImpl::getArithmeticInstrCost(unsigned Opcode,
|
||||||
Type *Ty) const {
|
Type *Ty) const {
|
||||||
// Check if any of the operands are vector operands.
|
// Check if any of the operands are vector operands.
|
||||||
|
@ -18115,6 +18115,13 @@ X86ScalarTargetTransformImpl::getPopcntHwSupport(unsigned TyWidth) const {
|
|||||||
return ST.hasSSE41() ? Fast : None;
|
return ST.hasSSE41() ? Fast : None;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
unsigned X86VectorTargetTransformInfo::getNumberOfRegisters(bool Vector) const {
|
||||||
|
const X86Subtarget &ST = TLI->getTargetMachine().getSubtarget<X86Subtarget>();
|
||||||
|
if (ST.is64Bit())
|
||||||
|
return 16;
|
||||||
|
return 8;
|
||||||
|
}
|
||||||
|
|
||||||
unsigned
|
unsigned
|
||||||
X86VectorTargetTransformInfo::getArithmeticInstrCost(unsigned Opcode,
|
X86VectorTargetTransformInfo::getArithmeticInstrCost(unsigned Opcode,
|
||||||
Type *Ty) const {
|
Type *Ty) const {
|
||||||
|
@ -959,6 +959,8 @@ namespace llvm {
|
|||||||
explicit X86VectorTargetTransformInfo(const TargetLowering *TL) :
|
explicit X86VectorTargetTransformInfo(const TargetLowering *TL) :
|
||||||
VectorTargetTransformImpl(TL) {}
|
VectorTargetTransformImpl(TL) {}
|
||||||
|
|
||||||
|
virtual unsigned getNumberOfRegisters(bool Vector) const;
|
||||||
|
|
||||||
virtual unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty) const;
|
virtual unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty) const;
|
||||||
|
|
||||||
virtual unsigned getMemoryOpCost(unsigned Opcode, Type *Src,
|
virtual unsigned getMemoryOpCost(unsigned Opcode, Type *Src,
|
||||||
|
@ -7,6 +7,7 @@
|
|||||||
//
|
//
|
||||||
//===----------------------------------------------------------------------===//
|
//===----------------------------------------------------------------------===//
|
||||||
#include "LoopVectorize.h"
|
#include "LoopVectorize.h"
|
||||||
|
#include "llvm/ADT/SmallSet.h"
|
||||||
#include "llvm/ADT/StringExtras.h"
|
#include "llvm/ADT/StringExtras.h"
|
||||||
#include "llvm/Analysis/AliasAnalysis.h"
|
#include "llvm/Analysis/AliasAnalysis.h"
|
||||||
#include "llvm/Analysis/AliasSetTracker.h"
|
#include "llvm/Analysis/AliasSetTracker.h"
|
||||||
@ -43,7 +44,7 @@ VectorizationFactor("force-vector-width", cl::init(0), cl::Hidden,
|
|||||||
cl::desc("Sets the SIMD width. Zero is autoselect."));
|
cl::desc("Sets the SIMD width. Zero is autoselect."));
|
||||||
|
|
||||||
static cl::opt<unsigned>
|
static cl::opt<unsigned>
|
||||||
VectorizationUnroll("force-vector-unroll", cl::init(1), cl::Hidden,
|
VectorizationUnroll("force-vector-unroll", cl::init(0), cl::Hidden,
|
||||||
cl::desc("Sets the vectorization unroll count. "
|
cl::desc("Sets the vectorization unroll count. "
|
||||||
"Zero is autoselect."));
|
"Zero is autoselect."));
|
||||||
|
|
||||||
@ -94,7 +95,7 @@ struct LoopVectorize : public LoopPass {
|
|||||||
if (TTI)
|
if (TTI)
|
||||||
VTTI = TTI->getVectorTargetTransformInfo();
|
VTTI = TTI->getVectorTargetTransformInfo();
|
||||||
// Use the cost model.
|
// Use the cost model.
|
||||||
LoopVectorizationCostModel CM(L, SE, &LVL, VTTI);
|
LoopVectorizationCostModel CM(L, SE, LI, &LVL, VTTI);
|
||||||
|
|
||||||
// Check the function attribues to find out if this function should be
|
// Check the function attribues to find out if this function should be
|
||||||
// optimized for size.
|
// optimized for size.
|
||||||
@ -112,6 +113,7 @@ struct LoopVectorize : public LoopPass {
|
|||||||
}
|
}
|
||||||
|
|
||||||
unsigned VF = CM.selectVectorizationFactor(OptForSize, VectorizationFactor);
|
unsigned VF = CM.selectVectorizationFactor(OptForSize, VectorizationFactor);
|
||||||
|
unsigned UF = CM.selectUnrollFactor(OptForSize, VectorizationUnroll);
|
||||||
|
|
||||||
if (VF == 1) {
|
if (VF == 1) {
|
||||||
DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
|
DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
|
||||||
@ -120,9 +122,10 @@ struct LoopVectorize : public LoopPass {
|
|||||||
|
|
||||||
DEBUG(dbgs() << "LV: Found a vectorizable loop ("<< VF << ") in "<<
|
DEBUG(dbgs() << "LV: Found a vectorizable loop ("<< VF << ") in "<<
|
||||||
F->getParent()->getModuleIdentifier()<<"\n");
|
F->getParent()->getModuleIdentifier()<<"\n");
|
||||||
|
DEBUG(dbgs() << "LV: Unroll Factor is " << UF << "\n");
|
||||||
|
|
||||||
// If we decided that it is *legal* to vectorizer the loop then do it.
|
// If we decided that it is *legal* to vectorizer the loop then do it.
|
||||||
InnerLoopVectorizer LB(L, SE, LI, DT, DL, VF, VectorizationUnroll);
|
InnerLoopVectorizer LB(L, SE, LI, DT, DL, VF, UF);
|
||||||
LB.vectorize(&LVL);
|
LB.vectorize(&LVL);
|
||||||
|
|
||||||
DEBUG(verifyFunction(*L->getHeader()->getParent()));
|
DEBUG(verifyFunction(*L->getHeader()->getParent()));
|
||||||
@ -2082,7 +2085,7 @@ bool LoopVectorizationLegality::hasComputableBounds(Value *Ptr) {
|
|||||||
|
|
||||||
unsigned
|
unsigned
|
||||||
LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize,
|
LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize,
|
||||||
unsigned UserVF) {
|
unsigned UserVF) {
|
||||||
if (OptForSize && Legal->getRuntimePointerCheck()->Need) {
|
if (OptForSize && Legal->getRuntimePointerCheck()->Need) {
|
||||||
DEBUG(dbgs() << "LV: Aborting. Runtime ptr check is required in Os.\n");
|
DEBUG(dbgs() << "LV: Aborting. Runtime ptr check is required in Os.\n");
|
||||||
return 1;
|
return 1;
|
||||||
@ -2148,6 +2151,161 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize,
|
|||||||
return Width;
|
return Width;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
unsigned
|
||||||
|
LoopVectorizationCostModel::selectUnrollFactor(bool OptForSize,
|
||||||
|
unsigned UserUF) {
|
||||||
|
// Use the user preference, unless 'auto' is selected.
|
||||||
|
if (UserUF != 0)
|
||||||
|
return UserUF;
|
||||||
|
|
||||||
|
// When we optimize for size we don't unroll.
|
||||||
|
if (OptForSize)
|
||||||
|
return 1;
|
||||||
|
|
||||||
|
unsigned TargetVectorRegisters = VTTI->getNumberOfRegisters(true);
|
||||||
|
DEBUG(dbgs() << "LV: The target has " << TargetVectorRegisters <<
|
||||||
|
" vector registers\n");
|
||||||
|
|
||||||
|
LoopVectorizationCostModel::RegisterUsage R = calculateRegisterUsage();
|
||||||
|
// We divide by these constants so assume that we have at least one
|
||||||
|
// instruction that uses at least one register.
|
||||||
|
R.MaxLocalUsers = std::max(R.MaxLocalUsers, 1U);
|
||||||
|
R.NumInstructions = std::max(R.NumInstructions, 1U);
|
||||||
|
|
||||||
|
// We calculate the unroll factor using the following formula.
|
||||||
|
// Subtract the number of loop invariants from the number of available
|
||||||
|
// registers. These registers are used by all of the unrolled instances.
|
||||||
|
// Next, divide the remaining registers by the number of registers that is
|
||||||
|
// required by the loop, in order to estimate how many parallel instances
|
||||||
|
// fit without causing spills.
|
||||||
|
unsigned UF = (TargetVectorRegisters - R.LoopInvariantRegs) / R.MaxLocalUsers;
|
||||||
|
|
||||||
|
// We don't want to unroll the loops to the point where they do not fit into
|
||||||
|
// the decoded cache. Assume that we only allow 32 IR instructions.
|
||||||
|
UF = std::min(UF, (32 / R.NumInstructions));
|
||||||
|
|
||||||
|
// Clamp the unroll factor ranges to reasonable factors.
|
||||||
|
if (UF > MaxUnrollSize)
|
||||||
|
UF = MaxUnrollSize;
|
||||||
|
else if (UF < 1)
|
||||||
|
UF = 1;
|
||||||
|
|
||||||
|
return UF;
|
||||||
|
}
|
||||||
|
|
||||||
|
LoopVectorizationCostModel::RegisterUsage
|
||||||
|
LoopVectorizationCostModel::calculateRegisterUsage() {
|
||||||
|
// This function calculates the register usage by measuring the highest number
|
||||||
|
// of values that are alive at a single location. Obviously, this is a very
|
||||||
|
// rough estimation. We scan the loop in a topological order in order and
|
||||||
|
// assign a number to each instruction. We use RPO to ensure that defs are
|
||||||
|
// met before their users. We assume that each instruction that has in-loop
|
||||||
|
// users starts an interval. We record every time that an in-loop value is
|
||||||
|
// used, so we have a list of the first and last occurrences of each
|
||||||
|
// instruction. Next, we transpose this data structure into a multi map that
|
||||||
|
// holds the list of intervals that *end* at a specific location. This multi
|
||||||
|
// map allows us to perform a linear search. We scan the instructions linearly
|
||||||
|
// and record each time that a new interval starts, by placing it in a set.
|
||||||
|
// If we find this value in the multi-map then we remove it from the set.
|
||||||
|
// The max register usage is the maximum size of the set.
|
||||||
|
// We also search for instructions that are defined outside the loop, but are
|
||||||
|
// used inside the loop. We need this number separately from the max-interval
|
||||||
|
// usage number because when we unroll, loop-invariant values do not take
|
||||||
|
// more register.
|
||||||
|
LoopBlocksDFS DFS(TheLoop);
|
||||||
|
DFS.perform(LI);
|
||||||
|
|
||||||
|
RegisterUsage R;
|
||||||
|
R.NumInstructions = 0;
|
||||||
|
|
||||||
|
// Each 'key' in the map opens a new interval. The values
|
||||||
|
// of the map are the index of the 'last seen' usage of the
|
||||||
|
// instruction that is the key.
|
||||||
|
typedef DenseMap<Instruction*, unsigned> IntervalMap;
|
||||||
|
// Maps instruction to its index.
|
||||||
|
DenseMap<unsigned, Instruction*> IdxToInstr;
|
||||||
|
// Marks the end of each interval.
|
||||||
|
IntervalMap EndPoint;
|
||||||
|
// Saves the list of instruction indices that are used in the loop.
|
||||||
|
SmallSet<Instruction*, 8> Ends;
|
||||||
|
// Saves the list of values that are used in the loop but are
|
||||||
|
// defined outside the loop, such as arguments and constants.
|
||||||
|
SmallPtrSet<Value*, 8> LoopInvariants;
|
||||||
|
|
||||||
|
unsigned Index = 0;
|
||||||
|
for (LoopBlocksDFS::RPOIterator bb = DFS.beginRPO(),
|
||||||
|
be = DFS.endRPO(); bb != be; ++bb) {
|
||||||
|
R.NumInstructions += (*bb)->size();
|
||||||
|
for (BasicBlock::iterator it = (*bb)->begin(), e = (*bb)->end(); it != e;
|
||||||
|
++it) {
|
||||||
|
Instruction *I = it;
|
||||||
|
IdxToInstr[Index++] = I;
|
||||||
|
|
||||||
|
// Save the end location of each USE.
|
||||||
|
for (unsigned i = 0; i < I->getNumOperands(); ++i) {
|
||||||
|
Value *U = I->getOperand(i);
|
||||||
|
Instruction *Instr = dyn_cast<Instruction>(U);
|
||||||
|
|
||||||
|
// Ignore non-instruction values such as arguments, constants, etc.
|
||||||
|
if (!Instr) continue;
|
||||||
|
|
||||||
|
// If this instruction is outside the loop then record it and continue.
|
||||||
|
if (!TheLoop->contains(Instr)) {
|
||||||
|
LoopInvariants.insert(Instr);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Overwrite previous end points.
|
||||||
|
EndPoint[Instr] = Index;
|
||||||
|
Ends.insert(Instr);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Saves the list of intervals that end with the index in 'key'.
|
||||||
|
typedef SmallVector<Instruction*, 2> InstrList;
|
||||||
|
DenseMap<unsigned, InstrList> TransposeEnds;
|
||||||
|
|
||||||
|
// Transpose the EndPoints to a list of values that end at each index.
|
||||||
|
for (IntervalMap::iterator it = EndPoint.begin(), e = EndPoint.end();
|
||||||
|
it != e; ++it)
|
||||||
|
TransposeEnds[it->second].push_back(it->first);
|
||||||
|
|
||||||
|
SmallSet<Instruction*, 8> OpenIntervals;
|
||||||
|
unsigned MaxUsage = 0;
|
||||||
|
|
||||||
|
|
||||||
|
DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
|
||||||
|
for (unsigned int i = 0; i < Index; ++i) {
|
||||||
|
Instruction *I = IdxToInstr[i];
|
||||||
|
// Ignore instructions that are never used within the loop.
|
||||||
|
if (!Ends.count(I)) continue;
|
||||||
|
|
||||||
|
// Remove all of the instructions that end at this location.
|
||||||
|
InstrList &List = TransposeEnds[i];
|
||||||
|
for (unsigned int i=0, e = List.size(); i < e; ++i)
|
||||||
|
OpenIntervals.erase(List[i]);
|
||||||
|
|
||||||
|
// Count the number of live interals.
|
||||||
|
MaxUsage = std::max(MaxUsage, OpenIntervals.size());
|
||||||
|
|
||||||
|
DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " <<
|
||||||
|
OpenIntervals.size() <<"\n");
|
||||||
|
|
||||||
|
// Add the current instruction to the list of open intervals.
|
||||||
|
OpenIntervals.insert(I);
|
||||||
|
}
|
||||||
|
|
||||||
|
unsigned Invariant = LoopInvariants.size();
|
||||||
|
DEBUG(dbgs() << "LV(REG): Found max usage: " << MaxUsage << " \n");
|
||||||
|
DEBUG(dbgs() << "LV(REG): Found invariant usage: " << Invariant << " \n");
|
||||||
|
DEBUG(dbgs() << "LV(REG): LoopSize: " << R.NumInstructions << " \n");
|
||||||
|
|
||||||
|
R.LoopInvariantRegs = Invariant;
|
||||||
|
R.MaxLocalUsers = MaxUsage;
|
||||||
|
return R;
|
||||||
|
}
|
||||||
|
|
||||||
unsigned LoopVectorizationCostModel::expectedCost(unsigned VF) {
|
unsigned LoopVectorizationCostModel::expectedCost(unsigned VF) {
|
||||||
unsigned Cost = 0;
|
unsigned Cost = 0;
|
||||||
|
|
||||||
|
@ -68,6 +68,9 @@ const unsigned RuntimeMemoryCheckThreshold = 4;
|
|||||||
/// This is the highest vector width that we try to generate.
|
/// This is the highest vector width that we try to generate.
|
||||||
const unsigned MaxVectorSize = 8;
|
const unsigned MaxVectorSize = 8;
|
||||||
|
|
||||||
|
/// This is the highest Unroll Factor.
|
||||||
|
const unsigned MaxUnrollSize = 4;
|
||||||
|
|
||||||
namespace llvm {
|
namespace llvm {
|
||||||
|
|
||||||
// Forward declarations.
|
// Forward declarations.
|
||||||
@ -473,17 +476,37 @@ private:
|
|||||||
class LoopVectorizationCostModel {
|
class LoopVectorizationCostModel {
|
||||||
public:
|
public:
|
||||||
/// C'tor.
|
/// C'tor.
|
||||||
LoopVectorizationCostModel(Loop *Lp, ScalarEvolution *Se,
|
LoopVectorizationCostModel(Loop *Lp, ScalarEvolution *Se, LoopInfo *Li,
|
||||||
LoopVectorizationLegality *Leg,
|
LoopVectorizationLegality *Leg,
|
||||||
const VectorTargetTransformInfo *Vtti):
|
const VectorTargetTransformInfo *Vtti):
|
||||||
TheLoop(Lp), SE(Se), Legal(Leg), VTTI(Vtti) { }
|
TheLoop(Lp), SE(Se), LI(Li), Legal(Leg), VTTI(Vtti) { }
|
||||||
|
|
||||||
/// Returns the most profitable vectorization factor in powers of two.
|
/// \return The most profitable vectorization factor.
|
||||||
/// This method checks every power of two up to VF. If UserVF is not ZERO
|
/// This method checks every power of two up to VF. If UserVF is not ZERO
|
||||||
/// then this vectorization factor will be selected if vectorization is
|
/// then this vectorization factor will be selected if vectorization is
|
||||||
/// possible.
|
/// possible.
|
||||||
unsigned selectVectorizationFactor(bool OptForSize, unsigned UserVF);
|
unsigned selectVectorizationFactor(bool OptForSize, unsigned UserVF);
|
||||||
|
|
||||||
|
|
||||||
|
/// \return The most profitable unroll factor.
|
||||||
|
/// If UserUF is non-zero then this method finds the best unroll-factor
|
||||||
|
/// based on register pressure and other parameters.
|
||||||
|
unsigned selectUnrollFactor(bool OptForSize, unsigned UserUF);
|
||||||
|
|
||||||
|
/// \brief A struct that represents some properties of the register usage
|
||||||
|
/// of a loop.
|
||||||
|
struct RegisterUsage {
|
||||||
|
/// Holds the number of loop invariant values that are used in the loop.
|
||||||
|
unsigned LoopInvariantRegs;
|
||||||
|
/// Holds the maximum number of concurrent live intervals in the loop.
|
||||||
|
unsigned MaxLocalUsers;
|
||||||
|
/// Holds the number of instructions in the loop.
|
||||||
|
unsigned NumInstructions;
|
||||||
|
};
|
||||||
|
|
||||||
|
/// \return information about the register usage of the loop.
|
||||||
|
RegisterUsage calculateRegisterUsage();
|
||||||
|
|
||||||
private:
|
private:
|
||||||
/// Returns the expected execution cost. The unit of the cost does
|
/// Returns the expected execution cost. The unit of the cost does
|
||||||
/// not matter because we use the 'cost' units to compare different
|
/// not matter because we use the 'cost' units to compare different
|
||||||
@ -504,7 +527,8 @@ private:
|
|||||||
Loop *TheLoop;
|
Loop *TheLoop;
|
||||||
/// Scev analysis.
|
/// Scev analysis.
|
||||||
ScalarEvolution *SE;
|
ScalarEvolution *SE;
|
||||||
|
/// Loop Info analysis.
|
||||||
|
LoopInfo *LI;
|
||||||
/// Vectorization legality.
|
/// Vectorization legality.
|
||||||
LoopVectorizationLegality *Legal;
|
LoopVectorizationLegality *Legal;
|
||||||
/// Vector target information.
|
/// Vector target information.
|
||||||
|
@ -1,4 +1,5 @@
|
|||||||
; RUN: opt < %s -loop-vectorize -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7 -dce -instcombine -licm -S | FileCheck %s
|
; RUN: opt < %s -loop-vectorize -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7 -dce -instcombine -licm -S | FileCheck %s
|
||||||
|
; RUN: opt < %s -loop-vectorize -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7 -force-vector-unroll=0 -dce -instcombine -licm -S | FileCheck %s -check-prefix=UNROLL
|
||||||
|
|
||||||
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
|
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
|
||||||
target triple = "x86_64-apple-macosx10.8.0"
|
target triple = "x86_64-apple-macosx10.8.0"
|
||||||
@ -13,6 +14,15 @@ target triple = "x86_64-apple-macosx10.8.0"
|
|||||||
;CHECK: add nsw <4 x i32>
|
;CHECK: add nsw <4 x i32>
|
||||||
;CHECK: store <4 x i32>
|
;CHECK: store <4 x i32>
|
||||||
;CHECK: ret void
|
;CHECK: ret void
|
||||||
|
|
||||||
|
;UNROLL: @example1
|
||||||
|
;UNROLL: load <4 x i32>
|
||||||
|
;UNROLL: load <4 x i32>
|
||||||
|
;UNROLL: add nsw <4 x i32>
|
||||||
|
;UNROLL: add nsw <4 x i32>
|
||||||
|
;UNROLL: store <4 x i32>
|
||||||
|
;UNROLL: store <4 x i32>
|
||||||
|
;UNROLL: ret void
|
||||||
define void @example1() nounwind uwtable ssp {
|
define void @example1() nounwind uwtable ssp {
|
||||||
br label %1
|
br label %1
|
||||||
|
|
||||||
@ -34,13 +44,20 @@ define void @example1() nounwind uwtable ssp {
|
|||||||
ret void
|
ret void
|
||||||
}
|
}
|
||||||
|
|
||||||
|
; Select VF=4 because sext <8 x i1> to <8 x i32> is expensive.
|
||||||
; Select VF=4 because sext <8 x i1> to <8 x i32> is expensive.
|
|
||||||
;CHECK: @example10b
|
;CHECK: @example10b
|
||||||
;CHECK: load <4 x i16>
|
;CHECK: load <4 x i16>
|
||||||
;CHECK: sext <4 x i16>
|
;CHECK: sext <4 x i16>
|
||||||
;CHECK: store <4 x i32>
|
;CHECK: store <4 x i32>
|
||||||
;CHECK: ret void
|
;CHECK: ret void
|
||||||
|
;UNROLL: @example10b
|
||||||
|
;UNROLL: load <4 x i16>
|
||||||
|
;UNROLL: load <4 x i16>
|
||||||
|
;UNROLL: load <4 x i16>
|
||||||
|
;UNROLL: store <4 x i32>
|
||||||
|
;UNROLL: store <4 x i32>
|
||||||
|
;UNROLL: store <4 x i32>
|
||||||
|
;UNROLL: ret void
|
||||||
define void @example10b(i16* noalias nocapture %sa, i16* noalias nocapture %sb, i16* noalias nocapture %sc, i32* noalias nocapture %ia, i32* noalias nocapture %ib, i32* noalias nocapture %ic) nounwind uwtable ssp {
|
define void @example10b(i16* noalias nocapture %sa, i16* noalias nocapture %sb, i16* noalias nocapture %sc, i32* noalias nocapture %ia, i32* noalias nocapture %ib, i32* noalias nocapture %ic) nounwind uwtable ssp {
|
||||||
br label %1
|
br label %1
|
||||||
|
|
||||||
|
@ -1,4 +1,5 @@
|
|||||||
; RUN: opt < %s -loop-vectorize -force-vector-width=4 -dce -instcombine -licm -S | FileCheck %s
|
; RUN: opt < %s -loop-vectorize -force-vector-width=4 -dce -instcombine -licm -S | FileCheck %s
|
||||||
|
; RUN: opt < %s -loop-vectorize -force-vector-width=4 -force-vector-unroll=4 -dce -instcombine -licm -S | FileCheck %s -check-prefix=UNROLL
|
||||||
|
|
||||||
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
|
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
|
||||||
target triple = "x86_64-apple-macosx10.8.0"
|
target triple = "x86_64-apple-macosx10.8.0"
|
||||||
@ -24,6 +25,20 @@ target triple = "x86_64-apple-macosx10.8.0"
|
|||||||
;CHECK: add nsw <4 x i32>
|
;CHECK: add nsw <4 x i32>
|
||||||
;CHECK: store <4 x i32>
|
;CHECK: store <4 x i32>
|
||||||
;CHECK: ret void
|
;CHECK: ret void
|
||||||
|
;UNROLL: @example1
|
||||||
|
;UNROLL: load <4 x i32>
|
||||||
|
;UNROLL: load <4 x i32>
|
||||||
|
;UNROLL: load <4 x i32>
|
||||||
|
;UNROLL: load <4 x i32>
|
||||||
|
;UNROLL: add nsw <4 x i32>
|
||||||
|
;UNROLL: add nsw <4 x i32>
|
||||||
|
;UNROLL: add nsw <4 x i32>
|
||||||
|
;UNROLL: add nsw <4 x i32>
|
||||||
|
;UNROLL: store <4 x i32>
|
||||||
|
;UNROLL: store <4 x i32>
|
||||||
|
;UNROLL: store <4 x i32>
|
||||||
|
;UNROLL: store <4 x i32>
|
||||||
|
;UNROLL: ret void
|
||||||
define void @example1() nounwind uwtable ssp {
|
define void @example1() nounwind uwtable ssp {
|
||||||
br label %1
|
br label %1
|
||||||
|
|
||||||
@ -48,6 +63,12 @@ define void @example1() nounwind uwtable ssp {
|
|||||||
;CHECK: @example2
|
;CHECK: @example2
|
||||||
;CHECK: store <4 x i32>
|
;CHECK: store <4 x i32>
|
||||||
;CHECK: ret void
|
;CHECK: ret void
|
||||||
|
;UNROLL: @example2
|
||||||
|
;UNROLL: store <4 x i32>
|
||||||
|
;UNROLL: store <4 x i32>
|
||||||
|
;UNROLL: store <4 x i32>
|
||||||
|
;UNROLL: store <4 x i32>
|
||||||
|
;UNROLL: ret void
|
||||||
define void @example2(i32 %n, i32 %x) nounwind uwtable ssp {
|
define void @example2(i32 %n, i32 %x) nounwind uwtable ssp {
|
||||||
%1 = icmp sgt i32 %n, 0
|
%1 = icmp sgt i32 %n, 0
|
||||||
br i1 %1, label %.lr.ph5, label %.preheader
|
br i1 %1, label %.lr.ph5, label %.preheader
|
||||||
@ -92,6 +113,12 @@ define void @example2(i32 %n, i32 %x) nounwind uwtable ssp {
|
|||||||
;CHECK: @example3
|
;CHECK: @example3
|
||||||
;CHECK: <4 x i32>
|
;CHECK: <4 x i32>
|
||||||
;CHECK: ret void
|
;CHECK: ret void
|
||||||
|
;UNROLL: @example3
|
||||||
|
;UNROLL: <4 x i32>
|
||||||
|
;UNROLL: <4 x i32>
|
||||||
|
;UNROLL: <4 x i32>
|
||||||
|
;UNROLL: <4 x i32>
|
||||||
|
;UNROLL: ret void
|
||||||
define void @example3(i32 %n, i32* noalias nocapture %p, i32* noalias nocapture %q) nounwind uwtable ssp {
|
define void @example3(i32 %n, i32* noalias nocapture %p, i32* noalias nocapture %q) nounwind uwtable ssp {
|
||||||
%1 = icmp eq i32 %n, 0
|
%1 = icmp eq i32 %n, 0
|
||||||
br i1 %1, label %._crit_edge, label %.lr.ph
|
br i1 %1, label %._crit_edge, label %.lr.ph
|
||||||
@ -115,6 +142,12 @@ define void @example3(i32 %n, i32* noalias nocapture %p, i32* noalias nocapture
|
|||||||
;CHECK: @example4
|
;CHECK: @example4
|
||||||
;CHECK: load <4 x i32>
|
;CHECK: load <4 x i32>
|
||||||
;CHECK: ret void
|
;CHECK: ret void
|
||||||
|
;UNROLL: @example4
|
||||||
|
;UNROLL: load <4 x i32>
|
||||||
|
;UNROLL: load <4 x i32>
|
||||||
|
;UNROLL: load <4 x i32>
|
||||||
|
;UNROLL: load <4 x i32>
|
||||||
|
;UNROLL: ret void
|
||||||
define void @example4(i32 %n, i32* noalias nocapture %p, i32* noalias nocapture %q) nounwind uwtable ssp {
|
define void @example4(i32 %n, i32* noalias nocapture %p, i32* noalias nocapture %q) nounwind uwtable ssp {
|
||||||
%1 = add nsw i32 %n, -1
|
%1 = add nsw i32 %n, -1
|
||||||
%2 = icmp eq i32 %n, 0
|
%2 = icmp eq i32 %n, 0
|
||||||
@ -175,6 +208,12 @@ define void @example4(i32 %n, i32* noalias nocapture %p, i32* noalias nocapture
|
|||||||
;CHECK: @example8
|
;CHECK: @example8
|
||||||
;CHECK: store <4 x i32>
|
;CHECK: store <4 x i32>
|
||||||
;CHECK: ret void
|
;CHECK: ret void
|
||||||
|
;UNROLL: @example8
|
||||||
|
;UNROLL: store <4 x i32>
|
||||||
|
;UNROLL: store <4 x i32>
|
||||||
|
;UNROLL: store <4 x i32>
|
||||||
|
;UNROLL: store <4 x i32>
|
||||||
|
;UNROLL: ret void
|
||||||
define void @example8(i32 %x) nounwind uwtable ssp {
|
define void @example8(i32 %x) nounwind uwtable ssp {
|
||||||
br label %.preheader
|
br label %.preheader
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user