[LSR] Canonicalize reg1 + ... + regN into reg1 + ... + 1*regN.

This commit introduces a canonical representation for the formulae.
Basically, as soon as a formula has more that one base register, the scaled
register field is used for one of them. The register put into the scaled
register is preferably a loop variant.
The commit refactors how the formulae are built in order to produce such
representation.
This yields a more accurate, but still perfectible, cost model.

<rdar://problem/16731508>


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@209230 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
Quentin Colombet 2014-05-20 19:25:04 +00:00
parent 8598e290c3
commit 50d4008b47
3 changed files with 393 additions and 198 deletions

View File

@ -238,7 +238,15 @@ struct Formula {
int64_t Scale;
/// BaseRegs - The list of "base" registers for this use. When this is
/// non-empty,
/// non-empty. The canonical representation of a formula is
/// 1. BaseRegs.size > 1 implies ScaledReg != NULL and
/// 2. ScaledReg != NULL implies Scale != 1 || !BaseRegs.empty().
/// #1 enforces that the scaled register is always used when at least two
/// registers are needed by the formula: e.g., reg1 + reg2 is reg1 + 1 * reg2.
/// #2 enforces that 1 * reg is reg.
/// This invariant can be temporarly broken while building a formula.
/// However, every formula inserted into the LSRInstance must be in canonical
/// form.
SmallVector<const SCEV *, 4> BaseRegs;
/// ScaledReg - The 'scaled' register for this use. This should be non-null
@ -256,6 +264,12 @@ struct Formula {
void InitialMatch(const SCEV *S, Loop *L, ScalarEvolution &SE);
bool isCanonical() const;
void Canonicalize();
bool Unscale();
size_t getNumRegs() const;
Type *getType() const;
@ -346,6 +360,52 @@ void Formula::InitialMatch(const SCEV *S, Loop *L, ScalarEvolution &SE) {
BaseRegs.push_back(Sum);
HasBaseReg = true;
}
Canonicalize();
}
/// \brief Check whether or not this formula statisfies the canonical
/// representation.
/// \see Formula::BaseRegs.
bool Formula::isCanonical() const {
if (ScaledReg)
return Scale != 1 || !BaseRegs.empty();
return BaseRegs.size() <= 1;
}
/// \brief Helper method to morph a formula into its canonical representation.
/// \see Formula::BaseRegs.
/// Every formula having more than one base register, must use the ScaledReg
/// field. Otherwise, we would have to do special cases everywhere in LSR
/// to treat reg1 + reg2 + ... the same way as reg1 + 1*reg2 + ...
/// On the other hand, 1*reg should be canonicalized into reg.
void Formula::Canonicalize() {
if (isCanonical())
return;
// So far we did not need this case. This is easy to implement but it is
// useless to maintain dead code. Beside it could hurt compile time.
assert(!BaseRegs.empty() && "1*reg => reg, should not be needed.");
// Keep the invariant sum in BaseRegs and one of the variant sum in ScaledReg.
ScaledReg = BaseRegs.back();
BaseRegs.pop_back();
Scale = 1;
size_t BaseRegsSize = BaseRegs.size();
size_t Try = 0;
// If ScaledReg is an invariant, try to find a variant expression.
while (Try < BaseRegsSize && !isa<SCEVAddRecExpr>(ScaledReg))
std::swap(ScaledReg, BaseRegs[Try++]);
}
/// \brief Get rid of the scale in the formula.
/// In other words, this method morphes reg1 + 1*reg2 into reg1 + reg2.
/// \return true if it was possible to get rid of the scale, false otherwise.
/// \note After this operation the formula may not be in the canonical form.
bool Formula::Unscale() {
if (Scale != 1)
return false;
Scale = 0;
BaseRegs.push_back(ScaledReg);
ScaledReg = nullptr;
return true;
}
/// getNumRegs - Return the total number of register operands used by this
@ -776,9 +836,18 @@ DeleteTriviallyDeadInstructions(SmallVectorImpl<WeakVH> &DeadInsts) {
namespace {
class LSRUse;
}
// Check if it is legal to fold 2 base registers.
static bool isLegal2RegAMUse(const TargetTransformInfo &TTI, const LSRUse &LU,
const Formula &F);
/// \brief Check if the addressing mode defined by \p F is completely
/// folded in \p LU at isel time.
/// This includes address-mode folding and special icmp tricks.
/// This function returns true if \p LU can accommodate what \p F
/// defines and up to 1 base + 1 scaled + offset.
/// In other words, if \p F has several base registers, this function may
/// still return true. Therefore, users still need to account for
/// additional base registers and/or unfolded offsets to derive an
/// accurate cost model.
static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
const LSRUse &LU, const Formula &F);
// Get the cost of the scaling factor used in F for LU.
static unsigned getScalingFactorCost(const TargetTransformInfo &TTI,
const LSRUse &LU, const Formula &F);
@ -922,6 +991,7 @@ void Cost::RateFormula(const TargetTransformInfo &TTI,
ScalarEvolution &SE, DominatorTree &DT,
const LSRUse &LU,
SmallPtrSet<const SCEV *, 16> *LoserRegs) {
assert(F.isCanonical() && "Cost is accurate only for canonical formula");
// Tally up the registers.
if (const SCEV *ScaledReg = F.ScaledReg) {
if (VisitedRegs.count(ScaledReg)) {
@ -945,11 +1015,13 @@ void Cost::RateFormula(const TargetTransformInfo &TTI,
}
// Determine how many (unfolded) adds we'll need inside the loop.
size_t NumBaseParts = F.BaseRegs.size() + (F.UnfoldedOffset != 0);
size_t NumBaseParts = F.getNumRegs();
if (NumBaseParts > 1)
// Do not count the base and a possible second register if the target
// allows to fold 2 registers.
NumBaseAdds += NumBaseParts - (1 + isLegal2RegAMUse(TTI, LU, F));
NumBaseAdds +=
NumBaseParts - (1 + (F.Scale && isAMCompletelyFolded(TTI, LU, F)));
NumBaseAdds += (F.UnfoldedOffset != 0);
// Accumulate non-free scaling amounts.
ScaleCost += getScalingFactorCost(TTI, LU, F);
@ -1210,7 +1282,10 @@ bool LSRUse::HasFormulaWithSameRegs(const Formula &F) const {
/// InsertFormula - If the given formula has not yet been inserted, add it to
/// the list, and return true. Return false otherwise.
/// The formula must be in canonical form.
bool LSRUse::InsertFormula(const Formula &F) {
assert(F.isCanonical() && "Invalid canonical representation");
if (!Formulae.empty() && RigidFormula)
return false;
@ -1236,6 +1311,8 @@ bool LSRUse::InsertFormula(const Formula &F) {
// Record registers now being used by this use.
Regs.insert(F.BaseRegs.begin(), F.BaseRegs.end());
if (F.ScaledReg)
Regs.insert(F.ScaledReg);
return true;
}
@ -1302,12 +1379,10 @@ void LSRUse::dump() const {
}
#endif
/// isLegalUse - Test whether the use described by AM is "legal", meaning it can
/// be completely folded into the user instruction at isel time. This includes
/// address-mode folding and special icmp tricks.
static bool isLegalUse(const TargetTransformInfo &TTI, LSRUse::KindType Kind,
Type *AccessTy, GlobalValue *BaseGV, int64_t BaseOffset,
bool HasBaseReg, int64_t Scale) {
static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
LSRUse::KindType Kind, Type *AccessTy,
GlobalValue *BaseGV, int64_t BaseOffset,
bool HasBaseReg, int64_t Scale) {
switch (Kind) {
case LSRUse::Address:
return TTI.isLegalAddressingMode(AccessTy, BaseGV, BaseOffset, HasBaseReg, Scale);
@ -1358,10 +1433,11 @@ static bool isLegalUse(const TargetTransformInfo &TTI, LSRUse::KindType Kind,
llvm_unreachable("Invalid LSRUse Kind!");
}
static bool isLegalUse(const TargetTransformInfo &TTI, int64_t MinOffset,
int64_t MaxOffset, LSRUse::KindType Kind, Type *AccessTy,
GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg,
int64_t Scale) {
static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
int64_t MinOffset, int64_t MaxOffset,
LSRUse::KindType Kind, Type *AccessTy,
GlobalValue *BaseGV, int64_t BaseOffset,
bool HasBaseReg, int64_t Scale) {
// Check for overflow.
if (((int64_t)((uint64_t)BaseOffset + MinOffset) > BaseOffset) !=
(MinOffset > 0))
@ -1372,9 +1448,41 @@ static bool isLegalUse(const TargetTransformInfo &TTI, int64_t MinOffset,
return false;
MaxOffset = (uint64_t)BaseOffset + MaxOffset;
return isLegalUse(TTI, Kind, AccessTy, BaseGV, MinOffset, HasBaseReg,
Scale) &&
isLegalUse(TTI, Kind, AccessTy, BaseGV, MaxOffset, HasBaseReg, Scale);
return isAMCompletelyFolded(TTI, Kind, AccessTy, BaseGV, MinOffset,
HasBaseReg, Scale) &&
isAMCompletelyFolded(TTI, Kind, AccessTy, BaseGV, MaxOffset,
HasBaseReg, Scale);
}
static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
int64_t MinOffset, int64_t MaxOffset,
LSRUse::KindType Kind, Type *AccessTy,
const Formula &F) {
// For the purpose of isAMCompletelyFolded either having a canonical formula
// or a scale not equal to zero is correct.
// Problems may arise from non canonical formulae having a scale == 0.
// Strictly speaking it would best to just rely on canonical formulae.
// However, when we generate the scaled formulae, we first check that the
// scaling factor is profitable before computing the actual ScaledReg for
// compile time sake.
assert((F.isCanonical() || F.Scale != 0));
return isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy,
F.BaseGV, F.BaseOffset, F.HasBaseReg, F.Scale);
}
/// isLegalUse - Test whether we know how to expand the current formula.
static bool isLegalUse(const TargetTransformInfo &TTI, int64_t MinOffset,
int64_t MaxOffset, LSRUse::KindType Kind, Type *AccessTy,
GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg,
int64_t Scale) {
// We know how to expand completely foldable formulae.
return isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy, BaseGV,
BaseOffset, HasBaseReg, Scale) ||
// Or formulae that use a base register produced by a sum of base
// registers.
(Scale == 1 &&
isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy,
BaseGV, BaseOffset, true, 0));
}
static bool isLegalUse(const TargetTransformInfo &TTI, int64_t MinOffset,
@ -1384,36 +1492,23 @@ static bool isLegalUse(const TargetTransformInfo &TTI, int64_t MinOffset,
F.BaseOffset, F.HasBaseReg, F.Scale);
}
static bool isLegal2RegAMUse(const TargetTransformInfo &TTI, const LSRUse &LU,
const Formula &F) {
// If F is used as an Addressing Mode, it may fold one Base plus one
// scaled register. If the scaled register is nil, do as if another
// element of the base regs is a 1-scaled register.
// This is possible if BaseRegs has at least 2 registers.
// If this is not an address calculation, this is not an addressing mode
// use.
if (LU.Kind != LSRUse::Address)
return false;
// F is already scaled.
if (F.Scale != 0)
return false;
// We need to keep one register for the base and one to scale.
if (F.BaseRegs.size() < 2)
return false;
return isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy,
F.BaseGV, F.BaseOffset, F.HasBaseReg, 1);
}
static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
const LSRUse &LU, const Formula &F) {
return isAMCompletelyFolded(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind,
LU.AccessTy, F.BaseGV, F.BaseOffset, F.HasBaseReg,
F.Scale);
}
static unsigned getScalingFactorCost(const TargetTransformInfo &TTI,
const LSRUse &LU, const Formula &F) {
if (!F.Scale)
return 0;
assert(isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind,
LU.AccessTy, F) && "Illegal formula in use.");
// If the use is not completely folded in that instruction, we will have to
// pay an extra cost only for scale != 1.
if (!isAMCompletelyFolded(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind,
LU.AccessTy, F))
return F.Scale != 1;
switch (LU.Kind) {
case LSRUse::Address: {
@ -1432,12 +1527,10 @@ static unsigned getScalingFactorCost(const TargetTransformInfo &TTI,
return std::max(ScaleCostMinOffset, ScaleCostMaxOffset);
}
case LSRUse::ICmpZero:
// ICmpZero BaseReg + -1*ScaleReg => ICmp BaseReg, ScaleReg.
// Therefore, return 0 in case F.Scale == -1.
return F.Scale != -1;
case LSRUse::Basic:
case LSRUse::Special:
// The use is completely folded, i.e., everything is folded into the
// instruction.
return 0;
}
@ -1462,7 +1555,8 @@ static bool isAlwaysFoldable(const TargetTransformInfo &TTI,
HasBaseReg = true;
}
return isLegalUse(TTI, Kind, AccessTy, BaseGV, BaseOffset, HasBaseReg, Scale);
return isAMCompletelyFolded(TTI, Kind, AccessTy, BaseGV, BaseOffset,
HasBaseReg, Scale);
}
static bool isAlwaysFoldable(const TargetTransformInfo &TTI,
@ -1487,8 +1581,8 @@ static bool isAlwaysFoldable(const TargetTransformInfo &TTI,
// base and a scale.
int64_t Scale = Kind == LSRUse::ICmpZero ? -1 : 1;
return isLegalUse(TTI, MinOffset, MaxOffset, Kind, AccessTy, BaseGV,
BaseOffset, HasBaseReg, Scale);
return isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy, BaseGV,
BaseOffset, HasBaseReg, Scale);
}
namespace {
@ -1644,8 +1738,19 @@ class LSRInstance {
void GenerateReassociations(LSRUse &LU, unsigned LUIdx, Formula Base,
unsigned Depth = 0);
void GenerateReassociationsImpl(LSRUse &LU, unsigned LUIdx,
const Formula &Base, unsigned Depth,
size_t Idx, bool IsScaledReg = false);
void GenerateCombinations(LSRUse &LU, unsigned LUIdx, Formula Base);
void GenerateSymbolicOffsetsImpl(LSRUse &LU, unsigned LUIdx,
const Formula &Base, size_t Idx,
bool IsScaledReg = false);
void GenerateSymbolicOffsets(LSRUse &LU, unsigned LUIdx, Formula Base);
void GenerateConstantOffsetsImpl(LSRUse &LU, unsigned LUIdx,
const Formula &Base,
const SmallVectorImpl<int64_t> &Worklist,
size_t Idx, bool IsScaledReg = false);
void GenerateConstantOffsets(LSRUse &LU, unsigned LUIdx, Formula Base);
void GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx, Formula Base);
void GenerateScales(LSRUse &LU, unsigned LUIdx, Formula Base);
@ -2148,24 +2253,26 @@ LSRInstance::reconcileNewOffset(LSRUse &LU, int64_t NewOffset, bool HasBaseReg,
// the uses will have all its uses outside the loop, for example.
if (LU.Kind != Kind)
return false;
// Conservatively assume HasBaseReg is true for now.
if (NewOffset < LU.MinOffset) {
if (!isAlwaysFoldable(TTI, Kind, AccessTy, /*BaseGV=*/ nullptr,
LU.MaxOffset - NewOffset, HasBaseReg))
return false;
NewMinOffset = NewOffset;
} else if (NewOffset > LU.MaxOffset) {
if (!isAlwaysFoldable(TTI, Kind, AccessTy, /*BaseGV=*/ nullptr,
NewOffset - LU.MinOffset, HasBaseReg))
return false;
NewMaxOffset = NewOffset;
}
// Check for a mismatched access type, and fall back conservatively as needed.
// TODO: Be less conservative when the type is similar and can use the same
// addressing modes.
if (Kind == LSRUse::Address && AccessTy != LU.AccessTy)
NewAccessTy = Type::getVoidTy(AccessTy->getContext());
// Conservatively assume HasBaseReg is true for now.
if (NewOffset < LU.MinOffset) {
if (!isAlwaysFoldable(TTI, Kind, NewAccessTy, /*BaseGV=*/nullptr,
LU.MaxOffset - NewOffset, HasBaseReg))
return false;
NewMinOffset = NewOffset;
} else if (NewOffset > LU.MaxOffset) {
if (!isAlwaysFoldable(TTI, Kind, NewAccessTy, /*BaseGV=*/nullptr,
NewOffset - LU.MinOffset, HasBaseReg))
return false;
NewMaxOffset = NewOffset;
}
// Update the use.
LU.MinOffset = NewMinOffset;
LU.MaxOffset = NewMaxOffset;
@ -2994,6 +3101,9 @@ void LSRInstance::CountRegisters(const Formula &F, size_t LUIdx) {
/// InsertFormula - If the given formula has not yet been inserted, add it to
/// the list, and return true. Return false otherwise.
bool LSRInstance::InsertFormula(LSRUse &LU, unsigned LUIdx, const Formula &F) {
// Do not insert formula that we will not be able to expand.
assert(isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F) &&
"Formula is illegal");
if (!LU.InsertFormula(F))
return false;
@ -3149,84 +3259,104 @@ static const SCEV *CollectSubexprs(const SCEV *S, const SCEVConstant *C,
return S;
}
/// \brief Helper function for LSRInstance::GenerateReassociations.
void LSRInstance::GenerateReassociationsImpl(LSRUse &LU, unsigned LUIdx,
const Formula &Base,
unsigned Depth, size_t Idx,
bool IsScaledReg) {
const SCEV *BaseReg = IsScaledReg ? Base.ScaledReg : Base.BaseRegs[Idx];
SmallVector<const SCEV *, 8> AddOps;
const SCEV *Remainder = CollectSubexprs(BaseReg, nullptr, AddOps, L, SE);
if (Remainder)
AddOps.push_back(Remainder);
if (AddOps.size() == 1)
return;
for (SmallVectorImpl<const SCEV *>::const_iterator J = AddOps.begin(),
JE = AddOps.end();
J != JE; ++J) {
// Loop-variant "unknown" values are uninteresting; we won't be able to
// do anything meaningful with them.
if (isa<SCEVUnknown>(*J) && !SE.isLoopInvariant(*J, L))
continue;
// Don't pull a constant into a register if the constant could be folded
// into an immediate field.
if (isAlwaysFoldable(TTI, SE, LU.MinOffset, LU.MaxOffset, LU.Kind,
LU.AccessTy, *J, Base.getNumRegs() > 1))
continue;
// Collect all operands except *J.
SmallVector<const SCEV *, 8> InnerAddOps(
((const SmallVector<const SCEV *, 8> &)AddOps).begin(), J);
InnerAddOps.append(std::next(J),
((const SmallVector<const SCEV *, 8> &)AddOps).end());
// Don't leave just a constant behind in a register if the constant could
// be folded into an immediate field.
if (InnerAddOps.size() == 1 &&
isAlwaysFoldable(TTI, SE, LU.MinOffset, LU.MaxOffset, LU.Kind,
LU.AccessTy, InnerAddOps[0], Base.getNumRegs() > 1))
continue;
const SCEV *InnerSum = SE.getAddExpr(InnerAddOps);
if (InnerSum->isZero())
continue;
Formula F = Base;
// Add the remaining pieces of the add back into the new formula.
const SCEVConstant *InnerSumSC = dyn_cast<SCEVConstant>(InnerSum);
if (InnerSumSC && SE.getTypeSizeInBits(InnerSumSC->getType()) <= 64 &&
TTI.isLegalAddImmediate((uint64_t)F.UnfoldedOffset +
InnerSumSC->getValue()->getZExtValue())) {
F.UnfoldedOffset =
(uint64_t)F.UnfoldedOffset + InnerSumSC->getValue()->getZExtValue();
if (IsScaledReg)
F.ScaledReg = nullptr;
else
F.BaseRegs.erase(F.BaseRegs.begin() + Idx);
} else if (IsScaledReg)
F.ScaledReg = InnerSum;
else
F.BaseRegs[Idx] = InnerSum;
// Add J as its own register, or an unfolded immediate.
const SCEVConstant *SC = dyn_cast<SCEVConstant>(*J);
if (SC && SE.getTypeSizeInBits(SC->getType()) <= 64 &&
TTI.isLegalAddImmediate((uint64_t)F.UnfoldedOffset +
SC->getValue()->getZExtValue()))
F.UnfoldedOffset =
(uint64_t)F.UnfoldedOffset + SC->getValue()->getZExtValue();
else
F.BaseRegs.push_back(*J);
// We may have changed the number of register in base regs, adjust the
// formula accordingly.
F.Canonicalize();
if (InsertFormula(LU, LUIdx, F))
// If that formula hadn't been seen before, recurse to find more like
// it.
GenerateReassociations(LU, LUIdx, LU.Formulae.back(), Depth + 1);
}
}
/// GenerateReassociations - Split out subexpressions from adds and the bases of
/// addrecs.
void LSRInstance::GenerateReassociations(LSRUse &LU, unsigned LUIdx,
Formula Base,
unsigned Depth) {
Formula Base, unsigned Depth) {
assert(Base.isCanonical() && "Input must be in the canonical form");
// Arbitrarily cap recursion to protect compile time.
if (Depth >= 3) return;
if (Depth >= 3)
return;
for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i) {
const SCEV *BaseReg = Base.BaseRegs[i];
for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i)
GenerateReassociationsImpl(LU, LUIdx, Base, Depth, i);
SmallVector<const SCEV *, 8> AddOps;
const SCEV *Remainder = CollectSubexprs(BaseReg, nullptr, AddOps, L, SE);
if (Remainder)
AddOps.push_back(Remainder);
if (AddOps.size() == 1) continue;
for (SmallVectorImpl<const SCEV *>::const_iterator J = AddOps.begin(),
JE = AddOps.end(); J != JE; ++J) {
// Loop-variant "unknown" values are uninteresting; we won't be able to
// do anything meaningful with them.
if (isa<SCEVUnknown>(*J) && !SE.isLoopInvariant(*J, L))
continue;
// Don't pull a constant into a register if the constant could be folded
// into an immediate field.
if (isAlwaysFoldable(TTI, SE, LU.MinOffset, LU.MaxOffset, LU.Kind,
LU.AccessTy, *J, Base.getNumRegs() > 1))
continue;
// Collect all operands except *J.
SmallVector<const SCEV *, 8> InnerAddOps(
((const SmallVector<const SCEV *, 8> &)AddOps).begin(), J);
InnerAddOps.append(std::next(J),
((const SmallVector<const SCEV *, 8> &)AddOps).end());
// Don't leave just a constant behind in a register if the constant could
// be folded into an immediate field.
if (InnerAddOps.size() == 1 &&
isAlwaysFoldable(TTI, SE, LU.MinOffset, LU.MaxOffset, LU.Kind,
LU.AccessTy, InnerAddOps[0], Base.getNumRegs() > 1))
continue;
const SCEV *InnerSum = SE.getAddExpr(InnerAddOps);
if (InnerSum->isZero())
continue;
Formula F = Base;
// Add the remaining pieces of the add back into the new formula.
const SCEVConstant *InnerSumSC = dyn_cast<SCEVConstant>(InnerSum);
if (InnerSumSC &&
SE.getTypeSizeInBits(InnerSumSC->getType()) <= 64 &&
TTI.isLegalAddImmediate((uint64_t)F.UnfoldedOffset +
InnerSumSC->getValue()->getZExtValue())) {
F.UnfoldedOffset = (uint64_t)F.UnfoldedOffset +
InnerSumSC->getValue()->getZExtValue();
F.BaseRegs.erase(F.BaseRegs.begin() + i);
} else
F.BaseRegs[i] = InnerSum;
// Add J as its own register, or an unfolded immediate.
const SCEVConstant *SC = dyn_cast<SCEVConstant>(*J);
if (SC && SE.getTypeSizeInBits(SC->getType()) <= 64 &&
TTI.isLegalAddImmediate((uint64_t)F.UnfoldedOffset +
SC->getValue()->getZExtValue()))
F.UnfoldedOffset = (uint64_t)F.UnfoldedOffset +
SC->getValue()->getZExtValue();
else
F.BaseRegs.push_back(*J);
if (InsertFormula(LU, LUIdx, F))
// If that formula hadn't been seen before, recurse to find more like
// it.
GenerateReassociations(LU, LUIdx, LU.Formulae.back(), Depth+1);
}
}
if (Base.Scale == 1)
GenerateReassociationsImpl(LU, LUIdx, Base, Depth,
/* Idx */ -1, /* IsScaledReg */ true);
}
/// GenerateCombinations - Generate a formula consisting of all of the
@ -3234,8 +3364,12 @@ void LSRInstance::GenerateReassociations(LSRUse &LU, unsigned LUIdx,
void LSRInstance::GenerateCombinations(LSRUse &LU, unsigned LUIdx,
Formula Base) {
// This method is only interesting on a plurality of registers.
if (Base.BaseRegs.size() <= 1) return;
if (Base.BaseRegs.size() + (Base.Scale == 1) <= 1)
return;
// Flatten the representation, i.e., reg1 + 1*reg2 => reg1 + reg2, before
// processing the formula.
Base.Unscale();
Formula F = Base;
F.BaseRegs.clear();
SmallVector<const SCEV *, 4> Ops;
@ -3255,29 +3389,87 @@ void LSRInstance::GenerateCombinations(LSRUse &LU, unsigned LUIdx,
// rather than proceed with zero in a register.
if (!Sum->isZero()) {
F.BaseRegs.push_back(Sum);
F.Canonicalize();
(void)InsertFormula(LU, LUIdx, F);
}
}
}
/// \brief Helper function for LSRInstance::GenerateSymbolicOffsets.
void LSRInstance::GenerateSymbolicOffsetsImpl(LSRUse &LU, unsigned LUIdx,
const Formula &Base, size_t Idx,
bool IsScaledReg) {
const SCEV *G = IsScaledReg ? Base.ScaledReg : Base.BaseRegs[Idx];
GlobalValue *GV = ExtractSymbol(G, SE);
if (G->isZero() || !GV)
return;
Formula F = Base;
F.BaseGV = GV;
if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F))
return;
if (IsScaledReg)
F.ScaledReg = G;
else
F.BaseRegs[Idx] = G;
(void)InsertFormula(LU, LUIdx, F);
}
/// GenerateSymbolicOffsets - Generate reuse formulae using symbolic offsets.
void LSRInstance::GenerateSymbolicOffsets(LSRUse &LU, unsigned LUIdx,
Formula Base) {
// We can't add a symbolic offset if the address already contains one.
if (Base.BaseGV) return;
for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i) {
const SCEV *G = Base.BaseRegs[i];
GlobalValue *GV = ExtractSymbol(G, SE);
if (G->isZero() || !GV)
continue;
for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i)
GenerateSymbolicOffsetsImpl(LU, LUIdx, Base, i);
if (Base.Scale == 1)
GenerateSymbolicOffsetsImpl(LU, LUIdx, Base, /* Idx */ -1,
/* IsScaledReg */ true);
}
/// \brief Helper function for LSRInstance::GenerateConstantOffsets.
void LSRInstance::GenerateConstantOffsetsImpl(
LSRUse &LU, unsigned LUIdx, const Formula &Base,
const SmallVectorImpl<int64_t> &Worklist, size_t Idx, bool IsScaledReg) {
const SCEV *G = IsScaledReg ? Base.ScaledReg : Base.BaseRegs[Idx];
for (SmallVectorImpl<int64_t>::const_iterator I = Worklist.begin(),
E = Worklist.end();
I != E; ++I) {
Formula F = Base;
F.BaseGV = GV;
if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F))
continue;
F.BaseRegs[i] = G;
(void)InsertFormula(LU, LUIdx, F);
F.BaseOffset = (uint64_t)Base.BaseOffset - *I;
if (isLegalUse(TTI, LU.MinOffset - *I, LU.MaxOffset - *I, LU.Kind,
LU.AccessTy, F)) {
// Add the offset to the base register.
const SCEV *NewG = SE.getAddExpr(SE.getConstant(G->getType(), *I), G);
// If it cancelled out, drop the base register, otherwise update it.
if (NewG->isZero()) {
if (IsScaledReg) {
F.Scale = 0;
F.ScaledReg = nullptr;
} else
F.DeleteBaseReg(F.BaseRegs[Idx]);
F.Canonicalize();
} else if (IsScaledReg)
F.ScaledReg = NewG;
else
F.BaseRegs[Idx] = NewG;
(void)InsertFormula(LU, LUIdx, F);
}
}
int64_t Imm = ExtractImmediate(G, SE);
if (G->isZero() || Imm == 0)
return;
Formula F = Base;
F.BaseOffset = (uint64_t)F.BaseOffset + Imm;
if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F))
return;
if (IsScaledReg)
F.ScaledReg = G;
else
F.BaseRegs[Idx] = G;
(void)InsertFormula(LU, LUIdx, F);
}
/// GenerateConstantOffsets - Generate reuse formulae using symbolic offsets.
@ -3290,38 +3482,11 @@ void LSRInstance::GenerateConstantOffsets(LSRUse &LU, unsigned LUIdx,
if (LU.MaxOffset != LU.MinOffset)
Worklist.push_back(LU.MaxOffset);
for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i) {
const SCEV *G = Base.BaseRegs[i];
for (SmallVectorImpl<int64_t>::const_iterator I = Worklist.begin(),
E = Worklist.end(); I != E; ++I) {
Formula F = Base;
F.BaseOffset = (uint64_t)Base.BaseOffset - *I;
if (isLegalUse(TTI, LU.MinOffset - *I, LU.MaxOffset - *I, LU.Kind,
LU.AccessTy, F)) {
// Add the offset to the base register.
const SCEV *NewG = SE.getAddExpr(SE.getConstant(G->getType(), *I), G);
// If it cancelled out, drop the base register, otherwise update it.
if (NewG->isZero()) {
std::swap(F.BaseRegs[i], F.BaseRegs.back());
F.BaseRegs.pop_back();
} else
F.BaseRegs[i] = NewG;
(void)InsertFormula(LU, LUIdx, F);
}
}
int64_t Imm = ExtractImmediate(G, SE);
if (G->isZero() || Imm == 0)
continue;
Formula F = Base;
F.BaseOffset = (uint64_t)F.BaseOffset + Imm;
if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F))
continue;
F.BaseRegs[i] = G;
(void)InsertFormula(LU, LUIdx, F);
}
for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i)
GenerateConstantOffsetsImpl(LU, LUIdx, Base, Worklist, i);
if (Base.Scale == 1)
GenerateConstantOffsetsImpl(LU, LUIdx, Base, Worklist, /* Idx */ -1,
/* IsScaledReg */ true);
}
/// GenerateICmpZeroScales - For ICmpZero, check to see if we can scale up
@ -3421,7 +3586,11 @@ void LSRInstance::GenerateScales(LSRUse &LU, unsigned LUIdx, Formula Base) {
if (!IntTy) return;
// If this Formula already has a scaled register, we can't add another one.
if (Base.Scale != 0) return;
// Try to unscale the formula to generate a better scale.
if (Base.Scale != 0 && !Base.Unscale())
return;
assert(Base.Scale == 0 && "Unscale did not did its job!");
// Check each interesting stride.
for (SmallSetVector<int64_t, 8>::const_iterator
@ -3462,6 +3631,11 @@ void LSRInstance::GenerateScales(LSRUse &LU, unsigned LUIdx, Formula Base) {
Formula F = Base;
F.ScaledReg = Quotient;
F.DeleteBaseReg(F.BaseRegs[i]);
// The canonical representation of 1*reg is reg, which is already in
// Base. In that case, do not try to insert the formula, it will be
// rejected anyway.
if (F.Scale == 1 && F.BaseRegs.empty())
continue;
(void)InsertFormula(LU, LUIdx, F);
}
}
@ -3626,7 +3800,12 @@ void LSRInstance::GenerateCrossUseConstantOffsets() {
// TODO: Use a more targeted data structure.
for (size_t L = 0, LE = LU.Formulae.size(); L != LE; ++L) {
const Formula &F = LU.Formulae[L];
Formula F = LU.Formulae[L];
// FIXME: The code for the scaled and unscaled registers looks
// very similar but slightly different. Investigate if they
// could be merged. That way, we would not have to unscale the
// Formula.
F.Unscale();
// Use the immediate in the scaled register.
if (F.ScaledReg == OrigReg) {
int64_t Offset = (uint64_t)F.BaseOffset + Imm * (uint64_t)F.Scale;
@ -3652,6 +3831,7 @@ void LSRInstance::GenerateCrossUseConstantOffsets() {
continue;
// OK, looks good.
NewF.Canonicalize();
(void)InsertFormula(LU, LUIdx, NewF);
} else {
// Use the immediate in a base register.
@ -3685,6 +3865,7 @@ void LSRInstance::GenerateCrossUseConstantOffsets() {
goto skip_formula;
// Ok, looks good.
NewF.Canonicalize();
(void)InsertFormula(LU, LUIdx, NewF);
break;
skip_formula:;
@ -3938,7 +4119,7 @@ void LSRInstance::NarrowSearchSpaceByCollapsingUnrolledCode() {
for (SmallVectorImpl<Formula>::const_iterator I = LU.Formulae.begin(),
E = LU.Formulae.end(); I != E; ++I) {
const Formula &F = *I;
if (F.BaseOffset == 0 || F.Scale != 0)
if (F.BaseOffset == 0 || (F.Scale != 0 && F.Scale != 1))
continue;
LSRUse *LUThatHas = FindUseWithSimilarFormula(F, LU);
@ -4399,25 +4580,34 @@ Value *LSRInstance::Expand(const LSRFixup &LF,
Loops, SE, DT);
if (LU.Kind == LSRUse::ICmpZero) {
// An interesting way of "folding" with an icmp is to use a negated
// scale, which we'll implement by inserting it into the other operand
// of the icmp.
assert(F.Scale == -1 &&
"The only scale supported by ICmpZero uses is -1!");
ICmpScaledV = Rewriter.expandCodeFor(ScaledS, nullptr, IP);
// Expand ScaleReg as if it was part of the base regs.
if (F.Scale == 1)
Ops.push_back(
SE.getUnknown(Rewriter.expandCodeFor(ScaledS, nullptr, IP)));
else {
// An interesting way of "folding" with an icmp is to use a negated
// scale, which we'll implement by inserting it into the other operand
// of the icmp.
assert(F.Scale == -1 &&
"The only scale supported by ICmpZero uses is -1!");
ICmpScaledV = Rewriter.expandCodeFor(ScaledS, nullptr, IP);
}
} else {
// Otherwise just expand the scaled register and an explicit scale,
// which is expected to be matched as part of the address.
// Flush the operand list to suppress SCEVExpander hoisting address modes.
if (!Ops.empty() && LU.Kind == LSRUse::Address) {
// Unless the addressing mode will not be folded.
if (!Ops.empty() && LU.Kind == LSRUse::Address &&
isAMCompletelyFolded(TTI, LU, F)) {
Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty, IP);
Ops.clear();
Ops.push_back(SE.getUnknown(FullV));
}
ScaledS = SE.getUnknown(Rewriter.expandCodeFor(ScaledS, nullptr, IP));
ScaledS = SE.getMulExpr(ScaledS,
SE.getConstant(ScaledS->getType(), F.Scale));
if (F.Scale != 1)
ScaledS =
SE.getMulExpr(ScaledS, SE.getConstant(ScaledS->getType(), F.Scale));
Ops.push_back(ScaledS);
}
}
@ -4495,7 +4685,9 @@ Value *LSRInstance::Expand(const LSRFixup &LF,
}
CI->setOperand(1, ICmpScaledV);
} else {
assert(F.Scale == 0 &&
// A scale of 1 means that the scale has been expanded as part of the
// base regs.
assert((F.Scale == 0 || F.Scale == 1) &&
"ICmp does not support folding a global value and "
"a scale at the same time!");
Constant *C = ConstantInt::getSigned(SE.getEffectiveSCEVType(OpTy),

View File

@ -1,6 +1,9 @@
; RUN: opt -S -loop-reduce < %s | FileCheck %s
; Complex addressing mode are costly.
; Make loop-reduce prefer unscaled accesses.
; On X86, reg1 + 1*reg2 has the same cost as reg1 + 8*reg2.
; Therefore, LSR currently prefers to fold as much computation as possible
; in the addressing mode.
; <rdar://problem/16730541>
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
target triple = "x86_64-apple-macosx"
@ -18,8 +21,8 @@ for.body: ; preds = %for.body, %entry
%tmp = add nsw i64 %indvars.iv, -1
%arrayidx = getelementptr inbounds double* %b, i64 %tmp
%tmp1 = load double* %arrayidx, align 8
; The induction variable should carry the scaling factor: 1 * 8 = 8.
; CHECK: [[IVNEXT]] = add nuw nsw i64 [[IV]], 8
; The induction variable should carry the scaling factor: 1.
; CHECK: [[IVNEXT]] = add nuw nsw i64 [[IV]], 1
%indvars.iv.next = add i64 %indvars.iv, 1
%arrayidx2 = getelementptr inbounds double* %c, i64 %indvars.iv.next
%tmp2 = load double* %arrayidx2, align 8
@ -27,8 +30,8 @@ for.body: ; preds = %for.body, %entry
%arrayidx4 = getelementptr inbounds double* %a, i64 %indvars.iv
store double %mul, double* %arrayidx4, align 8
%lftr.wideiv = trunc i64 %indvars.iv.next to i32
; Comparison should be 19 * 8 = 152.
; CHECK: icmp eq i32 {{%[^,]+}}, 152
; Comparison should be 19 * 1 = 19.
; CHECK: icmp eq i32 {{%[^,]+}}, 19
%exitcond = icmp eq i32 %lftr.wideiv, 20
br i1 %exitcond, label %for.end, label %for.body

View File

@ -5,7 +5,7 @@
; CHECK-LABEL: count_up
; CHECK-NOT: {{and|movz|sar|shl}}
; CHECK: addq $8,
; CHECK: incq
; CHECK-NOT: {{and|movz|sar|shl}}
; CHECK: jne
define void @count_up(double* %d, i64 %n) nounwind {
@ -71,7 +71,7 @@ return:
; CHECK-LABEL: count_up_signed
; CHECK-NOT: {{and|movz|sar|shl}}
; CHECK: addq $8,
; CHECK: incq
; CHECK-NOT: {{and|movz|sar|shl}}
; CHECK: jne
define void @count_up_signed(double* %d, i64 %n) nounwind {
@ -242,7 +242,7 @@ return:
; CHECK-LABEL: another_count_down_signed
; CHECK-NOT: {{and|movz|sar|shl}}
; CHECK: addq $-8,
; CHECK: decq
; CHECK-NOT: {{and|movz|sar|shl}}
; CHECK: jne
define void @another_count_down_signed(double* %d, i64 %n) nounwind {