diff --git a/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/lib/Transforms/Scalar/LoopStrengthReduce.cpp index 349d04524fe..7867d9fad34 100644 --- a/lib/Transforms/Scalar/LoopStrengthReduce.cpp +++ b/lib/Transforms/Scalar/LoopStrengthReduce.cpp @@ -634,6 +634,19 @@ static Type *getAccessType(const Instruction *Inst) { return AccessTy; } +/// isExistingPhi - Return true if this AddRec is already a phi in its loop. +static bool isExistingPhi(const SCEVAddRecExpr *AR, ScalarEvolution &SE) { + for (BasicBlock::iterator I = AR->getLoop()->getHeader()->begin(); + PHINode *PN = dyn_cast(I); ++I) { + if (SE.isSCEVable(PN->getType()) && + (SE.getEffectiveSCEVType(PN->getType()) == + SE.getEffectiveSCEVType(AR->getType())) && + SE.getSCEV(PN) == AR) + return true; + } + return false; +} + /// DeleteTriviallyDeadInstructions - If any of the instructions is the /// specified set are trivially dead, delete them and see if this makes any of /// their operands subsequently dead. @@ -703,7 +716,8 @@ public: const DenseSet &VisitedRegs, const Loop *L, const SmallVectorImpl &Offsets, - ScalarEvolution &SE, DominatorTree &DT); + ScalarEvolution &SE, DominatorTree &DT, + SmallPtrSet *LoserRegs = 0); void print(raw_ostream &OS) const; void dump() const; @@ -716,7 +730,8 @@ private: void RatePrimaryRegister(const SCEV *Reg, SmallPtrSet &Regs, const Loop *L, - ScalarEvolution &SE, DominatorTree &DT); + ScalarEvolution &SE, DominatorTree &DT, + SmallPtrSet *LoserRegs); }; } @@ -736,18 +751,13 @@ void Cost::RateRegister(const SCEV *Reg, // on other loops, and cannot be expected to change sibling loops. If the // AddRec exists, consider it's register free and leave it alone. Otherwise, // do not consider this formula at all. - // FIXME: why do we need to generate such fomulae? else if (!EnableNested || L->contains(AR->getLoop()) || (!AR->getLoop()->contains(L) && DT.dominates(L->getHeader(), AR->getLoop()->getHeader()))) { - for (BasicBlock::iterator I = AR->getLoop()->getHeader()->begin(); - PHINode *PN = dyn_cast(I); ++I) { - if (SE.isSCEVable(PN->getType()) && - (SE.getEffectiveSCEVType(PN->getType()) == - SE.getEffectiveSCEVType(AR->getType())) && - SE.getSCEV(PN) == AR) - return; - } + if (isExistingPhi(AR, SE)) + return; + + // For !EnableNested, never rewrite IVs in other loops. if (!EnableNested) { Loose(); return; @@ -789,13 +799,22 @@ void Cost::RateRegister(const SCEV *Reg, } /// RatePrimaryRegister - Record this register in the set. If we haven't seen it -/// before, rate it. +/// before, rate it. Optional LoserRegs provides a way to declare any formula +/// that refers to one of those regs an instant loser. void Cost::RatePrimaryRegister(const SCEV *Reg, SmallPtrSet &Regs, const Loop *L, - ScalarEvolution &SE, DominatorTree &DT) { - if (Regs.insert(Reg)) + ScalarEvolution &SE, DominatorTree &DT, + SmallPtrSet *LoserRegs) { + if (LoserRegs && LoserRegs->count(Reg)) { + Loose(); + return; + } + if (Regs.insert(Reg)) { RateRegister(Reg, Regs, L, SE, DT); + if (isLoser()) + LoserRegs->insert(Reg); + } } void Cost::RateFormula(const Formula &F, @@ -803,14 +822,15 @@ void Cost::RateFormula(const Formula &F, const DenseSet &VisitedRegs, const Loop *L, const SmallVectorImpl &Offsets, - ScalarEvolution &SE, DominatorTree &DT) { + ScalarEvolution &SE, DominatorTree &DT, + SmallPtrSet *LoserRegs) { // Tally up the registers. if (const SCEV *ScaledReg = F.ScaledReg) { if (VisitedRegs.count(ScaledReg)) { Loose(); return; } - RatePrimaryRegister(ScaledReg, Regs, L, SE, DT); + RatePrimaryRegister(ScaledReg, Regs, L, SE, DT, LoserRegs); if (isLoser()) return; } @@ -821,7 +841,7 @@ void Cost::RateFormula(const Formula &F, Loose(); return; } - RatePrimaryRegister(BaseReg, Regs, L, SE, DT); + RatePrimaryRegister(BaseReg, Regs, L, SE, DT, LoserRegs); if (isLoser()) return; } @@ -1103,7 +1123,6 @@ bool LSRUse::InsertFormula(const Formula &F) { Formulae.push_back(F); // Record registers now being used by this use. - if (F.ScaledReg) Regs.insert(F.ScaledReg); Regs.insert(F.BaseRegs.begin(), F.BaseRegs.end()); return true; @@ -1114,7 +1133,6 @@ void LSRUse::DeleteFormula(Formula &F) { if (&F != &Formulae.back()) std::swap(F, Formulae.back()); Formulae.pop_back(); - assert(!Formulae.empty() && "LSRUse has no formulae left!"); } /// RecomputeRegs - Recompute the Regs field, and update RegUses. @@ -2912,6 +2930,7 @@ LSRInstance::GenerateAllReuseFormulae() { void LSRInstance::FilterOutUndesirableDedicatedRegisters() { DenseSet VisitedRegs; SmallPtrSet Regs; + SmallPtrSet LoserRegs; #ifndef NDEBUG bool ChangedFormulae = false; #endif @@ -2931,46 +2950,66 @@ void LSRInstance::FilterOutUndesirableDedicatedRegisters() { FIdx != NumForms; ++FIdx) { Formula &F = LU.Formulae[FIdx]; - SmallVector Key; - for (SmallVectorImpl::const_iterator J = F.BaseRegs.begin(), - JE = F.BaseRegs.end(); J != JE; ++J) { - const SCEV *Reg = *J; - if (RegUses.isRegUsedByUsesOtherThan(Reg, LUIdx)) - Key.push_back(Reg); + // Some formulas are instant losers. For example, they may depend on + // nonexistent AddRecs from other loops. These need to be filtered + // immediately, otherwise heuristics could choose them over others leading + // to an unsatisfactory solution. Passing LoserRegs into RateFormula here + // avoids the need to recompute this information across formulae using the + // same bad AddRec. Passing LoserRegs is also essential unless we remove + // the corresponding bad register from the Regs set. + Cost CostF; + Regs.clear(); + CostF.RateFormula(F, Regs, VisitedRegs, L, LU.Offsets, SE, DT, + &LoserRegs); + if (CostF.isLoser()) { + // During initial formula generation, undesirable formulae are generated + // by uses within other loops that have some non-trivial address mode or + // use the postinc form of the IV. LSR needs to provide these formulae + // as the basis of rediscovering the desired formula that uses an AddRec + // corresponding to the existing phi. Once all formulae have been + // generated, these initial losers may be pruned. + DEBUG(dbgs() << " Filtering loser "; F.print(dbgs()); + dbgs() << "\n"); } - if (F.ScaledReg && - RegUses.isRegUsedByUsesOtherThan(F.ScaledReg, LUIdx)) - Key.push_back(F.ScaledReg); - // Unstable sort by host order ok, because this is only used for - // uniquifying. - std::sort(Key.begin(), Key.end()); + else { + SmallVector Key; + for (SmallVectorImpl::const_iterator J = F.BaseRegs.begin(), + JE = F.BaseRegs.end(); J != JE; ++J) { + const SCEV *Reg = *J; + if (RegUses.isRegUsedByUsesOtherThan(Reg, LUIdx)) + Key.push_back(Reg); + } + if (F.ScaledReg && + RegUses.isRegUsedByUsesOtherThan(F.ScaledReg, LUIdx)) + Key.push_back(F.ScaledReg); + // Unstable sort by host order ok, because this is only used for + // uniquifying. + std::sort(Key.begin(), Key.end()); + + std::pair P = + BestFormulae.insert(std::make_pair(Key, FIdx)); + if (P.second) + continue; - std::pair P = - BestFormulae.insert(std::make_pair(Key, FIdx)); - if (!P.second) { Formula &Best = LU.Formulae[P.first->second]; - Cost CostF; - CostF.RateFormula(F, Regs, VisitedRegs, L, LU.Offsets, SE, DT); - Regs.clear(); Cost CostBest; - CostBest.RateFormula(Best, Regs, VisitedRegs, L, LU.Offsets, SE, DT); Regs.clear(); + CostBest.RateFormula(Best, Regs, VisitedRegs, L, LU.Offsets, SE, DT); if (CostF < CostBest) std::swap(F, Best); DEBUG(dbgs() << " Filtering out formula "; F.print(dbgs()); dbgs() << "\n" " in favor of formula "; Best.print(dbgs()); dbgs() << '\n'); -#ifndef NDEBUG - ChangedFormulae = true; -#endif - LU.DeleteFormula(F); - --FIdx; - --NumForms; - Any = true; - continue; } +#ifndef NDEBUG + ChangedFormulae = true; +#endif + LU.DeleteFormula(F); + --FIdx; + --NumForms; + Any = true; } // Now that we've filtered out some formulae, recompute the Regs set. diff --git a/test/Transforms/LoopStrengthReduce/2011-12-04-loserreg.ll b/test/Transforms/LoopStrengthReduce/2011-12-04-loserreg.ll new file mode 100644 index 00000000000..51086509627 --- /dev/null +++ b/test/Transforms/LoopStrengthReduce/2011-12-04-loserreg.ll @@ -0,0 +1,96 @@ +; RUN: llc < %s | FileCheck %s +; +; Test LSR's ability to prune formulae that refer to nonexistant +; AddRecs in other loops. +; +; Unable to reduce this case further because it requires LSR to exceed +; ComplexityLimit. +; +; We really just want to ensure that LSR can process this loop without +; finding an unsatisfactory solution and bailing out. I've added +; dummyout, an obvious candidate for postinc replacement so we can +; verify that LSR removes it. + +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" +target triple = "x86_64-apple-darwin" + +; CHECK: @test +; CHECK: # %for.body{{$}} +; dummyiv copy should be removed +; CHECK-NOT: movq +; CHECK: # %for.cond19.preheader +; dummycnt should be removed +; CHECK-NOT: incq +; CHECK: # %for.body23{{$}} +define i64 @test(i64 %count, float* nocapture %srcrow, i32* nocapture %destrow) nounwind uwtable ssp { +entry: + %cmp34 = icmp eq i64 %count, 0 + br i1 %cmp34, label %for.end29, label %for.body + +for.body: ; preds = %entry, %for.body + %dummyiv = phi i64 [ %dummycnt, %for.body ], [ 0, %entry ] + %indvars.iv39 = phi i64 [ %indvars.iv.next40, %for.body ], [ 0, %entry ] + %dp.036 = phi i32* [ %add.ptr, %for.body ], [ %destrow, %entry ] + %p.035 = phi float* [ %incdec.ptr4, %for.body ], [ %srcrow, %entry ] + %incdec.ptr = getelementptr inbounds float* %p.035, i64 1 + %0 = load float* %incdec.ptr, align 4 + %incdec.ptr2 = getelementptr inbounds float* %p.035, i64 2 + %1 = load float* %incdec.ptr2, align 4 + %incdec.ptr3 = getelementptr inbounds float* %p.035, i64 3 + %2 = load float* %incdec.ptr3, align 4 + %incdec.ptr4 = getelementptr inbounds float* %p.035, i64 4 + %3 = load float* %incdec.ptr4, align 4 + %4 = load i32* %dp.036, align 4 + %conv5 = fptoui float %0 to i32 + %or = or i32 %4, %conv5 + %arrayidx6 = getelementptr inbounds i32* %dp.036, i64 1 + %5 = load i32* %arrayidx6, align 4 + %conv7 = fptoui float %1 to i32 + %or8 = or i32 %5, %conv7 + %arrayidx9 = getelementptr inbounds i32* %dp.036, i64 2 + %6 = load i32* %arrayidx9, align 4 + %conv10 = fptoui float %2 to i32 + %or11 = or i32 %6, %conv10 + %arrayidx12 = getelementptr inbounds i32* %dp.036, i64 3 + %7 = load i32* %arrayidx12, align 4 + %conv13 = fptoui float %3 to i32 + %or14 = or i32 %7, %conv13 + store i32 %or, i32* %dp.036, align 4 + store i32 %or8, i32* %arrayidx6, align 4 + store i32 %or11, i32* %arrayidx9, align 4 + store i32 %or14, i32* %arrayidx12, align 4 + %add.ptr = getelementptr inbounds i32* %dp.036, i64 4 + %indvars.iv.next40 = add i64 %indvars.iv39, 4 + %dummycnt = add i64 %dummyiv, 1 + %cmp = icmp ult i64 %indvars.iv.next40, %count + br i1 %cmp, label %for.body, label %for.cond19.preheader + +for.cond19.preheader: ; preds = %for.body + %dummyout = add i64 %dummyiv, 1 + %rem = and i64 %count, 3 + %cmp2130 = icmp eq i64 %rem, 0 + br i1 %cmp2130, label %for.end29, label %for.body23.lr.ph + +for.body23.lr.ph: ; preds = %for.cond19.preheader + %8 = and i64 %count, 3 + br label %for.body23 + +for.body23: ; preds = %for.body23, %for.body23.lr.ph + %indvars.iv = phi i64 [ 0, %for.body23.lr.ph ], [ %indvars.iv.next, %for.body23 ] + %dp.132 = phi i32* [ %add.ptr, %for.body23.lr.ph ], [ %incdec.ptr28, %for.body23 ] + %p.131 = phi float* [ %incdec.ptr4, %for.body23.lr.ph ], [ %incdec.ptr24, %for.body23 ] + %incdec.ptr24 = getelementptr inbounds float* %p.131, i64 1 + %9 = load float* %incdec.ptr24, align 4 + %10 = load i32* %dp.132, align 4 + %conv25 = fptoui float %9 to i32 + %or26 = or i32 %10, %conv25 + store i32 %or26, i32* %dp.132, align 4 + %indvars.iv.next = add i64 %indvars.iv, 1 + %incdec.ptr28 = getelementptr inbounds i32* %dp.132, i64 1 + %exitcond = icmp eq i64 %indvars.iv.next, %8 + br i1 %exitcond, label %for.end29, label %for.body23 + +for.end29: ; preds = %entry, %for.body23, %for.cond19.preheader + %result = phi i64 [ 0, %entry ], [ %dummyout, %for.body23 ], [ %dummyout, %for.cond19.preheader ] + ret i64 %result +}