diff --git a/include/llvm/Analysis/ScalarEvolutionExpander.h b/include/llvm/Analysis/ScalarEvolutionExpander.h index 01a4b958cb6..68414242b8f 100644 --- a/include/llvm/Analysis/ScalarEvolutionExpander.h +++ b/include/llvm/Analysis/ScalarEvolutionExpander.h @@ -60,6 +60,9 @@ namespace llvm { /// insert the IV increment at this position. Instruction *IVIncInsertPos; + /// Phis that complete an IV chain. Reuse + std::set > ChainedPhis; + /// CanonicalMode - When true, expressions are expanded in "canonical" /// form. In particular, addrecs are expanded as arithmetic based on /// a canonical induction variable. When false, expression are expanded @@ -102,6 +105,7 @@ namespace llvm { InsertedExpressions.clear(); InsertedValues.clear(); InsertedPostIncValues.clear(); + ChainedPhis.clear(); } /// getOrInsertCanonicalInductionVariable - This method returns the @@ -164,6 +168,9 @@ namespace llvm { void clearInsertPoint() { Builder.ClearInsertionPoint(); } + + void setChainedPhi(PHINode *PN) { ChainedPhis.insert(PN); } + private: LLVMContext &getContext() const { return SE.getContext(); } diff --git a/lib/Analysis/ScalarEvolutionExpander.cpp b/lib/Analysis/ScalarEvolutionExpander.cpp index 26ae22b56b6..8dc8eb68ff4 100644 --- a/lib/Analysis/ScalarEvolutionExpander.cpp +++ b/lib/Analysis/ScalarEvolutionExpander.cpp @@ -874,6 +874,9 @@ bool SCEVExpander::isNormalAddRecExprPHI(PHINode *PN, Instruction *IncV, /// expandAddtoGEP. bool SCEVExpander::isExpandedAddRecExprPHI(PHINode *PN, Instruction *IncV, const Loop *L) { + if (ChainedPhis.count(PN)) + return true; + switch (IncV->getOpcode()) { // Check for a simple Add/Sub or GEP of a loop invariant step. case Instruction::Add: @@ -1638,8 +1641,8 @@ unsigned SCEVExpander::replaceCongruentIVs(Loop *L, const DominatorTree *DT, const SCEV *TruncExpr = SE.getTruncateOrNoop(SE.getSCEV(OrigInc), IsomorphicInc->getType()); if (OrigInc != IsomorphicInc - && TruncExpr == SE.getSCEV(IsomorphicInc) && - hoistStep(OrigInc, IsomorphicInc, DT)) { + && TruncExpr == SE.getSCEV(IsomorphicInc) + && hoistStep(OrigInc, IsomorphicInc, DT)) { DEBUG_WITH_TYPE(DebugType, dbgs() << "INDVARS: Eliminated congruent iv.inc: " << *IsomorphicInc << '\n'); diff --git a/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/lib/Transforms/Scalar/LoopStrengthReduce.cpp index 48693c743af..85f1389fe31 100644 --- a/lib/Transforms/Scalar/LoopStrengthReduce.cpp +++ b/lib/Transforms/Scalar/LoopStrengthReduce.cpp @@ -658,6 +658,77 @@ static bool isExistingPhi(const SCEVAddRecExpr *AR, ScalarEvolution &SE) { return false; } +/// Check if expanding this expression is likely to incur significant cost. This +/// is tricky because SCEV doesn't track which expressions are actually computed +/// by the current IR. +/// +/// We currently allow expansion of IV increments that involve adds, +/// multiplication by constants, and AddRecs from existing phis. +/// +/// TODO: Allow UDivExpr if we can find an existing IV increment that is an +/// obvious multiple of the UDivExpr. +static bool isHighCostExpansion(const SCEV *S, + SmallPtrSet &Processed, + ScalarEvolution &SE) { + // Zero/One operand expressions + switch (S->getSCEVType()) { + case scUnknown: + case scConstant: + return false; + case scTruncate: + return isHighCostExpansion(cast(S)->getOperand(), + Processed, SE); + case scZeroExtend: + return isHighCostExpansion(cast(S)->getOperand(), + Processed, SE); + case scSignExtend: + return isHighCostExpansion(cast(S)->getOperand(), + Processed, SE); + } + + if (!Processed.insert(S)) + return false; + + if (const SCEVAddExpr *Add = dyn_cast(S)) { + for (SCEVAddExpr::op_iterator I = Add->op_begin(), E = Add->op_end(); + I != E; ++I) { + if (isHighCostExpansion(*I, Processed, SE)) + return true; + } + return false; + } + + if (const SCEVMulExpr *Mul = dyn_cast(S)) { + if (Mul->getNumOperands() == 2) { + // Multiplication by a constant is ok + if (isa(Mul->getOperand(0))) + return isHighCostExpansion(Mul->getOperand(1), Processed, SE); + + // If we have the value of one operand, check if an existing + // multiplication already generates this expression. + if (const SCEVUnknown *U = dyn_cast(Mul->getOperand(1))) { + Value *UVal = U->getValue(); + for (Value::use_iterator UI = UVal->use_begin(), UE = UVal->use_end(); + UI != UE; ++UI) { + Instruction *User = cast(*UI); + if (User->getOpcode() == Instruction::Mul + && SE.isSCEVable(User->getType())) { + return SE.getSCEV(User) == Mul; + } + } + } + } + } + + if (const SCEVAddRecExpr *AR = dyn_cast(S)) { + if (isExistingPhi(AR, SE)) + return false; + } + + // Fow now, consider any other type of expression (div/mul/min/max) high cost. + return true; +} + /// DeleteTriviallyDeadInstructions - If any of the instructions is the /// specified set are trivially dead, delete them and see if this makes any of /// their operands subsequently dead. @@ -2204,6 +2275,49 @@ static bool isCompatibleIVType(Value *LVal, Value *RVal) { return (LType == RType) || (LType->isPointerTy() && RType->isPointerTy()); } +/// getExprBase - Return an approximation of this SCEV expression's "base", or +/// NULL for any constant. Returning the expression itself is +/// conservative. Returning a deeper subexpression is more precise and valid as +/// long as it isn't less complex than another subexpression. For expressions +/// involving multiple unscaled values, we need to return the pointer-type +/// SCEVUnknown. This avoids forming chains across objects, such as: +/// PrevOper==a[i], IVOper==b[i], IVInc==b-a. +/// +/// Since SCEVUnknown is the rightmost type, and pointers are the rightmost +/// SCEVUnknown, we simply return the rightmost SCEV operand. +static const SCEV *getExprBase(const SCEV *S) { + switch (S->getSCEVType()) { + default: // uncluding scUnknown. + return S; + case scConstant: + return 0; + case scTruncate: + return getExprBase(cast(S)->getOperand()); + case scZeroExtend: + return getExprBase(cast(S)->getOperand()); + case scSignExtend: + return getExprBase(cast(S)->getOperand()); + case scAddExpr: { + // Skip over scaled operands (scMulExpr) to follow add operands as long as + // there's nothing more complex. + // FIXME: not sure if we want to recognize negation. + const SCEVAddExpr *Add = cast(S); + for (std::reverse_iterator I(Add->op_end()), + E(Add->op_begin()); I != E; ++I) { + const SCEV *SubExpr = *I; + if (SubExpr->getSCEVType() == scAddExpr) + return getExprBase(SubExpr); + + if (SubExpr->getSCEVType() != scMulExpr) + return SubExpr; + } + return S; // all operands are scaled, be conservative. + } + case scAddRecExpr: + return getExprBase(cast(S)->getStart()); + } +} + /// Return true if the chain increment is profitable to expand into a loop /// invariant value, which may require its own register. A profitable chain /// increment will be an offset relative to the same base. We allow such offsets @@ -2213,7 +2327,16 @@ static const SCEV * getProfitableChainIncrement(Value *NextIV, Value *PrevIV, const IVChain &Chain, Loop *L, ScalarEvolution &SE, const TargetLowering *TLI) { - const SCEV *IncExpr = SE.getMinusSCEV(SE.getSCEV(NextIV), SE.getSCEV(PrevIV)); + // Prune the solution space aggressively by checking that both IV operands + // are expressions that operate on the same unscaled SCEVUnknown. This + // "base" will be canceled by the subsequent getMinusSCEV call. Checking first + // avoids creating extra SCEV expressions. + const SCEV *OperExpr = SE.getSCEV(NextIV); + const SCEV *PrevExpr = SE.getSCEV(PrevIV); + if (getExprBase(OperExpr) != getExprBase(PrevExpr) && !StressIVChain) + return 0; + + const SCEV *IncExpr = SE.getMinusSCEV(OperExpr, PrevExpr); if (!SE.isLoopInvariant(IncExpr, L)) return 0; @@ -2222,8 +2345,19 @@ getProfitableChainIncrement(Value *NextIV, Value *PrevIV, if (StressIVChain) return IncExpr; - // Unimplemented - return 0; + // Do not replace a constant offset from IV head with a nonconstant IV + // increment. + if (!isa(IncExpr)) { + const SCEV *HeadExpr = SE.getSCEV(getWideOperand(Chain[0].IVOperand)); + if (isa(SE.getMinusSCEV(OperExpr, HeadExpr))) + return 0; + } + + SmallPtrSet Processed; + if (isHighCostExpansion(IncExpr, Processed, SE)) + return 0; + + return IncExpr; } /// Return true if the number of registers needed for the chain is estimated to @@ -2242,8 +2376,72 @@ isProfitableChain(IVChain &Chain, SmallPtrSet &Users, if (StressIVChain) return true; - // Unimplemented - return false; + if (Chain.size() <= 2) + return false; + + if (!Users.empty()) { + DEBUG(dbgs() << "Chain: " << *Chain[0].UserInst << " users:\n"; + for (SmallPtrSet::const_iterator I = Users.begin(), + E = Users.end(); I != E; ++I) { + dbgs() << " " << **I << "\n"; + }); + return false; + } + assert(!Chain.empty() && "empty IV chains are not allowed"); + + // The chain itself may require a register, so intialize cost to 1. + int cost = 1; + + // A complete chain likely eliminates the need for keeping the original IV in + // a register. LSR does not currently know how to form a complete chain unless + // the header phi already exists. + if (isa(Chain.back().UserInst) + && SE.getSCEV(Chain.back().UserInst) == Chain[0].IncExpr) { + --cost; + } + const SCEV *LastIncExpr = 0; + unsigned NumConstIncrements = 0; + unsigned NumVarIncrements = 0; + unsigned NumReusedIncrements = 0; + for (IVChain::const_iterator I = llvm::next(Chain.begin()), E = Chain.end(); + I != E; ++I) { + + if (I->IncExpr->isZero()) + continue; + + // Incrementing by zero or some constant is neutral. We assume constants can + // be folded into an addressing mode or an add's immediate operand. + if (isa(I->IncExpr)) { + ++NumConstIncrements; + continue; + } + + if (I->IncExpr == LastIncExpr) + ++NumReusedIncrements; + else + ++NumVarIncrements; + + LastIncExpr = I->IncExpr; + } + // An IV chain with a single increment is handled by LSR's postinc + // uses. However, a chain with multiple increments requires keeping the IV's + // value live longer than it needs to be if chained. + if (NumConstIncrements > 1) + --cost; + + // Materializing increment expressions in the preheader that didn't exist in + // the original code may cost a register. For example, sign-extended array + // indices can produce ridiculous increments like this: + // IV + ((sext i32 (2 * %s) to i64) + (-1 * (sext i32 %s to i64))) + cost += NumVarIncrements; + + // Reusing variable increments likely saves a register to hold the multiple of + // the stride. + cost -= NumReusedIncrements; + + DEBUG(dbgs() << "Chain: " << *Chain[0].UserInst << " Cost: " << cost << "\n"); + + return cost < 0; } /// ChainInstruction - Add this IV user to an existing chain or make it the head @@ -4280,6 +4478,13 @@ LSRInstance::ImplementSolution(const SmallVectorImpl &Solution, Rewriter.enableLSRMode(); Rewriter.setIVIncInsertPos(L, IVIncInsertPos); + // Mark phi nodes that terminate chains so the expander tries to reuse them. + for (SmallVectorImpl::const_iterator ChainI = IVChainVec.begin(), + ChainE = IVChainVec.end(); ChainI != ChainE; ++ChainI) { + if (PHINode *PN = dyn_cast(ChainI->back().UserInst)) + Rewriter.setChainedPhi(PN); + } + // Expand the new value definitions and update the users. for (SmallVectorImpl::const_iterator I = Fixups.begin(), E = Fixups.end(); I != E; ++I) { diff --git a/test/Transforms/LoopStrengthReduce/ARM/dg.exp b/test/Transforms/LoopStrengthReduce/ARM/dg.exp new file mode 100644 index 00000000000..a0009b86e5b --- /dev/null +++ b/test/Transforms/LoopStrengthReduce/ARM/dg.exp @@ -0,0 +1,5 @@ +load_lib llvm.exp + +if { [llvm_supports_target ARM] } { + RunLLVMTests [lsort [glob -nocomplain $srcdir/$subdir/*.{ll}]] +} diff --git a/test/Transforms/LoopStrengthReduce/ARM/ivchain-ARM.ll b/test/Transforms/LoopStrengthReduce/ARM/ivchain-ARM.ll new file mode 100644 index 00000000000..9189d79e2fb --- /dev/null +++ b/test/Transforms/LoopStrengthReduce/ARM/ivchain-ARM.ll @@ -0,0 +1,292 @@ +; RUN: llc < %s -O3 -march=thumb -mcpu=cortex-a9 | FileCheck %s -check-prefix=A9 + +; @simple is the most basic chain of address induction variables. Chaining +; saves at least one register and avoids complex addressing and setup +; code. +; +; A9: @simple +; no expensive address computation in the preheader +; A9: lsl +; A9-NOT: lsl +; A9: %loop +; no complex address modes +; A9-NOT: lsl +define i32 @simple(i32* %a, i32* %b, i32 %x) nounwind { +entry: + br label %loop +loop: + %iv = phi i32* [ %a, %entry ], [ %iv4, %loop ] + %s = phi i32 [ 0, %entry ], [ %s4, %loop ] + %v = load i32* %iv + %iv1 = getelementptr inbounds i32* %iv, i32 %x + %v1 = load i32* %iv1 + %iv2 = getelementptr inbounds i32* %iv1, i32 %x + %v2 = load i32* %iv2 + %iv3 = getelementptr inbounds i32* %iv2, i32 %x + %v3 = load i32* %iv3 + %s1 = add i32 %s, %v + %s2 = add i32 %s1, %v1 + %s3 = add i32 %s2, %v2 + %s4 = add i32 %s3, %v3 + %iv4 = getelementptr inbounds i32* %iv3, i32 %x + %cmp = icmp eq i32* %iv4, %b + br i1 %cmp, label %exit, label %loop +exit: + ret i32 %s4 +} + +; @user is not currently chained because the IV is live across memory ops. +; +; A9: @user +; stride multiples computed in the preheader +; A9: lsl +; A9: lsl +; A9: %loop +; complex address modes +; A9: lsl +; A9: lsl +define i32 @user(i32* %a, i32* %b, i32 %x) nounwind { +entry: + br label %loop +loop: + %iv = phi i32* [ %a, %entry ], [ %iv4, %loop ] + %s = phi i32 [ 0, %entry ], [ %s4, %loop ] + %v = load i32* %iv + %iv1 = getelementptr inbounds i32* %iv, i32 %x + %v1 = load i32* %iv1 + %iv2 = getelementptr inbounds i32* %iv1, i32 %x + %v2 = load i32* %iv2 + %iv3 = getelementptr inbounds i32* %iv2, i32 %x + %v3 = load i32* %iv3 + %s1 = add i32 %s, %v + %s2 = add i32 %s1, %v1 + %s3 = add i32 %s2, %v2 + %s4 = add i32 %s3, %v3 + %iv4 = getelementptr inbounds i32* %iv3, i32 %x + store i32 %s4, i32* %iv + %cmp = icmp eq i32* %iv4, %b + br i1 %cmp, label %exit, label %loop +exit: + ret i32 %s4 +} + +; @extrastride is a slightly more interesting case of a single +; complete chain with multiple strides. The test case IR is what LSR +; used to do, and exactly what we don't want to do. LSR's new IV +; chaining feature should now undo the damage. +; +; A9: extrastride: +; no spills +; A9-NOT: str +; only one stride multiple in the preheader +; A9: lsl +; A9-NOT: {{str r|lsl}} +; A9: %for.body{{$}} +; no complex address modes or reloads +; A9-NOT: {{ldr .*[sp]|lsl}} +define void @extrastride(i8* nocapture %main, i32 %main_stride, i32* nocapture %res, i32 %x, i32 %y, i32 %z) nounwind { +entry: + %cmp8 = icmp eq i32 %z, 0 + br i1 %cmp8, label %for.end, label %for.body.lr.ph + +for.body.lr.ph: ; preds = %entry + %add.ptr.sum = shl i32 %main_stride, 1 ; s*2 + %add.ptr1.sum = add i32 %add.ptr.sum, %main_stride ; s*3 + %add.ptr2.sum = add i32 %x, %main_stride ; s + x + %add.ptr4.sum = shl i32 %main_stride, 2 ; s*4 + %add.ptr3.sum = add i32 %add.ptr2.sum, %add.ptr4.sum ; total IV stride = s*5+x + br label %for.body + +for.body: ; preds = %for.body.lr.ph, %for.body + %main.addr.011 = phi i8* [ %main, %for.body.lr.ph ], [ %add.ptr6, %for.body ] + %i.010 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.body ] + %res.addr.09 = phi i32* [ %res, %for.body.lr.ph ], [ %add.ptr7, %for.body ] + %0 = bitcast i8* %main.addr.011 to i32* + %1 = load i32* %0, align 4 + %add.ptr = getelementptr inbounds i8* %main.addr.011, i32 %main_stride + %2 = bitcast i8* %add.ptr to i32* + %3 = load i32* %2, align 4 + %add.ptr1 = getelementptr inbounds i8* %main.addr.011, i32 %add.ptr.sum + %4 = bitcast i8* %add.ptr1 to i32* + %5 = load i32* %4, align 4 + %add.ptr2 = getelementptr inbounds i8* %main.addr.011, i32 %add.ptr1.sum + %6 = bitcast i8* %add.ptr2 to i32* + %7 = load i32* %6, align 4 + %add.ptr3 = getelementptr inbounds i8* %main.addr.011, i32 %add.ptr4.sum + %8 = bitcast i8* %add.ptr3 to i32* + %9 = load i32* %8, align 4 + %add = add i32 %3, %1 + %add4 = add i32 %add, %5 + %add5 = add i32 %add4, %7 + %add6 = add i32 %add5, %9 + store i32 %add6, i32* %res.addr.09, align 4 + %add.ptr6 = getelementptr inbounds i8* %main.addr.011, i32 %add.ptr3.sum + %add.ptr7 = getelementptr inbounds i32* %res.addr.09, i32 %y + %inc = add i32 %i.010, 1 + %cmp = icmp eq i32 %inc, %z + br i1 %cmp, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + ret void +} + +; @foldedidx is an unrolled variant of this loop: +; for (unsigned long i = 0; i < len; i += s) { +; c[i] = a[i] + b[i]; +; } +; where 's' can be folded into the addressing mode. +; Consequently, we should *not* form any chains. +; +; A9: foldedidx: +; A9: ldrb.w {{r[0-9]|lr}}, [{{r[0-9]|lr}}, #3] +define void @foldedidx(i8* nocapture %a, i8* nocapture %b, i8* nocapture %c) nounwind ssp { +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %i.07 = phi i32 [ 0, %entry ], [ %inc.3, %for.body ] + %arrayidx = getelementptr inbounds i8* %a, i32 %i.07 + %0 = load i8* %arrayidx, align 1 + %conv5 = zext i8 %0 to i32 + %arrayidx1 = getelementptr inbounds i8* %b, i32 %i.07 + %1 = load i8* %arrayidx1, align 1 + %conv26 = zext i8 %1 to i32 + %add = add nsw i32 %conv26, %conv5 + %conv3 = trunc i32 %add to i8 + %arrayidx4 = getelementptr inbounds i8* %c, i32 %i.07 + store i8 %conv3, i8* %arrayidx4, align 1 + %inc1 = or i32 %i.07, 1 + %arrayidx.1 = getelementptr inbounds i8* %a, i32 %inc1 + %2 = load i8* %arrayidx.1, align 1 + %conv5.1 = zext i8 %2 to i32 + %arrayidx1.1 = getelementptr inbounds i8* %b, i32 %inc1 + %3 = load i8* %arrayidx1.1, align 1 + %conv26.1 = zext i8 %3 to i32 + %add.1 = add nsw i32 %conv26.1, %conv5.1 + %conv3.1 = trunc i32 %add.1 to i8 + %arrayidx4.1 = getelementptr inbounds i8* %c, i32 %inc1 + store i8 %conv3.1, i8* %arrayidx4.1, align 1 + %inc.12 = or i32 %i.07, 2 + %arrayidx.2 = getelementptr inbounds i8* %a, i32 %inc.12 + %4 = load i8* %arrayidx.2, align 1 + %conv5.2 = zext i8 %4 to i32 + %arrayidx1.2 = getelementptr inbounds i8* %b, i32 %inc.12 + %5 = load i8* %arrayidx1.2, align 1 + %conv26.2 = zext i8 %5 to i32 + %add.2 = add nsw i32 %conv26.2, %conv5.2 + %conv3.2 = trunc i32 %add.2 to i8 + %arrayidx4.2 = getelementptr inbounds i8* %c, i32 %inc.12 + store i8 %conv3.2, i8* %arrayidx4.2, align 1 + %inc.23 = or i32 %i.07, 3 + %arrayidx.3 = getelementptr inbounds i8* %a, i32 %inc.23 + %6 = load i8* %arrayidx.3, align 1 + %conv5.3 = zext i8 %6 to i32 + %arrayidx1.3 = getelementptr inbounds i8* %b, i32 %inc.23 + %7 = load i8* %arrayidx1.3, align 1 + %conv26.3 = zext i8 %7 to i32 + %add.3 = add nsw i32 %conv26.3, %conv5.3 + %conv3.3 = trunc i32 %add.3 to i8 + %arrayidx4.3 = getelementptr inbounds i8* %c, i32 %inc.23 + store i8 %conv3.3, i8* %arrayidx4.3, align 1 + %inc.3 = add nsw i32 %i.07, 4 + %exitcond.3 = icmp eq i32 %inc.3, 400 + br i1 %exitcond.3, label %for.end, label %for.body + +for.end: ; preds = %for.body + ret void +} + +; @testNeon is an important example of the nead for ivchains. +; +; Currently we have three extra add.w's that keep the store address +; live past the next increment because ISEL is unfortunately undoing +; the store chain. ISEL also fails to convert the stores to +; post-increment addressing. However, the loads should use +; post-increment addressing, no add's or add.w's beyond the three +; mentioned. Most importantly, there should be no spills or reloads! +; +; CHECK: testNeon: +; CHECK: %.lr.ph +; CHECK-NOT: lsl.w +; CHECK-NOT: {{ldr|str|adds|add r}} +; CHECK: add.w r +; CHECK-NOT: {{ldr|str|adds|add r}} +; CHECK: add.w r +; CHECK-NOT: {{ldr|str|adds|add r}} +; CHECK: add.w r +; CHECK-NOT: {{ldr|str|adds|add r}} +; CHECK-NOT: add.w r +; CHECK: bne +define hidden void @testNeon(i8* %ref_data, i32 %ref_stride, i32 %limit, <16 x i8>* nocapture %data) nounwind optsize { + %1 = icmp sgt i32 %limit, 0 + br i1 %1, label %.lr.ph, label %45 + +.lr.ph: ; preds = %0 + %2 = shl nsw i32 %ref_stride, 1 + %3 = mul nsw i32 %ref_stride, 3 + %4 = shl nsw i32 %ref_stride, 2 + %5 = mul nsw i32 %ref_stride, 5 + %6 = mul nsw i32 %ref_stride, 6 + %7 = mul nsw i32 %ref_stride, 7 + %8 = shl nsw i32 %ref_stride, 3 + %9 = sub i32 0, %8 + %10 = mul i32 %limit, -64 + br label %11 + +;