diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 412a954566b..44f28d3d770 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -6463,6 +6463,10 @@ bool X86TargetLowering::isLegalAddressingMode(const AddrMode &AM, // We can only fold this if we don't need an extra load. if (Subtarget->GVRequiresExtraLoad(AM.BaseGV, getTargetMachine(), false)) return false; + // If BaseGV requires a register, we cannot also have a BaseReg. + if (Subtarget->GVRequiresRegister(AM.BaseGV, getTargetMachine(), false) && + AM.HasBaseReg) + return false; // X86-64 only supports addr of globals in small code model. if (Subtarget->is64Bit()) { diff --git a/lib/Target/X86/X86Subtarget.cpp b/lib/Target/X86/X86Subtarget.cpp index b836471afc1..583fe8a0fbe 100644 --- a/lib/Target/X86/X86Subtarget.cpp +++ b/lib/Target/X86/X86Subtarget.cpp @@ -59,7 +59,23 @@ bool X86Subtarget::GVRequiresExtraLoad(const GlobalValue* GV, return (GV->hasDLLImportLinkage()); } } - + return false; +} + +/// True if accessing the GV requires a register. This is a superset of the +/// cases where GVRequiresExtraLoad is true. Some variations of PIC require +/// a register, but not an extra load. +bool X86Subtarget::GVRequiresRegister(const GlobalValue *GV, + const TargetMachine& TM, + bool isDirectCall) const +{ + if (GVRequiresExtraLoad(GV, TM, isDirectCall)) + return true; + // Code below here need only consider cases where GVRequiresExtraLoad + // returns false. + if (TM.getRelocationModel() == Reloc::PIC_) + return !isDirectCall && + (GV->hasInternalLinkage() || GV->hasExternalLinkage()); return false; } diff --git a/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/lib/Transforms/Scalar/LoopStrengthReduce.cpp index 38fd7fad65c..9b792c1aca9 100644 --- a/lib/Transforms/Scalar/LoopStrengthReduce.cpp +++ b/lib/Transforms/Scalar/LoopStrengthReduce.cpp @@ -444,7 +444,33 @@ static bool IVUseShouldUsePostIncValue(Instruction *User, Instruction *IV, return true; } - +/// isAddress - Returns true if the specified instruction is using the +/// specified value as an address. +static bool isAddressUse(Instruction *Inst, Value *OperandVal) { + bool isAddress = isa(Inst); + if (StoreInst *SI = dyn_cast(Inst)) { + if (SI->getOperand(1) == OperandVal) + isAddress = true; + } else if (IntrinsicInst *II = dyn_cast(Inst)) { + // Addressing modes can also be folded into prefetches and a variety + // of intrinsics. + switch (II->getIntrinsicID()) { + default: break; + case Intrinsic::prefetch: + case Intrinsic::x86_sse2_loadu_dq: + case Intrinsic::x86_sse2_loadu_pd: + case Intrinsic::x86_sse_loadu_ps: + case Intrinsic::x86_sse_storeu_ps: + case Intrinsic::x86_sse2_storeu_pd: + case Intrinsic::x86_sse2_storeu_dq: + case Intrinsic::x86_sse2_storel_dq: + if (II->getOperand(1) == OperandVal) + isAddress = true; + break; + } + } + return isAddress; +} /// AddUsersIfInteresting - Inspect the specified instruction. If it is a /// reducible SCEV, recursively add its users to the IVUsesByStride set and @@ -731,15 +757,16 @@ void BasedUser::RewriteInstructionToUseNewBase(const SCEVHandle &NewBase, } -/// isTargetConstant - Return true if the following can be referenced by the -/// immediate field of a target instruction. -static bool isTargetConstant(const SCEVHandle &V, const Type *UseTy, - const TargetLowering *TLI) { +/// fitsInAddressMode - Return true if V can be subsumed within an addressing +/// mode, and does not need to be put in a register first. +static bool fitsInAddressMode(const SCEVHandle &V, const Type *UseTy, + const TargetLowering *TLI, bool HasBaseReg) { if (SCEVConstant *SC = dyn_cast(V)) { int64_t VC = SC->getValue()->getSExtValue(); if (TLI) { TargetLowering::AddrMode AM; AM.BaseOffs = VC; + AM.HasBaseReg = HasBaseReg; return TLI->isLegalAddressingMode(AM, UseTy); } else { // Defaults to PPC. PPC allows a sign-extended 16-bit immediate field. @@ -754,6 +781,7 @@ static bool isTargetConstant(const SCEVHandle &V, const Type *UseTy, if (GlobalValue *GV = dyn_cast(Op0)) { TargetLowering::AddrMode AM; AM.BaseGV = GV; + AM.HasBaseReg = HasBaseReg; return TLI->isLegalAddressingMode(AM, UseTy); } } @@ -846,7 +874,7 @@ static void MoveImmediateValues(const TargetLowering *TLI, return; } else if (SCEVMulExpr *SME = dyn_cast(Val)) { // Transform "8 * (4 + v)" -> "32 + 8*V" if "32" fits in the immed field. - if (isAddress && isTargetConstant(SME->getOperand(0), UseTy, TLI) && + if (isAddress && fitsInAddressMode(SME->getOperand(0), UseTy, TLI, false) && SME->getNumOperands() == 2 && SME->isLoopInvariant(L)) { SCEVHandle SubImm = SE->getIntegerSCEV(0, Val->getType()); @@ -859,7 +887,7 @@ static void MoveImmediateValues(const TargetLowering *TLI, // Scale SubImm up by "8". If the result is a target constant, we are // good. SubImm = SE->getMulExpr(SubImm, SME->getOperand(0)); - if (isTargetConstant(SubImm, UseTy, TLI)) { + if (fitsInAddressMode(SubImm, UseTy, TLI, false)) { // Accumulate the immediate. Imm = SE->getAddExpr(Imm, SubImm); @@ -873,7 +901,7 @@ static void MoveImmediateValues(const TargetLowering *TLI, // Loop-variant expressions must stay in the immediate field of the // expression. - if ((isAddress && isTargetConstant(Val, UseTy, TLI)) || + if ((isAddress && fitsInAddressMode(Val, UseTy, TLI, false)) || !Val->isLoopInvariant(L)) { Imm = SE->getAddExpr(Imm, Val); Val = SE->getIntegerSCEV(0, Val->getType()); @@ -912,21 +940,28 @@ static void SeparateSubExprs(std::vector &SubExprs, } } +// This is logically local to the following function, but C++ says we have +// to make it file scope. +struct SubExprUseData { unsigned Count; bool notAllUsesAreFree; }; -/// RemoveCommonExpressionsFromUseBases - Look through all of the uses in Bases, -/// removing any common subexpressions from it. Anything truly common is -/// removed, accumulated, and returned. This looks for things like (a+b+c) and +/// RemoveCommonExpressionsFromUseBases - Look through all of the Bases of all +/// the Uses, removing any common subexpressions, except that if all such +/// subexpressions can be folded into an addressing mode for all uses inside +/// the loop (this case is referred to as "free" in comments herein) we do +/// not remove anything. This looks for things like (a+b+c) and /// (a+c+d) and computes the common (a+c) subexpression. The common expression /// is *removed* from the Bases and returned. static SCEVHandle RemoveCommonExpressionsFromUseBases(std::vector &Uses, - ScalarEvolution *SE, Loop *L) { + ScalarEvolution *SE, Loop *L, + const TargetLowering *TLI) { unsigned NumUses = Uses.size(); // Only one use? This is a very common case, so we handle it specially and // cheaply. SCEVHandle Zero = SE->getIntegerSCEV(0, Uses[0].Base->getType()); SCEVHandle Result = Zero; + SCEVHandle FreeResult = Zero; if (NumUses == 1) { // If the use is inside the loop, use its base, regardless of what it is: // it is clearly shared across all the IV's. If the use is outside the loop @@ -939,7 +974,10 @@ RemoveCommonExpressionsFromUseBases(std::vector &Uses, // To find common subexpressions, count how many of Uses use each expression. // If any subexpressions are used Uses.size() times, they are common. - std::map SubExpressionUseCounts; + // Also track whether all uses of each expression can be moved into an + // an addressing mode "for free"; such expressions are left within the loop. + // struct SubExprUseData { unsigned Count; bool notAllUsesAreFree; }; + std::map SubExpressionUseData; // UniqueSubExprs - Keep track of all of the subexpressions we see in the // order we see them. @@ -962,31 +1000,89 @@ RemoveCommonExpressionsFromUseBases(std::vector &Uses, // CSEs we can find. if (Uses[i].Base == Zero) return Zero; + // If this use is as an address we may be able to put CSEs in the addressing + // mode rather than hoisting them. + bool isAddrUse = isAddressUse(Uses[i].Inst, Uses[i].OperandValToReplace); + // We may need the UseTy below, but only when isAddrUse, so compute it + // only in that case. + const Type *UseTy = 0; + if (isAddrUse) { + UseTy = Uses[i].Inst->getType(); + if (StoreInst *SI = dyn_cast(Uses[i].Inst)) + UseTy = SI->getOperand(0)->getType(); + } + // Split the expression into subexprs. SeparateSubExprs(SubExprs, Uses[i].Base, SE); - // Add one to SubExpressionUseCounts for each subexpr present. - for (unsigned j = 0, e = SubExprs.size(); j != e; ++j) - if (++SubExpressionUseCounts[SubExprs[j]] == 1) + // Add one to SubExpressionUseData.Count for each subexpr present, and + // if the subexpr is not a valid immediate within an addressing mode use, + // set SubExpressionUseData.notAllUsesAreFree. We definitely want to + // hoist these out of the loop (if they are common to all uses). + for (unsigned j = 0, e = SubExprs.size(); j != e; ++j) { + if (++SubExpressionUseData[SubExprs[j]].Count == 1) UniqueSubExprs.push_back(SubExprs[j]); + if (!isAddrUse || !fitsInAddressMode(SubExprs[j], UseTy, TLI, false)) + SubExpressionUseData[SubExprs[j]].notAllUsesAreFree = true; + } SubExprs.clear(); } // Now that we know how many times each is used, build Result. Iterate over // UniqueSubexprs so that we have a stable ordering. for (unsigned i = 0, e = UniqueSubExprs.size(); i != e; ++i) { - std::map::iterator I = - SubExpressionUseCounts.find(UniqueSubExprs[i]); - assert(I != SubExpressionUseCounts.end() && "Entry not found?"); - if (I->second == NumUsesInsideLoop) // Found CSE! - Result = SE->getAddExpr(Result, I->first); - else - // Remove non-cse's from SubExpressionUseCounts. - SubExpressionUseCounts.erase(I); + std::map::iterator I = + SubExpressionUseData.find(UniqueSubExprs[i]); + assert(I != SubExpressionUseData.end() && "Entry not found?"); + if (I->second.Count == NumUsesInsideLoop) { // Found CSE! + if (I->second.notAllUsesAreFree) + Result = SE->getAddExpr(Result, I->first); + else + FreeResult = SE->getAddExpr(FreeResult, I->first); + } else + // Remove non-cse's from SubExpressionUseData. + SubExpressionUseData.erase(I); } - + + if (FreeResult != Zero) { + // We have some subexpressions that can be subsumed into addressing + // modes in every use inside the loop. However, it's possible that + // there are so many of them that the combined FreeResult cannot + // be subsumed, or that the target cannot handle both a FreeResult + // and a Result in the same instruction (for example because it would + // require too many registers). Check this. + for (unsigned i=0; icontains(Uses[i].Inst->getParent())) + continue; + // We know this is an addressing mode use; if there are any uses that + // are not, FreeResult would be Zero. + const Type *UseTy = Uses[i].Inst->getType(); + if (StoreInst *SI = dyn_cast(Uses[i].Inst)) + UseTy = SI->getOperand(0)->getType(); + if (!fitsInAddressMode(FreeResult, UseTy, TLI, Result!=Zero)) { + // FIXME: could split up FreeResult into pieces here, some hoisted + // and some not. Doesn't seem worth it for now. + Result = SE->getAddExpr(Result, FreeResult); + FreeResult = Zero; + break; + } + } + } + // If we found no CSE's, return now. if (Result == Zero) return Result; + // If we still have a FreeResult, remove its subexpressions from + // SubExpressionUseData. This means they will remain in the use Bases. + if (FreeResult != Zero) { + SeparateSubExprs(SubExprs, FreeResult, SE); + for (unsigned j = 0, e = SubExprs.size(); j != e; ++j) { + std::map::iterator I = + SubExpressionUseData.find(SubExprs[j]); + SubExpressionUseData.erase(I); + } + SubExprs.clear(); + } + // Otherwise, remove all of the CSE's we found from each of the base values. for (unsigned i = 0; i != NumUses; ++i) { // Uses outside the loop don't necessarily include the common base, but @@ -1003,7 +1099,7 @@ RemoveCommonExpressionsFromUseBases(std::vector &Uses, // Remove any common subexpressions. for (unsigned j = 0, e = SubExprs.size(); j != e; ++j) - if (SubExpressionUseCounts.count(SubExprs[j])) { + if (SubExpressionUseData.count(SubExprs[j])) { SubExprs.erase(SubExprs.begin()+j); --j; --e; } @@ -1131,34 +1227,6 @@ static bool isNonConstantNegative(const SCEVHandle &Expr) { return SC->getValue()->getValue().isNegative(); } -/// isAddress - Returns true if the specified instruction is using the -/// specified value as an address. -static bool isAddressUse(Instruction *Inst, Value *OperandVal) { - bool isAddress = isa(Inst); - if (StoreInst *SI = dyn_cast(Inst)) { - if (SI->getOperand(1) == OperandVal) - isAddress = true; - } else if (IntrinsicInst *II = dyn_cast(Inst)) { - // Addressing modes can also be folded into prefetches and a variety - // of intrinsics. - switch (II->getIntrinsicID()) { - default: break; - case Intrinsic::prefetch: - case Intrinsic::x86_sse2_loadu_dq: - case Intrinsic::x86_sse2_loadu_pd: - case Intrinsic::x86_sse_loadu_ps: - case Intrinsic::x86_sse_storeu_ps: - case Intrinsic::x86_sse2_storeu_pd: - case Intrinsic::x86_sse2_storeu_dq: - case Intrinsic::x86_sse2_storel_dq: - if (II->getOperand(1) == OperandVal) - isAddress = true; - break; - } - } - return isAddress; -} - // CollectIVUsers - Transform our list of users and offsets to a bit more // complex table. In this new vector, each 'BasedUser' contains 'Base', the base // of the strided accesses, as well as the old information from Uses. We @@ -1190,7 +1258,7 @@ SCEVHandle LoopStrengthReduce::CollectIVUsers(const SCEVHandle &Stride, // "A+B"), emit it to the preheader, then remove the expression from the // UsersToProcess base values. SCEVHandle CommonExprs = - RemoveCommonExpressionsFromUseBases(UsersToProcess, SE, L); + RemoveCommonExpressionsFromUseBases(UsersToProcess, SE, L, TLI); // Next, figure out what we can represent in the immediate fields of // instructions. If we can represent anything there, move it to the imm @@ -1347,7 +1415,8 @@ void LoopStrengthReduce::StrengthReduceStridedIVUsers(const SCEVHandle &Stride, Constant *C = dyn_cast(CommonBaseV); if (!C || (!C->isNullValue() && - !isTargetConstant(SE->getUnknown(CommonBaseV), ReplacedTy, TLI))) + !fitsInAddressMode(SE->getUnknown(CommonBaseV), ReplacedTy, + TLI, false))) // We want the common base emitted into the preheader! This is just // using cast as a copy so BitCast (no-op cast) is appropriate CommonBaseV = new BitCastInst(CommonBaseV, CommonBaseV->getType(), @@ -1403,7 +1472,8 @@ void LoopStrengthReduce::StrengthReduceStridedIVUsers(const SCEVHandle &Stride, // this by forcing a BitCast (noop cast) to be inserted into the preheader // in this case. if (Constant *C = dyn_cast(BaseV)) { - if (!C->isNullValue() && !isTargetConstant(Base, ReplacedTy, TLI)) { + if (!C->isNullValue() && !fitsInAddressMode(Base, ReplacedTy, + TLI, false)) { // We want this constant emitted into the preheader! This is just // using cast as a copy so BitCast (no-op cast) is appropriate BaseV = new BitCastInst(BaseV, BaseV->getType(), "preheaderinsert", diff --git a/test/CodeGen/X86/loop-strength-reduce-2.ll b/test/CodeGen/X86/loop-strength-reduce-2.ll new file mode 100644 index 00000000000..1375793bce3 --- /dev/null +++ b/test/CodeGen/X86/loop-strength-reduce-2.ll @@ -0,0 +1,30 @@ +; RUN: llvm-as < %s | llc -march=x86 -relocation-model=pic | \ +; RUN: grep {A-} | count 1 +; +; Make sure the common loop invariant A is hoisted up to preheader, +; since too many registers are needed to subsume it into the addressing modes. + +@A = global [16 x [16 x i32]] zeroinitializer, align 32 ; <[16 x [16 x i32]]*> [#uses=2] + +define void @test(i32 %row, i32 %N.in) nounwind { +entry: + %N = bitcast i32 %N.in to i32 ; [#uses=1] + %tmp5 = icmp sgt i32 %N.in, 0 ; [#uses=1] + br i1 %tmp5, label %cond_true, label %return + +cond_true: ; preds = %cond_true, %entry + %indvar = phi i32 [ 0, %entry ], [ %indvar.next, %cond_true ] ; [#uses=2] + %i.0.0 = bitcast i32 %indvar to i32 ; [#uses=2] + %tmp2 = add i32 %i.0.0, 1 ; [#uses=1] + %tmp = getelementptr [16 x [16 x i32]]* @A, i32 0, i32 %row, i32 %tmp2 ; [#uses=1] + store i32 4, i32* %tmp + %tmp5.upgrd.1 = add i32 %i.0.0, 2 ; [#uses=1] + %tmp7 = getelementptr [16 x [16 x i32]]* @A, i32 0, i32 %row, i32 %tmp5.upgrd.1 ; [#uses=1] + store i32 5, i32* %tmp7 + %indvar.next = add i32 %indvar, 1 ; [#uses=2] + %exitcond = icmp eq i32 %indvar.next, %N ; [#uses=1] + br i1 %exitcond, label %return, label %cond_true + +return: ; preds = %cond_true, %entry + ret void +} diff --git a/test/CodeGen/X86/loop-strength-reduce-3.ll b/test/CodeGen/X86/loop-strength-reduce-3.ll new file mode 100644 index 00000000000..b6bb81471bc --- /dev/null +++ b/test/CodeGen/X86/loop-strength-reduce-3.ll @@ -0,0 +1,30 @@ +; RUN: llvm-as < %s | llc -mtriple=i386-apple-darwin -relocation-model=dynamic-no-pic | \ +; RUN: grep {A+} | count 2 +; +; Make sure the common loop invariant A is not hoisted up to preheader, +; since it can be subsumed it into the addressing modes. + +@A = global [16 x [16 x i32]] zeroinitializer, align 32 ; <[16 x [16 x i32]]*> [#uses=2] + +define void @test(i32 %row, i32 %N.in) nounwind { +entry: + %N = bitcast i32 %N.in to i32 ; [#uses=1] + %tmp5 = icmp sgt i32 %N.in, 0 ; [#uses=1] + br i1 %tmp5, label %cond_true, label %return + +cond_true: ; preds = %cond_true, %entry + %indvar = phi i32 [ 0, %entry ], [ %indvar.next, %cond_true ] ; [#uses=2] + %i.0.0 = bitcast i32 %indvar to i32 ; [#uses=2] + %tmp2 = add i32 %i.0.0, 1 ; [#uses=1] + %tmp = getelementptr [16 x [16 x i32]]* @A, i32 0, i32 %row, i32 %tmp2 ; [#uses=1] + store i32 4, i32* %tmp + %tmp5.upgrd.1 = add i32 %i.0.0, 2 ; [#uses=1] + %tmp7 = getelementptr [16 x [16 x i32]]* @A, i32 0, i32 %row, i32 %tmp5.upgrd.1 ; [#uses=1] + store i32 5, i32* %tmp7 + %indvar.next = add i32 %indvar, 1 ; [#uses=2] + %exitcond = icmp eq i32 %indvar.next, %N ; [#uses=1] + br i1 %exitcond, label %return, label %cond_true + +return: ; preds = %cond_true, %entry + ret void +} diff --git a/test/CodeGen/X86/loop-strength-reduce.ll b/test/CodeGen/X86/loop-strength-reduce.ll index 8bacbd97bbe..873710112b6 100644 --- a/test/CodeGen/X86/loop-strength-reduce.ll +++ b/test/CodeGen/X86/loop-strength-reduce.ll @@ -1,7 +1,8 @@ -; RUN: llvm-as < %s | llc -march=x86 | \ -; RUN: grep {A(} | count 1 +; RUN: llvm-as < %s | llc -march=x86 -relocation-model=static | \ +; RUN: grep {A+} | count 2 ; -; Make sure the common loop invariant _A(reg) is hoisted up to preheader. +; Make sure the common loop invariant A is not hoisted up to preheader, +; since it can be subsumed into the addressing mode in all uses. @A = internal global [16 x [16 x i32]] zeroinitializer, align 32 ; <[16 x [16 x i32]]*> [#uses=2]