diff --git a/include/llvm/Target/TargetLowering.h b/include/llvm/Target/TargetLowering.h index d55a8af7227..e092145ebe6 100644 --- a/include/llvm/Target/TargetLowering.h +++ b/include/llvm/Target/TargetLowering.h @@ -104,7 +104,7 @@ public: /// isPow2DivCheap() - Return true if pow2 div is cheaper than a chain of /// srl/add/sra. bool isPow2DivCheap() const { return Pow2DivIsCheap; } - + /// getSetCCResultTy - Return the ValueType of the result of setcc operations. /// MVT::ValueType getSetCCResultTy() const { return SetCCResultTy; } @@ -994,6 +994,13 @@ public: /// TODO: Handle pre/postinc as well. virtual bool isLegalAddressingMode(const AddrMode &AM, const Type *Ty) const; + /// isTruncateFree - Return true if it's free to truncate a value of + /// type Ty1 to type Ty2. e.g. On x86 it's free to truncate a i32 value in + /// register EAX to i16 by referencing its sub-register AX. + virtual bool isTruncateFree(const Type *Ty1, const Type *Ty2) const { + return false; + } + //===--------------------------------------------------------------------===// // Div utility functions // diff --git a/lib/Target/X86/README.txt b/lib/Target/X86/README.txt index ccd15be4e6b..41b38d84c8a 100644 --- a/lib/Target/X86/README.txt +++ b/lib/Target/X86/README.txt @@ -339,20 +339,18 @@ void foo(int N) { for (i = 0; i < N; i++) { X = i; Y = i*4; } } -LBB1_1: #bb.preheader - xorl %ecx, %ecx - xorw %dx, %dx -LBB1_2: #bb - movl L_X$non_lazy_ptr, %esi - movw %dx, (%esi) - movw %dx, %si - shlw $2, %si - movl L_Y$non_lazy_ptr, %edi - movw %si, (%edi) - incl %ecx - incw %dx - cmpl %eax, %ecx - jne LBB1_2 #bb +LBB1_1: # entry.bb_crit_edge + xorl %ecx, %ecx + xorw %dx, %dx +LBB1_2: # bb + movl L_X$non_lazy_ptr, %esi + movw %cx, (%esi) + movl L_Y$non_lazy_ptr, %esi + movw %dx, (%esi) + addw $4, %dx + incl %ecx + cmpl %eax, %ecx + jne LBB1_2 # bb vs. @@ -367,11 +365,7 @@ L4: cmpl %edx, %edi jne L4 -There are 3 issues: - -1. Lack of post regalloc LICM. -2. LSR unable to reused IV for a different type (i16 vs. i32) even though - the cast would be free. +This is due to the lack of post regalloc LICM. //===---------------------------------------------------------------------===// diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 4d528abb6b4..172aa5338b9 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -5122,6 +5122,13 @@ bool X86TargetLowering::isLegalAddressingMode(const AddrMode &AM, } +bool X86TargetLowering::isTruncateFree(const Type *Ty1, const Type *Ty2) const { + if (!Ty1->isInteger() || !Ty2->isInteger()) + return false; + return Ty1->getPrimitiveSizeInBits() > Ty2->getPrimitiveSizeInBits(); +} + + /// isShuffleMaskLegal - Targets can use this to indicate that they only /// support *some* VECTOR_SHUFFLE operations, those with specific masks. /// By default, if a target supports the VECTOR_SHUFFLE node, all mask values diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h index 7123adaad27..b68de5a6753 100644 --- a/lib/Target/X86/X86ISelLowering.h +++ b/lib/Target/X86/X86ISelLowering.h @@ -359,6 +359,11 @@ namespace llvm { /// by AM is legal for this target, for a load/store of the specified type. virtual bool isLegalAddressingMode(const AddrMode &AM, const Type *Ty)const; + /// isTruncateFree - Return true if it's free to truncate a value of + /// type Ty1 to type Ty2. e.g. On x86 it's free to truncate a i32 value in + /// register EAX to i16 by referencing its sub-register AX. + virtual bool isTruncateFree(const Type *Ty1, const Type *Ty2) const; + /// isShuffleMaskLegal - Targets can use this to indicate that they only /// support *some* VECTOR_SHUFFLE operations, those with specific masks. /// By default, if a target supports the VECTOR_SHUFFLE node, all mask diff --git a/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/lib/Transforms/Scalar/LoopStrengthReduce.cpp index fbe3171f81f..d81ea2b7994 100644 --- a/lib/Transforms/Scalar/LoopStrengthReduce.cpp +++ b/lib/Transforms/Scalar/LoopStrengthReduce.cpp @@ -178,7 +178,7 @@ private: bool FindIVForUser(ICmpInst *Cond, IVStrideUse *&CondUse, const SCEVHandle *&CondStride); bool RequiresTypeConversion(const Type *Ty, const Type *NewTy); - unsigned CheckForIVReuse(bool, const SCEVHandle&, + unsigned CheckForIVReuse(bool, bool, const SCEVHandle&, IVExpr&, const Type*, const std::vector& UsersToProcess); bool ValidStride(bool, int64_t, @@ -980,15 +980,17 @@ bool LoopStrengthReduce::ValidStride(bool HasBaseReg, /// RequiresTypeConversion - Returns true if converting Ty to NewTy is not /// a nop. -bool LoopStrengthReduce::RequiresTypeConversion(const Type *Ty, - const Type *NewTy) { - if (Ty == NewTy) +bool LoopStrengthReduce::RequiresTypeConversion(const Type *Ty1, + const Type *Ty2) { + if (Ty1 == Ty2) return false; - return (!Ty->canLosslesslyBitCastTo(NewTy) && - !(isa(NewTy) && - Ty->canLosslesslyBitCastTo(UIntPtrTy)) && - !(isa(Ty) && - NewTy->canLosslesslyBitCastTo(UIntPtrTy))); + if (TLI && TLI->isTruncateFree(Ty1, Ty2)) + return false; + return (!Ty1->canLosslesslyBitCastTo(Ty2) && + !(isa(Ty2) && + Ty1->canLosslesslyBitCastTo(UIntPtrTy)) && + !(isa(Ty1) && + Ty2->canLosslesslyBitCastTo(UIntPtrTy))); } /// CheckForIVReuse - Returns the multiple if the stride is the multiple @@ -997,20 +999,23 @@ bool LoopStrengthReduce::RequiresTypeConversion(const Type *Ty, /// this stride to be rewritten as prev iv * factor. It returns 0 if no /// reuse is possible. unsigned LoopStrengthReduce::CheckForIVReuse(bool HasBaseReg, + bool AllUsesAreAddresses, const SCEVHandle &Stride, IVExpr &IV, const Type *Ty, const std::vector& UsersToProcess) { if (SCEVConstant *SC = dyn_cast(Stride)) { int64_t SInt = SC->getValue()->getSExtValue(); - if (SInt == 1) return 0; - for (std::map::iterator SI= IVsByStride.begin(), SE = IVsByStride.end(); SI != SE; ++SI) { int64_t SSInt = cast(SI->first)->getValue()->getSExtValue(); - if (SInt != -SSInt && + if (SI->first != Stride && (unsigned(abs(SInt)) < SSInt || (SInt % SSInt) != 0)) continue; int64_t Scale = SInt / SSInt; + // When scale is 1, we don't need to worry about whether the + // multiplication can be folded into the addressing mode. + if (!AllUsesAreAddresses && Scale != 1) + continue; // Check that this stride is valid for all the types used for loads and // stores; if it can be used for some and not others, we might as well use // the original stride everywhere, since we have to create the IV for it @@ -1021,7 +1026,7 @@ unsigned LoopStrengthReduce::CheckForIVReuse(bool HasBaseReg, // FIXME: Only handle base == 0 for now. // Only reuse previous IV if it would not require a type conversion. if (isZero(II->Base) && - !RequiresTypeConversion(II->Base->getType(),Ty)) { + !RequiresTypeConversion(II->Base->getType(), Ty)) { IV = *II; return Scale; } @@ -1183,10 +1188,9 @@ void LoopStrengthReduce::StrengthReduceStridedIVUsers(const SCEVHandle &Stride, SE->getIntegerSCEV(0, Type::Int32Ty), 0, 0); unsigned RewriteFactor = 0; - if (AllUsesAreAddresses) - RewriteFactor = CheckForIVReuse(HaveCommonExprs, Stride, ReuseIV, - CommonExprs->getType(), - UsersToProcess); + RewriteFactor = CheckForIVReuse(HaveCommonExprs, AllUsesAreAddresses, + Stride, ReuseIV, CommonExprs->getType(), + UsersToProcess); if (RewriteFactor != 0) { DOUT << "BASED ON IV of STRIDE " << *ReuseIV.Stride << " and BASE " << *ReuseIV.Base << " :\n"; diff --git a/test/CodeGen/X86/loop-strength-reduce5.ll b/test/CodeGen/X86/loop-strength-reduce5.ll new file mode 100644 index 00000000000..6e037e2aca3 --- /dev/null +++ b/test/CodeGen/X86/loop-strength-reduce5.ll @@ -0,0 +1,23 @@ +; RUN: llvm-as < %s | llc -march=x86 | grep inc | count 1 + +@X = weak global i16 0 ; [#uses=1] +@Y = weak global i16 0 ; [#uses=1] + +define void @foo(i32 %N) { +entry: + %tmp1019 = icmp sgt i32 %N, 0 ; [#uses=1] + br i1 %tmp1019, label %bb, label %return + +bb: ; preds = %bb, %entry + %i.014.0 = phi i32 [ 0, %entry ], [ %indvar.next, %bb ] ; [#uses=2] + %tmp1 = trunc i32 %i.014.0 to i16 ; [#uses=2] + volatile store i16 %tmp1, i16* @X, align 2 + %tmp34 = shl i16 %tmp1, 2 ; [#uses=1] + volatile store i16 %tmp34, i16* @Y, align 2 + %indvar.next = add i32 %i.014.0, 1 ; [#uses=2] + %exitcond = icmp eq i32 %indvar.next, %N ; [#uses=1] + br i1 %exitcond, label %return, label %bb + +return: ; preds = %bb, %entry + ret void +}