From a188894d67a3cc2516b25aae9b3cbdbff4b0babe Mon Sep 17 00:00:00 2001 From: Chris Lattner Date: Mon, 12 Dec 2005 07:19:13 +0000 Subject: [PATCH] Implement a little hack for parity with GCC on crafty. This speeds up 186.crafty by about 16% (from 15.109s to 13.045s) on my system. This turns allocas with unions/casts into scalars. For example crafty has something like this: union doub { unsigned short i[4]; long long d; }; int f(long long a) { return ((union doub){.d=a}).i[1]; } Instead of generating loads and stores to an alloca, we now promote the whole thing to a scalar long value. This implements: Transforms/ScalarRepl/AggregatePromote.ll git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@24667 91177308-0d34-0410-b5e6-96231b3b80d8 --- .../Scalar/ScalarReplAggregates.cpp | 279 +++++++++++++++++- 1 file changed, 277 insertions(+), 2 deletions(-) diff --git a/lib/Transforms/Scalar/ScalarReplAggregates.cpp b/lib/Transforms/Scalar/ScalarReplAggregates.cpp index cc03d0ee683..4c5e3a30b6a 100644 --- a/lib/Transforms/Scalar/ScalarReplAggregates.cpp +++ b/lib/Transforms/Scalar/ScalarReplAggregates.cpp @@ -26,9 +26,10 @@ #include "llvm/Pass.h" #include "llvm/Instructions.h" #include "llvm/Analysis/Dominators.h" -#include "llvm/Support/GetElementPtrTypeIterator.h" #include "llvm/Target/TargetData.h" #include "llvm/Transforms/Utils/PromoteMemToReg.h" +#include "llvm/Support/GetElementPtrTypeIterator.h" +#include "llvm/Support/MathExtras.h" #include "llvm/Support/Debug.h" #include "llvm/ADT/Statistic.h" #include "llvm/ADT/StringExtras.h" @@ -37,6 +38,8 @@ using namespace llvm; namespace { Statistic<> NumReplaced("scalarrepl", "Number of allocas broken up"); Statistic<> NumPromoted("scalarrepl", "Number of allocas promoted"); + Statistic<> NumConverted("scalarrepl", + "Number of aggregates converted to scalar"); struct SROA : public FunctionPass { bool runOnFunction(Function &F); @@ -59,6 +62,10 @@ namespace { int isSafeAllocaToScalarRepl(AllocationInst *AI); void CanonicalizeAllocaUsers(AllocationInst *AI); AllocaInst *AddNewAlloca(Function &F, const Type *Ty, AllocationInst *Base); + + const Type *CanConvertToScalar(Value *V, bool &IsNotTrivial); + void ConvertToScalar(AllocationInst *AI, const Type *Ty); + void ConvertUsesToScalar(Value *Ptr, AllocaInst *NewAI, unsigned Offset); }; RegisterOpt X("scalarrepl", "Scalar Replacement of Aggregates"); @@ -112,7 +119,6 @@ bool SROA::performPromotion(Function &F) { return Changed; } - // performScalarRepl - This algorithm is a simple worklist driven algorithm, // which runs on all of the malloc/alloca instructions in the function, removing // them if they are only used by getelementptr instructions. @@ -131,6 +137,16 @@ bool SROA::performScalarRepl(Function &F) { while (!WorkList.empty()) { AllocationInst *AI = WorkList.back(); WorkList.pop_back(); + + // If we can turn this aggregate value (potentially with casts) into a + // simple scalar value that can be mem2reg'd into a register value. + bool IsNotTrivial = false; + if (const Type *ActualType = CanConvertToScalar(AI, IsNotTrivial)) + if (IsNotTrivial) { + ConvertToScalar(AI, ActualType); + Changed = true; + continue; + } // We cannot transform the allocation instruction if it is an array // allocation (allocations OF arrays are ok though), and an allocation of a @@ -378,3 +394,262 @@ void SROA::CanonicalizeAllocaUsers(AllocationInst *AI) { } } } + +/// MergeInType - Add the 'In' type to the accumulated type so far. If the +/// types are incompatible, return true, otherwise update Accum and return +/// false. +static bool MergeInType(const Type *In, const Type *&Accum) { + if (!In->isIntegral()) return true; + + // If this is our first type, just use it. + if (Accum == Type::VoidTy) { + Accum = In; + } else { + // Otherwise pick whichever type is larger. + if (In->getTypeID() > Accum->getTypeID()) + Accum = In; + } + return false; +} + +/// getUIntAtLeastAsBitAs - Return an unsigned integer type that is at least +/// as big as the specified type. If there is no suitable type, this returns +/// null. +const Type *getUIntAtLeastAsBitAs(unsigned NumBits) { + if (NumBits > 64) return 0; + if (NumBits > 32) return Type::ULongTy; + if (NumBits > 16) return Type::UIntTy; + if (NumBits > 8) return Type::UShortTy; + return Type::UByteTy; +} + +/// CanConvertToScalar - V is a pointer. If we can convert the pointee to a +/// single scalar integer type, return that type. Further, if the use is not +/// a completely trivial use that mem2reg could promote, set IsNotTrivial. If +/// there are no uses of this pointer, return Type::VoidTy to differentiate from +/// failure. +/// +const Type *SROA::CanConvertToScalar(Value *V, bool &IsNotTrivial) { + const Type *UsedType = Type::VoidTy; // No uses, no forced type. + const TargetData &TD = getAnalysis(); + const PointerType *PTy = cast(V->getType()); + + for (Value::use_iterator UI = V->use_begin(), E = V->use_end(); UI!=E; ++UI) { + Instruction *User = cast(*UI); + + if (LoadInst *LI = dyn_cast(User)) { + if (MergeInType(LI->getType(), UsedType)) + return 0; + + } else if (StoreInst *SI = dyn_cast(User)) { + // Storing the pointer, not the into the value? + if (SI->getOperand(0) == V) return 0; + + // NOTE: We could handle storing of FP imms here! + + if (MergeInType(SI->getOperand(0)->getType(), UsedType)) + return 0; + } else if (CastInst *CI = dyn_cast(User)) { + if (!isa(CI->getType())) return 0; + IsNotTrivial = true; + const Type *SubTy = CanConvertToScalar(CI, IsNotTrivial); + if (!SubTy || MergeInType(SubTy, UsedType)) return 0; + } else if (GetElementPtrInst *GEP = dyn_cast(User)) { + // Check to see if this is stepping over an element: GEP Ptr, int C + if (GEP->getNumOperands() == 2 && isa(GEP->getOperand(1))) { + unsigned Idx = cast(GEP->getOperand(1))->getRawValue(); + unsigned ElSize = TD.getTypeSize(PTy->getElementType()); + unsigned BitOffset = Idx*ElSize*8; + if (BitOffset > 64 || !isPowerOf2_32(ElSize)) return 0; + + IsNotTrivial = true; + const Type *SubElt = CanConvertToScalar(GEP, IsNotTrivial); + if (SubElt == 0) return 0; + if (SubElt != Type::VoidTy) { + const Type *NewTy = + getUIntAtLeastAsBitAs(SubElt->getPrimitiveSizeInBits()+BitOffset); + if (NewTy == 0 || MergeInType(NewTy, UsedType)) return 0; + continue; + } + } else if (GEP->getNumOperands() == 3 && + isa(GEP->getOperand(1)) && + isa(GEP->getOperand(2)) && + cast(GEP->getOperand(1))->isNullValue()) { + // We are stepping into an element, e.g. a structure or an array: + // GEP Ptr, int 0, uint C + const Type *AggTy = PTy->getElementType(); + unsigned Idx = cast(GEP->getOperand(2))->getRawValue(); + + if (const ArrayType *ATy = dyn_cast(AggTy)) { + if (Idx >= ATy->getNumElements()) return 0; // Out of range. + } else if (const PackedType *PTy = dyn_cast(AggTy)) { + if (Idx >= PTy->getNumElements()) return 0; // Out of range. + } else if (isa(AggTy)) { + // Structs are always ok. + } else { + return 0; + } + const Type *NTy = getUIntAtLeastAsBitAs(TD.getTypeSize(AggTy)*8); + if (NTy == 0 || MergeInType(NTy, UsedType)) return 0; + const Type *SubTy = CanConvertToScalar(GEP, IsNotTrivial); + if (SubTy == 0) return 0; + if (SubTy != Type::VoidTy && MergeInType(SubTy, UsedType)) + return 0; + continue; // Everything looks ok + } + return 0; + } else { + // Cannot handle this! + return 0; + } + } + + return UsedType; +} + +/// ConvertToScalar - The specified alloca passes the CanConvertToScalar +/// predicate and is non-trivial. Convert it to something that can be trivially +/// promoted into a register by mem2reg. +void SROA::ConvertToScalar(AllocationInst *AI, const Type *ActualTy) { + DEBUG(std::cerr << "CONVERT TO SCALAR: " << *AI << " TYPE = " + << *ActualTy << "\n"); + ++NumConverted; + + BasicBlock *EntryBlock = AI->getParent(); + assert(EntryBlock == &EntryBlock->getParent()->front() && + "Not in the entry block!"); + EntryBlock->getInstList().remove(AI); // Take the alloca out of the program. + + // Create and insert the alloca. + AllocaInst *NewAI = new AllocaInst(ActualTy->getUnsignedVersion(), 0, + AI->getName(), EntryBlock->begin()); + ConvertUsesToScalar(AI, NewAI, 0); + delete AI; +} + + +/// ConvertUsesToScalar - Convert all of the users of Ptr to use the new alloca +/// directly. Offset is an offset from the original alloca, in bits that need +/// to be shifted to the right. By the end of this, there should be no uses of +/// Ptr. +void SROA::ConvertUsesToScalar(Value *Ptr, AllocaInst *NewAI, unsigned Offset) { + while (!Ptr->use_empty()) { + Instruction *User = cast(Ptr->use_back()); + + if (LoadInst *LI = dyn_cast(User)) { + // The load is a bit extract from NewAI shifted right by Offset bits. + Value *NV = new LoadInst(NewAI, LI->getName(), LI); + if (Offset) + NV = new ShiftInst(Instruction::Shr, NV, + ConstantUInt::get(Type::UByteTy, Offset), + LI->getName(), LI); + if (NV->getType() != LI->getType()) + NV = new CastInst(NV, LI->getType(), LI->getName(), LI); + LI->replaceAllUsesWith(NV); + LI->eraseFromParent(); + } else if (StoreInst *SI = dyn_cast(User)) { + assert(SI->getOperand(0) != Ptr && "Consistency error!"); + + // Convert the stored type to the actual type, shift it left to insert + // then 'or' into place. + Value *SV = SI->getOperand(0); + if (SV->getType() == NewAI->getType()->getElementType()) { + assert(Offset == 0 && "Store out of bounds!"); + } else { + Value *Old = new LoadInst(NewAI, NewAI->getName()+".in", SI); + // If SV is signed, convert it to unsigned, so that the next cast zero + // extends the value. + if (SV->getType()->isSigned()) + SV = new CastInst(SV, SV->getType()->getUnsignedVersion(), + SV->getName(), SI); + SV = new CastInst(SV, Old->getType(), SV->getName(), SI); + if (Offset) + SV = new ShiftInst(Instruction::Shl, SV, + ConstantUInt::get(Type::UByteTy, Offset), + SV->getName()+".adj", SI); + // Mask out the bits we are about to insert from the old value. + unsigned TotalBits = SV->getType()->getPrimitiveSizeInBits(); + unsigned InsertBits = + SI->getOperand(0)->getType()->getPrimitiveSizeInBits(); + if (TotalBits != InsertBits) { + assert(TotalBits > InsertBits); + uint64_t Mask = ~(((1ULL << InsertBits)-1) << Offset); + if (TotalBits != 64) + Mask = Mask & ((1ULL << TotalBits)-1); + Old = BinaryOperator::createAnd(Old, + ConstantUInt::get(Old->getType(), Mask), + Old->getName()+".mask", SI); + SV = BinaryOperator::createOr(Old, SV, SV->getName()+".ins", SI); + } + } + new StoreInst(SV, NewAI, SI); + SI->eraseFromParent(); + + } else if (CastInst *CI = dyn_cast(User)) { + unsigned NewOff = Offset; + const TargetData &TD = getAnalysis(); + if (TD.isBigEndian()) { + // Adjust the pointer. For example, storing 16-bits into a 32-bit + // alloca with just a cast makes it modify the top 16-bits. + const Type *SrcTy = cast(Ptr->getType())->getElementType(); + const Type *DstTy = cast(CI->getType())->getElementType(); + int PtrDiffBits = TD.getTypeSize(SrcTy)*8-TD.getTypeSize(DstTy)*8; + NewOff += PtrDiffBits; + } + ConvertUsesToScalar(CI, NewAI, NewOff); + CI->eraseFromParent(); + } else if (GetElementPtrInst *GEP = dyn_cast(User)) { + const PointerType *AggPtrTy = + cast(GEP->getOperand(0)->getType()); + const TargetData &TD = getAnalysis(); + unsigned AggSizeInBits = TD.getTypeSize(AggPtrTy->getElementType())*8; + + // Check to see if this is stepping over an element: GEP Ptr, int C + unsigned NewOffset = Offset; + if (GEP->getNumOperands() == 2) { + unsigned Idx = cast(GEP->getOperand(1))->getRawValue(); + unsigned BitOffset = Idx*AggSizeInBits; + + if (TD.isLittleEndian()) + NewOffset += BitOffset; + else + NewOffset -= BitOffset; + + } else if (GEP->getNumOperands() == 3) { + // We know that operand #2 is zero. + unsigned Idx = cast(GEP->getOperand(2))->getRawValue(); + const Type *AggTy = AggPtrTy->getElementType(); + if (const SequentialType *SeqTy = dyn_cast(AggTy)) { + unsigned ElSizeBits = TD.getTypeSize(SeqTy->getElementType())*8; + + if (TD.isLittleEndian()) + NewOffset += ElSizeBits*Idx; + else + NewOffset += AggSizeInBits-ElSizeBits*(Idx+1); + } else if (const StructType *STy = dyn_cast(AggTy)) { + unsigned EltBitOffset = TD.getStructLayout(STy)->MemberOffsets[Idx]*8; + + if (TD.isLittleEndian()) + NewOffset += EltBitOffset; + else { + const PointerType *ElPtrTy = cast(GEP->getType()); + unsigned ElSizeBits = TD.getTypeSize(ElPtrTy->getElementType())*8; + NewOffset += AggSizeInBits-(EltBitOffset+ElSizeBits); + } + + } else { + assert(0 && "Unsupported operation!"); + abort(); + } + } else { + assert(0 && "Unsupported operation!"); + abort(); + } + ConvertUsesToScalar(GEP, NewAI, NewOffset); + GEP->eraseFromParent(); + } else { + assert(0 && "Unsupported operation!"); + abort(); + } + } +}