diff --git a/include/llvm/Support/CallSite.h b/include/llvm/Support/CallSite.h index 571a04924d1..8a998a8cd0d 100644 --- a/include/llvm/Support/CallSite.h +++ b/include/llvm/Support/CallSite.h @@ -52,11 +52,6 @@ public: CallSiteBase(CallTy *CI) : I(CI, true) { assert(CI); } CallSiteBase(InvokeTy *II) : I(II, false) { assert(II); } CallSiteBase(ValTy *II) { *this = get(II); } - CallSiteBase(InstrTy *II) { - assert(II && "Null instruction given?"); - *this = get(II); - assert(I.getPointer() && "Not a call?"); - } protected: /// CallSiteBase::get - This static method is sort of like a constructor. It /// will create an appropriate call site for a Call or Invoke instruction, but diff --git a/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/lib/Transforms/Scalar/MemCpyOptimizer.cpp index 6f53c322ef5..4f1cfd622f0 100644 --- a/lib/Transforms/Scalar/MemCpyOptimizer.cpp +++ b/lib/Transforms/Scalar/MemCpyOptimizer.cpp @@ -301,11 +301,13 @@ void MemsetRanges::addStore(int64_t Start, StoreInst *SI) { namespace { class MemCpyOpt : public FunctionPass { + MemoryDependenceAnalysis *MD; bool runOnFunction(Function &F); public: static char ID; // Pass identification, replacement for typeid MemCpyOpt() : FunctionPass(ID) { initializeMemCpyOptPass(*PassRegistry::getPassRegistry()); + MD = 0; } private: @@ -327,6 +329,7 @@ namespace { uint64_t cpyLen, CallInst *C); bool processMemCpyMemCpyDependence(MemCpyInst *M, MemCpyInst *MDep, uint64_t MSize); + bool processByValArgument(CallSite CS, unsigned ArgNo); bool iterateOnFunction(Function &F); }; @@ -359,9 +362,7 @@ bool MemCpyOpt::processStore(StoreInst *SI, BasicBlock::iterator &BBI) { // a memcpy. if (LoadInst *LI = dyn_cast(SI->getOperand(0))) { if (!LI->isVolatile() && LI->hasOneUse()) { - MemoryDependenceAnalysis &MD = getAnalysis(); - - MemDepResult dep = MD.getDependency(LI); + MemDepResult dep = MD->getDependency(LI); CallInst *C = 0; if (dep.isClobber() && !isa(dep.getInst())) C = dyn_cast(dep.getInst()); @@ -372,7 +373,7 @@ bool MemCpyOpt::processStore(StoreInst *SI, BasicBlock::iterator &BBI) { LI->getPointerOperand()->stripPointerCasts(), TD->getTypeStoreSize(SI->getOperand(0)->getType()), C); if (changed) { - MD.removeInstruction(SI); + MD->removeInstruction(SI); SI->eraseFromParent(); LI->eraseFromParent(); ++NumMemCpyInstr; @@ -505,8 +506,8 @@ bool MemCpyOpt::processStore(StoreInst *SI, BasicBlock::iterator &BBI) { Value *C = CallInst::Create(MemSetF, Ops, Ops+5, "", InsertPt); DEBUG(dbgs() << "Replace stores:\n"; for (unsigned i = 0, e = Range.TheStores.size(); i != e; ++i) - dbgs() << *Range.TheStores[i]; - dbgs() << "With: " << *C); C=C; + dbgs() << *Range.TheStores[i] << '\n'; + dbgs() << "With: " << *C << '\n'); C=C; // Don't invalidate the iterator BBI = BI; @@ -657,11 +658,10 @@ bool MemCpyOpt::performCallSlotOptzn(Instruction *cpy, // Drop any cached information about the call, because we may have changed // its dependence information by changing its parameter. - MemoryDependenceAnalysis &MD = getAnalysis(); - MD.removeInstruction(C); + MD->removeInstruction(C); - // Remove the memcpy - MD.removeInstruction(cpy); + // Remove the memcpy. + MD->removeInstruction(cpy); ++NumMemCpyInstr; return true; @@ -675,7 +675,7 @@ bool MemCpyOpt::processMemCpyMemCpyDependence(MemCpyInst *M, MemCpyInst *MDep, uint64_t MSize) { // We can only transforms memcpy's where the dest of one is the source of the // other. - if (M->getSource() != MDep->getDest()) + if (M->getSource() != MDep->getDest() || MDep->isVolatile()) return false; // Second, the length of the memcpy's must be the same, or the preceeding one @@ -684,27 +684,26 @@ bool MemCpyOpt::processMemCpyMemCpyDependence(MemCpyInst *M, MemCpyInst *MDep, if (!C1) return false; uint64_t DepSize = C1->getValue().getZExtValue(); - if (DepSize < MSize) return false; - Intrinsic::ID ResultFn = Intrinsic::memcpy; + AliasAnalysis &AA = getAnalysis(); // If the dest of the second might alias the source of the first, then the // source and dest might overlap. We still want to eliminate the intermediate // value, but we have to generate a memmove instead of memcpy. - AliasAnalysis &AA = getAnalysis(); + Intrinsic::ID ResultFn = Intrinsic::memcpy; if (!AA.isNoAlias(M->getRawDest(), MSize, MDep->getRawSource(), DepSize)) ResultFn = Intrinsic::memmove; - // If all checks passed, then we can transform these memcpy's + // If all checks passed, then we can transform M. const Type *ArgTys[3] = { M->getRawDest()->getType(), MDep->getRawSource()->getType(), M->getLength()->getType() }; Function *MemCpyFun = - Intrinsic::getDeclaration(M->getParent()->getParent()->getParent(), + Intrinsic::getDeclaration(MDep->getParent()->getParent()->getParent(), ResultFn, ArgTys, 3); // Make sure to use the lesser of the alignment of the source and the dest @@ -717,29 +716,30 @@ bool MemCpyOpt::processMemCpyMemCpyDependence(MemCpyInst *M, MemCpyInst *MDep, M->getRawDest(), MDep->getRawSource(), M->getLength(), - ConstantInt::get(Type::getInt32Ty(M->getContext()), Align), + ConstantInt::get(Type::getInt32Ty(MemCpyFun->getContext()), Align), M->getVolatileCst() }; CallInst *C = CallInst::Create(MemCpyFun, Args, Args+5, "", M); - MemoryDependenceAnalysis &MD = getAnalysis(); - // Verify that the copied-from memory doesn't change in between the two // transfers. For example, in: // memcpy(a <- b) // *b = 42; // memcpy(c <- a) // It would be invalid to transform the second memcpy into memcpy(c <- b). - MemDepResult NewDep = MD.getDependency(C); + // + // TODO: If the code between M and MDep is transparent to the destination "c", + // then we could still perform the xform by moving M up to the first memcpy. + MemDepResult NewDep = MD->getDependency(C); if (!NewDep.isClobber() || NewDep.getInst() != MDep) { - MD.removeInstruction(C); + MD->removeInstruction(C); C->eraseFromParent(); return false; } // Otherwise we're good! Nuke the instruction we're replacing. - MD.removeInstruction(M); + MD->removeInstruction(M); M->eraseFromParent(); ++NumMemCpyInstr; return true; @@ -752,25 +752,23 @@ bool MemCpyOpt::processMemCpyMemCpyDependence(MemCpyInst *M, MemCpyInst *MDep, /// circumstances). This allows later passes to remove the first memcpy /// altogether. bool MemCpyOpt::processMemCpy(MemCpyInst *M) { - MemoryDependenceAnalysis &MD = getAnalysis(); - - // We can only optimize statically-sized memcpy's. - ConstantInt *cpyLen = dyn_cast(M->getLength()); - if (!cpyLen) return false; + // We can only optimize statically-sized memcpy's that are non-volatile. + ConstantInt *CopySize = dyn_cast(M->getLength()); + if (CopySize == 0 || M->isVolatile()) return false; // The are two possible optimizations we can do for memcpy: // a) memcpy-memcpy xform which exposes redundance for DSE. // b) call-memcpy xform for return slot optimization. - MemDepResult dep = MD.getDependency(M); - if (!dep.isClobber()) + MemDepResult DepInfo = MD->getDependency(M); + if (!DepInfo.isClobber()) return false; - if (MemCpyInst *MDep = dyn_cast(dep.getInst())) - return processMemCpyMemCpyDependence(M, MDep, cpyLen->getZExtValue()); + if (MemCpyInst *MDep = dyn_cast(DepInfo.getInst())) + return processMemCpyMemCpyDependence(M, MDep, CopySize->getZExtValue()); - if (CallInst *C = dyn_cast(dep.getInst())) { + if (CallInst *C = dyn_cast(DepInfo.getInst())) { bool changed = performCallSlotOptzn(M, M->getDest(), M->getSource(), - cpyLen->getZExtValue(), C); + CopySize->getZExtValue(), C); if (changed) M->eraseFromParent(); return changed; } @@ -805,33 +803,116 @@ bool MemCpyOpt::processMemMove(MemMoveInst *M) { // MemDep may have over conservative information about this instruction, just // conservatively flush it from the cache. - getAnalysis().removeInstruction(M); + MD->removeInstruction(M); ++NumMoveToCpy; return true; } +/// processByValArgument - This is called on every byval argument in call sites. +bool MemCpyOpt::processByValArgument(CallSite CS, unsigned ArgNo) { + TargetData *TD = getAnalysisIfAvailable(); + if (!TD) return false; -// MemCpyOpt::iterateOnFunction - Executes one iteration of GVN. + Value *ByValArg = CS.getArgument(ArgNo); + + // MemDep doesn't have a way to do a local query with a memory location. + // Instead, just insert a load and ask for its dependences. + LoadInst *TmpLoad = new LoadInst(ByValArg, "", CS.getInstruction()); + MemDepResult DepInfo = MD->getDependency(TmpLoad); + + MD->removeInstruction(TmpLoad); + TmpLoad->eraseFromParent(); + + if (!DepInfo.isClobber()) + return false; + + // If the byval argument isn't fed by a memcpy, ignore it. If it is fed by + // a memcpy, see if we can byval from the source of the memcpy instead of the + // result. + MemCpyInst *MDep = dyn_cast(DepInfo.getInst()); + if (MDep == 0 || MDep->isVolatile() || + ByValArg->stripPointerCasts() != MDep->getDest()) + return false; + + // The length of the memcpy must be larger or equal to the size of the byval. + // must be larger than the following one. + ConstantInt *C1 = dyn_cast(MDep->getLength()); + if (C1 == 0 || + C1->getValue().getZExtValue() < TD->getTypeAllocSize(ByValArg->getType())) + return false; + + // Get the alignment of the byval. If it is greater than the memcpy, then we + // can't do the substitution. If the call doesn't specify the alignment, then + // it is some target specific value that we can't know. + unsigned ByValAlign = CS.getParamAlignment(ArgNo+1); + if (ByValAlign == 0 || MDep->getAlignment() < ByValAlign) + return false; + + // Verify that the copied-from memory doesn't change in between the memcpy and + // the byval call. + // memcpy(a <- b) + // *b = 42; + // foo(*a) + // It would be invalid to transform the second memcpy into foo(*b). + Value *TmpCast = MDep->getSource(); + if (MDep->getSource()->getType() != ByValArg->getType()) + TmpCast = new BitCastInst(MDep->getSource(), ByValArg->getType(), + "tmpcast", CS.getInstruction()); + Value *TmpVal = + UndefValue::get(cast(TmpCast->getType())->getElementType()); + Instruction *TmpStore = new StoreInst(TmpVal, TmpCast, false, + CS.getInstruction()); + DepInfo = MD->getDependency(TmpStore); + bool isUnsafe = !DepInfo.isClobber() || DepInfo.getInst() != MDep; + MD->removeInstruction(TmpStore); + TmpStore->eraseFromParent(); + + if (isUnsafe) { + // Clean up the inserted cast instruction. + if (TmpCast != MDep->getSource()) + cast(TmpCast)->eraseFromParent(); + return false; + } + + DEBUG(dbgs() << "MemCpyOpt: Forwarding memcpy to byval:\n" + << " " << *MDep << "\n" + << " " << *CS.getInstruction() << "\n"); + + // Otherwise we're good! Update the byval argument. + CS.setArgument(ArgNo, TmpCast); + ++NumMemCpyInstr; + return true; +} + +/// iterateOnFunction - Executes one iteration of MemCpyOpt. bool MemCpyOpt::iterateOnFunction(Function &F) { bool MadeChange = false; // Walk all instruction in the function. for (Function::iterator BB = F.begin(), BBE = F.end(); BB != BBE; ++BB) { - for (BasicBlock::iterator BI = BB->begin(), BE = BB->end(); - BI != BE;) { + for (BasicBlock::iterator BI = BB->begin(), BE = BB->end(); BI != BE;) { // Avoid invalidating the iterator. Instruction *I = BI++; + bool RepeatInstruction = false; + if (StoreInst *SI = dyn_cast(I)) MadeChange |= processStore(SI, BI); - else if (MemCpyInst *M = dyn_cast(I)) - MadeChange |= processMemCpy(M); - else if (MemMoveInst *M = dyn_cast(I)) { - if (processMemMove(M)) { - --BI; // Reprocess the new memcpy. - MadeChange = true; - } + else if (MemCpyInst *M = dyn_cast(I)) { + RepeatInstruction = processMemCpy(M); + } else if (MemMoveInst *M = dyn_cast(I)) { + RepeatInstruction = processMemMove(M); + } else if (CallSite CS = (Value*)I) { + for (unsigned i = 0, e = CS.arg_size(); i != e; ++i) + if (CS.paramHasAttr(i+1, Attribute::ByVal)) + MadeChange |= processByValArgument(CS, i); + } + + // Reprocess the instruction if desired. + if (RepeatInstruction) { + --BI; + MadeChange = true; } } } @@ -844,12 +925,14 @@ bool MemCpyOpt::iterateOnFunction(Function &F) { // bool MemCpyOpt::runOnFunction(Function &F) { bool MadeChange = false; + MD = &getAnalysis(); while (1) { if (!iterateOnFunction(F)) break; MadeChange = true; } + MD = 0; return MadeChange; } diff --git a/test/Transforms/MemCpyOpt/memcpy.ll b/test/Transforms/MemCpyOpt/memcpy.ll index 303c2fcceef..7309319c46e 100644 --- a/test/Transforms/MemCpyOpt/memcpy.ll +++ b/test/Transforms/MemCpyOpt/memcpy.ll @@ -62,3 +62,18 @@ define void @test3({ x86_fp80, x86_fp80 }* noalias sret %agg.result) nounwind { ; CHECK-NEXT: call void @llvm.memcpy ; CHECK-NEXT: ret void } + + +; PR8644 +define void @test4(i8 *%P) { + %A = alloca {i32, i32} + %a = bitcast {i32, i32}* %A to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %a, i8* %P, i64 8, i32 4, i1 false) + call void @test4a(i8* byval align 1 %a) + ret void +; CHECK: @test4 +; CHECK-NEXT: call void @test4a( +} + +declare void @test4a(i8* byval align 1) +declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32, i1) nounwind diff --git a/test/Transforms/MemCpyOpt/sret.ll b/test/Transforms/MemCpyOpt/sret.ll index d35ab910d7c..ddfd0fd1fcc 100644 --- a/test/Transforms/MemCpyOpt/sret.ll +++ b/test/Transforms/MemCpyOpt/sret.ll @@ -3,7 +3,7 @@ target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128" target triple = "i686-apple-darwin9" -define void @ccosl({ x86_fp80, x86_fp80 }* noalias sret %agg.result, { x86_fp80, x86_fp80 }* byval %z) nounwind { +define void @ccosl({ x86_fp80, x86_fp80 }* noalias sret %agg.result, { x86_fp80, x86_fp80 }* byval align 8 %z) nounwind { entry: %iz = alloca { x86_fp80, x86_fp80 } ; <{ x86_fp80, x86_fp80 }*> [#uses=3] %memtmp = alloca { x86_fp80, x86_fp80 }, align 16 ; <{ x86_fp80, x86_fp80 }*> [#uses=2] @@ -16,7 +16,7 @@ entry: %tmp8 = load x86_fp80* %tmp7, align 16 ; [#uses=1] store x86_fp80 %tmp3, x86_fp80* %real, align 16 store x86_fp80 %tmp8, x86_fp80* %tmp4, align 16 - call void @ccoshl( { x86_fp80, x86_fp80 }* noalias sret %memtmp, { x86_fp80, x86_fp80 }* byval %iz ) nounwind + call void @ccoshl( { x86_fp80, x86_fp80 }* noalias sret %memtmp, { x86_fp80, x86_fp80 }* byval align 8 %iz ) nounwind %memtmp14 = bitcast { x86_fp80, x86_fp80 }* %memtmp to i8* ; [#uses=1] %agg.result15 = bitcast { x86_fp80, x86_fp80 }* %agg.result to i8* ; [#uses=1] call void @llvm.memcpy.i32( i8* %agg.result15, i8* %memtmp14, i32 32, i32 16 )