[MemCpyOpt] Turn memcpy from just-memset'd source into memset.

There's no point in copying around constants, so, when all else fails,
we can still transform memcpy of memset into two independent memsets.

To quote the example, we can turn:
  memset(dst1, c, dst1_size);
  memcpy(dst2, dst1, dst2_size);
into:
  memset(dst1, c, dst1_size);
  memset(dst2, c, dst2_size);
When dst2_size <= dst1_size.

Like r235232 for copy constructors, this can occur in move constructors.

Differential Revision: http://reviews.llvm.org/D9682


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@237506 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
Ahmed Bougacha 2015-05-16 01:32:26 +00:00
parent 487db4685a
commit 8081057fca
4 changed files with 149 additions and 3 deletions

View File

@ -346,6 +346,7 @@ namespace {
uint64_t cpyLen, unsigned cpyAlign, CallInst *C);
bool processMemCpyMemCpyDependence(MemCpyInst *M, MemCpyInst *MDep);
bool processMemSetMemCpyDependence(MemCpyInst *M, MemSetInst *MDep);
bool performMemCpyToMemSetOptzn(MemCpyInst *M, MemSetInst *MDep);
bool processByValArgument(CallSite CS, unsigned ArgNo);
Instruction *tryMergingIntoMemset(Instruction *I, Value *StartPtr,
Value *ByteVal);
@ -896,6 +897,39 @@ bool MemCpyOpt::processMemSetMemCpyDependence(MemCpyInst *MemCpy,
return true;
}
/// Transform memcpy to memset when its source was just memset.
/// In other words, turn:
/// \code
/// memset(dst1, c, dst1_size);
/// memcpy(dst2, dst1, dst2_size);
/// \endcode
/// into:
/// \code
/// memset(dst1, c, dst1_size);
/// memset(dst2, c, dst2_size);
/// \endcode
/// When dst2_size <= dst1_size.
///
/// The \p MemCpy must have a Constant length.
bool MemCpyOpt::performMemCpyToMemSetOptzn(MemCpyInst *MemCpy,
MemSetInst *MemSet) {
// This only makes sense on memcpy(..., memset(...), ...).
if (MemSet->getRawDest() != MemCpy->getRawSource())
return false;
ConstantInt *CopySize = cast<ConstantInt>(MemCpy->getLength());
ConstantInt *MemSetSize = dyn_cast<ConstantInt>(MemSet->getLength());
// Make sure the memcpy doesn't read any more than what the memset wrote.
// Don't worry about sizes larger than i64.
if (!MemSetSize || CopySize->getZExtValue() > MemSetSize->getZExtValue())
return false;
IRBuilder<> Builder(MemCpy->getNextNode());
Builder.CreateMemSet(MemCpy->getRawDest(), MemSet->getOperand(1),
CopySize, MemCpy->getAlignment());
return true;
}
/// processMemCpy - perform simplification of memcpy's. If we have memcpy A
/// which copies X to Y, and memcpy B which copies Y to Z, then we can rewrite
/// B to be a memcpy from X to Z (or potentially a memmove, depending on
@ -938,12 +972,13 @@ bool MemCpyOpt::processMemCpy(MemCpyInst *M) {
ConstantInt *CopySize = dyn_cast<ConstantInt>(M->getLength());
if (!CopySize) return false;
// The are three possible optimizations we can do for memcpy:
// There are four possible optimizations we can do for memcpy:
// a) memcpy-memcpy xform which exposes redundance for DSE.
// b) call-memcpy xform for return slot optimization.
// c) memcpy from freshly alloca'd space or space that has just started its
// lifetime copies undefined data, and we can therefore eliminate the
// memcpy in favor of the data that was already at the destination.
// d) memcpy from a just-memset'd source can be turned into memset.
if (DepInfo.isClobber()) {
if (CallInst *C = dyn_cast<CallInst>(DepInfo.getInst())) {
if (performCallSlotOptzn(M, M->getDest(), M->getSource(),
@ -984,6 +1019,15 @@ bool MemCpyOpt::processMemCpy(MemCpyInst *M) {
}
}
if (SrcDepInfo.isClobber())
if (MemSetInst *MDep = dyn_cast<MemSetInst>(SrcDepInfo.getInst()))
if (performMemCpyToMemSetOptzn(M, MDep)) {
MD->removeInstruction(M);
M->eraseFromParent();
++NumCpyToSet;
return true;
}
return false;
}

View File

@ -17,10 +17,11 @@ define void @must_remove_memcpy(i8* noalias nocapture dereferenceable(4096) %dst
}
; memset touch more bytes than those guaranteed to be dereferenceable
; We can't remove the memcpy, but we can turn it into an independent memset.
define void @must_not_remove_memcpy(i8* noalias nocapture dereferenceable(1024) %dst) {
; CHECK-LABEL: @must_not_remove_memcpy(
; CHECK: call void @llvm.memset.p0i8.i64
; CHECK: call void @llvm.memcpy.p0i8.p0i8.i64
; CHECK: call void @llvm.memset.p0i8.i64
%src = alloca [4096 x i8], align 1
%p = getelementptr inbounds [4096 x i8], [4096 x i8]* %src, i64 0, i64 0
call void @llvm.memset.p0i8.i64(i8* %p, i8 0, i64 4096, i32 1, i1 false)

View File

@ -40,7 +40,7 @@ entry-block:
; CHECK: %[[a_cast:[^=]+]] = bitcast [8 x i64]* %[[a]] to i8*
; CHECK: call void @llvm.memset.p0i8.i64(i8* %[[a_cast]], i8 0, i64 64
; CHECK: %[[sret_cast:[^=]+]] = bitcast [8 x i64]* %sret to i8*
; CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %[[sret_cast]], i8* %[[a_cast]], i64 64
; CHECK: call void @llvm.memset.p0i8.i64(i8* %[[sret_cast]], i8 0, i64 64
; CHECK: call void @llvm.memset.p0i8.i64(i8* %[[a_cast]], i8 42, i64 32
; CHECK: %[[out_cast:[^=]+]] = bitcast [8 x i64]* %out to i8*
; CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %[[out_cast]], i8* %[[a_cast]], i64 64

View File

@ -0,0 +1,101 @@
; RUN: opt -memcpyopt -S %s | FileCheck %s
target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
; CHECK-LABEL: define void @test(
; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* %dst1, i8 %c, i64 128, i32 1, i1 false)
; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* %dst2, i8 %c, i64 128, i32 8, i1 false)
; CHECK-NEXT: ret void
define void @test(i8* %dst1, i8* %dst2, i8 %c) {
call void @llvm.memset.p0i8.i64(i8* %dst1, i8 %c, i64 128, i32 1, i1 false)
call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst2, i8* %dst1, i64 128, i32 8, i1 false)
ret void
}
; CHECK-LABEL: define void @test_smaller_memcpy(
; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* %dst1, i8 %c, i64 128, i32 1, i1 false)
; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* %dst2, i8 %c, i64 100, i32 1, i1 false)
; CHECK-NEXT: ret void
define void @test_smaller_memcpy(i8* %dst1, i8* %dst2, i8 %c) {
call void @llvm.memset.p0i8.i64(i8* %dst1, i8 %c, i64 128, i32 1, i1 false)
call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst2, i8* %dst1, i64 100, i32 1, i1 false)
ret void
}
; CHECK-LABEL: define void @test_smaller_memset(
; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* %dst1, i8 %c, i64 100, i32 1, i1 false)
; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst2, i8* %dst1, i64 128, i32 1, i1 false)
; CHECK-NEXT: ret void
define void @test_smaller_memset(i8* %dst1, i8* %dst2, i8 %c) {
call void @llvm.memset.p0i8.i64(i8* %dst1, i8 %c, i64 100, i32 1, i1 false)
call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst2, i8* %dst1, i64 128, i32 1, i1 false)
ret void
}
; CHECK-LABEL: define void @test_align_memset(
; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* %dst1, i8 %c, i64 128, i32 8, i1 false)
; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* %dst2, i8 %c, i64 128, i32 1, i1 false)
; CHECK-NEXT: ret void
define void @test_align_memset(i8* %dst1, i8* %dst2, i8 %c) {
call void @llvm.memset.p0i8.i64(i8* %dst1, i8 %c, i64 128, i32 8, i1 false)
call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst2, i8* %dst1, i64 128, i32 1, i1 false)
ret void
}
; CHECK-LABEL: define void @test_different_types(
; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* %dst1, i8 %c, i64 128, i32 8, i1 false)
; CHECK-NEXT: call void @llvm.memset.p0i8.i32(i8* %dst2, i8 %c, i32 100, i32 1, i1 false)
; CHECK-NEXT: ret void
define void @test_different_types(i8* %dst1, i8* %dst2, i8 %c) {
call void @llvm.memset.p0i8.i64(i8* %dst1, i8 %c, i64 128, i32 8, i1 false)
call void @llvm.memcpy.p0i8.p0i8.i32(i8* %dst2, i8* %dst1, i32 100, i32 1, i1 false)
ret void
}
; CHECK-LABEL: define void @test_different_types_2(
; CHECK-NEXT: call void @llvm.memset.p0i8.i32(i8* %dst1, i8 %c, i32 128, i32 8, i1 false)
; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* %dst2, i8 %c, i64 100, i32 1, i1 false)
; CHECK-NEXT: ret void
define void @test_different_types_2(i8* %dst1, i8* %dst2, i8 %c) {
call void @llvm.memset.p0i8.i32(i8* %dst1, i8 %c, i32 128, i32 8, i1 false)
call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst2, i8* %dst1, i64 100, i32 1, i1 false)
ret void
}
; CHECK-LABEL: define void @test_different_source_gep(
; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* %dst1, i8 %c, i64 128, i32 1, i1 false)
; CHECK-NEXT: %p = getelementptr i8, i8* %dst1, i64 64
; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst2, i8* %p, i64 64, i32 1, i1 false)
; CHECK-NEXT: ret void
define void @test_different_source_gep(i8* %dst1, i8* %dst2, i8 %c) {
call void @llvm.memset.p0i8.i64(i8* %dst1, i8 %c, i64 128, i32 1, i1 false)
; FIXME: We could optimize this as well.
%p = getelementptr i8, i8* %dst1, i64 64
call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst2, i8* %p, i64 64, i32 1, i1 false)
ret void
}
; CHECK-LABEL: define void @test_variable_size_1(
; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* %dst1, i8 %c, i64 %dst1_size, i32 1, i1 false)
; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst2, i8* %dst1, i64 128, i32 1, i1 false)
; CHECK-NEXT: ret void
define void @test_variable_size_1(i8* %dst1, i64 %dst1_size, i8* %dst2, i8 %c) {
call void @llvm.memset.p0i8.i64(i8* %dst1, i8 %c, i64 %dst1_size, i32 1, i1 false)
call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst2, i8* %dst1, i64 128, i32 1, i1 false)
ret void
}
; CHECK-LABEL: define void @test_variable_size_2(
; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* %dst1, i8 %c, i64 128, i32 1, i1 false)
; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst2, i8* %dst1, i64 %dst2_size, i32 1, i1 false)
; CHECK-NEXT: ret void
define void @test_variable_size_2(i8* %dst1, i8* %dst2, i64 %dst2_size, i8 %c) {
call void @llvm.memset.p0i8.i64(i8* %dst1, i8 %c, i64 128, i32 1, i1 false)
call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst2, i8* %dst1, i64 %dst2_size, i32 1, i1 false)
ret void
}
declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1)
declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i32, i1)
declare void @llvm.memset.p0i8.i32(i8* nocapture, i8, i32, i32, i1)
declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture readonly, i32, i32, i1)