diff --git a/lib/Transforms/Scalar/GVN.cpp b/lib/Transforms/Scalar/GVN.cpp
index 42d7a939705..3478b310803 100644
--- a/lib/Transforms/Scalar/GVN.cpp
+++ b/lib/Transforms/Scalar/GVN.cpp
@@ -19,6 +19,7 @@
 #include "llvm/Constants.h"
 #include "llvm/DerivedTypes.h"
 #include "llvm/Function.h"
+#include "llvm/IntrinsicInst.h"
 #include "llvm/Instructions.h"
 #include "llvm/Value.h"
 #include "llvm/ADT/BitVector.h"
@@ -736,6 +737,7 @@ namespace {
                             SmallVector<Instruction*, 4>& toErase);
     bool processNonLocalLoad(LoadInst* L,
                              SmallVector<Instruction*, 4>& toErase);
+    bool processMemCpy(MemCpyInst* M, SmallVector<Instruction*, 4>& toErase);
     Value *GetValueForBlock(BasicBlock *BB, LoadInst* orig,
                             DenseMap<BasicBlock*, Value*> &Phis,
                             bool top_level = false);
@@ -1017,6 +1019,84 @@ bool GVN::processLoad(LoadInst* L,
   return deletedLoad;
 }
 
+/// processMemCpy - perform simplication of memcpy's.  If we have memcpy A which
+/// copies X to Y, and memcpy B which copies Y to Z, then we can rewrite B to be
+/// a memcpy from X to Z (or potentially a memmove, depending on circumstances).
+///  This allows later passes to remove the first memcpy altogether.
+bool GVN::processMemCpy(MemCpyInst* M,
+                        SmallVector<Instruction*, 4>& toErase) {
+  MemoryDependenceAnalysis& MD = getAnalysis<MemoryDependenceAnalysis>();
+  
+  // First, we have to check that the dependency is another memcpy
+  Instruction* dep = MD.getDependency(M);
+  if  (dep == MemoryDependenceAnalysis::None ||
+       dep == MemoryDependenceAnalysis::NonLocal ||
+       !isa<MemCpyInst>(dep))
+    return false;
+  
+  // We can only transforms memcpy's where the dest of one is the source of the
+  // other
+  MemCpyInst* MDep = cast<MemCpyInst>(dep);
+  if (M->getSource() != MDep->getDest())
+    return false;
+  
+  // Second, the length of the memcpy's must be the same, or the preceeding one
+  // must be larger than the following one.
+  Value* DepLength = MDep->getLength();
+  uint64_t CpySize = ~0UL;
+  uint64_t DepSize = ~0UL;
+  if (isa<ConstantInt>(DepLength)) {
+    if (isa<ConstantInt>(M->getLength())) {
+      if (cast<ConstantInt>(DepLength)->getLimitedValue() <
+          cast<ConstantInt>(M->getLength())->getLimitedValue()) {
+        return false;
+      } else {
+        CpySize = cast<ConstantInt>(M->getLength())->getLimitedValue();
+        DepSize = cast<ConstantInt>(DepLength)->getLimitedValue();
+      }
+    } else {
+      return false;
+    }
+  } else {
+    return false;
+  }
+  
+  // Finally, we have to make sure that the dest of the second does not
+  // alias the source of the first
+  AliasAnalysis& AA = getAnalysis<AliasAnalysis>();
+  if (AA.alias(M->getRawDest(), CpySize, MDep->getRawSource(), DepSize) !=
+      AliasAnalysis::NoAlias) {
+    // If they don't, we can still make the transformation by first turning M
+    // into a memmove rather than a memcpy.
+    bool is32bit = M->getIntrinsicID() == Intrinsic::memcpy_i32;
+    Function* MemMoveFun = Intrinsic::getDeclaration(
+                                 M->getParent()->getParent()->getParent(),
+                                 is32bit ? Intrinsic::memmove_i32 : 
+                                           Intrinsic::memmove_i64);
+    
+    std::vector<Value*> args;
+    args.push_back(M->getRawDest());
+    args.push_back(MDep->getRawSource());
+    args.push_back(M->getLength());
+    args.push_back(M->getAlignment());
+                                           
+    new CallInst(MemMoveFun, args.begin(), args.end(), "", M);
+    
+    MD.removeInstruction(M);
+    toErase.push_back(M);
+    
+    return true;
+  }
+  
+  // If all checks passed, then we can transform these memcpy's
+  M->setSource(MDep->getRawSource());
+  
+  // Reset dependence information for the memcpy
+  MD.removeInstruction(M);
+  
+  return true;
+}
+
 /// processInstruction - When calculating availability, handle an instruction
 /// by inserting it into the appropriate sets
 bool GVN::processInstruction(Instruction* I,
@@ -1025,6 +1105,8 @@ bool GVN::processInstruction(Instruction* I,
                                 SmallVector<Instruction*, 4>& toErase) {
   if (LoadInst* L = dyn_cast<LoadInst>(I)) {
     return processLoad(L, lastSeenLoad, toErase);
+  } else if (MemCpyInst* M = dyn_cast<MemCpyInst>(I)) {
+    return processMemCpy(M, toErase);
   }
   
   unsigned num = VN.lookup_or_add(I);
diff --git a/test/Transforms/GVN/memcpy.ll b/test/Transforms/GVN/memcpy.ll
new file mode 100644
index 00000000000..a91e12d2ccd
--- /dev/null
+++ b/test/Transforms/GVN/memcpy.ll
@@ -0,0 +1,22 @@
+; RUN: llvm-as < %s | opt -gvn -dse | llvm-dis | not grep {i8* %agg.result21, i8* %tmp219}
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
+target triple = "i686-apple-darwin9"
+
+define void @ccosl({ x86_fp80, x86_fp80 }* sret  %agg.result, x86_fp80 %z.0, x86_fp80 %z.1) nounwind  {
+entry:
+	%tmp2 = alloca { x86_fp80, x86_fp80 }		; <{ x86_fp80, x86_fp80 }*> [#uses=1]
+	%memtmp = alloca { x86_fp80, x86_fp80 }, align 16		; <{ x86_fp80, x86_fp80 }*> [#uses=2]
+	%tmp5 = sub x86_fp80 0xK80000000000000000000, %z.1		; <x86_fp80> [#uses=1]
+	call void @ccoshl( { x86_fp80, x86_fp80 }* sret  %memtmp, x86_fp80 %tmp5, x86_fp80 %z.0 ) nounwind 
+	%tmp219 = bitcast { x86_fp80, x86_fp80 }* %tmp2 to i8*		; <i8*> [#uses=2]
+	%memtmp20 = bitcast { x86_fp80, x86_fp80 }* %memtmp to i8*		; <i8*> [#uses=1]
+	call void @llvm.memcpy.i32( i8* %tmp219, i8* %memtmp20, i32 32, i32 16 )
+	%agg.result21 = bitcast { x86_fp80, x86_fp80 }* %agg.result to i8*		; <i8*> [#uses=1]
+	call void @llvm.memcpy.i32( i8* %agg.result21, i8* %tmp219, i32 32, i32 16 )
+	ret void
+}
+
+declare void @ccoshl({ x86_fp80, x86_fp80 }* sret , x86_fp80, x86_fp80) nounwind 
+
+declare void @llvm.memcpy.i32(i8*, i8*, i32, i32) nounwind