diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index e97d09aeb2a..7a9a4d77d2d 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -5448,6 +5448,18 @@ static SDValue getMemCmpLoad(const Value *PtrVal, MVT LoadVT,
   return LoadVal;
 }
 
+/// processIntegerCallValue - Record the value for an instruction that
+/// produces an integer result, converting the type where necessary.
+void SelectionDAGBuilder::processIntegerCallValue(const Instruction &I,
+                                                  SDValue Value,
+                                                  bool IsSigned) {
+  EVT VT = TM.getTargetLowering()->getValueType(I.getType(), true);
+  if (IsSigned)
+    Value = DAG.getSExtOrTrunc(Value, getCurSDLoc(), VT);
+  else
+    Value = DAG.getZExtOrTrunc(Value, getCurSDLoc(), VT);
+  setValue(&I, Value);
+}
 
 /// visitMemCmpCall - See if we can lower a call to memcmp in an optimized form.
 /// If so, return true and lower it, otherwise return false and it will be
@@ -5463,35 +5475,33 @@ bool SelectionDAGBuilder::visitMemCmpCall(const CallInst &I) {
       !I.getType()->isIntegerTy())
     return false;
 
-  const ConstantInt *Size = dyn_cast<ConstantInt>(I.getArgOperand(2));
-  if (Size && Size->getZExtValue() == 0) {
+  const Value *Size = I.getArgOperand(2);
+  const ConstantInt *CSize = dyn_cast<ConstantInt>(Size);
+  if (CSize && CSize->getZExtValue() == 0) {
     EVT CallVT = TM.getTargetLowering()->getValueType(I.getType(), true);
     setValue(&I, DAG.getConstant(0, CallVT));
     return true;
   }
 
-  const Value *Arg0 = I.getArgOperand(0);
-  const Value *Arg1 = I.getArgOperand(1);
-  const Value *Arg2 = I.getArgOperand(2);
   const TargetSelectionDAGInfo &TSI = DAG.getSelectionDAGInfo();
   std::pair<SDValue, SDValue> Res =
     TSI.EmitTargetCodeForMemcmp(DAG, getCurSDLoc(), DAG.getRoot(),
-                                getValue(Arg0), getValue(Arg1), getValue(Arg2),
-                                MachinePointerInfo(Arg0),
-                                MachinePointerInfo(Arg1));
+                                getValue(LHS), getValue(RHS), getValue(Size),
+                                MachinePointerInfo(LHS),
+                                MachinePointerInfo(RHS));
   if (Res.first.getNode()) {
-    setValue(&I, Res.first);
-    DAG.setRoot(Res.second);
+    processIntegerCallValue(I, Res.first, true);
+    PendingLoads.push_back(Res.second);
     return true;
   }
 
   // memcmp(S1,S2,2) != 0 -> (*(short*)LHS != *(short*)RHS)  != 0
   // memcmp(S1,S2,4) != 0 -> (*(int*)LHS != *(int*)RHS)  != 0
-  if (Size && IsOnlyUsedInZeroEqualityComparison(&I)) {
+  if (CSize && IsOnlyUsedInZeroEqualityComparison(&I)) {
     bool ActuallyDoIt = true;
     MVT LoadVT;
     Type *LoadTy;
-    switch (Size->getZExtValue()) {
+    switch (CSize->getZExtValue()) {
     default:
       LoadVT = MVT::Other;
       LoadTy = 0;
@@ -5499,20 +5509,20 @@ bool SelectionDAGBuilder::visitMemCmpCall(const CallInst &I) {
       break;
     case 2:
       LoadVT = MVT::i16;
-      LoadTy = Type::getInt16Ty(Size->getContext());
+      LoadTy = Type::getInt16Ty(CSize->getContext());
       break;
     case 4:
       LoadVT = MVT::i32;
-      LoadTy = Type::getInt32Ty(Size->getContext());
+      LoadTy = Type::getInt32Ty(CSize->getContext());
       break;
     case 8:
       LoadVT = MVT::i64;
-      LoadTy = Type::getInt64Ty(Size->getContext());
+      LoadTy = Type::getInt64Ty(CSize->getContext());
       break;
         /*
     case 16:
       LoadVT = MVT::v4i32;
-      LoadTy = Type::getInt32Ty(Size->getContext());
+      LoadTy = Type::getInt32Ty(CSize->getContext());
       LoadTy = VectorType::get(LoadTy, 4);
       break;
          */
@@ -5526,7 +5536,7 @@ bool SelectionDAGBuilder::visitMemCmpCall(const CallInst &I) {
     // supports unaligned loads of that type.  Expanding into byte loads would
     // bloat the code.
     const TargetLowering *TLI = TM.getTargetLowering();
-    if (ActuallyDoIt && Size->getZExtValue() > 4) {
+    if (ActuallyDoIt && CSize->getZExtValue() > 4) {
       // TODO: Handle 5 byte compare as 4-byte + 1 byte.
       // TODO: Handle 8 byte compare on x86-32 as two 32-bit loads.
       if (!TLI->isTypeLegal(LoadVT) ||!TLI->allowsUnalignedMemoryAccesses(LoadVT))
@@ -5539,8 +5549,7 @@ bool SelectionDAGBuilder::visitMemCmpCall(const CallInst &I) {
 
       SDValue Res = DAG.getSetCC(getCurSDLoc(), MVT::i1, LHSVal, RHSVal,
                                  ISD::SETNE);
-      EVT CallVT = TLI->getValueType(I.getType(), true);
-      setValue(&I, DAG.getZExtOrTrunc(Res, getCurSDLoc(), CallVT));
+      processIntegerCallValue(I, Res, false);
       return true;
     }
   }
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
index 82ac0ed69e2..217d2f9e6ac 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
@@ -543,6 +543,9 @@ private:
     llvm_unreachable("UserOp2 should not exist at instruction selection time!");
   }
 
+  void processIntegerCallValue(const Instruction &I,
+                               SDValue Value, bool IsSigned);
+
   void HandlePHINodesInSuccessorBlocks(const BasicBlock *LLVMBB);
 
   /// EmitFuncArgumentDbgValue - If V is an function argument then create
diff --git a/lib/Target/SystemZ/SystemZInstrInfo.cpp b/lib/Target/SystemZ/SystemZInstrInfo.cpp
index f2e877fd29b..a84ce28c514 100644
--- a/lib/Target/SystemZ/SystemZInstrInfo.cpp
+++ b/lib/Target/SystemZ/SystemZInstrInfo.cpp
@@ -339,7 +339,12 @@ static void eraseIfDead(MachineInstr *MI, const MachineRegisterInfo *MRI) {
 static bool removeIPMBasedCompare(MachineInstr *Compare, unsigned SrcReg,
                                   const MachineRegisterInfo *MRI,
                                   const TargetRegisterInfo *TRI) {
+  MachineInstr *LGFR = 0;
   MachineInstr *RLL = getDef(SrcReg, MRI);
+  if (RLL && RLL->getOpcode() == SystemZ::LGFR) {
+    LGFR = RLL;
+    RLL = getDef(LGFR->getOperand(1).getReg(), MRI);
+  }
   if (!RLL || !isShift(RLL, SystemZ::RLL, 31))
     return false;
 
@@ -362,6 +367,8 @@ static bool removeIPMBasedCompare(MachineInstr *Compare, unsigned SrcReg,
   }
 
   Compare->eraseFromParent();
+  if (LGFR)
+    eraseIfDead(LGFR, MRI);
   eraseIfDead(RLL, MRI);
   eraseIfDead(SRL, MRI);
   eraseIfDead(IPM, MRI);
diff --git a/test/CodeGen/SystemZ/memcmp-01.ll b/test/CodeGen/SystemZ/memcmp-01.ll
index 4d8cd14880a..5f5752b336d 100644
--- a/test/CodeGen/SystemZ/memcmp-01.ll
+++ b/test/CodeGen/SystemZ/memcmp-01.ll
@@ -1,4 +1,4 @@
-; Test memcmp using CLC.
+; Test memcmp using CLC, with i32 results.
 ;
 ; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
 
diff --git a/test/CodeGen/SystemZ/memcmp-02.ll b/test/CodeGen/SystemZ/memcmp-02.ll
new file mode 100644
index 00000000000..cae3d3d4943
--- /dev/null
+++ b/test/CodeGen/SystemZ/memcmp-02.ll
@@ -0,0 +1,135 @@
+; Test memcmp using CLC, with i64 results.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+
+declare i64 @memcmp(i8 *%src1, i8 *%src2, i64 %size)
+
+; Zero-length comparisons should be optimized away.
+define i64 @f1(i8 *%src1, i8 *%src2) {
+; CHECK-LABEL: f1:
+; CHECK: lghi %r2, 0
+; CHECK: br %r14
+  %res = call i64 @memcmp(i8 *%src1, i8 *%src2, i64 0)
+  ret i64 %res
+}
+
+; Check a case where the result is used as an integer.
+define i64 @f2(i8 *%src1, i8 *%src2) {
+; CHECK-LABEL: f2:
+; CHECK: clc 0(2,%r2), 0(%r3)
+; CHECK: ipm [[REG:%r[0-5]]]
+; CHECK: srl [[REG]], 28
+; CHECK: rll [[REG]], [[REG]], 31
+; CHECK: lgfr %r2, [[REG]]
+; CHECK: br %r14
+  %res = call i64 @memcmp(i8 *%src1, i8 *%src2, i64 2)
+  ret i64 %res
+}
+
+; Check a case where the result is tested for equality.
+define void @f3(i8 *%src1, i8 *%src2, i64 *%dest) {
+; CHECK-LABEL: f3:
+; CHECK: clc 0(3,%r2), 0(%r3)
+; CHECK-NEXT: je {{\..*}}
+; CHECK: br %r14
+  %res = call i64 @memcmp(i8 *%src1, i8 *%src2, i64 3)
+  %cmp = icmp eq i64 %res, 0
+  br i1 %cmp, label %exit, label %store
+
+store:
+  store i64 0, i64 *%dest
+  br label %exit
+
+exit:
+  ret void
+}
+
+; Check a case where the result is tested for inequality.
+define void @f4(i8 *%src1, i8 *%src2, i64 *%dest) {
+; CHECK-LABEL: f4:
+; CHECK: clc 0(4,%r2), 0(%r3)
+; CHECK-NEXT: jlh {{\..*}}
+; CHECK: br %r14
+entry:
+  %res = call i64 @memcmp(i8 *%src1, i8 *%src2, i64 4)
+  %cmp = icmp ne i64 %res, 0
+  br i1 %cmp, label %exit, label %store
+
+store:
+  store i64 0, i64 *%dest
+  br label %exit
+
+exit:
+  ret void
+}
+
+; Check a case where the result is tested via slt.
+define void @f5(i8 *%src1, i8 *%src2, i64 *%dest) {
+; CHECK-LABEL: f5:
+; CHECK: clc 0(5,%r2), 0(%r3)
+; CHECK-NEXT: jl {{\..*}}
+; CHECK: br %r14
+entry:
+  %res = call i64 @memcmp(i8 *%src1, i8 *%src2, i64 5)
+  %cmp = icmp slt i64 %res, 0
+  br i1 %cmp, label %exit, label %store
+
+store:
+  store i64 0, i64 *%dest
+  br label %exit
+
+exit:
+  ret void
+}
+
+; Check a case where the result is tested for sgt.
+define void @f6(i8 *%src1, i8 *%src2, i64 *%dest) {
+; CHECK-LABEL: f6:
+; CHECK: clc 0(6,%r2), 0(%r3)
+; CHECK-NEXT: jh {{\..*}}
+; CHECK: br %r14
+entry:
+  %res = call i64 @memcmp(i8 *%src1, i8 *%src2, i64 6)
+  %cmp = icmp sgt i64 %res, 0
+  br i1 %cmp, label %exit, label %store
+
+store:
+  store i64 0, i64 *%dest
+  br label %exit
+
+exit:
+  ret void
+}
+
+; Check the upper end of the CLC range.  Here the result is used both as
+; an integer and for branching.
+define i64 @f7(i8 *%src1, i8 *%src2, i64 *%dest) {
+; CHECK-LABEL: f7:
+; CHECK: clc 0(256,%r2), 0(%r3)
+; CHECK: ipm [[REG:%r[0-5]]]
+; CHECK: srl [[REG]], 28
+; CHECK: rll [[REG]], [[REG]], 31
+; CHECK: lgfr %r2, [[REG]]
+; CHECK: jl {{.L*}}
+; CHECK: br %r14
+entry:
+  %res = call i64 @memcmp(i8 *%src1, i8 *%src2, i64 256)
+  %cmp = icmp slt i64 %res, 0
+  br i1 %cmp, label %exit, label %store
+
+store:
+  store i64 0, i64 *%dest
+  br label %exit
+
+exit:
+  ret i64 %res
+}
+
+; 257 bytes is too big for a single CLC.  For now expect a call instead.
+define i64 @f8(i8 *%src1, i8 *%src2) {
+; CHECK-LABEL: f8:
+; CHECK: brasl %r14, memcmp@PLT
+; CHECK: br %r14
+  %res = call i64 @memcmp(i8 *%src1, i8 *%src2, i64 257)
+  ret i64 %res
+}