diff --git a/lib/Target/Sparc/SparcISelLowering.cpp b/lib/Target/Sparc/SparcISelLowering.cpp
index f987a15cee4..eb01c5ef318 100644
--- a/lib/Target/Sparc/SparcISelLowering.cpp
+++ b/lib/Target/Sparc/SparcISelLowering.cpp
@@ -594,6 +594,15 @@ LowerFormalArguments_64(SDValue Chain,
 SDValue
 SparcTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
                                SmallVectorImpl<SDValue> &InVals) const {
+  if (Subtarget->is64Bit())
+    return LowerCall_64(CLI, InVals);
+  return LowerCall_32(CLI, InVals);
+}
+
+// Lower a call for the 32-bit ABI.
+SDValue
+SparcTargetLowering::LowerCall_32(TargetLowering::CallLoweringInfo &CLI,
+                                  SmallVectorImpl<SDValue> &InVals) const {
   SelectionDAG &DAG                     = CLI.DAG;
   DebugLoc &dl                          = CLI.DL;
   SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
@@ -887,6 +896,221 @@ SparcTargetLowering::getSRetArgSize(SelectionDAG &DAG, SDValue Callee) const
   return getDataLayout()->getTypeAllocSize(ElementTy);
 }
 
+// Lower a call for the 64-bit ABI.
+SDValue
+SparcTargetLowering::LowerCall_64(TargetLowering::CallLoweringInfo &CLI,
+                                  SmallVectorImpl<SDValue> &InVals) const {
+  SelectionDAG &DAG = CLI.DAG;
+  DebugLoc DL = CLI.DL;
+  SDValue Chain = CLI.Chain;
+
+  // Analyze operands of the call, assigning locations to each operand.
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(CLI.CallConv, CLI.IsVarArg, DAG.getMachineFunction(),
+                 DAG.getTarget(), ArgLocs, *DAG.getContext());
+  CCInfo.AnalyzeCallOperands(CLI.Outs, CC_Sparc64);
+
+  // Get the size of the outgoing arguments stack space requirement.
+  // The stack offset computed by CC_Sparc64 includes all arguments.
+  // We always allocate space for 6 arguments in the prolog.
+  unsigned ArgsSize = std::max(6*8u, CCInfo.getNextStackOffset()) - 6*8u;
+
+  // Keep stack frames 16-byte aligned.
+  ArgsSize = RoundUpToAlignment(ArgsSize, 16);
+
+  // Adjust the stack pointer to make room for the arguments.
+  // FIXME: Use hasReservedCallFrame to avoid %sp adjustments around all calls
+  // with more than 6 arguments.
+  Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(ArgsSize, true));
+
+  // Collect the set of registers to pass to the function and their values.
+  // This will be emitted as a sequence of CopyToReg nodes glued to the call
+  // instruction.
+  SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
+
+  // Collect chains from all the memory opeations that copy arguments to the
+  // stack. They must follow the stack pointer adjustment above and precede the
+  // call instruction itself.
+  SmallVector<SDValue, 8> MemOpChains;
+
+  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+    const CCValAssign &VA = ArgLocs[i];
+    SDValue Arg = CLI.OutVals[i];
+
+    // Promote the value if needed.
+    switch (VA.getLocInfo()) {
+    default:
+      llvm_unreachable("Unknown location info!");
+    case CCValAssign::Full:
+      break;
+    case CCValAssign::SExt:
+      Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
+      break;
+    case CCValAssign::ZExt:
+      Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
+      break;
+    case CCValAssign::AExt:
+      Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
+      break;
+    case CCValAssign::BCvt:
+      Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
+      break;
+    }
+
+    if (VA.isRegLoc()) {
+      // The custom bit on an i32 return value indicates that it should be
+      // passed in the high bits of the register.
+      if (VA.getValVT() == MVT::i32 && VA.needsCustom()) {
+        Arg = DAG.getNode(ISD::SHL, DL, MVT::i64, Arg,
+                          DAG.getConstant(32, MVT::i32));
+
+        // The next value may go in the low bits of the same register.
+        // Handle both at once.
+        if (i+1 < ArgLocs.size() && ArgLocs[i+1].isRegLoc() &&
+            ArgLocs[i+1].getLocReg() == VA.getLocReg()) {
+          SDValue NV = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64,
+                                   CLI.OutVals[i+1]);
+          Arg = DAG.getNode(ISD::OR, DL, MVT::i64, Arg, NV);
+          // Skip the next value, it's already done.
+          ++i;
+        }
+      }
+
+      // The argument registers are described in term of the callee's register
+      // window, so translate I0-I7 -> O0-O7.
+      unsigned Reg = VA.getLocReg();
+      if (Reg >= SP::I0 && Reg <= SP::I7)
+        Reg = Reg - SP::I0 + SP::O0;
+      RegsToPass.push_back(std::make_pair(Reg, Arg));
+      continue;
+    }
+
+    assert(VA.isMemLoc());
+
+    // Create a store off the stack pointer for this argument.
+    SDValue StackPtr = DAG.getRegister(SP::O6, getPointerTy());
+    // The argument area starts at %fp+BIAS+128 in the callee frame,
+    // %sp+BIAS+128 in ours.
+    SDValue PtrOff = DAG.getIntPtrConstant(VA.getLocMemOffset() +
+                                           Subtarget->getStackPointerBias() +
+                                           128);
+    PtrOff = DAG.getNode(ISD::ADD, DL, getPointerTy(), StackPtr, PtrOff);
+    MemOpChains.push_back(DAG.getStore(Chain, DL, Arg, PtrOff,
+                                       MachinePointerInfo(),
+                                       false, false, 0));
+  }
+
+  // Emit all stores, make sure they occur before the call.
+  if (!MemOpChains.empty())
+    Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
+                        &MemOpChains[0], MemOpChains.size());
+
+  // Build a sequence of CopyToReg nodes glued together with token chain and
+  // glue operands which copy the outgoing args into registers. The InGlue is
+  // necessary since all emitted instructions must be stuck together in order
+  // to pass the live physical registers.
+  SDValue InGlue;
+  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
+    Chain = DAG.getCopyToReg(Chain, DL,
+                             RegsToPass[i].first, RegsToPass[i].second, InGlue);
+    InGlue = Chain.getValue(1);
+  }
+
+  // If the callee is a GlobalAddress node (quite common, every direct call is)
+  // turn it into a TargetGlobalAddress node so that legalize doesn't hack it.
+  // Likewise ExternalSymbol -> TargetExternalSymbol.
+  SDValue Callee = CLI.Callee;
+  if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
+    Callee = DAG.getTargetGlobalAddress(G->getGlobal(), DL, getPointerTy());
+  else if (ExternalSymbolSDNode *E = dyn_cast<ExternalSymbolSDNode>(Callee))
+    Callee = DAG.getTargetExternalSymbol(E->getSymbol(), getPointerTy());
+
+  // Build the operands for the call instruction itself.
+  SmallVector<SDValue, 8> Ops;
+  Ops.push_back(Chain);
+  Ops.push_back(Callee);
+  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
+    Ops.push_back(DAG.getRegister(RegsToPass[i].first,
+                                  RegsToPass[i].second.getValueType()));
+
+  // Make sure the CopyToReg nodes are glued to the call instruction which
+  // consumes the registers.
+  if (InGlue.getNode())
+    Ops.push_back(InGlue);
+
+  // Now the call itself.
+  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
+  Chain = DAG.getNode(SPISD::CALL, DL, NodeTys, &Ops[0], Ops.size());
+  InGlue = Chain.getValue(1);
+
+  // Revert the stack pointer immediately after the call.
+  Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(ArgsSize, true),
+                             DAG.getIntPtrConstant(0, true), InGlue);
+  InGlue = Chain.getValue(1);
+
+  // Now extract the return values. This is more or less the same as
+  // LowerFormalArguments_64.
+
+  // Assign locations to each value returned by this call.
+  SmallVector<CCValAssign, 16> RVLocs;
+  CCState RVInfo(CLI.CallConv, CLI.IsVarArg, DAG.getMachineFunction(),
+                 DAG.getTarget(), RVLocs, *DAG.getContext());
+  RVInfo.AnalyzeCallResult(CLI.Ins, CC_Sparc64);
+
+  // Copy all of the result registers out of their specified physreg.
+  for (unsigned i = 0; i != RVLocs.size(); ++i) {
+    CCValAssign &VA = RVLocs[i];
+    unsigned Reg = VA.getLocReg();
+
+    // Remap I0-I7 -> O0-O7.
+    if (Reg >= SP::I0 && Reg <= SP::I7)
+      Reg = Reg - SP::I0 + SP::O0;
+
+    // When returning 'inreg {i32, i32 }', two consecutive i32 arguments can
+    // reside in the same register in the high and low bits. Reuse the
+    // CopyFromReg previous node to avoid duplicate copies.
+    SDValue RV;
+    if (RegisterSDNode *SrcReg = dyn_cast<RegisterSDNode>(Chain.getOperand(1)))
+      if (SrcReg->getReg() == Reg && Chain->getOpcode() == ISD::CopyFromReg)
+        RV = Chain.getValue(0);
+
+    // But usually we'll create a new CopyFromReg for a different register.
+    if (!RV.getNode()) {
+      RV = DAG.getCopyFromReg(Chain, DL, Reg, RVLocs[i].getLocVT(), InGlue);
+      Chain = RV.getValue(1);
+      InGlue = Chain.getValue(2);
+    }
+
+    // Get the high bits for i32 struct elements.
+    if (VA.getValVT() == MVT::i32 && VA.needsCustom())
+      RV = DAG.getNode(ISD::SRL, DL, VA.getLocVT(), RV,
+                       DAG.getConstant(32, MVT::i32));
+
+    // The callee promoted the return value, so insert an Assert?ext SDNode so
+    // we won't promote the value again in this function.
+    switch (VA.getLocInfo()) {
+    case CCValAssign::SExt:
+      RV = DAG.getNode(ISD::AssertSext, DL, VA.getLocVT(), RV,
+                       DAG.getValueType(VA.getValVT()));
+      break;
+    case CCValAssign::ZExt:
+      RV = DAG.getNode(ISD::AssertZext, DL, VA.getLocVT(), RV,
+                       DAG.getValueType(VA.getValVT()));
+      break;
+    default:
+      break;
+    }
+
+    // Truncate the register down to the return value type.
+    if (VA.isExtInLoc())
+      RV = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), RV);
+
+    InVals.push_back(RV);
+  }
+
+  return Chain;
+}
+
 //===----------------------------------------------------------------------===//
 // TargetLowering Implementation
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/Sparc/SparcISelLowering.h b/lib/Target/Sparc/SparcISelLowering.h
index 189a3882d3c..8a50f6890a0 100644
--- a/lib/Target/Sparc/SparcISelLowering.h
+++ b/lib/Target/Sparc/SparcISelLowering.h
@@ -95,6 +95,10 @@ namespace llvm {
     virtual SDValue
       LowerCall(TargetLowering::CallLoweringInfo &CLI,
                 SmallVectorImpl<SDValue> &InVals) const;
+    SDValue LowerCall_32(TargetLowering::CallLoweringInfo &CLI,
+                         SmallVectorImpl<SDValue> &InVals) const;
+    SDValue LowerCall_64(TargetLowering::CallLoweringInfo &CLI,
+                         SmallVectorImpl<SDValue> &InVals) const;
 
     virtual SDValue
       LowerReturn(SDValue Chain,
diff --git a/test/CodeGen/SPARC/64abi.ll b/test/CodeGen/SPARC/64abi.ll
index d447ec76d84..10d8ff7c9ae 100644
--- a/test/CodeGen/SPARC/64abi.ll
+++ b/test/CodeGen/SPARC/64abi.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=sparcv9 | FileCheck %s
+; RUN: llc < %s -march=sparcv9 -disable-sparc-delay-filler | FileCheck %s
 
 ; CHECK: intarg
 ; CHECK: stb %i0, [%i4]
@@ -17,7 +17,7 @@ define void @intarg(i8  %a0,   ; %i0
                     i32 %a3,   ; %i3
                     i8* %a4,   ; %i4
                     i32 %a5,   ; %i5
-                    i32 %a6,   ; [%fp+BIAS+176]
+                    i32 signext %a6,   ; [%fp+BIAS+176]
                     i8* %a7) { ; [%fp+BIAS+184]
   store i8 %a0, i8* %a4
   store i8 %a1, i8* %a4
@@ -33,6 +33,18 @@ define void @intarg(i8  %a0,   ; %i0
   ret void
 }
 
+; CHECK: call_intarg
+; Sign-extend and store the full 64 bits.
+; CHECK: sra %i0, 0, [[R:%[gilo][0-7]]]
+; CHECK: stx [[R]], [%sp+2223]
+; Use %o0-%o5 for outgoing arguments
+; CHECK: or %g0, 5, %o5
+; CHECK: call intarg
+define void @call_intarg(i32 %i0, i8* %i1) {
+  call void @intarg(i8 0, i8 1, i16 2, i32 3, i8* undef, i32 5, i32 %i0, i8* %i1)
+  ret void
+}
+
 ; CHECK: floatarg
 ; CHECK: fstod %f1,
 ; CHECK: faddd %f2,
@@ -57,7 +69,7 @@ define double @floatarg(float %a0,    ; %f1
                         float %a14,   ; %f29
                         float %a15,   ; %f31
                         float %a16,   ; [%fp+BIAS+256] (using 8 bytes)
-                        float %a17) { ; [%fp+BIAS+264] (using 8 bytes)
+                        double %a17) { ; [%fp+BIAS+264] (using 8 bytes)
   %d0 = fpext float %a0 to double
   %s1 = fadd double %a1, %d0
   %s2 = fadd double %a2, %s1
@@ -68,6 +80,23 @@ define double @floatarg(float %a0,    ; %f1
   ret double %s17
 }
 
+; CHECK: call_floatarg
+; Store 4 bytes, right-aligned in slot.
+; CHECK: st %f1, [%sp+2307]
+; Store 8 bytes in full slot.
+; CHECK: std %f2, [%sp+2311]
+; CHECK: fmovd %f2, %f4
+; CHECK: call floatarg
+define void @call_floatarg(float %f1, double %d2, float %f5, double *%p) {
+  %r = call double @floatarg(float %f5, double %d2, double %d2, double %d2,
+                             float %f5, float %f5,  float %f5,  float %f5,
+                             float %f5, float %f5,  float %f5,  float %f5,
+                             float %f5, float %f5,  float %f5,  float %f5,
+                             float %f1, double %d2)
+  store double %r, double* %p
+  ret void
+}
+
 ; CHECK: mixedarg
 ; CHECK: fstod %f3
 ; CHECK: faddd %f6
@@ -92,6 +121,26 @@ define void @mixedarg(i8 %a0,      ; %i0
   ret void
 }
 
+; CHECK: call_mixedarg
+; CHECK: stx %i2, [%sp+2247]
+; CHECK: stx %i0, [%sp+2223]
+; CHECK: fmovd %f2, %f6
+; CHECK: fmovd %f2, %f16
+; CHECK: call mixedarg
+define void @call_mixedarg(i64 %i0, double %f2, i16* %i2) {
+  call void @mixedarg(i8 undef,
+                      float undef,
+                      i16 undef,
+                      double %f2,
+                      i13 undef,
+                      float undef,
+                      i64 %i0,
+                      double* undef,
+                      double %f2,
+                      i16* %i2)
+  ret void
+}
+
 ; The inreg attribute is used to indicate 32-bit sized struct elements that
 ; share an 8-byte slot.
 ; CHECK: inreg_fi
@@ -105,6 +154,15 @@ define i32 @inreg_fi(i32 inreg %a0,     ; high bits of %i0
   ret i32 %rv
 }
 
+; CHECK: call_inreg_fi
+; CHECK: sllx %i1, 32, %o0
+; CHECK: fmovs %f5, %f1
+; CHECK: call inreg_fi
+define void @call_inreg_fi(i32* %p, i32 %i1, float %f5) {
+  %x = call i32 @inreg_fi(i32 %i1, float %f5)
+  ret void
+}
+
 ; CHECK: inreg_ff
 ; CHECK: fsubs %f0, %f1, %f1
 define float @inreg_ff(float inreg %a0,   ; %f0
@@ -113,6 +171,15 @@ define float @inreg_ff(float inreg %a0,   ; %f0
   ret float %rv
 }
 
+; CHECK: call_inreg_ff
+; CHECK: fmovs %f3, %f0
+; CHECK: fmovs %f5, %f1
+; CHECK: call inreg_ff
+define void @call_inreg_ff(i32* %p, float %f3, float %f5) {
+  %x = call float @inreg_ff(float %f3, float %f5)
+  ret void
+}
+
 ; CHECK: inreg_if
 ; CHECK: fstoi %f0
 ; CHECK: sub %i0
@@ -123,6 +190,15 @@ define i32 @inreg_if(float inreg %a0, ; %f0
   ret i32 %rv
 }
 
+; CHECK: call_inreg_if
+; CHECK: fmovs %f3, %f0
+; CHECK: or %g0, %i2, %o0
+; CHECK: call inreg_if
+define void @call_inreg_if(i32* %p, float %f3, i32 %i2) {
+  %x = call i32 @inreg_if(float %f3, i32 %i2)
+  ret void
+}
+
 ; The frontend shouldn't do this. Just pass i64 instead.
 ; CHECK: inreg_ii
 ; CHECK: srlx %i0, 32, [[R:%[gilo][0-7]]]
@@ -133,6 +209,16 @@ define i32 @inreg_ii(i32 inreg %a0,   ; high bits of %i0
   ret i32 %rv
 }
 
+; CHECK: call_inreg_ii
+; CHECK: srl %i2, 0, [[R2:%[gilo][0-7]]]
+; CHECK: sllx %i1, 32, [[R1:%[gilo][0-7]]]
+; CHECK: or [[R1]], [[R2]], %o0
+; CHECK: call inreg_ii
+define void @call_inreg_ii(i32* %p, i32 %i1, i32 %i2) {
+  %x = call i32 @inreg_ii(i32 %i1, i32 %i2)
+  ret void
+}
+
 ; Structs up to 32 bytes in size can be returned in registers.
 ; CHECK: ret_i64_pair
 ; CHECK: ldx [%i2], %i0
@@ -146,6 +232,20 @@ define { i64, i64 } @ret_i64_pair(i32 %a0, i32 %a1, i64* %p, i64* %q) {
   ret { i64, i64 } %rv2
 }
 
+; CHECK: call_ret_i64_pair
+; CHECK: call ret_i64_pair
+; CHECK: stx %o0, [%i0]
+; CHECK: stx %o1, [%i0]
+define void @call_ret_i64_pair(i64* %i0) {
+  %rv = call { i64, i64 } @ret_i64_pair(i32 undef, i32 undef,
+                                        i64* undef, i64* undef)
+  %e0 = extractvalue { i64, i64 } %rv, 0
+  store i64 %e0, i64* %i0
+  %e1 = extractvalue { i64, i64 } %rv, 1
+  store i64 %e1, i64* %i0
+  ret void
+}
+
 ; This is not a C struct, each member uses 8 bytes.
 ; CHECK: ret_i32_float_pair
 ; CHECK: ld [%i2], %i0
@@ -160,6 +260,20 @@ define { i32, float } @ret_i32_float_pair(i32 %a0, i32 %a1,
   ret { i32, float } %rv2
 }
 
+; CHECK: call_ret_i32_float_pair
+; CHECK: call ret_i32_float_pair
+; CHECK: st %o0, [%i0]
+; CHECK: st %f3, [%i1]
+define void @call_ret_i32_float_pair(i32* %i0, float* %i1) {
+  %rv = call { i32, float } @ret_i32_float_pair(i32 undef, i32 undef,
+                                                i32* undef, float* undef)
+  %e0 = extractvalue { i32, float } %rv, 0
+  store i32 %e0, i32* %i0
+  %e1 = extractvalue { i32, float } %rv, 1
+  store float %e1, float* %i1
+  ret void
+}
+
 ; This is a C struct, each member uses 4 bytes.
 ; CHECK: ret_i32_float_packed
 ; CHECK: ld [%i2], [[R:%[gilo][0-7]]]
@@ -175,6 +289,21 @@ define inreg { i32, float } @ret_i32_float_packed(i32 %a0, i32 %a1,
   ret { i32, float } %rv2
 }
 
+; CHECK: call_ret_i32_float_packed
+; CHECK: call ret_i32_float_packed
+; CHECK: srlx %o0, 32, [[R:%[gilo][0-7]]]
+; CHECK: st [[R]], [%i0]
+; CHECK: st %f1, [%i1]
+define void @call_ret_i32_float_packed(i32* %i0, float* %i1) {
+  %rv = call { i32, float } @ret_i32_float_packed(i32 undef, i32 undef,
+                                                  i32* undef, float* undef)
+  %e0 = extractvalue { i32, float } %rv, 0
+  store i32 %e0, i32* %i0
+  %e1 = extractvalue { i32, float } %rv, 1
+  store float %e1, float* %i1
+  ret void
+}
+
 ; The C frontend should use i64 to return { i32, i32 } structs, but verify that
 ; we don't miscompile thi case where both struct elements are placed in %i0.
 ; CHECK: ret_i32_packed
@@ -192,6 +321,21 @@ define inreg { i32, i32 } @ret_i32_packed(i32 %a0, i32 %a1,
   ret { i32, i32 } %rv2
 }
 
+; CHECK: call_ret_i32_packed
+; CHECK: call ret_i32_packed
+; CHECK: srlx %o0, 32, [[R:%[gilo][0-7]]]
+; CHECK: st [[R]], [%i0]
+; CHECK: st %o0, [%i1]
+define void @call_ret_i32_packed(i32* %i0, i32* %i1) {
+  %rv = call { i32, i32 } @ret_i32_packed(i32 undef, i32 undef,
+                                          i32* undef, i32* undef)
+  %e0 = extractvalue { i32, i32 } %rv, 0
+  store i32 %e0, i32* %i0
+  %e1 = extractvalue { i32, i32 } %rv, 1
+  store i32 %e1, i32* %i1
+  ret void
+}
+
 ; The return value must be sign-extended to 64 bits.
 ; CHECK: ret_sext
 ; CHECK: sra %i0, 0, %i0