diff --git a/include/llvm/CodeGen/SelectionDAG.h b/include/llvm/CodeGen/SelectionDAG.h
index 7c2845a9d07..de49d184131 100644
--- a/include/llvm/CodeGen/SelectionDAG.h
+++ b/include/llvm/CodeGen/SelectionDAG.h
@@ -582,7 +582,7 @@ public:
   /// getVAArg - VAArg produces a result and token chain, and takes a pointer
   /// and a source value as input.
   SDValue getVAArg(EVT VT, DebugLoc dl, SDValue Chain, SDValue Ptr,
-                   SDValue SV, unsigned Align = 0);
+                   SDValue SV, unsigned Align);
 
   /// getAtomic - Gets a node for an atomic op, produces result and chain and
   /// takes 3 operands
diff --git a/include/llvm/Target/TargetLowering.h b/include/llvm/Target/TargetLowering.h
index 47aa6d1683a..30aadf6bdf8 100644
--- a/include/llvm/Target/TargetLowering.h
+++ b/include/llvm/Target/TargetLowering.h
@@ -686,6 +686,12 @@ public:
     return JumpBufAlignment;
   }
 
+  /// getMinStackArgumentAlignment - return the minimum stack alignment of an
+  /// argument.
+  unsigned getMinStackArgumentAlignment() const {
+    return MinStackArgumentAlignment;
+  }
+
   /// getPrefLoopAlignment - return the preferred loop alignment.
   ///
   unsigned getPrefLoopAlignment() const {
@@ -1082,6 +1088,12 @@ protected:
     PrefLoopAlignment = Align;
   }
 
+  /// setMinStackArgumentAlignment - Set the minimum stack alignment of an
+  /// argument.
+  void setMinStackArgumentAlignment(unsigned Align) {
+    MinStackArgumentAlignment = Align;
+  }
+
   /// setShouldFoldAtomicFences - Set if the target's implementation of the
   /// atomic operation intrinsics includes locking. Default is false.
   void setShouldFoldAtomicFences(bool fold) {
@@ -1515,6 +1527,11 @@ private:
   /// buffers
   unsigned JumpBufAlignment;
 
+  /// MinStackArgumentAlignment - The minimum alginment that any argument
+  /// on the stack needs to have.
+  ///
+  unsigned MinStackArgumentAlignment;
+
   /// PrefLoopAlignment - The perferred loop alignment.
   ///
   unsigned PrefLoopAlignment;
diff --git a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index 87220a50b92..7a47da4ec52 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -2658,7 +2658,9 @@ void SelectionDAGLegalize::ExpandNode(SDNode *Node,
                                      false, false, 0);
     SDValue VAList = VAListLoad;
 
-    if (Align != 0 ) {
+    if (Align > TLI.getMinStackArgumentAlignment()) {
+      assert(((Align & (Align-1)) == 0) && "Expected Align to be a power of 2");
+
       VAList = DAG.getNode(ISD::ADD, dl, TLI.getPointerTy(), VAList,
                            DAG.getConstant(Align - 1,
                                            TLI.getPointerTy()));
diff --git a/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
index 79e3dec28eb..68bcebc7dd3 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
@@ -504,7 +504,8 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_VAARG(SDNode *N) {
   DebugLoc dl = N->getDebugLoc();
 
   SDValue NewVAARG;
-  NewVAARG = DAG.getVAArg(NVT, dl, Chain, Ptr, N->getOperand(2));
+  NewVAARG = DAG.getVAArg(NVT, dl, Chain, Ptr, N->getOperand(2),
+			  N->getConstantOperandVal(3));
 
   // Legalized the chain result - switch anything that used the old chain to
   // use the new one.
diff --git a/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index e4b42064fc2..b94ea9a3a9a 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -572,7 +572,8 @@ SDValue DAGTypeLegalizer::PromoteIntRes_VAARG(SDNode *N) {
 
   SmallVector<SDValue, 8> Parts(NumRegs);
   for (unsigned i = 0; i < NumRegs; ++i) {
-    Parts[i] = DAG.getVAArg(RegVT, dl, Chain, Ptr, N->getOperand(2));
+    Parts[i] = DAG.getVAArg(RegVT, dl, Chain, Ptr, N->getOperand(2),
+                            N->getConstantOperandVal(3));
     Chain = Parts[i].getValue(1);
   }
 
diff --git a/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp b/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp
index 71adb336170..9c2b1d9ed73 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp
@@ -243,13 +243,10 @@ void DAGTypeLegalizer::ExpandRes_VAARG(SDNode *N, SDValue &Lo, SDValue &Hi) {
   SDValue Chain = N->getOperand(0);
   SDValue Ptr = N->getOperand(1);
   DebugLoc dl = N->getDebugLoc();
-  const unsigned OldAlign = N->getConstantOperandVal(3);
-  const Type *Type = OVT.getTypeForEVT(*DAG.getContext());
-  const unsigned TypeAlign = TLI.getTargetData()->getABITypeAlignment(Type);
-  const unsigned Align = std::max(OldAlign, TypeAlign);
+  const unsigned Align = N->getConstantOperandVal(3);
 
   Lo = DAG.getVAArg(NVT, dl, Chain, Ptr, N->getOperand(2), Align);
-  Hi = DAG.getVAArg(NVT, dl, Lo.getValue(1), Ptr, N->getOperand(2));
+  Hi = DAG.getVAArg(NVT, dl, Lo.getValue(1), Ptr, N->getOperand(2), 0);
 
   // Handle endianness of the load.
   if (TLI.isBigEndian())
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 3b3ee3e3434..ea41ec63a52 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -5672,7 +5672,8 @@ void SelectionDAGBuilder::visitVAStart(const CallInst &I) {
 void SelectionDAGBuilder::visitVAArg(const VAArgInst &I) {
   SDValue V = DAG.getVAArg(TLI.getValueType(I.getType()), getCurDebugLoc(),
                            getRoot(), getValue(I.getOperand(0)),
-                           DAG.getSrcValue(I.getOperand(0)));
+                           DAG.getSrcValue(I.getOperand(0)),
+                           TLI.getTargetData()->getABITypeAlignment(I.getType()));
   setValue(&I, V);
   DAG.setRoot(V.getValue(1));
 }
diff --git a/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index a9a7e5054b7..4f3866956ca 100644
--- a/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -580,6 +580,7 @@ TargetLowering::TargetLowering(const TargetMachine &tm,
   JumpBufSize = 0;
   JumpBufAlignment = 0;
   PrefLoopAlignment = 0;
+  MinStackArgumentAlignment = 1;
   ShouldFoldAtomicFences = false;
 
   InitLibcallNames(LibcallRoutineNames);
diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp
index e1e5a80678b..e60a2b14f1f 100644
--- a/lib/Target/ARM/ARMISelLowering.cpp
+++ b/lib/Target/ARM/ARMISelLowering.cpp
@@ -539,6 +539,10 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
 
   maxStoresPerMemcpy = 1;   //// temporary - rewrite interface to use type
 
+  // On ARM arguments smaller than 4 bytes are extended, so all arguments
+  // are at least 4 bytes aligned.
+  setMinStackArgumentAlignment(4);
+
   if (EnableARMCodePlacement)
     benefitFromCodePlacementOpt = true;
 }
diff --git a/test/CodeGen/ARM/va_arg.ll b/test/CodeGen/ARM/va_arg.ll
index a54a499cf19..7cb976236dc 100644
--- a/test/CodeGen/ARM/va_arg.ll
+++ b/test/CodeGen/ARM/va_arg.ll
@@ -1,10 +1,13 @@
 ; RUN: llc < %s -mtriple=armv7-none-linux-gnueabi | FileCheck %s
 ; Test that we correctly align elements when using va_arg
 
+; CHECK: test1:
+; CHECK-NOT: bfc
 ; CHECK: add	r0, r0, #7
 ; CHECK: bfc	r0, #0, #3
+; CHECK-NOT: bfc
 
-define i64 @f8(i32 %i, ...) nounwind optsize {
+define i64 @test1(i32 %i, ...) nounwind optsize {
 entry:
   %g = alloca i8*, align 4
   %g1 = bitcast i8** %g to i8*
@@ -14,6 +17,25 @@ entry:
   ret i64 %0
 }
 
+; CHECK: test2:
+; CHECK-NOT: bfc
+; CHECK: add	r0, r0, #7
+; CHECK: bfc	r0, #0, #3
+; CHECK-NOT:	bfc
+; CHECK: bx	lr
+
+define double @test2(i32 %a, i32 %b, ...) nounwind optsize {
+entry:
+  %ap = alloca i8*, align 4                       ; <i8**> [#uses=3]
+  %ap1 = bitcast i8** %ap to i8*                  ; <i8*> [#uses=2]
+  call void @llvm.va_start(i8* %ap1)
+  %0 = va_arg i8** %ap, i32                       ; <i32> [#uses=0]
+  %1 = va_arg i8** %ap, double                    ; <double> [#uses=1]
+  call void @llvm.va_end(i8* %ap1)
+  ret double %1
+}
+
+
 declare void @llvm.va_start(i8*) nounwind
 
 declare void @llvm.va_end(i8*) nounwind