diff --git a/lib/Target/X86/X86VZeroUpper.cpp b/lib/Target/X86/X86VZeroUpper.cpp
index 39584942468..9bb54a826bd 100644
--- a/lib/Target/X86/X86VZeroUpper.cpp
+++ b/lib/Target/X86/X86VZeroUpper.cpp
@@ -14,14 +14,16 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "x86-codegen"
+#define DEBUG_TYPE "x86-vzeroupper"
 #include "X86.h"
 #include "X86InstrInfo.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/Passes.h"
-#include "llvm/GlobalValue.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetInstrInfo.h"
 using namespace llvm;
 
@@ -41,6 +43,60 @@ namespace {
   private:
     const TargetInstrInfo *TII; // Machine instruction info.
     MachineBasicBlock *MBB;     // Current basic block
+
+    // Any YMM register live-in to this function?
+    bool FnHasLiveInYmm;
+
+    // BBState - Contains the state of each MBB: unknown, clean, dirty
+    SmallVector<uint8_t, 8> BBState;
+
+    // BBSolved - Keep track of all MBB which had been already analyzed
+    // and there is no further processing required.
+    BitVector BBSolved;
+
+    // Machine Basic Blocks are classified according this pass:
+    //
+    //  ST_UNKNOWN - The MBB state is unknown, meaning from the entry state
+    //    until the MBB exit there isn't a instruction using YMM to change
+    //    the state to dirty, or one of the incoming predecessors is unknown
+    //    and there's not a dirty predecessor between them.
+    //
+    //  ST_CLEAN - No YMM usage in the end of the MBB. A MBB could have
+    //    instructions using YMM and be marked ST_CLEAN, as long as the state
+    //    is cleaned by a vzeroupper before any call.
+    //
+    //  ST_DIRTY - Any MBB ending with a YMM usage not cleaned up by a
+    //    vzeroupper instruction.
+    //
+    //  ST_INIT - Placeholder for an empty state set
+    //
+    enum {
+      ST_UNKNOWN = 0,
+      ST_CLEAN   = 1,
+      ST_DIRTY   = 2,
+      ST_INIT    = 3
+    };
+
+    // computeState - Given two states, compute the resulting state, in
+    // the following way
+    //
+    //  1) One dirty state yields another dirty state
+    //  2) All states must be clean for the result to be clean
+    //  3) If none above and one unknown, the result state is also unknown
+    //
+    unsigned computeState(unsigned PrevState, unsigned CurState) {
+      if (PrevState == ST_INIT)
+        return CurState;
+
+      if (PrevState == ST_DIRTY || CurState == ST_DIRTY)
+        return ST_DIRTY;
+
+      if (PrevState == ST_CLEAN && CurState == ST_CLEAN)
+        return ST_CLEAN;
+
+      return ST_UNKNOWN;
+    }
+
   };
   char VZeroUpperInserter::ID = 0;
 }
@@ -49,37 +105,82 @@ FunctionPass *llvm::createX86IssueVZeroUpperPass() {
   return new VZeroUpperInserter();
 }
 
+static bool isYmmReg(unsigned Reg) {
+  if (Reg >= X86::YMM0 && Reg <= X86::YMM15)
+    return true;
+
+  return false;
+}
+
+static bool checkFnHasLiveInYmm(MachineRegisterInfo &MRI) {
+  for (MachineRegisterInfo::livein_iterator I = MRI.livein_begin(),
+       E = MRI.livein_end(); I != E; ++I)
+    if (isYmmReg(I->first))
+      return true;
+
+  return false;
+}
+
+static bool hasYmmReg(MachineInstr *MI) {
+  for (int i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI->getOperand(i);
+    if (!MO.isReg())
+      continue;
+    if (MO.isDebug())
+      continue;
+    if (isYmmReg(MO.getReg()))
+      return true;
+  }
+  return false;
+}
+
 /// runOnMachineFunction - Loop over all of the basic blocks, inserting
 /// vzero upper instructions before function calls.
 bool VZeroUpperInserter::runOnMachineFunction(MachineFunction &MF) {
   TII = MF.getTarget().getInstrInfo();
-  bool Changed = false;
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  bool EverMadeChange = false;
 
-  // Process any unreachable blocks in arbitrary order now.
-  for (MachineFunction::iterator BB = MF.begin(), E = MF.end(); BB != E; ++BB)
-    Changed |= processBasicBlock(MF, *BB);
-
-  return Changed;
-}
-
-static bool isCallToModuleFn(const MachineInstr *MI) {
-  assert(MI->getDesc().isCall() && "Isn't a call instruction");
-
-  for (int i = 0, e = MI->getNumOperands(); i != e; ++i) {
-    const MachineOperand &MO = MI->getOperand(i);
-
-    if (!MO.isGlobal())
-      continue;
-
-    const GlobalValue *GV = MO.getGlobal();
-    GlobalValue::LinkageTypes LT = GV->getLinkage();
-    if (GV->isInternalLinkage(LT) || GV->isPrivateLinkage(LT) ||
-        (GV->isExternalLinkage(LT) && !GV->isDeclaration()))
-      return true;
-
-    return false;
+  // Fast check: if the function doesn't use any ymm registers, we don't need
+  // to insert any VZEROUPPER instructions.  This is constant-time, so it is
+  // cheap in the common case of no ymm use.
+  bool YMMUsed = false;
+  TargetRegisterClass *RC = X86::VR256RegisterClass;
+  for (TargetRegisterClass::iterator i = RC->begin(), e = RC->end();
+       i != e; i++) {
+    if (MRI.isPhysRegUsed(*i)) {
+      YMMUsed = true;
+      break;
+    }
   }
-  return false;
+  if (!YMMUsed)
+    return EverMadeChange;
+
+  // Pre-compute the existence of any live-in YMM registers to this function
+  FnHasLiveInYmm = checkFnHasLiveInYmm(MRI);
+
+  assert(BBState.empty());
+  BBState.resize(MF.getNumBlockIDs(), 0);
+  BBSolved.resize(MF.getNumBlockIDs(), 0);
+
+  // Each BB state depends on all predecessors, loop over until everything
+  // converges.  (Once we converge, we can implicitly mark everything that is
+  // still ST_UNKNOWN as ST_CLEAN.)
+  while (1) {
+    bool MadeChange = false;
+
+    // Process all basic blocks.
+    for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; ++I)
+      MadeChange |= processBasicBlock(MF, *I);
+
+    // If this iteration over the code changed anything, keep iterating.
+    if (!MadeChange) break;
+    EverMadeChange = true;
+  }
+
+  BBState.clear();
+  BBSolved.clear();
+  return EverMadeChange;
 }
 
 /// processBasicBlock - Loop over all of the instructions in the basic block,
@@ -87,19 +188,98 @@ static bool isCallToModuleFn(const MachineInstr *MI) {
 bool VZeroUpperInserter::processBasicBlock(MachineFunction &MF,
                                            MachineBasicBlock &BB) {
   bool Changed = false;
+  unsigned BBNum = BB.getNumber();
   MBB = &BB;
 
+  // Don't process already solved BBs
+  if (BBSolved[BBNum])
+    return false; // No changes
+
+  // Check the state of all predecessors
+  unsigned EntryState = ST_INIT;
+  for (MachineBasicBlock::const_pred_iterator PI = BB.pred_begin(),
+       PE = BB.pred_end(); PI != PE; ++PI) {
+    EntryState = computeState(EntryState, BBState[(*PI)->getNumber()]);
+    if (EntryState == ST_DIRTY)
+      break;
+  }
+
+
+  // The entry MBB for the function may set the inital state to dirty if
+  // the function receives any YMM incoming arguments
+  if (MBB == MF.begin()) {
+    EntryState = ST_CLEAN;
+    if (FnHasLiveInYmm)
+      EntryState = ST_DIRTY;
+  }
+
+  // The current state is initialized according to the predecessors
+  unsigned CurState = EntryState;
+  bool BBHasCall = false;
+
   for (MachineBasicBlock::iterator I = BB.begin(); I != BB.end(); ++I) {
     MachineInstr *MI = I;
     DebugLoc dl = I->getDebugLoc();
+    bool isControlFlow = MI->getDesc().isCall() || MI->getDesc().isReturn();
 
-    // Insert a vzeroupper instruction before each control transfer
-    // to functions outside this module
-    if (MI->getDesc().isCall() && !isCallToModuleFn(MI)) {
-      BuildMI(*MBB, I, dl, TII->get(X86::VZEROUPPER));
-      ++NumVZU;
+    // Shortcut: don't need to check regular instructions in dirty state. 
+    if (!isControlFlow && CurState == ST_DIRTY)
+      continue;
+
+    if (hasYmmReg(MI)) {
+      // We found a ymm-using instruction; this could be an AVX instruction,
+      // or it could be control flow.
+      CurState = ST_DIRTY;
+      continue;
+    }
+
+    // Check for control-flow out of the current function (which might
+    // indirectly execute SSE instructions).
+    if (!isControlFlow)
+      continue;
+
+    BBHasCall = true;
+
+    // The VZEROUPPER instruction resets the upper 128 bits of all Intel AVX
+    // registers. This instruction has zero latency. In addition, the processor
+    // changes back to Clean state, after which execution of Intel SSE
+    // instructions or Intel AVX instructions has no transition penalty. Add
+    // the VZEROUPPER instruction before any function call/return that might
+    // execute SSE code.
+    // FIXME: In some cases, we may want to move the VZEROUPPER into a
+    // predecessor block.
+    if (CurState == ST_DIRTY) {
+      // Only insert the VZEROUPPER in case the entry state isn't unknown.
+      // When unknown, only compute the information within the block to have
+      // it available in the exit if possible, but don't change the block.
+      if (EntryState != ST_UNKNOWN) {
+        BuildMI(*MBB, I, dl, TII->get(X86::VZEROUPPER));
+        ++NumVZU;
+      }
+
+      // After the inserted VZEROUPPER the state becomes clean again, but
+      // other YMM may appear before other subsequent calls or even before
+      // the end of the BB.
+      CurState = ST_CLEAN;
     }
   }
 
+  DEBUG(dbgs() << "MBB #" << BBNum
+               << ", current state: " << CurState << '\n');
+
+  // A BB can only be considered solved when we both have done all the
+  // necessary transformations, and have computed the exit state.  This happens
+  // in two cases:
+  //  1) We know the entry state: this immediately implies the exit state and
+  //     all the necessary transformations.
+  //  2) There are no calls, and and a non-call instruction marks this block:
+  //     no transformations are necessary, and we know the exit state.
+  if (EntryState != ST_UNKNOWN || (!BBHasCall && CurState != ST_UNKNOWN))
+    BBSolved[BBNum] = true;
+
+  if (CurState != BBState[BBNum])
+    Changed = true;
+
+  BBState[BBNum] = CurState;
   return Changed;
 }
diff --git a/test/CodeGen/X86/avx-vzeroupper.ll b/test/CodeGen/X86/avx-vzeroupper.ll
index eaf236c6c77..bf4ab5be151 100644
--- a/test/CodeGen/X86/avx-vzeroupper.ll
+++ b/test/CodeGen/X86/avx-vzeroupper.ll
@@ -1,26 +1,83 @@
 ; RUN: llc < %s -x86-use-vzeroupper -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck %s
 
-define <4 x float> @do_sse_local(<4 x float> %a) nounwind uwtable readnone ssp {
-entry:
-  %add.i = fadd <4 x float> %a, %a
-  ret <4 x float> %add.i
-}
+declare <4 x float> @do_sse(<4 x float>)
+declare <8 x float> @do_avx(<8 x float>)
+declare <4 x float> @llvm.x86.avx.vextractf128.ps.256(<8 x float>, i8) nounwind readnone
+@x = common global <4 x float> zeroinitializer, align 16
+@g = common global <8 x float> zeroinitializer, align 32
+
+;; Basic checking - don't emit any vzeroupper instruction
 
 ; CHECK: _test00
 define <4 x float> @test00(<4 x float> %a, <4 x float> %b) nounwind uwtable ssp {
 entry:
-  %add.i = fadd <4 x float> %a, %b
-  ; CHECK: vzeroupper
-  ; CHECK-NEXT: callq _do_sse
-  %call3 = tail call <4 x float> @do_sse(<4 x float> %add.i) nounwind
-  %sub.i = fsub <4 x float> %call3, %add.i
   ; CHECK-NOT: vzeroupper
-  ; CHECK: callq _do_sse_local
-  %call8 = tail call <4 x float> @do_sse_local(<4 x float> %sub.i)
-  ; CHECK: vzeroupper
-  ; CHECK-NEXT: jmp _do_sse
-  %call10 = tail call <4 x float> @do_sse(<4 x float> %call8) nounwind
-  ret <4 x float> %call10
+  %add.i = fadd <4 x float> %a, %b
+  %call3 = call <4 x float> @do_sse(<4 x float> %add.i) nounwind
+  ; CHECK: ret
+  ret <4 x float> %call3
 }
 
-declare <4 x float> @do_sse(<4 x float>)
+;; Check parameter 256-bit parameter passing
+
+; CHECK: _test01
+define <8 x float> @test01(<4 x float> %a, <4 x float> %b, <8 x float> %c) nounwind uwtable ssp {
+entry:
+  %tmp = load <4 x float>* @x, align 16
+  ; CHECK: vzeroupper
+  ; CHECK-NEXT: callq _do_sse
+  %call = tail call <4 x float> @do_sse(<4 x float> %tmp) nounwind
+  store <4 x float> %call, <4 x float>* @x, align 16
+  ; CHECK-NOT: vzeroupper
+  ; CHECK: callq _do_sse
+  %call2 = tail call <4 x float> @do_sse(<4 x float> %call) nounwind
+  store <4 x float> %call2, <4 x float>* @x, align 16
+  ; CHECK: ret
+  ret <8 x float> %c
+}
+
+;; Test the pass convergence and also that vzeroupper is only issued when necessary,
+;; for this function it should be only once
+
+; CHECK: _test02
+define <4 x float> @test02(<4 x float> %a, <4 x float> %b) nounwind uwtable ssp {
+entry:
+  %add.i = fadd <4 x float> %a, %b
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  ; CHECK: LBB
+  ; CHECK-NOT: vzeroupper
+  %i.018 = phi i32 [ 0, %entry ], [ %1, %for.body ]
+  %c.017 = phi <4 x float> [ %add.i, %entry ], [ %call14, %for.body ]
+  ; CHECK: callq _do_sse
+  %call5 = tail call <4 x float> @do_sse(<4 x float> %c.017) nounwind
+  ; CHECK-NEXT: callq _do_sse
+  %call7 = tail call <4 x float> @do_sse(<4 x float> %call5) nounwind
+  %tmp11 = load <8 x float>* @g, align 32
+  %0 = tail call <4 x float> @llvm.x86.avx.vextractf128.ps.256(<8 x float> %tmp11, i8 1) nounwind
+  ; CHECK: vzeroupper
+  ; CHECK-NEXT: callq _do_sse
+  %call14 = tail call <4 x float> @do_sse(<4 x float> %0) nounwind
+  %1 = add nsw i32 %i.018, 1
+  %exitcond = icmp eq i32 %1, 4
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret <4 x float> %call14
+}
+
+;; Check that we also perform vzeroupper when we return from a function.
+
+; CHECK: _test03
+define <4 x float> @test03(<4 x float> %a, <4 x float> %b) nounwind uwtable ssp {
+entry:
+  %shuf = shufflevector <4 x float> %a, <4 x float> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ; CHECK-NOT: vzeroupper
+  ; CHECK: call
+  %call = call <8 x float> @do_avx(<8 x float> %shuf) nounwind
+  %shuf2 = shufflevector <8 x float> %call, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ; CHECK: vzeroupper
+  ; CHECK: ret
+  ret <4 x float> %shuf2
+}