diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 8b102e4fbb9..c6f73baeb2a 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -7182,21 +7182,6 @@ static bool isSingleInputShuffleMask(ArrayRef<int> Mask) {
   return true;
 }
 
-/// \brief Check wether all of one set of inputs to a shuffle mask are in place.
-///
-/// Mask entries pointing at the other input or undef will be skipped.
-static bool isShuffleMaskInputInPlace(ArrayRef<int> Mask, bool LoInput = true) {
-  int Size = Mask.size();
-  for (int i = 0; i < Size; ++i) {
-    int M = Mask[i];
-    if (M == -1 || (LoInput && M >= 4) || (!LoInput && M < 4))
-      continue;
-    if (M - (LoInput ? 0 : Size) != i)
-      return false;
-  }
-  return true;
-}
-
 // Hide this symbol with an anonymous namespace instead of 'static' so that MSVC
 // 2013 will allow us to use it as a non-type template parameter.
 namespace {
@@ -7385,13 +7370,48 @@ static SDValue lowerV4F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
     // INSERTPS when the V1 elements are already in the correct locations
     // because otherwise we can just always use two SHUFPS instructions which
     // are much smaller to encode than a SHUFPS and an INSERTPS.
-    if (Subtarget->hasSSE41() &&
-        isShuffleMaskInputInPlace(Mask, /*LoInput*/ true)) {
-      // Insert the V2 element into the desired position.
-      SDValue InsertPSMask =
-          DAG.getIntPtrConstant(Mask[V2Index] << 6 | V2Index << 4);
-      return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
-                         InsertPSMask);
+    if (Subtarget->hasSSE41()) {
+      // When using INSERTPS we can zero any lane of the destination. Collect
+      // the zero inputs into a mask and drop them from the lanes of V1 which
+      // actually need to be present as inputs to the INSERTPS.
+      unsigned ZMask = 0;
+      if (ISD::isBuildVectorAllZeros(V1.getNode())) {
+        ZMask = 0xF ^ (1 << V2Index);
+      } else if (V1.getOpcode() == ISD::BUILD_VECTOR) {
+        for (int i = 0; i < 4; ++i) {
+          int M = Mask[i];
+          if (M >= 4)
+            continue;
+          if (M > -1) {
+            SDValue Input = V1.getOperand(M);
+            if (Input.getOpcode() != ISD::UNDEF &&
+                !X86::isZeroNode(Input)) {
+              // A non-zero input!
+              ZMask = 0;
+              break;
+            }
+          }
+          ZMask |= 1 << i;
+        }
+      }
+
+      // Synthesize a shuffle mask for the non-zero and non-v2 inputs.
+      int InsertShuffleMask[4] = {-1, -1, -1, -1};
+      for (int i = 0; i < 4; ++i)
+        if (i != V2Index && (ZMask & (1 << i)) == 0)
+          InsertShuffleMask[i] = Mask[i];
+
+      if (isNoopShuffleMask(InsertShuffleMask)) {
+        // Replace V1 with undef if nothing from V1 survives the INSERTPS.
+        if ((ZMask | 1 << V2Index) == 0xF)
+          V1 = DAG.getUNDEF(MVT::v4f32);
+
+        // Insert the V2 element into the desired position.
+        SDValue InsertPSMask =
+            DAG.getIntPtrConstant(Mask[V2Index] << 6 | V2Index << 4 | ZMask);
+        return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
+                           InsertPSMask);
+      }
     }
 
     // Compute the index adjacent to V2Index and in the same half by toggling
diff --git a/test/CodeGen/X86/vector-shuffle-128-v4.ll b/test/CodeGen/X86/vector-shuffle-128-v4.ll
index 0c43e0e9d27..7f448835b5d 100644
--- a/test/CodeGen/X86/vector-shuffle-128-v4.ll
+++ b/test/CodeGen/X86/vector-shuffle-128-v4.ll
@@ -207,3 +207,113 @@ define <4 x i32> @shuffle_v4i32_4015(<4 x i32> %a, <4 x i32> %b) {
   %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 0, i32 1, i32 5>
   ret <4 x i32> %shuffle
 }
+
+define <4 x float> @shuffle_v4f32_4zzz(<4 x float> %a) {
+; SSE2-LABEL: @shuffle_v4f32_4zzz
+; SSE2:         xorps %[[X:xmm[0-9]+]], %[[X]]
+; SSE2-NEXT:    shufps {{.*}} # xmm0 = xmm0[0,0],[[X]][1,0]
+; SSE2-NEXT:    shufps {{.*}} # xmm0 = xmm0[0,2],[[X]][2,3]
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: @shuffle_v4f32_4zzz
+; SSE41:         insertps {{.*}} # xmm0 = xmm0[0],zero,zero,zero
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: @shuffle_v4f32_4zzz
+; AVX1:         vinsertps {{.*}} # xmm0 = xmm0[0],zero,zero,zero
+; AVX1-NEXT:    retq
+  %shuffle = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
+  ret <4 x float> %shuffle
+}
+
+define <4 x float> @shuffle_v4f32_z4zz(<4 x float> %a) {
+; SSE2-LABEL: @shuffle_v4f32_z4zz
+; SSE2:         xorps %[[X:xmm[0-9]+]], %[[X]]
+; SSE2-NEXT:    shufps {{.*}} # xmm0 = xmm0[0,0],[[X]][2,0]
+; SSE2-NEXT:    shufps {{.*}} # xmm0 = xmm0[2,0],[[X]][3,0]
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: @shuffle_v4f32_z4zz
+; SSE41:         insertps {{.*}} # xmm0 = zero,xmm0[0],zero,zero
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: @shuffle_v4f32_z4zz
+; AVX1:         vinsertps {{.*}} # xmm0 = zero,xmm0[0],zero,zero
+; AVX1-NEXT:    retq
+  %shuffle = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> <i32 2, i32 4, i32 3, i32 0>
+  ret <4 x float> %shuffle
+}
+
+define <4 x float> @shuffle_v4f32_zz4z(<4 x float> %a) {
+; SSE2-LABEL: @shuffle_v4f32_zz4z
+; SSE2:         xorps %[[X:xmm[0-9]+]], %[[X]]
+; SSE2-NEXT:    shufps {{.*}} # xmm0 = xmm0[0,0],[[X]][0,0]
+; SSE2-NEXT:    shufps {{.*}} # [[X]] = [[X]][0,0],xmm0[0,2]
+; SSE2-NEXT:    movaps %[[X]], %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: @shuffle_v4f32_zz4z
+; SSE41:         insertps {{.*}} # xmm0 = zero,zero,xmm0[0],zero
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: @shuffle_v4f32_zz4z
+; AVX1:         vinsertps {{.*}} # xmm0 = zero,zero,xmm0[0],zero
+; AVX1-NEXT:    retq
+  %shuffle = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> <i32 0, i32 0, i32 4, i32 0>
+  ret <4 x float> %shuffle
+}
+
+define <4 x float> @shuffle_v4f32_zuu4(<4 x float> %a) {
+; SSE2-LABEL: @shuffle_v4f32_zuu4
+; SSE2:         xorps %[[X:xmm[0-9]+]], %[[X]]
+; SSE2-NEXT:    shufps {{.*}} # [[X]] = [[X]][0,1],xmm0[2,0]
+; SSE2-NEXT:    movaps %[[X]], %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: @shuffle_v4f32_zuu4
+; SSE41:         insertps {{.*}} # xmm0 = zero,zero,zero,xmm0[0]
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: @shuffle_v4f32_zuu4
+; AVX1:         vinsertps {{.*}} # xmm0 = zero,zero,zero,xmm0[0]
+; AVX1-NEXT:    retq
+  %shuffle = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> <i32 0, i32 undef, i32 undef, i32 4>
+  ret <4 x float> %shuffle
+}
+
+define <4 x float> @shuffle_v4f32_zzz7(<4 x float> %a) {
+; SSE2-LABEL: @shuffle_v4f32_zzz7
+; SSE2:         xorps %[[X:xmm[0-9]+]], %[[X]]
+; SSE2-NEXT:    shufps {{.*}} # xmm0 = xmm0[3,0],[[X]][2,0]
+; SSE2-NEXT:    shufps {{.*}} # [[X]] = [[X]][0,1],xmm0[2,0]
+; SSE2-NEXT:    movaps %[[X]], %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: @shuffle_v4f32_zzz7
+; SSE41:         insertps {{.*}} # xmm0 = zero,zero,zero,xmm0[3]
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: @shuffle_v4f32_zzz7
+; AVX1:         vinsertps {{.*}} # xmm0 = zero,zero,zero,xmm0[3]
+; AVX1-NEXT:    retq
+  %shuffle = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+  ret <4 x float> %shuffle
+}
+
+define <4 x float> @shuffle_v4f32_z6zz(<4 x float> %a) {
+; SSE2-LABEL: @shuffle_v4f32_z6zz
+; SSE2:         xorps %[[X:xmm[0-9]+]], %[[X]]
+; SSE2-NEXT:    shufps {{.*}} # xmm0 = xmm0[2,0],[[X]][0,0]
+; SSE2-NEXT:    shufps {{.*}} # xmm0 = xmm0[2,0],[[X]][2,3]
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: @shuffle_v4f32_z6zz
+; SSE41:         insertps {{.*}} # xmm0 = zero,xmm0[2],zero,zero
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: @shuffle_v4f32_z6zz
+; AVX1:         vinsertps {{.*}} # xmm0 = zero,xmm0[2],zero,zero
+; AVX1-NEXT:    retq
+  %shuffle = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> <i32 0, i32 6, i32 2, i32 3>
+  ret <4 x float> %shuffle
+}