Handle vector move / load which zero the destination register top bits (i.e. movd, movq, movss (addr), movsd (addr)) with X86 specific dag combine.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@50838 91177308-0d34-0410-b5e6-96231b3b80d8
2025-04-13 15:37:24 +00:00 · 2008-05-08 00:57:18 +00:00 · 2008-05-08 00:57:18 +00:00 · 7e2ff77ef0
commit 7e2ff77ef0
parent 687bcb2be0
9 changed files with 175 additions and 147 deletions
--- a/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/lib/Target/X86/X86ISelDAGToDAG.cpp
@ -975,38 +975,19 @@ bool X86DAGToDAGISel::SelectScalarSSELoad(SDOperand Op, SDOperand Pred,

  // Also handle the case where we explicitly require zeros in the top
  // elements.  This is a vector shuffle from the zero vector.
-  if (N.getOpcode() == ISD::VECTOR_SHUFFLE && N.Val->hasOneUse() &&
+  if (N.getOpcode() == X86ISD::ZEXT_VMOVL && N.Val->hasOneUse() &&
      // Check to see if the top elements are all zeros (or bitcast of zeros).
-      ISD::isBuildVectorAllZeros(N.getOperand(0).Val) &&
-      N.getOperand(1).getOpcode() == ISD::SCALAR_TO_VECTOR && 
-      N.getOperand(1).Val->hasOneUse() &&
-      ISD::isNON_EXTLoad(N.getOperand(1).getOperand(0).Val) &&
-      N.getOperand(1).getOperand(0).hasOneUse()) {
-    // Check to see if the shuffle mask is 4/L/L/L or 2/L, where L is something
-    // from the LHS.
-    unsigned VecWidth=MVT::getVectorNumElements(N.getOperand(0).getValueType());
-    SDOperand ShufMask = N.getOperand(2);
-    assert(ShufMask.getOpcode() == ISD::BUILD_VECTOR && "Invalid shuf mask!");
-    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(ShufMask.getOperand(0))) {
-      if (C->getValue() == VecWidth) {
-        for (unsigned i = 1; i != VecWidth; ++i) {
-          if (ShufMask.getOperand(i).getOpcode() == ISD::UNDEF) {
-            // ok.
-          } else {
-            ConstantSDNode *C = cast<ConstantSDNode>(ShufMask.getOperand(i));
-            if (C->getValue() >= VecWidth) return false;
-          }
-        }
-      }
-      
-      // Okay, this is a zero extending load.  Fold it.
-      LoadSDNode *LD = cast<LoadSDNode>(N.getOperand(1).getOperand(0));
-      if (!SelectAddr(Op, LD->getBasePtr(), Base, Scale, Index, Disp))
-        return false;
-      OutChain = LD->getChain();
-      InChain = SDOperand(LD, 1);
-      return true;
-    }
+      N.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR && 
+      N.getOperand(0).Val->hasOneUse() &&
+      ISD::isNON_EXTLoad(N.getOperand(0).getOperand(0).Val) &&
+      N.getOperand(0).getOperand(0).hasOneUse()) {
+    // Okay, this is a zero extending load.  Fold it.
+    LoadSDNode *LD = cast<LoadSDNode>(N.getOperand(0).getOperand(0));
+    if (!SelectAddr(Op, LD->getBasePtr(), Base, Scale, Index, Disp))
+      return false;
+    OutChain = LD->getChain();
+    InChain = SDOperand(LD, 1);
+    return true;
  }
  return false;
 }
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@ -2605,11 +2605,16 @@ static bool ShouldXformToMOVHLPS(SDNode *Mask) {
 }

 /// isScalarLoadToVector - Returns true if the node is a scalar load that
-/// is promoted to a vector.
-static inline bool isScalarLoadToVector(SDNode *N) {
+/// is promoted to a vector. It also returns the LoadSDNode by reference if
+/// required.
+static bool isScalarLoadToVector(SDNode *N, LoadSDNode **LD = NULL) {
  if (N->getOpcode() == ISD::SCALAR_TO_VECTOR) {
    N = N->getOperand(0).Val;
-    return ISD::isNON_EXTLoad(N);
+    if (ISD::isNON_EXTLoad(N)) {
+      if (LD)
+        *LD = cast<LoadSDNode>(N);
+      return true;
+    }
  }
  return false;
 }
@ -3082,8 +3087,16 @@ X86TargetLowering::LowerBUILD_VECTOR(SDOperand Op, SelectionDAG &DAG) {
    return SDOperand();

  // Let legalizer expand 2-wide build_vectors.
-  if (EVTBits == 64)
+  if (EVTBits == 64) {
+    if (NumNonZero == 1) {
+      // One half is zero or undef.
+      unsigned Idx = CountTrailingZeros_32(NonZeros);
+      SDOperand V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, VT,
+                                 Op.getOperand(Idx));
+      return getShuffleVectorZeroOrUndef(V2, Idx, true, DAG);
+    }
    return SDOperand();
+  }

  // If element VT is < 32 bits, convert it to inserts into a zero vector.
  if (EVTBits == 8 && NumElems == 16) {
@ -3131,13 +3144,6 @@ X86TargetLowering::LowerBUILD_VECTOR(SDOperand Op, SelectionDAG &DAG) {
      }
    }

-    // Take advantage of the fact GR32 to VR128 scalar_to_vector (i.e. movd)
-    // clears the upper bits.
-    // FIXME: we can do the same for v4f32 case when we know both parts of
-    // the lower half come from scalar_to_vector (loadf32). We should do
-    // that in post legalizer dag combiner with target specific hooks.
-    if (MVT::isInteger(EVT) && (NonZeros & (0x3 << 2)) == 0)
-      return V[0];
    MVT::ValueType MaskVT = MVT::getIntVectorWithNumElements(NumElems);
    MVT::ValueType EVT = MVT::getVectorElementType(MaskVT);
    SmallVector<SDOperand, 8> MaskVec;
@ -3475,6 +3481,38 @@ SDOperand RewriteAsNarrowerShuffle(SDOperand V1, SDOperand V2,
                                 &MaskVec[0], MaskVec.size()));
 }

+/// getZextVMoveL - Return a zero-extending vector move low node.
+///
+static SDOperand getZextVMoveL(MVT::ValueType VT, MVT::ValueType OpVT,
+                               SDOperand SrcOp, SelectionDAG &DAG,
+                               const X86Subtarget *Subtarget) {
+  if (VT == MVT::v2f64 || VT == MVT::v4f32) {
+    LoadSDNode *LD = NULL;
+    if (!isScalarLoadToVector(SrcOp.Val, &LD))
+      LD = dyn_cast<LoadSDNode>(SrcOp);
+    if (!LD) {
+      // movssrr and movsdrr do not clear top bits. Try to use movd, movq
+      // instead.
+      MVT::ValueType EVT = (OpVT == MVT::v2f64) ? MVT::i64 : MVT::i32;
+      if ((EVT != MVT::i64 || Subtarget->is64Bit()) &&
+          SrcOp.getOpcode() == ISD::SCALAR_TO_VECTOR &&
+          SrcOp.getOperand(0).getOpcode() == ISD::BIT_CONVERT &&
+          SrcOp.getOperand(0).getOperand(0).getValueType() == EVT) {
+        // PR2108
+        OpVT = (OpVT == MVT::v2f64) ? MVT::v2i64 : MVT::v4i32;
+        return DAG.getNode(ISD::BIT_CONVERT, VT,
+                           DAG.getNode(X86ISD::ZEXT_VMOVL, OpVT,
+                                       DAG.getNode(ISD::SCALAR_TO_VECTOR, OpVT,
+                                                   SrcOp.getOperand(0).getOperand(0))));
+      }
+    }
+  }
+
+  return DAG.getNode(ISD::BIT_CONVERT, VT,
+                     DAG.getNode(X86ISD::ZEXT_VMOVL, OpVT,
+                                 DAG.getNode(ISD::BIT_CONVERT, OpVT, SrcOp)));
+}
+
 SDOperand
 X86TargetLowering::LowerVECTOR_SHUFFLE(SDOperand Op, SelectionDAG &DAG) {
  SDOperand V1 = Op.getOperand(0);
@ -3515,27 +3553,33 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDOperand Op, SelectionDAG &DAG) {
    // FIXME: Figure out a cleaner way to do this.
    // Try to make use of movq to zero out the top part.
    if (ISD::isBuildVectorAllZeros(V2.Val)) {
-      SDOperand NewOp = RewriteAsNarrowerShuffle(V1, V2, VT, PermMask, DAG, *this);
+      SDOperand NewOp = RewriteAsNarrowerShuffle(V1, V2, VT, PermMask,
+                                                 DAG, *this);
      if (NewOp.Val) {
        SDOperand NewV1 = NewOp.getOperand(0);
        SDOperand NewV2 = NewOp.getOperand(1);
        SDOperand NewMask = NewOp.getOperand(2);
        if (isCommutedMOVL(NewMask.Val, true, false)) {
          NewOp = CommuteVectorShuffle(NewOp, NewV1, NewV2, NewMask, DAG);
-          NewOp = DAG.getNode(ISD::VECTOR_SHUFFLE, NewOp.getValueType(),
-                              NewV1, NewV2, getMOVLMask(2, DAG));
-          return DAG.getNode(ISD::BIT_CONVERT, VT, LowerVECTOR_SHUFFLE(NewOp, DAG));
+          return getZextVMoveL(VT, NewOp.getValueType(), NewV2, DAG, Subtarget);
        }
      }
    } else if (ISD::isBuildVectorAllZeros(V1.Val)) {
-      SDOperand NewOp= RewriteAsNarrowerShuffle(V1, V2, VT, PermMask, DAG, *this);
+      SDOperand NewOp= RewriteAsNarrowerShuffle(V1, V2, VT, PermMask,
+                                                DAG, *this);
      if (NewOp.Val && X86::isMOVLMask(NewOp.getOperand(2).Val))
-        return DAG.getNode(ISD::BIT_CONVERT, VT, LowerVECTOR_SHUFFLE(NewOp, DAG));
+        return getZextVMoveL(VT, NewOp.getValueType(), NewOp.getOperand(1),
+                             DAG, Subtarget);
    }
  }

-  if (X86::isMOVLMask(PermMask.Val))
-    return (V1IsUndef) ? V2 : Op;
+  if (X86::isMOVLMask(PermMask.Val)) {
+    if (V1IsUndef)
+      return V2;
+    if (ISD::isBuildVectorAllZeros(V1.Val))
+      return getZextVMoveL(VT, VT, V2, DAG, Subtarget);
+    return Op;
+  }

  if (X86::isMOVSHDUPMask(PermMask.Val) ||
      X86::isMOVSLDUPMask(PermMask.Val) ||
@ -5629,8 +5673,9 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
  case X86ISD::EH_RETURN:          return "X86ISD::EH_RETURN";
  case X86ISD::TC_RETURN:          return "X86ISD::TC_RETURN";
  case X86ISD::FNSTCW16m:          return "X86ISD::FNSTCW16m";
-  case X86ISD::LCMPXCHG_DAG:       return "x86ISD::LCMPXCHG_DAG";
-  case X86ISD::LCMPXCHG8_DAG:      return "x86ISD::LCMPXCHG8_DAG";
+  case X86ISD::LCMPXCHG_DAG:       return "X86ISD::LCMPXCHG_DAG";
+  case X86ISD::LCMPXCHG8_DAG:      return "X86ISD::LCMPXCHG8_DAG";
+  case X86ISD::ZEXT_VMOVL:         return "X86ISD::ZEXT_VMOVL";
  }
 }

@ -6192,16 +6237,46 @@ static bool isConsecutiveLoad(SDNode *N, SDNode *Base, int Dist, int Size,
  return false;
 }

-static bool isBaseAlignment16(SDNode *Base, MachineFrameInfo *MFI,
-                              const X86Subtarget *Subtarget) {
+static bool isBaseAlignmentOfN(unsigned N, SDNode *Base, MachineFrameInfo *MFI,
+                               const X86Subtarget *Subtarget) {
  GlobalValue *GV;
  int64_t Offset = 0;
  if (isGAPlusOffset(Base, GV, Offset))
-    return (GV->getAlignment() >= 16 && (Offset % 16) == 0);
+    return (GV->getAlignment() >= N && (Offset % N) == 0);
  // DAG combine handles the stack object case.
  return false;
 }

+static bool EltsFromConsecutiveLoads(SDNode *N, SDOperand PermMask,
+                                     unsigned NumElems, MVT::ValueType EVT,
+                                     MachineFrameInfo *MFI,
+                                     SelectionDAG &DAG, SDNode *&Base) {
+  Base = NULL;
+  for (unsigned i = 0; i < NumElems; ++i) {
+    SDOperand Idx = PermMask.getOperand(i);
+    if (Idx.getOpcode() == ISD::UNDEF) {
+      if (!Base)
+        return false;
+      continue;
+    }
+
+    unsigned Index = cast<ConstantSDNode>(Idx)->getValue();
+    SDOperand Elt = getShuffleScalarElt(N, Index, DAG);
+    if (!Elt.Val ||
+        (Elt.getOpcode() != ISD::UNDEF && !ISD::isNON_EXTLoad(Elt.Val)))
+      return false;
+    if (!Base) {
+      Base = Elt.Val;
+      continue;
+    }
+    if (Elt.getOpcode() == ISD::UNDEF)
+      continue;
+
+    if (!isConsecutiveLoad(Elt.Val, Base, i, MVT::getSizeInBits(EVT)/8,MFI))
+      return false;
+  }
+  return true;
+}

 /// PerformShuffleCombine - Combine a vector_shuffle that is equal to
 /// build_vector load1, load2, load3, load4, <0, 1, 2, 3> into a 128-bit load
@ -6209,36 +6284,17 @@ static bool isBaseAlignment16(SDNode *Base, MachineFrameInfo *MFI,
 /// order.
 static SDOperand PerformShuffleCombine(SDNode *N, SelectionDAG &DAG,
                                       const X86Subtarget *Subtarget) {
-  MachineFunction &MF = DAG.getMachineFunction();
-  MachineFrameInfo *MFI = MF.getFrameInfo();
+  MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
  MVT::ValueType VT = N->getValueType(0);
  MVT::ValueType EVT = MVT::getVectorElementType(VT);
  SDOperand PermMask = N->getOperand(2);
  unsigned NumElems = PermMask.getNumOperands();
  SDNode *Base = NULL;
-  for (unsigned i = 0; i < NumElems; ++i) {
-    SDOperand Elt = PermMask.getOperand(i);
-    if (Elt.getOpcode() == ISD::UNDEF) {
-      if (!Base)
-        return SDOperand();
-      continue;
-    }
-
-    unsigned Idx = cast<ConstantSDNode>(Elt)->getValue();
-    SDOperand Arg = getShuffleScalarElt(N, Idx, DAG);
-    if (!Arg.Val || !ISD::isNON_EXTLoad(Arg.Val))
-      return SDOperand();
-    if (!Base) {
-      Base = Arg.Val;
-      continue;
-    }
-
-    if (!isConsecutiveLoad(Arg.Val, Base, i, MVT::getSizeInBits(EVT)/8,MFI))
-      return SDOperand();
-  }
+  if (!EltsFromConsecutiveLoads(N, PermMask, NumElems, EVT, MFI, DAG, Base))
+    return SDOperand();

  LoadSDNode *LD = cast<LoadSDNode>(Base);
-  if (isBaseAlignment16(Base->getOperand(1).Val, MFI, Subtarget))
+  if (isBaseAlignmentOfN(16, Base->getOperand(1).Val, MFI, Subtarget))
    return DAG.getLoad(VT, LD->getChain(), LD->getBasePtr(), LD->getSrcValue(),
                       LD->getSrcValueOffset(), LD->isVolatile());
  return DAG.getLoad(VT, LD->getChain(), LD->getBasePtr(), LD->getSrcValue(),
@ -6319,12 +6375,13 @@ static SDOperand PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
 }

 /// PerformSTORECombine - Do target-specific dag combines on STORE nodes.
-static SDOperand PerformSTORECombine(StoreSDNode *St, SelectionDAG &DAG,
+static SDOperand PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
                                     const X86Subtarget *Subtarget) {
  // Turn load->store of MMX types into GPR load/stores.  This avoids clobbering
  // the FP state in cases where an emms may be missing.
  // A preferable solution to the general problem is to figure out the right
  // places to insert EMMS.  This qualifies as a quick hack.
+  StoreSDNode *St = cast<StoreSDNode>(N);
  if (MVT::isVector(St->getValue().getValueType()) && 
      MVT::getSizeInBits(St->getValue().getValueType()) == 64 &&
      isa<LoadSDNode>(St->getValue()) &&
@ -6442,8 +6499,7 @@ SDOperand X86TargetLowering::PerformDAGCombine(SDNode *N,
  default: break;
  case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, Subtarget);
  case ISD::SELECT:         return PerformSELECTCombine(N, DAG, Subtarget);
-  case ISD::STORE:          
-      return PerformSTORECombine(cast<StoreSDNode>(N), DAG, Subtarget);
+  case ISD::STORE:          return PerformSTORECombine(N, DAG, Subtarget);
  case X86ISD::FXOR:
  case X86ISD::FOR:         return PerformFORCombine(N, DAG);
  case X86ISD::FAND:        return PerformFANDCombine(N, DAG);
--- a/lib/Target/X86/X86ISelLowering.h
+++ b/lib/Target/X86/X86ISelLowering.h
@ -181,10 +181,10 @@ namespace llvm {
      /// in order to obtain suitable precision.
      FRSQRT, FRCP,

-      // Thread Local Storage
+      // TLSADDR, THREAThread - Thread Local Storage.
      TLSADDR, THREAD_POINTER,

-      // Exception Handling helpers
+      // EH_RETURN - Exception Handling helpers.
      EH_RETURN,
      
      /// TC_RETURN - Tail call return.
@ -194,12 +194,15 @@ namespace llvm {
      ///   operand #3 optional in flag
      TC_RETURN,

-      // compare and swap
+      // LCMPXCHG_DAG, LCMPXCHG8_DAG - Compare and swap.
      LCMPXCHG_DAG,
      LCMPXCHG8_DAG,

-      // Store FP control world into i16 memory
-      FNSTCW16m
+      // FNSTCW16m - Store FP control world into i16 memory.
+      FNSTCW16m,
+
+      // ZEXT_VMOVL - Vector move low and zero extend.
+      ZEXT_VMOVL
    };
  }

--- a/lib/Target/X86/X86InstrMMX.td
+++ b/lib/Target/X86/X86InstrMMX.td
@ -200,18 +200,14 @@ let AddedComplexity = 15 in
 // movd to MMX register zero-extends
 def MMX_MOVZDI2PDIrr : MMXI<0x6E, MRMSrcReg, (outs VR64:$dst), (ins GR32:$src),
                             "movd\t{$src, $dst|$dst, $src}",
-                             [(set VR64:$dst,
-                               (v2i32 (vector_shuffle immAllZerosV,
-                                       (v2i32 (scalar_to_vector GR32:$src)),
-                                       MMX_MOVL_shuffle_mask)))]>;
+              [(set VR64:$dst,
+                    (v2i32 (X86zvmovl (v2i32 (scalar_to_vector GR32:$src)))))]>;
 let AddedComplexity = 20 in
 def MMX_MOVZDI2PDIrm : MMXI<0x6E, MRMSrcMem, (outs VR64:$dst), (ins i32mem:$src),
                             "movd\t{$src, $dst|$dst, $src}",
-                             [(set VR64:$dst,
-                               (v2i32 (vector_shuffle immAllZerosV,
-                                       (v2i32 (scalar_to_vector
-                                               (loadi32 addr:$src))),
-                                       MMX_MOVL_shuffle_mask)))]>;
+          [(set VR64:$dst,
+                (v2i32 (X86zvmovl (v2i32
+                                   (scalar_to_vector (loadi32 addr:$src))))))]>;

 // Arithmetic Instructions

@ -564,14 +560,10 @@ def : Pat<(i64  (bitconvert (v8i8 VR64:$src))),
 // Move scalar to XMM zero-extended
 // movd to XMM register zero-extends
 let AddedComplexity = 15 in {
-  def : Pat<(v8i8 (vector_shuffle immAllZerosV_bc,
-                    (bc_v8i8 (v2i32 (scalar_to_vector GR32:$src))),
-                    MMX_MOVL_shuffle_mask)),
-            (MMX_MOVZDI2PDIrr GR32:$src)>;
-  def : Pat<(v4i16 (vector_shuffle immAllZerosV_bc,
-                    (bc_v4i16 (v2i32 (scalar_to_vector GR32:$src))),
-                    MMX_MOVL_shuffle_mask)),
-            (MMX_MOVZDI2PDIrr GR32:$src)>;
+  def : Pat<(v8i8 (X86zvmovl (bc_v8i8 (v2i32 (scalar_to_vector GR32:$src))))),
+           (MMX_MOVZDI2PDIrr GR32:$src)>; 
+  def : Pat<(v4i16 (X86zvmovl (bc_v8i8 (v2i32 (scalar_to_vector GR32:$src))))),
+           (MMX_MOVZDI2PDIrr GR32:$src)>; 
 }

 // Scalar to v4i16 / v8i8. The source may be a GR32, but only the lower
--- a/lib/Target/X86/X86InstrSSE.td
+++ b/lib/Target/X86/X86InstrSSE.td
@ -47,6 +47,7 @@ def X86pinsrw  : SDNode<"X86ISD::PINSRW",
 def X86insrtps : SDNode<"X86ISD::INSERTPS", 
                 SDTypeProfile<1, 3, [SDTCisVT<0, v4f32>, SDTCisSameAs<0,1>,
                                      SDTCisVT<2, f32>, SDTCisPtrTy<3>]>>;
+def X86zvmovl  : SDNode<"X86ISD::ZEXT_VMOVL", SDTUnaryOp>;

 //===----------------------------------------------------------------------===//
 // SSE Complex Patterns
@ -1007,10 +1008,11 @@ let neverHasSideEffects = 1 in
 let AddedComplexity = 20 in
 def MOVZSS2PSrm : SSI<0x10, MRMSrcMem, (outs VR128:$dst), (ins f32mem:$src),
                      "movss\t{$src, $dst|$dst, $src}",
-                      [(set VR128:$dst, (v4f32 (vector_shuffle immAllZerosV_bc,
-                                 (v4f32 (scalar_to_vector (loadf32 addr:$src))),
-                                                MOVL_shuffle_mask)))]>;
+                   [(set VR128:$dst, (v4f32 (X86zvmovl (v4f32 (scalar_to_vector
+                                                      (loadf32 addr:$src))))))]>;

+def : Pat<(v4f32 (X86zvmovl (memopv4f32 addr:$src))),
+          (MOVZSS2PSrm addr:$src)>;

 //===----------------------------------------------------------------------===//
 // SSE2 Instructions
@ -2264,51 +2266,36 @@ let AddedComplexity = 20 in
  def MOVZSD2PDrm : SDI<0x10, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
                        "movsd\t{$src, $dst|$dst, $src}",
                        [(set VR128:$dst,
-                          (v2f64 (vector_shuffle immAllZerosV_bc,
-                                  (v2f64 (scalar_to_vector
-                                          (loadf64 addr:$src))),
-                                  MOVL_shuffle_mask)))]>;
+                          (v2f64 (X86zvmovl (v2f64 (scalar_to_vector
+                                                   (loadf64 addr:$src))))))]>;
+
+def : Pat<(v2f64 (X86zvmovl (memopv2f64 addr:$src))),
+          (MOVZSD2PDrm addr:$src)>;

 // movd / movq to XMM register zero-extends
 let AddedComplexity = 15 in {
 def MOVZDI2PDIrr : PDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src),
                       "movd\t{$src, $dst|$dst, $src}",
-                       [(set VR128:$dst,
-                         (v4i32 (vector_shuffle immAllZerosV,
-                                 (v4i32 (scalar_to_vector GR32:$src)),
-                                 MOVL_shuffle_mask)))]>;
+                       [(set VR128:$dst, (v4i32 (X86zvmovl
+                                      (v4i32 (scalar_to_vector GR32:$src)))))]>;
 // This is X86-64 only.
 def MOVZQI2PQIrr : RPDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
                       "mov{d|q}\t{$src, $dst|$dst, $src}",
-                       [(set VR128:$dst,
-                         (v2i64 (vector_shuffle immAllZerosV_bc,
-                                 (v2i64 (scalar_to_vector GR64:$src)),
-                                 MOVL_shuffle_mask)))]>;
+                       [(set VR128:$dst, (v2i64 (X86zvmovl
+                                      (v2i64 (scalar_to_vector GR64:$src)))))]>;
 }

-// Handle the v2f64 form of 'MOVZQI2PQIrr' for PR2108.  FIXME: this would be
-// better written as a dag combine xform.
-let AddedComplexity = 15 in
-def : Pat<(v2f64 (vector_shuffle immAllZerosV_bc,
-                                  (v2f64 (scalar_to_vector 
-                                       (f64 (bitconvert GR64:$src)))),
-                                  MOVL_shuffle_mask)),
-          (MOVZQI2PQIrr GR64:$src)>, Requires<[HasSSE2]>;
-          
-
 let AddedComplexity = 20 in {
 def MOVZDI2PDIrm : PDI<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src),
                       "movd\t{$src, $dst|$dst, $src}",
                       [(set VR128:$dst,
-                         (v4i32 (vector_shuffle immAllZerosV,
-                                 (v4i32 (scalar_to_vector (loadi32 addr:$src))),
-                                 MOVL_shuffle_mask)))]>;
+                         (v4i32 (X86zvmovl (v4i32 (scalar_to_vector
+                                                   (loadi32 addr:$src))))))]>;
 def MOVZQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
                     "movq\t{$src, $dst|$dst, $src}",
                     [(set VR128:$dst,
-                       (v2i64 (vector_shuffle immAllZerosV_bc,
-                              (v2i64 (scalar_to_vector (loadi64 addr:$src))),
-                              MOVL_shuffle_mask)))]>, XS,
+                       (v2i64 (X86zvmovl (v2i64 (scalar_to_vector
+                                                 (loadi64 addr:$src))))))]>, XS,
                   Requires<[HasSSE2]>;
 }

@ -2317,17 +2304,14 @@ def MOVZQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
 let AddedComplexity = 15 in
 def MOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                        "movq\t{$src, $dst|$dst, $src}",
-                    [(set VR128:$dst, (v2i64 (vector_shuffle immAllZerosV_bc,
-                                             VR128:$src,
-                                             MOVL_shuffle_mask)))]>,
+                    [(set VR128:$dst, (v2i64 (X86zvmovl (v2i64 VR128:$src))))]>,
                      XS, Requires<[HasSSE2]>;

 let AddedComplexity = 20 in
 def MOVZPQILo2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
                        "movq\t{$src, $dst|$dst, $src}",
-                    [(set VR128:$dst, (v2i64 (vector_shuffle immAllZerosV_bc,
-                                             (memopv2i64 addr:$src),
-                                             MOVL_shuffle_mask)))]>,
+                    [(set VR128:$dst, (v2i64 (X86zvmovl
+                                             (memopv2i64 addr:$src))))]>,
                      XS, Requires<[HasSSE2]>;

 //===----------------------------------------------------------------------===//
@ -2774,11 +2758,9 @@ let Predicates = [HasSSE2] in {
 // movd to XMM register zero-extends
 let AddedComplexity = 15 in {
 // Zeroing a VR128 then do a MOVS{S|D} to the lower bits.
-def : Pat<(v2f64 (vector_shuffle immAllZerosV_bc,
-                  (v2f64 (scalar_to_vector FR64:$src)), MOVL_shuffle_mask)),
+def : Pat<(v2f64 (X86zvmovl (v2f64 (scalar_to_vector FR64:$src)))),
          (MOVLSD2PDrr (V_SET0), FR64:$src)>, Requires<[HasSSE2]>;
-def : Pat<(v4f32 (vector_shuffle immAllZerosV_bc,
-                  (v4f32 (scalar_to_vector FR32:$src)), MOVL_shuffle_mask)),
+def : Pat<(v4f32 (X86zvmovl (v4f32 (scalar_to_vector FR32:$src)))),
          (MOVLSS2PSrr (V_SET0), FR32:$src)>, Requires<[HasSSE2]>;
 }

--- a/test/CodeGen/X86/vec_set-5.ll
+++ b/test/CodeGen/X86/vec_set-5.ll
@ -1,8 +1,7 @@
 ; RUN: llvm-as < %s | llc -march=x86 -mattr=+sse2 -o %t -f
 ; RUN: grep movlhps   %t | count 1
-; RUN: grep unpcklps  %t | count 1
-; RUN: grep punpckldq %t | count 1
 ; RUN: grep movq      %t | count 1
+; RUN: grep movsd     %t | count 1

 define <4 x float> @test1(float %a, float %b) nounwind {
 	%tmp = insertelement <4 x float> zeroinitializer, float %a, i32 0		; <<4 x float>> [#uses=1]
--- a/test/CodeGen/X86/vec_set-6.ll
+++ b/test/CodeGen/X86/vec_set-6.ll
@ -1,5 +1,6 @@
 ; RUN: llvm-as < %s | llc -march=x86 -mattr=+sse2 -o %t -f
-; RUN: grep unpcklps %t | count 1
+; RUN: grep movss    %t | count 1
+; RUN: grep movups   %t | count 1
 ; RUN: grep shufps   %t | count 1

 define <4 x float> @test(float %a, float %b, float %c) nounwind {
--- a/test/CodeGen/X86/vec_set-C.ll
+++ b/test/CodeGen/X86/vec_set-C.ll
@ -0,0 +1,7 @@
+; RUN: llvm-as < %s | llc -march=x86 -mattr=+sse2 | grep movq
+; RUN: llvm-as < %s | llc -march=x86-64 -mattr=+sse2 | grep movd
+
+define <2 x i64> @t1(i64 %x) nounwind  {
+	%tmp8 = insertelement <2 x i64> zeroinitializer, i64 %x, i32 0
+	ret <2 x i64> %tmp8
+}
--- a/test/CodeGen/X86/vec_set-D.ll
+++ b/test/CodeGen/X86/vec_set-D.ll
@ -0,0 +1,7 @@
+; RUN: llvm-as < %s | llc -march=x86 -mattr=+sse2 | grep movq
+
+define <4 x i32> @t(i32 %x, i32 %y) nounwind  {
+	%tmp1 = insertelement <4 x i32> zeroinitializer, i32 %x, i32 0
+	%tmp2 = insertelement <4 x i32> %tmp1, i32 %y, i32 1
+	ret <4 x i32> %tmp2
+}