Implement aarch64 neon instruction set AdvSIMD (copy).

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@192410 91177308-0d34-0410-b5e6-96231b3b80d8
2025-04-05 01:31:05 +00:00 · 2013-10-11 02:33:55 +00:00 · 2013-10-11 02:33:55 +00:00 · 767f816b92
commit 767f816b92
parent 6c066c044e
6 changed files with 875 additions and 56 deletions
--- a/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/lib/Target/AArch64/AArch64ISelLowering.cpp
@ -297,15 +297,23 @@ AArch64TargetLowering::AArch64TargetLowering(AArch64TargetMachine &TM)
    setOperationAction(ISD::BUILD_VECTOR, MVT::v1f64, Custom);
    setOperationAction(ISD::BUILD_VECTOR, MVT::v2f64, Custom);

+    setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i8, Custom);
+    setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i8, Custom);
    setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i16, Custom);
    setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i16, Custom);
    setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i32, Custom);
    setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i32, Custom);
+    setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v1i64, Custom);
+    setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i64, Custom);
    setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f32, Custom);
    setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom);
    setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v1f64, Custom);
    setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f64, Custom);

+    setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i8, Legal);
+    setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i16, Legal);
+    setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i32, Legal);
+    setOperationAction(ISD::CONCAT_VECTORS, MVT::v2i64, Legal);
    setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i16, Legal);
    setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i32, Legal);
    setOperationAction(ISD::CONCAT_VECTORS, MVT::v2i64, Legal);
@ -866,12 +874,12 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
    return "AArch64ISD::NEON_CMPZ";
  case AArch64ISD::NEON_TST:
    return "AArch64ISD::NEON_TST";
-  case AArch64ISD::NEON_DUPIMM:
-    return "AArch64ISD::NEON_DUPIMM";
  case AArch64ISD::NEON_QSHLs:
    return "AArch64ISD::NEON_QSHLs";
  case AArch64ISD::NEON_QSHLu:
    return "AArch64ISD::NEON_QSHLu";
+  case AArch64ISD::NEON_VDUP:
+    return "AArch64ISD::NEON_VDUP";
  case AArch64ISD::NEON_VDUPLANE:
    return "AArch64ISD::NEON_VDUPLANE";
  default:
@ -3342,7 +3350,7 @@ static SDValue PerformShiftCombine(SDNode *N,
  case ISD::SHL:
    if (isVShiftLImm(N->getOperand(1), VT, Cnt)) {
      SDValue RHS =
-          DAG.getNode(AArch64ISD::NEON_DUPIMM, SDLoc(N->getOperand(1)), VT,
+          DAG.getNode(AArch64ISD::NEON_VDUP, SDLoc(N->getOperand(1)), VT,
                      DAG.getConstant(Cnt, MVT::i32));
      return DAG.getNode(ISD::SHL, SDLoc(N), VT, N->getOperand(0), RHS);
    }
@ -3352,7 +3360,7 @@ static SDValue PerformShiftCombine(SDNode *N,
  case ISD::SRL:
    if (isVShiftRImm(N->getOperand(1), VT, Cnt)) {
      SDValue RHS =
-          DAG.getNode(AArch64ISD::NEON_DUPIMM, SDLoc(N->getOperand(1)), VT,
+          DAG.getNode(AArch64ISD::NEON_VDUP, SDLoc(N->getOperand(1)), VT,
                      DAG.getConstant(Cnt, MVT::i32));
      return DAG.getNode(N->getOpcode(), SDLoc(N), VT, N->getOperand(0), RHS);
    }
@ -3492,6 +3500,107 @@ AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
      }
    }
  }
+
+  unsigned NumElts = VT.getVectorNumElements();
+  bool isOnlyLowElement = true;
+  bool usesOnlyOneValue = true;
+  bool hasDominantValue = false;
+  bool isConstant = true;
+
+  // Map of the number of times a particular SDValue appears in the
+  // element list.
+  DenseMap<SDValue, unsigned> ValueCounts;
+  SDValue Value;
+  for (unsigned i = 0; i < NumElts; ++i) {
+    SDValue V = Op.getOperand(i);
+    if (V.getOpcode() == ISD::UNDEF)
+      continue;
+    if (i > 0)
+      isOnlyLowElement = false;
+    if (!isa<ConstantFPSDNode>(V) && !isa<ConstantSDNode>(V))
+      isConstant = false;
+
+    ValueCounts.insert(std::make_pair(V, 0));
+    unsigned &Count = ValueCounts[V];
+
+    // Is this value dominant? (takes up more than half of the lanes)
+    if (++Count > (NumElts / 2)) {
+      hasDominantValue = true;
+      Value = V;
+    }
+  }
+  if (ValueCounts.size() != 1)
+    usesOnlyOneValue = false;
+  if (!Value.getNode() && ValueCounts.size() > 0)
+    Value = ValueCounts.begin()->first;
+
+  if (ValueCounts.size() == 0)
+    return DAG.getUNDEF(VT);
+
+  // Loads are better lowered with insert_vector_elt.
+  // Keep going if we are hitting this case.
+  if (isOnlyLowElement && !ISD::isNormalLoad(Value.getNode()))
+    return DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Value);
+
+  unsigned EltSize = VT.getVectorElementType().getSizeInBits();
+  // Use VDUP for non-constant splats.
+  if (hasDominantValue && EltSize <= 64) {
+    if (!isConstant) {
+      SDValue N;
+
+      // If we are DUPing a value that comes directly from a vector, we could
+      // just use DUPLANE. We can only do this if the lane being extracted
+      // is at a constant index, as the DUP from lane instructions only have
+      // constant-index forms.
+      if (Value->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
+          isa<ConstantSDNode>(Value->getOperand(1))) {
+          N = DAG.getNode(AArch64ISD::NEON_VDUPLANE, DL, VT,
+                        Value->getOperand(0), Value->getOperand(1));
+      } else
+        N = DAG.getNode(AArch64ISD::NEON_VDUP, DL, VT, Value);
+
+      if (!usesOnlyOneValue) {
+        // The dominant value was splatted as 'N', but we now have to insert
+        // all differing elements.
+        for (unsigned I = 0; I < NumElts; ++I) {
+          if (Op.getOperand(I) == Value)
+            continue;
+          SmallVector<SDValue, 3> Ops;
+          Ops.push_back(N);
+          Ops.push_back(Op.getOperand(I));
+          Ops.push_back(DAG.getConstant(I, MVT::i32));
+          N = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, &Ops[0], 3);
+        }
+      }
+      return N;
+    }
+    if (usesOnlyOneValue && isConstant) {
+      return DAG.getNode(AArch64ISD::NEON_VDUP, DL, VT, Value);
+    }
+  }
+  // If all elements are constants and the case above didn't get hit, fall back
+  // to the default expansion, which will generate a load from the constant
+  // pool.
+  if (isConstant)
+    return SDValue();
+
+  // If all else fails, just use a sequence of INSERT_VECTOR_ELT when we
+  // know the default expansion would otherwise fall back on something even
+  // worse. For a vector with one or two non-undef values, that's
+  // scalar_to_vector for the elements followed by a shuffle (provided the
+  // shuffle is valid for the target) and materialization element by element
+  // on the stack followed by a load for everything else.
+  if (!isConstant && !usesOnlyOneValue) {
+    SDValue Vec = DAG.getUNDEF(VT);
+    for (unsigned i = 0 ; i < NumElts; ++i) {
+      SDValue V = Op.getOperand(i);
+      if (V.getOpcode() == ISD::UNDEF)
+        continue;
+      SDValue LaneIdx = DAG.getConstant(i, MVT::i32);
+      Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, Vec, V, LaneIdx);
+    }
+    return Vec;
+  }
  return SDValue();
 }

@ -3499,6 +3608,7 @@ SDValue
 AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
                                                SelectionDAG &DAG) const {
  SDValue V1 = Op.getOperand(0);
+  SDValue V2 = Op.getOperand(1);
  SDLoc dl(Op);
  EVT VT = Op.getValueType();
  ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
@ -3516,9 +3626,89 @@ AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
      // If this is undef splat, generate it via "just" vdup, if possible.
      if (Lane == -1) Lane = 0;

+      // Test if V1 is a SCALAR_TO_VECTOR.
+      if (V1.getOpcode() == ISD::SCALAR_TO_VECTOR) {
+        return DAG.getNode(AArch64ISD::NEON_VDUP, dl, VT, V1.getOperand(0));
+      }
+      // Test if V1 is a BUILD_VECTOR which is equivalent to a SCALAR_TO_VECTOR.
+      if (V1.getOpcode() == ISD::BUILD_VECTOR) {
+        bool IsScalarToVector = true;
+        for (unsigned i = 0, e = V1.getNumOperands(); i != e; ++i)
+          if (V1.getOperand(i).getOpcode() != ISD::UNDEF &&
+              i != (unsigned)Lane) {
+            IsScalarToVector = false;
+            break;
+          }
+        if (IsScalarToVector)
+          return DAG.getNode(AArch64ISD::NEON_VDUP, dl, VT,
+                             V1.getOperand(Lane));
+      }
      return DAG.getNode(AArch64ISD::NEON_VDUPLANE, dl, VT, V1,
                         DAG.getConstant(Lane, MVT::i64));
    }
+    // For shuffle mask like "0, 1, 2, 3, 4, 5, 13, 7", try to generate insert
+    // by element from V2 to V1 .
+    // If shuffle mask is like "0, 1, 10, 11, 12, 13, 14, 15", V2 would be a
+    // better choice to be inserted than V1 as less insert needed, so we count
+    // element to be inserted for both V1 and V2, and select less one as insert
+    // target.
+
+    // Collect elements need to be inserted and their index.
+    SmallVector<int, 8> NV1Elt;
+    SmallVector<int, 8> N1Index;
+    SmallVector<int, 8> NV2Elt;
+    SmallVector<int, 8> N2Index;
+    int Length = ShuffleMask.size();
+    int V1EltNum = V1.getValueType().getVectorNumElements();
+    for (int I = 0; I != Length; ++I) {
+      if (ShuffleMask[I] != I) {
+        NV1Elt.push_back(ShuffleMask[I]);
+        N1Index.push_back(I);
+      }
+    }
+    for (int I = 0; I != Length; ++I) {
+      if (ShuffleMask[I] != (I + V1EltNum)) {
+        NV2Elt.push_back(ShuffleMask[I]);
+        N2Index.push_back(I);
+      }
+    }
+
+    // Decide which to be inserted. If all lanes mismatch, neither V1 nor V2
+    // will be inserted.
+    SDValue InsV = V1;
+    SmallVector<int, 8> InsMasks = NV1Elt;
+    SmallVector<int, 8> InsIndex = N1Index;
+    if ((int)NV1Elt.size() != Length || (int)NV2Elt.size() != Length) {
+      if (NV1Elt.size() > NV2Elt.size()) {
+        InsV = V2;
+        InsMasks = NV2Elt;
+        InsIndex = N2Index;
+      }
+    } else {
+      InsV = DAG.getNode(ISD::UNDEF, dl, VT);
+    }
+
+    SDValue PassN;
+
+    for (int I = 0, E = InsMasks.size(); I != E; ++I) {
+      SDValue ExtV = V1;
+      int Mask = InsMasks[I];
+      if (Mask > V1EltNum) {
+        ExtV = V2;
+        Mask -= V1EltNum;
+      }
+      // Any value type smaller than i32 is illegal in AArch64, and this lower
+      // function is called after legalize pass, so we need to legalize
+      // the result here.
+      EVT EltVT = MVT::i32;
+      if(EltSize == 64)
+        EltVT = MVT::i64;
+      PassN = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, ExtV,
+                          DAG.getConstant(Mask, MVT::i64));
+      PassN = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, InsV, PassN,
+                          DAG.getConstant(InsIndex[I], MVT::i64));
+    }
+    return PassN;
  }

  return SDValue();
--- a/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/lib/Target/AArch64/AArch64ISelLowering.h
@ -134,13 +134,13 @@ namespace AArch64ISD {
    // Vector compare bitwise test
    NEON_TST,

-    // Operation for the immediate in vector shift
-    NEON_DUPIMM,
-
    // Vector saturating shift
    NEON_QSHLs,
    NEON_QSHLu,

+    // Vector dup
+    NEON_VDUP,
+
    // Vector dup by lane
    NEON_VDUPLANE
  };
@ -296,6 +296,10 @@ enum NeonModImmType {
  Neon_Mov_Imm,
  Neon_Mvn_Imm
 };
+
+extern SDValue ScanBUILD_VECTOR(SDValue Op, bool &isOnlyLowElement,
+                                bool &usesOnlyOneValue, bool &hasDominantValue,
+                                bool &isConstant, bool &isUNDEF);
 } // namespace llvm

 #endif // LLVM_TARGET_AARCH64_ISELLOWERING_H
--- a/lib/Target/AArch64/AArch64InstrNEON.td
+++ b/lib/Target/AArch64/AArch64InstrNEON.td
@ -41,14 +41,13 @@ def Neon_cmpz : SDNode<"AArch64ISD::NEON_CMPZ", SDTypeProfile<1, 3,
 def Neon_tst : SDNode<"AArch64ISD::NEON_TST", SDTypeProfile<1, 2,
                 [SDTCisVec<0>,  SDTCisSameAs<1, 2>]>>;

-def Neon_dupImm : SDNode<"AArch64ISD::NEON_DUPIMM", SDTypeProfile<1, 1, 
-                    [SDTCisVec<0>, SDTCisVT<1, i32>]>>;
-
 def SDTARMVSH : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0, 1>,
                                     SDTCisVT<2, i32>]>;
 def Neon_sqrshlImm   : SDNode<"AArch64ISD::NEON_QSHLs", SDTARMVSH>;
 def Neon_uqrshlImm   : SDNode<"AArch64ISD::NEON_QSHLu", SDTARMVSH>;

+def Neon_vdup : SDNode<"AArch64ISD::NEON_VDUP", SDTypeProfile<1, 1,
+                       [SDTCisVec<0>]>>;
 def Neon_vduplane : SDNode<"AArch64ISD::NEON_VDUPLANE", SDTypeProfile<1, 2,
                           [SDTCisVec<0>, SDTCisVec<1>, SDTCisVT<2, i64>]>>;

@ -1480,7 +1479,7 @@ class N2VShift<bit q, bit u, bits<5> opcode, string asmop, string T,
                     asmop # "\t$Rd." # T # ", $Rn." # T # ", $Imm",
                     [(set (Ty VPRC:$Rd),
                        (Ty (OpNode (Ty VPRC:$Rn),
-                          (Ty (Neon_dupImm (i32 imm:$Imm))))))],
+                          (Ty (Neon_vdup (i32 imm:$Imm))))))],
                     NoItinerary>;

 multiclass NeonI_N2VShL<bit u, bits<5> opcode, string asmop> {
@ -1585,7 +1584,7 @@ class N2VShiftLong<bit q, bit u, bits<5> opcode, string asmop, string DestT,
                     [(set (DestTy VPR128:$Rd),
                        (DestTy (shl
                          (DestTy (ExtOp (SrcTy VPR64:$Rn))),
-                            (DestTy (Neon_dupImm (i32 imm:$Imm))))))],
+                            (DestTy (Neon_vdup (i32 imm:$Imm))))))],
                     NoItinerary>;

 class N2VShiftLongHigh<bit q, bit u, bits<5> opcode, string asmop, string DestT,
@ -1599,7 +1598,7 @@ class N2VShiftLongHigh<bit q, bit u, bits<5> opcode, string asmop, string DestT,
                        (DestTy (shl
                          (DestTy (ExtOp
                            (SrcTy (getTop VPR128:$Rn)))),
-                              (DestTy (Neon_dupImm (i32 imm:$Imm))))))],
+                              (DestTy (Neon_vdup (i32 imm:$Imm))))))],
                     NoItinerary>;

 multiclass NeonI_N2VShLL<string prefix, bit u, bits<5> opcode, string asmop,
@ -1771,7 +1770,7 @@ class N2VShiftAdd<bit q, bit u, bits<5> opcode, string asmop, string T,
           asmop # "\t$Rd." # T # ", $Rn." # T # ", $Imm",
           [(set (Ty VPRC:$Rd), (Ty (add (Ty VPRC:$src),
              (Ty (OpNode (Ty VPRC:$Rn),
-                (Ty (Neon_dupImm (i32 imm:$Imm))))))))],
+                (Ty (Neon_vdup (i32 imm:$Imm))))))))],
           NoItinerary> {
  let Constraints = "$src = $Rd";
 }
@ -2048,48 +2047,48 @@ def Neon_combine_2d : PatFrag<(ops node:$Rm, node:$Rn),

 def Neon_lshrImm8H : PatFrag<(ops node:$lhs, node:$rhs),
                             (v8i16 (srl (v8i16 node:$lhs),
-                               (v8i16 (Neon_dupImm (i32 node:$rhs)))))>;
+                               (v8i16 (Neon_vdup (i32 node:$rhs)))))>;
 def Neon_lshrImm4S : PatFrag<(ops node:$lhs, node:$rhs),
                             (v4i32 (srl (v4i32 node:$lhs),
-                               (v4i32 (Neon_dupImm (i32 node:$rhs)))))>;
+                               (v4i32 (Neon_vdup (i32 node:$rhs)))))>;
 def Neon_lshrImm2D : PatFrag<(ops node:$lhs, node:$rhs),
                             (v2i64 (srl (v2i64 node:$lhs),
-                               (v2i64 (Neon_dupImm (i32 node:$rhs)))))>;
+                               (v2i64 (Neon_vdup (i32 node:$rhs)))))>;
 def Neon_ashrImm8H : PatFrag<(ops node:$lhs, node:$rhs),
                             (v8i16 (sra (v8i16 node:$lhs),
-                               (v8i16 (Neon_dupImm (i32 node:$rhs)))))>;
+                               (v8i16 (Neon_vdup (i32 node:$rhs)))))>;
 def Neon_ashrImm4S : PatFrag<(ops node:$lhs, node:$rhs),
                             (v4i32 (sra (v4i32 node:$lhs),
-                               (v4i32 (Neon_dupImm (i32 node:$rhs)))))>;
+                               (v4i32 (Neon_vdup (i32 node:$rhs)))))>;
 def Neon_ashrImm2D : PatFrag<(ops node:$lhs, node:$rhs),
                             (v2i64 (sra (v2i64 node:$lhs),
-                               (v2i64 (Neon_dupImm (i32 node:$rhs)))))>;
+                               (v2i64 (Neon_vdup (i32 node:$rhs)))))>;

 // Normal shift right narrow is matched by IR (srl/sra, trunc, concat_vectors)
 multiclass Neon_shiftNarrow_patterns<string shr> {
  def : Pat<(v8i8 (trunc (!cast<PatFrag>("Neon_" # shr # "Imm8H") VPR128:$Rn,
-              imm:$Imm))),
+              (i32 imm:$Imm)))),
            (SHRNvvi_8B VPR128:$Rn, imm:$Imm)>;
  def : Pat<(v4i16 (trunc (!cast<PatFrag>("Neon_" # shr # "Imm4S") VPR128:$Rn,
-              imm:$Imm))),
+              (i32 imm:$Imm)))),
            (SHRNvvi_4H VPR128:$Rn, imm:$Imm)>;
  def : Pat<(v2i32 (trunc (!cast<PatFrag>("Neon_" # shr # "Imm2D") VPR128:$Rn,
-              imm:$Imm))),
+              (i32 imm:$Imm)))),
            (SHRNvvi_2S VPR128:$Rn, imm:$Imm)>;

  def : Pat<(Neon_combine_2D (v1i64 VPR64:$src), (v1i64 (bitconvert
              (v8i8 (trunc (!cast<PatFrag>("Neon_" # shr # "Imm8H")
-                VPR128:$Rn, imm:$Imm)))))),
-            (SHRNvvi_16B (SUBREG_TO_REG (i64 0), VPR64:$src, sub_64),
+                VPR128:$Rn, (i32 imm:$Imm))))))),
+            (SHRNvvi_16B (v2i64 (SUBREG_TO_REG (i64 0), VPR64:$src, sub_64)),
                         VPR128:$Rn, imm:$Imm)>;
  def : Pat<(Neon_combine_2D (v1i64 VPR64:$src), (v1i64 (bitconvert
              (v4i16 (trunc (!cast<PatFrag>("Neon_" # shr # "Imm4S")
-                VPR128:$Rn, imm:$Imm)))))),
+                VPR128:$Rn, (i32 imm:$Imm))))))),
            (SHRNvvi_8H (SUBREG_TO_REG (i64 0), VPR64:$src, sub_64),
                        VPR128:$Rn, imm:$Imm)>;
  def : Pat<(Neon_combine_2D (v1i64 VPR64:$src), (v1i64 (bitconvert
              (v2i32 (trunc (!cast<PatFrag>("Neon_" # shr # "Imm2D")
-                VPR128:$Rn, imm:$Imm)))))),
+                VPR128:$Rn, (i32 imm:$Imm))))))),
            (SHRNvvi_4S (SUBREG_TO_REG (i64 0), VPR64:$src, sub_64),
                        VPR128:$Rn, imm:$Imm)>;
 }
@ -2486,13 +2485,13 @@ multiclass NeonI_get_high
 {
  def _8h : PatFrag<(ops node:$Rn),
                    (v8i8 (trunc (v8i16 (srl (v8i16 node:$Rn),
-                                             (v8i16 (Neon_dupImm 8))))))>;
+                                             (v8i16 (Neon_vdup (i32 8)))))))>;
  def _4s : PatFrag<(ops node:$Rn),
                    (v4i16 (trunc (v4i32 (srl (v4i32 node:$Rn),
-                                              (v4i32 (Neon_dupImm 16))))))>;
+                                              (v4i32 (Neon_vdup (i32 16)))))))>;
  def _2d : PatFrag<(ops node:$Rn),
                    (v2i32 (trunc (v2i64 (srl (v2i64 node:$Rn),
-                                              (v2i64 (Neon_dupImm 32))))))>;
+                                              (v2i64 (Neon_vdup (i32 32)))))))>;
 }

 defm NI_get_hi : NeonI_get_high;
@ -4513,6 +4512,46 @@ def INSELd : NeonI_INS_element<"ins", "d", v2i64, neon_uimm1_bare, i64> {
  // bits 11-13 are unspecified.
 }

+multiclass Neon_INS_elt_float_pattern<ValueType ResTy, ValueType NaTy,
+                                      ValueType MidTy,
+                                      RegisterClass OpFPR, Operand ResImm,
+                                      SubRegIndex SubIndex, Instruction INS> {
+def : Pat<(ResTy (vector_insert
+          (ResTy VPR128:$src),
+          (MidTy (vector_extract
+            (ResTy VPR128:$Rn),
+            (ResImm:$Immn))),
+          (ResImm:$Immd))),
+        (INS (ResTy VPR128:$src), (ResTy VPR128:$Rn),
+          ResImm:$Immd, ResImm:$Immn)>;
+
+def : Pat <(ResTy (vector_insert
+             (ResTy VPR128:$src),
+             (MidTy OpFPR:$Rn),
+             (ResImm:$Imm))),
+           (INS (ResTy VPR128:$src),
+             (ResTy (SUBREG_TO_REG (i64 0), OpFPR:$Rn, SubIndex)),
+             ResImm:$Imm,
+             (i64 0))>;
+
+def : Pat <(NaTy (vector_insert
+             (NaTy VPR64:$src),
+             (MidTy OpFPR:$Rn),
+             (ResImm:$Imm))),
+           (NaTy (EXTRACT_SUBREG 
+             (ResTy (INS 
+               (ResTy (SUBREG_TO_REG (i64 0), (NaTy VPR64:$src), sub_64)),
+               (ResTy (SUBREG_TO_REG (i64 0), (MidTy OpFPR:$Rn), SubIndex)),
+               ResImm:$Imm,
+               (i64 0))),
+             sub_64))>;
+}
+
+defm : Neon_INS_elt_float_pattern<v4f32, v2f32, f32, FPR32, neon_uimm2_bare,
+                                  sub_32, INSELs>;
+defm : Neon_INS_elt_float_pattern<v2f64, v1f64, f64, FPR64, neon_uimm1_bare,
+                                  sub_64, INSELd>;
+
 multiclass Neon_INS_elt_pattern <ValueType NaTy, Operand NaImm,
                                ValueType MidTy, ValueType StTy,
                                Operand StImm, Instruction INS> { 
@ -4557,14 +4596,15 @@ def : Pat<(NaTy (vector_insert
          sub_64))>;
 }

-defm INSb_pattern : Neon_INS_elt_pattern<v8i8, neon_uimm3_bare, i32,
-                                         v16i8, neon_uimm4_bare, INSELb>;
-defm INSh_pattern : Neon_INS_elt_pattern<v4i16, neon_uimm2_bare, i32,
-                                         v8i16, neon_uimm3_bare, INSELh>;
-defm INSs_pattern : Neon_INS_elt_pattern<v2i32, neon_uimm1_bare, i32,
-                                         v4i32, neon_uimm2_bare, INSELs>;
-defm INSd_pattern : Neon_INS_elt_pattern<v1i64, neon_uimm0_bare, i64,
-                                         v2i64, neon_uimm1_bare, INSELd>;
+defm : Neon_INS_elt_pattern<v8i8, neon_uimm3_bare, i32,
+                            v16i8, neon_uimm4_bare, INSELb>;
+defm : Neon_INS_elt_pattern<v4i16, neon_uimm2_bare, i32,
+                            v8i16, neon_uimm3_bare, INSELh>;
+defm : Neon_INS_elt_pattern<v2i32, neon_uimm1_bare, i32,
+                            v4i32, neon_uimm2_bare, INSELs>;
+defm : Neon_INS_elt_pattern<v1i64, neon_uimm0_bare, i64,
+                            v2i64, neon_uimm1_bare, INSELd>;
+

 class NeonI_SMOV<string asmop, string Res, bit Q,
                 ValueType OpTy, ValueType eleTy,
@ -4640,12 +4680,12 @@ multiclass Neon_SMOVx_pattern <ValueType StTy, ValueType NaTy,
              NaImm:$Imm)>; 
 }

-defm SMOVxb_pattern : Neon_SMOVx_pattern<v16i8, v8i8, i8, neon_uimm4_bare,
-                                          neon_uimm3_bare, SMOVxb>;
-defm SMOVxh_pattern : Neon_SMOVx_pattern<v8i16, v4i16, i16, neon_uimm3_bare,
-                                          neon_uimm2_bare, SMOVxh>;
-defm SMOVxs_pattern : Neon_SMOVx_pattern<v4i32, v2i32, i32, neon_uimm2_bare,
-                                          neon_uimm1_bare, SMOVxs>;
+defm : Neon_SMOVx_pattern<v16i8, v8i8, i8, neon_uimm4_bare,
+                          neon_uimm3_bare, SMOVxb>;
+defm : Neon_SMOVx_pattern<v8i16, v4i16, i16, neon_uimm3_bare,
+                          neon_uimm2_bare, SMOVxh>;
+defm : Neon_SMOVx_pattern<v4i32, v2i32, i32, neon_uimm2_bare,
+                          neon_uimm1_bare, SMOVxs>;

 class Neon_SMOVw_pattern <ValueType StTy, ValueType NaTy,
                          ValueType eleTy, Operand StImm,  Operand NaImm,
@ -4657,11 +4697,10 @@ class Neon_SMOVw_pattern <ValueType StTy, ValueType NaTy,
        (SMOVI (StTy (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)),
          NaImm:$Imm)>;

-def SMOVwb_pattern : Neon_SMOVw_pattern<v16i8, v8i8, i8, neon_uimm4_bare,
-                                          neon_uimm3_bare, SMOVwb>;
-def SMOVwh_pattern : Neon_SMOVw_pattern<v8i16, v4i16, i16, neon_uimm3_bare,
-                                          neon_uimm2_bare, SMOVwh>;
-
+def : Neon_SMOVw_pattern<v16i8, v8i8, i8, neon_uimm4_bare,
+                         neon_uimm3_bare, SMOVwb>;
+def : Neon_SMOVw_pattern<v8i16, v4i16, i16, neon_uimm3_bare,
+                         neon_uimm2_bare, SMOVwh>;

 class NeonI_UMOV<string asmop, string Res, bit Q,
                 ValueType OpTy, Operand OpImm,
@ -4702,12 +4741,12 @@ class Neon_UMOV_pattern <ValueType StTy, ValueType NaTy, ValueType ResTy,
        (SMOVI (StTy (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)),
          NaImm:$Imm)>;

-def UMOVwb_pattern : Neon_UMOV_pattern<v16i8, v8i8, i32, neon_uimm4_bare,
-                                       neon_uimm3_bare, UMOVwb>;
-def UMOVwh_pattern : Neon_UMOV_pattern<v8i16, v4i16, i32, neon_uimm3_bare,
-                                       neon_uimm2_bare, UMOVwh>; 
-def UMOVws_pattern : Neon_UMOV_pattern<v4i32, v2i32, i32, neon_uimm2_bare,
-                                       neon_uimm1_bare, UMOVws>;
+def : Neon_UMOV_pattern<v16i8, v8i8, i32, neon_uimm4_bare,
+                        neon_uimm3_bare, UMOVwb>;
+def : Neon_UMOV_pattern<v8i16, v4i16, i32, neon_uimm3_bare,
+                        neon_uimm2_bare, UMOVwh>; 
+def : Neon_UMOV_pattern<v4i32, v2i32, i32, neon_uimm2_bare,
+                        neon_uimm1_bare, UMOVws>;

 def : Pat<(i32 (and
            (i32 (vector_extract
@ -4786,4 +4825,179 @@ def : Pat<(v1i64 (scalar_to_vector GPR64:$src)),
 def : Pat<(v1f32 (scalar_to_vector (f32 FPR32:$Rn))),
          (v1f32 FPR32:$Rn)>;
 def : Pat<(v1f64 (scalar_to_vector (f64 FPR64:$Rn))),
-          (v1f64 FPR64:$Rn)>;
+          (v1f64 FPR64:$Rn)>;
+
+def : Pat<(v1f64 (scalar_to_vector (f64 FPR64:$src))),
+          (FMOVdd $src)>;
+
+class NeonI_DUP_Elt<bit Q, string asmop, string rdlane,  string rnlane,
+                    RegisterOperand ResVPR, ValueType ResTy,
+                    ValueType OpTy, Operand OpImm>
+  : NeonI_copy<Q, 0b0, 0b0000, (outs ResVPR:$Rd),
+               (ins VPR128:$Rn, OpImm:$Imm),
+               asmop # "\t$Rd" # rdlane # ", $Rn" # rnlane # "[$Imm]",
+               [],
+               NoItinerary> {
+  bits<4> Imm;
+}
+
+def DUPELT16b : NeonI_DUP_Elt<0b1, "dup", ".16b", ".b", VPR128, v16i8, v16i8,
+                              neon_uimm4_bare> {
+  let Inst{20-16} = {Imm{3}, Imm{2}, Imm{1}, Imm{0}, 0b1};
+}
+
+def DUPELT8h : NeonI_DUP_Elt<0b1, "dup", ".8h", ".h", VPR128, v8i16, v8i16,
+                              neon_uimm3_bare> {
+  let Inst{20-16} = {Imm{2}, Imm{1}, Imm{0}, 0b1, 0b0};
+}
+
+def DUPELT4s : NeonI_DUP_Elt<0b1, "dup", ".4s", ".s", VPR128, v4i32, v4i32,
+                              neon_uimm2_bare> {
+  let Inst{20-16} = {Imm{1}, Imm{0}, 0b1, 0b0, 0b0};
+}
+
+def DUPELT2d : NeonI_DUP_Elt<0b1, "dup", ".2d", ".d", VPR128, v2i64, v2i64,
+                              neon_uimm1_bare> {
+  let Inst{20-16} = {Imm, 0b1, 0b0, 0b0, 0b0};
+}
+
+def DUPELT8b : NeonI_DUP_Elt<0b0, "dup", ".8b", ".b", VPR64, v8i8, v16i8,
+                              neon_uimm4_bare> {
+  let Inst{20-16} = {Imm{3}, Imm{2}, Imm{1}, Imm{0}, 0b1};
+}
+
+def DUPELT4h : NeonI_DUP_Elt<0b0, "dup", ".4h", ".h", VPR64, v4i16, v8i16,
+                              neon_uimm3_bare> {
+  let Inst{20-16} = {Imm{2}, Imm{1}, Imm{0}, 0b1, 0b0};
+}
+
+def DUPELT2s : NeonI_DUP_Elt<0b0, "dup", ".2s", ".s", VPR64, v2i32, v4i32,
+                              neon_uimm2_bare> {
+  let Inst{20-16} = {Imm{1}, Imm{0}, 0b1, 0b0, 0b0};
+}
+
+multiclass NeonI_DUP_Elt_pattern<Instruction DUPELT, ValueType ResTy,
+                                       ValueType OpTy,ValueType NaTy,
+                                       ValueType ExTy, Operand OpLImm,
+                                       Operand OpNImm> {
+def  : Pat<(ResTy (Neon_vduplane (OpTy VPR128:$Rn), OpLImm:$Imm)),
+        (ResTy (DUPELT (OpTy VPR128:$Rn), OpLImm:$Imm))>;
+
+def : Pat<(ResTy (Neon_vduplane
+            (NaTy VPR64:$Rn), OpNImm:$Imm)),
+          (ResTy (DUPELT
+            (ExTy (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)), OpNImm:$Imm))>;
+}
+defm : NeonI_DUP_Elt_pattern<DUPELT16b, v16i8, v16i8, v8i8, v16i8,
+                             neon_uimm4_bare, neon_uimm3_bare>;
+defm : NeonI_DUP_Elt_pattern<DUPELT8b, v8i8, v16i8, v8i8, v16i8,
+                             neon_uimm4_bare, neon_uimm3_bare>;
+defm : NeonI_DUP_Elt_pattern<DUPELT8h, v8i16, v8i16, v4i16, v8i16,
+                             neon_uimm3_bare, neon_uimm2_bare>;
+defm : NeonI_DUP_Elt_pattern<DUPELT4h, v4i16, v8i16, v4i16, v8i16,
+                             neon_uimm3_bare, neon_uimm2_bare>;
+defm : NeonI_DUP_Elt_pattern<DUPELT4s, v4i32, v4i32, v2i32, v4i32,
+                             neon_uimm2_bare, neon_uimm1_bare>;
+defm : NeonI_DUP_Elt_pattern<DUPELT2s, v2i32, v4i32, v2i32, v4i32,
+                             neon_uimm2_bare, neon_uimm1_bare>;
+defm : NeonI_DUP_Elt_pattern<DUPELT2d, v2i64, v2i64, v1i64, v2i64,
+                             neon_uimm1_bare, neon_uimm0_bare>;
+defm : NeonI_DUP_Elt_pattern<DUPELT4s, v4f32, v4f32, v2f32, v4f32,
+                             neon_uimm2_bare, neon_uimm1_bare>;
+defm : NeonI_DUP_Elt_pattern<DUPELT2s, v2f32, v4f32, v2f32, v4f32,
+                             neon_uimm2_bare, neon_uimm1_bare>;
+defm : NeonI_DUP_Elt_pattern<DUPELT2d, v2f64, v2f64, v1f64, v2f64,
+                             neon_uimm1_bare, neon_uimm0_bare>;
+
+def : Pat<(v2f32 (Neon_vdup (f32 FPR32:$Rn))),
+          (v2f32 (DUPELT2s 
+            (SUBREG_TO_REG (i64 0), FPR32:$Rn, sub_32),
+            (i64 0)))>;
+def : Pat<(v4f32 (Neon_vdup (f32 FPR32:$Rn))),
+          (v4f32 (DUPELT4s 
+            (SUBREG_TO_REG (i64 0), FPR32:$Rn, sub_32),
+            (i64 0)))>;
+def : Pat<(v2f64 (Neon_vdup (f64 FPR64:$Rn))),
+          (v2f64 (DUPELT2d 
+            (SUBREG_TO_REG (i64 0), FPR64:$Rn, sub_64),
+            (i64 0)))>;
+
+class NeonI_DUP<bit Q, string asmop, string rdlane,
+                RegisterOperand ResVPR, ValueType ResTy,
+                RegisterClass OpGPR, ValueType OpTy>
+  : NeonI_copy<Q, 0b0, 0b0001, (outs ResVPR:$Rd), (ins OpGPR:$Rn),
+               asmop # "\t$Rd" # rdlane # ", $Rn",
+               [(set (ResTy ResVPR:$Rd), 
+                 (ResTy (Neon_vdup (OpTy OpGPR:$Rn))))],
+               NoItinerary>;
+
+def DUP16b : NeonI_DUP<0b1, "dup", ".16b", VPR128, v16i8, GPR32, i32> {
+  let Inst{16} = 0b1;
+  // bits 17-19 are unspecified.
+}
+
+def DUP8h : NeonI_DUP<0b1, "dup", ".8h", VPR128, v8i16, GPR32, i32> {
+  let Inst{17-16} = 0b10;
+  // bits 18-19 are unspecified.
+}
+
+def DUP4s : NeonI_DUP<0b1, "dup", ".4s", VPR128, v4i32, GPR32, i32> {
+  let Inst{18-16} = 0b100;
+  // bit 19 is unspecified.
+}
+
+def DUP2d : NeonI_DUP<0b1, "dup", ".2d", VPR128, v2i64, GPR64, i64> {
+  let Inst{19-16} = 0b1000;
+}
+
+def DUP8b : NeonI_DUP<0b0, "dup", ".8b", VPR64, v8i8, GPR32, i32> {
+  let Inst{16} = 0b1;
+  // bits 17-19 are unspecified.
+}
+
+def DUP4h : NeonI_DUP<0b0, "dup", ".4h", VPR64, v4i16, GPR32, i32> {
+  let Inst{17-16} = 0b10;
+  // bits 18-19 are unspecified.
+}
+
+def DUP2s : NeonI_DUP<0b0, "dup", ".2s", VPR64, v2i32, GPR32, i32> {
+  let Inst{18-16} = 0b100;
+  // bit 19 is unspecified.
+}
+
+// patterns for CONCAT_VECTORS
+multiclass Concat_Vector_Pattern<ValueType ResTy, ValueType OpTy> {
+def : Pat<(ResTy (concat_vectors (OpTy VPR64:$Rn), undef)),
+          (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)>;
+def : Pat<(ResTy (concat_vectors (OpTy VPR64:$Rn), (OpTy VPR64:$Rm))),
+          (INSELd 
+            (v2i64 (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)),
+            (v2i64 (SUBREG_TO_REG (i64 0), VPR64:$Rm, sub_64)),
+            (i64 1),
+            (i64 0))>;
+def : Pat<(ResTy (concat_vectors (OpTy VPR64:$Rn), (OpTy VPR64:$Rn))),
+          (DUPELT2d 
+            (v2i64 (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)),
+            (i64 0))> ;
+}
+
+defm : Concat_Vector_Pattern<v16i8, v8i8>;
+defm : Concat_Vector_Pattern<v8i16, v4i16>;
+defm : Concat_Vector_Pattern<v4i32, v2i32>;
+defm : Concat_Vector_Pattern<v2i64, v1i64>;
+defm : Concat_Vector_Pattern<v4f32, v2f32>;
+defm : Concat_Vector_Pattern<v2f64, v1f64>;
+
+//patterns for EXTRACT_SUBVECTOR
+def : Pat<(v8i8 (extract_subvector (v16i8 VPR128:$Rn), (i64 0))),
+          (v8i8 (EXTRACT_SUBREG VPR128:$Rn, sub_64))>;
+def : Pat<(v4i16 (extract_subvector (v8i16 VPR128:$Rn), (i64 0))),
+          (v4i16 (EXTRACT_SUBREG VPR128:$Rn, sub_64))>;
+def : Pat<(v2i32 (extract_subvector (v4i32 VPR128:$Rn), (i64 0))),
+          (v2i32 (EXTRACT_SUBREG VPR128:$Rn, sub_64))>;
+def : Pat<(v1i64 (extract_subvector (v2i64 VPR128:$Rn), (i64 0))),
+          (v1i64 (EXTRACT_SUBREG VPR128:$Rn, sub_64))>;
+def : Pat<(v2f32 (extract_subvector (v4f32 VPR128:$Rn), (i64 0))),
+          (v2f32 (EXTRACT_SUBREG VPR128:$Rn, sub_64))>;
+def : Pat<(v1f64 (extract_subvector (v2f64 VPR128:$Rn), (i64 0))),
+          (v1f64 (EXTRACT_SUBREG VPR128:$Rn, sub_64))>;
--- a/test/CodeGen/AArch64/neon-copy.ll
+++ b/test/CodeGen/AArch64/neon-copy.ll
@ -225,8 +225,196 @@ define i64 @smovx2s(<2 x i32> %tmp1) {
  ret i64 %tmp4
 }

+define <8 x i8> @test_vcopy_lane_s8(<8 x i8> %v1, <8 x i8> %v2) {
+;CHECK: ins  {{v[0-9]+}}.b[5], {{v[0-9]+}}.b[3]
+  %vset_lane = shufflevector <8 x i8> %v1, <8 x i8> %v2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 11, i32 6, i32 7>
+  ret <8 x i8> %vset_lane
+}

+define <16 x i8> @test_vcopyq_laneq_s8(<16 x i8> %v1, <16 x i8> %v2) {
+;CHECK: ins  {{v[0-9]+}}.b[14], {{v[0-9]+}}.b[6]
+  %vset_lane = shufflevector <16 x i8> %v1, <16 x i8> %v2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 22, i32 15>
+  ret <16 x i8> %vset_lane
+}

+define <8 x i8> @test_vcopy_lane_swap_s8(<8 x i8> %v1, <8 x i8> %v2) {
+;CHECK: ins {{v[0-9]+}}.b[7], {{v[0-9]+}}.b[0]
+  %vset_lane = shufflevector <8 x i8> %v1, <8 x i8> %v2, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 0>
+  ret <8 x i8> %vset_lane
+}

+define <16 x i8> @test_vcopyq_laneq_swap_s8(<16 x i8> %v1, <16 x i8> %v2) {
+;CHECK: ins {{v[0-9]+}}.b[0], {{v[0-9]+}}.b[15]
+  %vset_lane = shufflevector <16 x i8> %v1, <16 x i8> %v2, <16 x i32> <i32 15, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  ret <16 x i8> %vset_lane
+}

+define <8 x i8> @test_vdup_n_u8(i8 %v1) #0 {
+;CHECK: dup {{v[0-9]+}}.8b, {{w[0-9]+}}
+  %vecinit.i = insertelement <8 x i8> undef, i8 %v1, i32 0
+  %vecinit1.i = insertelement <8 x i8> %vecinit.i, i8 %v1, i32 1
+  %vecinit2.i = insertelement <8 x i8> %vecinit1.i, i8 %v1, i32 2
+  %vecinit3.i = insertelement <8 x i8> %vecinit2.i, i8 %v1, i32 3
+  %vecinit4.i = insertelement <8 x i8> %vecinit3.i, i8 %v1, i32 4
+  %vecinit5.i = insertelement <8 x i8> %vecinit4.i, i8 %v1, i32 5
+  %vecinit6.i = insertelement <8 x i8> %vecinit5.i, i8 %v1, i32 6
+  %vecinit7.i = insertelement <8 x i8> %vecinit6.i, i8 %v1, i32 7
+  ret <8 x i8> %vecinit7.i
+}
+
+define <4 x i16> @test_vdup_n_u16(i16 %v1) #0 {
+;CHECK: dup {{v[0-9]+}}.4h, {{w[0-9]+}}
+  %vecinit.i = insertelement <4 x i16> undef, i16 %v1, i32 0
+  %vecinit1.i = insertelement <4 x i16> %vecinit.i, i16 %v1, i32 1
+  %vecinit2.i = insertelement <4 x i16> %vecinit1.i, i16 %v1, i32 2
+  %vecinit3.i = insertelement <4 x i16> %vecinit2.i, i16 %v1, i32 3
+  ret <4 x i16> %vecinit3.i
+}
+
+define <2 x i32> @test_vdup_n_u32(i32 %v1) #0 {
+;CHECK: dup {{v[0-9]+}}.2s, {{w[0-9]+}}
+  %vecinit.i = insertelement <2 x i32> undef, i32 %v1, i32 0
+  %vecinit1.i = insertelement <2 x i32> %vecinit.i, i32 %v1, i32 1
+  ret <2 x i32> %vecinit1.i
+}
+
+define <1 x i64> @test_vdup_n_u64(i64 %v1) #0 {
+;CHECK: fmov {{d[0-9]+}}, {{x[0-9]+}}
+  %vecinit.i = insertelement <1 x i64> undef, i64 %v1, i32 0
+  ret <1 x i64> %vecinit.i
+}
+
+define <16 x i8> @test_vdupq_n_u8(i8 %v1) #0 {
+;CHECK: dup {{v[0-9]+}}.16b, {{w[0-9]+}}
+  %vecinit.i = insertelement <16 x i8> undef, i8 %v1, i32 0
+  %vecinit1.i = insertelement <16 x i8> %vecinit.i, i8 %v1, i32 1
+  %vecinit2.i = insertelement <16 x i8> %vecinit1.i, i8 %v1, i32 2
+  %vecinit3.i = insertelement <16 x i8> %vecinit2.i, i8 %v1, i32 3
+  %vecinit4.i = insertelement <16 x i8> %vecinit3.i, i8 %v1, i32 4
+  %vecinit5.i = insertelement <16 x i8> %vecinit4.i, i8 %v1, i32 5
+  %vecinit6.i = insertelement <16 x i8> %vecinit5.i, i8 %v1, i32 6
+  %vecinit7.i = insertelement <16 x i8> %vecinit6.i, i8 %v1, i32 7
+  %vecinit8.i = insertelement <16 x i8> %vecinit7.i, i8 %v1, i32 8
+  %vecinit9.i = insertelement <16 x i8> %vecinit8.i, i8 %v1, i32 9
+  %vecinit10.i = insertelement <16 x i8> %vecinit9.i, i8 %v1, i32 10
+  %vecinit11.i = insertelement <16 x i8> %vecinit10.i, i8 %v1, i32 11
+  %vecinit12.i = insertelement <16 x i8> %vecinit11.i, i8 %v1, i32 12
+  %vecinit13.i = insertelement <16 x i8> %vecinit12.i, i8 %v1, i32 13
+  %vecinit14.i = insertelement <16 x i8> %vecinit13.i, i8 %v1, i32 14
+  %vecinit15.i = insertelement <16 x i8> %vecinit14.i, i8 %v1, i32 15
+  ret <16 x i8> %vecinit15.i
+}
+
+define <8 x i16> @test_vdupq_n_u16(i16 %v1) #0 {
+;CHECK: dup {{v[0-9]+}}.8h, {{w[0-9]+}}
+  %vecinit.i = insertelement <8 x i16> undef, i16 %v1, i32 0
+  %vecinit1.i = insertelement <8 x i16> %vecinit.i, i16 %v1, i32 1
+  %vecinit2.i = insertelement <8 x i16> %vecinit1.i, i16 %v1, i32 2
+  %vecinit3.i = insertelement <8 x i16> %vecinit2.i, i16 %v1, i32 3
+  %vecinit4.i = insertelement <8 x i16> %vecinit3.i, i16 %v1, i32 4
+  %vecinit5.i = insertelement <8 x i16> %vecinit4.i, i16 %v1, i32 5
+  %vecinit6.i = insertelement <8 x i16> %vecinit5.i, i16 %v1, i32 6
+  %vecinit7.i = insertelement <8 x i16> %vecinit6.i, i16 %v1, i32 7
+  ret <8 x i16> %vecinit7.i
+}
+
+define <4 x i32> @test_vdupq_n_u32(i32 %v1) #0 {
+;CHECK: dup {{v[0-9]+}}.4s, {{w[0-9]+}}
+  %vecinit.i = insertelement <4 x i32> undef, i32 %v1, i32 0
+  %vecinit1.i = insertelement <4 x i32> %vecinit.i, i32 %v1, i32 1
+  %vecinit2.i = insertelement <4 x i32> %vecinit1.i, i32 %v1, i32 2
+  %vecinit3.i = insertelement <4 x i32> %vecinit2.i, i32 %v1, i32 3
+  ret <4 x i32> %vecinit3.i
+}
+
+define <2 x i64> @test_vdupq_n_u64(i64 %v1) #0 {
+;CHECK: dup {{v[0-9]+}}.2d, {{x[0-9]+}}
+  %vecinit.i = insertelement <2 x i64> undef, i64 %v1, i32 0
+  %vecinit1.i = insertelement <2 x i64> %vecinit.i, i64 %v1, i32 1
+  ret <2 x i64> %vecinit1.i
+}
+
+define <8 x i8> @test_vdup_lane_s8(<8 x i8> %v1) #0 {
+;CHECK: dup {{v[0-9]+}}.8b, {{v[0-9]+}}.b[5]
+  %shuffle = shufflevector <8 x i8> %v1, <8 x i8> undef, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
+  ret <8 x i8> %shuffle
+}
+
+define <4 x i16> @test_vdup_lane_s16(<4 x i16> %v1) #0 {
+;CHECK: dup {{v[0-9]+}}.4h, {{v[0-9]+}}.h[2]
+  %shuffle = shufflevector <4 x i16> %v1, <4 x i16> undef, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
+  ret <4 x i16> %shuffle
+}
+
+define <2 x i32> @test_vdup_lane_s32(<2 x i32> %v1) #0 {
+;CHECK: dup {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+  %shuffle = shufflevector <2 x i32> %v1, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+  ret <2 x i32> %shuffle
+}
+
+define <16 x i8> @test_vdupq_lane_s8(<8 x i8> %v1) #0 {
+;CHECK: {{v[0-9]+}}.16b, {{v[0-9]+}}.b[5]
+  %shuffle = shufflevector <8 x i8> %v1, <8 x i8> undef, <16 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
+  ret <16 x i8> %shuffle
+}
+
+define <8 x i16> @test_vdupq_lane_s16(<4 x i16> %v1) #0 {
+;CHECK: {{v[0-9]+}}.8h, {{v[0-9]+}}.h[2]
+  %shuffle = shufflevector <4 x i16> %v1, <4 x i16> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
+  ret <8 x i16> %shuffle
+}
+
+define <4 x i32> @test_vdupq_lane_s32(<2 x i32> %v1) #0 {
+;CHECK: {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+  %shuffle = shufflevector <2 x i32> %v1, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  ret <4 x i32> %shuffle
+}
+
+define <2 x i64> @test_vdupq_lane_s64(<1 x i64> %v1) #0 {
+;CHECK: {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
+  %shuffle = shufflevector <1 x i64> %v1, <1 x i64> undef, <2 x i32> zeroinitializer
+  ret <2 x i64> %shuffle
+}
+
+define <8 x i8> @test_vdup_laneq_s8(<16 x i8> %v1) #0 {
+;CHECK: dup {{v[0-9]+}}.8b, {{v[0-9]+}}.b[5]
+  %shuffle = shufflevector <16 x i8> %v1, <16 x i8> undef, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
+  ret <8 x i8> %shuffle
+}
+
+define <4 x i16> @test_vdup_laneq_s16(<8 x i16> %v1) #0 {
+;CHECK: dup {{v[0-9]+}}.4h, {{v[0-9]+}}.h[2]
+  %shuffle = shufflevector <8 x i16> %v1, <8 x i16> undef, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
+  ret <4 x i16> %shuffle
+}
+
+define <2 x i32> @test_vdup_laneq_s32(<4 x i32> %v1) #0 {
+;CHECK: dup {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+  %shuffle = shufflevector <4 x i32> %v1, <4 x i32> undef, <2 x i32> <i32 1, i32 1>
+  ret <2 x i32> %shuffle
+}
+
+define <16 x i8> @test_vdupq_laneq_s8(<16 x i8> %v1) #0 {
+;CHECK: dup {{v[0-9]+}}.16b, {{v[0-9]+}}.b[5]
+  %shuffle = shufflevector <16 x i8> %v1, <16 x i8> undef, <16 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
+  ret <16 x i8> %shuffle
+}
+
+define <8 x i16> @test_vdupq_laneq_s16(<8 x i16> %v1) #0 {
+;CHECK: {{v[0-9]+}}.8h, {{v[0-9]+}}.h[2]
+  %shuffle = shufflevector <8 x i16> %v1, <8 x i16> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
+  ret <8 x i16> %shuffle
+}
+
+define <4 x i32> @test_vdupq_laneq_s32(<4 x i32> %v1) #0 {
+;CHECK: dup {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+  %shuffle = shufflevector <4 x i32> %v1, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  ret <4 x i32> %shuffle
+}
+
+define <2 x i64> @test_vdupq_laneq_s64(<2 x i64> %v1) #0 {
+;CHECK: dup {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
+  %shuffle = shufflevector <2 x i64> %v1, <2 x i64> undef, <2 x i32> zeroinitializer
+  ret <2 x i64> %shuffle
+}

--- a/test/MC/AArch64/neon-diagnostics.s
+++ b/test/MC/AArch64/neon-diagnostics.s
@ -4101,3 +4101,188 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        st4 {v31.2s-v1.2s}, [x31]
 // CHECK-ERROR:            ^
+
+         ins v2.b[16], w1
+         ins v7.h[8], w14
+         ins v20.s[5], w30
+         ins v1.d[2], x7
+         ins v2.b[3], b1
+         ins v7.h[2], h14
+         ins v20.s[1], s30
+         ins v1.d[0], d7
+
+// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR:         ins v2.b[16], w1
+// CHECK-ERROR:                  ^
+// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR:         ins v7.h[8], w14
+// CHECK-ERROR:                  ^
+// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR:         ins v20.s[5], w30
+// CHECK-ERROR:                   ^
+// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR:         ins v1.d[2], x7
+// CHECK-ERROR:                  ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         ins v2.b[3], b1
+// CHECK-ERROR:                      ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         ins v7.h[2], h14
+// CHECK-ERROR:                      ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         ins v20.s[1], s30
+// CHECK-ERROR:                       ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         ins v1.d[0], d7
+// CHECK-ERROR:                      ^
+
+         smov w1, v0.b[16]
+         smov w14, v6.h[8]
+         smov x1, v0.b[16]
+         smov x14, v6.h[8]
+         smov x20, v9.s[5]
+         smov w1, v0.d[0]
+         smov w14, v6.d[1]
+         smov x1, v0.d[0]
+         smov x14, v6.d[1]
+         smov x20, v9.d[0]
+
+// CHECK-ERROR error: lane number incompatible with layout
+// CHECK-ERROR         smov w1, v0.b[16]
+// CHECK-ERROR                       ^
+// CHECK-ERROR error: lane number incompatible with layout
+// CHECK-ERROR         smov w14, v6.h[8]
+// CHECK-ERROR                        ^
+// CHECK-ERROR error: lane number incompatible with layout
+// CHECK-ERROR         smov x1, v0.b[16]
+// CHECK-ERROR                       ^
+// CHECK-ERROR error: lane number incompatible with layout
+// CHECK-ERROR         smov x14, v6.h[8]
+// CHECK-ERROR                        ^
+// CHECK-ERROR error: lane number incompatible with layout
+// CHECK-ERROR         smov x20, v9.s[5]
+// CHECK-ERROR                        ^
+// CHECK-ERROR error: invalid operand for instruction
+// CHECK-ERROR         smov w1, v0.d[0]
+// CHECK-ERROR                     ^
+// CHECK-ERROR error: invalid operand for instruction
+// CHECK-ERROR         smov w14, v6.d[1]
+// CHECK-ERROR                      ^
+// CHECK-ERROR error: invalid operand for instruction
+// CHECK-ERROR         smov x1, v0.d[0]
+// CHECK-ERROR                     ^
+// CHECK-ERROR error: invalid operand for instruction
+// CHECK-ERROR         smov x14, v6.d[1]
+// CHECK-ERROR                      ^
+// CHECK-ERROR error: invalid operand for instruction
+// CHECK-ERROR         smov x20, v9.d[0]
+// CHECK-ERROR                      ^
+
+         umov w1, v0.b[16]
+         umov w14, v6.h[8]
+         umov w20, v9.s[5]
+         umov x7, v18.d[3]
+         umov w1, v0.d[0]
+         umov s20, v9.s[2]
+         umov d7, v18.d[1]
+
+// CHECK-ERROR error: lane number incompatible with layout
+// CHECK-ERROR         umov w1, v0.b[16]
+// CHECK-ERROR                       ^
+// CHECK-ERROR error: lane number incompatible with layout
+// CHECK-ERROR         umov w14, v6.h[8]
+// CHECK-ERROR                        ^
+// CHECK-ERROR error: lane number incompatible with layout
+// CHECK-ERROR         umov w20, v9.s[5]
+// CHECK-ERROR                        ^
+// CHECK-ERROR error: lane number incompatible with layout
+// CHECK-ERROR         umov x7, v18.d[3]
+// CHECK-ERROR                        ^
+// CHECK-ERROR error: invalid operand for instruction
+// CHECK-ERROR         umov w1, v0.d[0]
+// CHECK-ERROR                     ^
+// CHECK-ERROR error: invalid operand for instruction
+// CHECK-ERROR         umov s20, v9.s[2]
+// CHECK-ERROR              ^
+// CHECK-ERROR error: invalid operand for instruction
+// CHECK-ERROR         umov d7, v18.d[1]
+// CHECK-ERROR              ^
+
+         Ins v1.h[2], v3.b[6]
+         Ins v6.h[7], v7.s[2]
+         Ins v15.d[0], v22.s[2]
+         Ins v0.d[0], v4.b[1]
+
+// CHECK-ERROR error: invalid operand for instruction
+// CHECK-ERROR         Ins v1.h[2], v3.b[6]
+// CHECK-ERROR                         ^
+// CHECK-ERROR error: invalid operand for instruction
+// CHECK-ERROR         Ins v6.h[7], v7.s[2]
+// CHECK-ERROR                         ^
+// CHECK-ERROR error: invalid operand for instruction
+// CHECK-ERROR         Ins v15.d[0], v22.s[2]
+// CHECK-ERROR                           ^
+// CHECK-ERROR error: invalid operand for instruction
+// CHECK-ERROR         Ins v0.d[0], v4.b[1]
+// CHECK-ERROR                         ^
+
+         dup v1.8h, v2.b[2]
+         dup v11.4s, v7.h[7]
+         dup v17.2d, v20.s[0]
+         dup v1.16b, v2.h[2]
+         dup v11.8h, v7.s[3]
+         dup v17.4s, v20.d[0]
+         dup v5.2d, v1.b[1]
+
+// CHECK-ERROR error: invalid operand for instruction
+// CHECK-ERROR         dup v1.8h, v2.b[2]
+// CHECK-ERROR                       ^
+// CHECK-ERROR error: invalid operand for instruction
+// CHECK-ERROR         dup v11.4s, v7.h[7]
+// CHECK-ERROR                        ^
+// CHECK-ERROR error: invalid operand for instruction
+// CHECK-ERROR         dup v17.2d, v20.s[0]
+// CHECK-ERROR                         ^
+// CHECK-ERROR error: invalid operand for instruction
+// CHECK-ERROR         dup v1.16b, v2.h[2]
+// CHECK-ERROR                        ^
+// CHECK-ERROR invalid operand for instruction
+// CHECK-ERROR         dup v11.8h, v7.s[3]
+// CHECK-ERROR                        ^
+// CHECK-ERROR invalid operand for instruction
+// CHECK-ERROR         dup v17.4s, v20.d[0]
+// CHECK-ERROR                         ^
+// CHECK-ERROR invalid operand for instruction
+// CHECK-ERROR         dup v5.2d, v1.b[1]
+// CHECK-ERROR                       ^
+
+         dup v1.8b, b1
+         dup v11.4h, h14
+         dup v17.2s, s30
+         dup v1.16b, d2
+         dup v11.8s, w16
+         dup v17.4d, w28
+         dup v5.2d, w0
+
+// CHECK-ERROR error: invalid operand for instruction
+// CHECK-ERROR         dup v1.8b, b1
+// CHECK-ERROR                    ^
+// CHECK-ERROR error: invalid operand for instruction
+// CHECK-ERROR         dup v11.4h, h14
+// CHECK-ERROR                     ^
+// CHECK-ERROR error: invalid operand for instruction
+// CHECK-ERROR         dup v17.2s, s30
+// CHECK-ERROR                     ^
+// CHECK-ERROR error: invalid operand for instruction
+// CHECK-ERROR         dup v1.16b, d2
+// CHECK-ERROR                     ^
+// CHECK-ERROR error: invalid operand for instruction
+// CHECK-ERROR         dup v11.8s, w16
+// CHECK-ERROR             ^
+// CHECK-ERROR error: invalid operand for instruction
+// CHECK-ERROR         dup v17.4d, w28
+// CHECK-ERROR             ^
+// CHECK-ERROR error: invalid operand for instruction
+// CHECK-ERROR         dup v5.2d, w0
+// CHECK-ERROR                    ^
+
--- a/test/MC/AArch64/neon-simd-copy.s
+++ b/test/MC/AArch64/neon-simd-copy.s
@ -60,6 +60,44 @@
 // CHECK: ins	v15.s[3], v22.s[2]      // encoding: [0xcf,0x5e,0x1c,0x6e]
 // CHECK: ins	v0.d[0], v4.d[1]        // encoding: [0x80,0x44,0x08,0x6e]

+//------------------------------------------------------------------------------
+// Duplicate to all lanes( vector, from element)
+//------------------------------------------------------------------------------
+         dup v1.8b, v2.b[2]
+         dup v11.4h, v7.h[7]
+         dup v17.2s, v20.s[0]
+         dup v1.16b, v2.b[2]
+         dup v11.8h, v7.h[7]
+         dup v17.4s, v20.s[0]
+         dup v5.2d, v1.d[1]         
+
+// CHECK: dup v1.8b, v2.b[2]        // encoding: [0x41,0x04,0x05,0x0e]
+// CHECK: dup v11.4h, v7.h[7]       // encoding: [0xeb,0x04,0x1e,0x0e]
+// CHECK: dup v17.2s, v20.s[0]      // encoding: [0x91,0x06,0x04,0x0e]
+// CHECK: dup v1.16b, v2.b[2]       // encoding: [0x41,0x04,0x05,0x4e]
+// CHECK: dup v11.8h, v7.h[7]       // encoding: [0xeb,0x04,0x1e,0x4e]
+// CHECK: dup v17.4s, v20.s[0]      // encoding: [0x91,0x06,0x04,0x4e]
+// CHECK: dup v5.2d, v1.d[1]        // encoding: [0x25,0x04,0x18,0x4e]
+
+//------------------------------------------------------------------------------
+// Duplicate to all lanes( vector, from main)
+//------------------------------------------------------------------------------
+         dup v1.8b, w1
+         dup v11.4h, w14
+         dup v17.2s, w30
+         dup v1.16b, w2
+         dup v11.8h, w16
+         dup v17.4s, w28
+         dup v5.2d, x0        
+
+// CHECK: dup	v1.8b, w1             // encoding: [0x21,0x0c,0x01,0x0e]
+// CHECK: dup	v11.4h, w14           // encoding: [0xcb,0x0d,0x0a,0x0e]
+// CHECK: dup	v17.2s, w30           // encoding: [0xd1,0x0f,0x14,0x0e]
+// CHECK: dup	v1.16b, w2            // encoding: [0x41,0x0c,0x01,0x4e]
+// CHECK: dup	v11.8h, w16           // encoding: [0x0b,0x0e,0x0a,0x4e]
+// CHECK: dup	v17.4s, w28           // encoding: [0x91,0x0f,0x14,0x4e]
+// CHECK: dup	v5.2d, x0             // encoding: [0x05,0x0c,0x08,0x4e]
+