ARM64: initial backend import

This adds a second implementation of the AArch64 architecture to LLVM, accessible in parallel via the "arm64" triple. The plan over the coming weeks & months is to merge the two into a single backend, during which time thorough code review should naturally occur. Everything will be easier with the target in-tree though, hence this commit. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@205090 91177308-0d34-0410-b5e6-96231b3b80d8
2024-12-25 14:32:53 +00:00 · 2014-03-29 10:18:08 +00:00 · 2014-03-29 10:18:08 +00:00 · 7b837d8c75
commit 7b837d8c75
parent 69bd9577fc
394 changed files with 105888 additions and 32 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -128,6 +128,7 @@ set(LLVM_LIBDIR_SUFFIX "" CACHE STRING "Define suffix of library directory name

 set(LLVM_ALL_TARGETS
  AArch64
+  ARM64
  ARM
  CppBackend
  Hexagon
@ -143,7 +144,7 @@ set(LLVM_ALL_TARGETS
  )

 # List of targets with JIT support:
-set(LLVM_TARGETS_WITH_JIT X86 PowerPC AArch64 ARM Mips SystemZ)
+set(LLVM_TARGETS_WITH_JIT X86 PowerPC AArch64 ARM64 ARM Mips SystemZ)

 set(LLVM_TARGETS_TO_BUILD "all"
    CACHE STRING "Semicolon-separated list of targets to build, or \"all\".")
--- a/autoconf/configure.ac
+++ b/autoconf/configure.ac
@ -419,6 +419,7 @@ AC_CACHE_CHECK([target architecture],[llvm_cv_target_arch],
  amd64-* | x86_64-*)     llvm_cv_target_arch="x86_64" ;;
  sparc*-*)               llvm_cv_target_arch="Sparc" ;;
  powerpc*-*)             llvm_cv_target_arch="PowerPC" ;;
+  arm64*-*)               llvm_cv_target_arch="ARM64" ;;
  arm*-*)                 llvm_cv_target_arch="ARM" ;;
  aarch64*-*)             llvm_cv_target_arch="AArch64" ;;
  mips-* | mips64-*)      llvm_cv_target_arch="Mips" ;;
@ -454,6 +455,7 @@ case $host in
  amd64-* | x86_64-*)     host_arch="x86_64" ;;
  sparc*-*)               host_arch="Sparc" ;;
  powerpc*-*)             host_arch="PowerPC" ;;
+  arm64*-*)               host_arch="ARM64" ;;
  arm*-*)                 host_arch="ARM" ;;
  aarch64*-*)             host_arch="AArch64" ;;
  mips-* | mips64-*)      host_arch="Mips" ;;
@ -795,7 +797,7 @@ else
  esac
 fi

-TARGETS_WITH_JIT="AArch64 ARM Mips PowerPC SystemZ X86"
+TARGETS_WITH_JIT="AArch64 ARM ARM64 Mips PowerPC SystemZ X86"
 AC_SUBST(TARGETS_WITH_JIT,$TARGETS_WITH_JIT)

 dnl Allow enablement of building and installing docs
@ -948,14 +950,14 @@ if test "$llvm_cv_enable_crash_overrides" = "yes" ; then
 fi

 dnl List all possible targets
-ALL_TARGETS="X86 Sparc PowerPC AArch64 ARM Mips XCore MSP430 CppBackend NVPTX Hexagon SystemZ R600"
+ALL_TARGETS="X86 Sparc PowerPC AArch64 ARM ARM64 Mips XCore MSP430 CppBackend NVPTX Hexagon SystemZ R600"
 AC_SUBST(ALL_TARGETS,$ALL_TARGETS)

 dnl Allow specific targets to be specified for building (or not)
 TARGETS_TO_BUILD=""
 AC_ARG_ENABLE([targets],AS_HELP_STRING([--enable-targets],
    [Build specific host targets: all or target1,target2,... Valid targets are:
-     host, x86, x86_64, sparc, powerpc, arm, aarch64, mips, hexagon,
+     host, x86, x86_64, sparc, powerpc, arm64, arm, aarch64, mips, hexagon,
     xcore, msp430, nvptx, systemz, r600, and cpp (default=all)]),,
    enableval=all)
 if test "$enableval" = host-only ; then
@ -970,6 +972,7 @@ case "$enableval" in
        sparc)    TARGETS_TO_BUILD="Sparc $TARGETS_TO_BUILD" ;;
        powerpc)  TARGETS_TO_BUILD="PowerPC $TARGETS_TO_BUILD" ;;
        aarch64)  TARGETS_TO_BUILD="AArch64 $TARGETS_TO_BUILD" ;;
+        arm64)    TARGETS_TO_BUILD="ARM64 $TARGETS_TO_BUILD" ;;
        arm)      TARGETS_TO_BUILD="ARM $TARGETS_TO_BUILD" ;;
        mips)     TARGETS_TO_BUILD="Mips $TARGETS_TO_BUILD" ;;
        mipsel)   TARGETS_TO_BUILD="Mips $TARGETS_TO_BUILD" ;;
--- a/cmake/config-ix.cmake
+++ b/cmake/config-ix.cmake
@ -366,6 +366,8 @@ elseif (LLVM_NATIVE_ARCH MATCHES "powerpc")
  set(LLVM_NATIVE_ARCH PowerPC)
 elseif (LLVM_NATIVE_ARCH MATCHES "aarch64")
  set(LLVM_NATIVE_ARCH AArch64)
+elseif (LLVM_NATIVE_ARCH MATCHES "arm64")
+  set(LLVM_NATIVE_ARCH ARM64)
 elseif (LLVM_NATIVE_ARCH MATCHES "arm")
  set(LLVM_NATIVE_ARCH ARM)
 elseif (LLVM_NATIVE_ARCH MATCHES "mips")
--- a/13
+++ b/13
@ -1447,9 +1447,9 @@ Optional Features:
                          Enable crash handling overrides (default is YES)
  --enable-targets        Build specific host targets: all or
                          target1,target2,... Valid targets are: host, x86,
-                          x86_64, sparc, powerpc, arm, aarch64, mips, hexagon,
-                          xcore, msp430, nvptx, systemz, r600, and cpp
-                          (default=all)
+                          x86_64, sparc, powerpc, arm64, arm, aarch64, mips,
+                          hexagon, xcore, msp430, nvptx, systemz, r600, and
+                          cpp (default=all)
  --enable-experimental-targets
                          Build experimental host targets: disable or
                          target1,target2,... (default=disable)
@ -4151,6 +4151,7 @@ else
  amd64-* | x86_64-*)     llvm_cv_target_arch="x86_64" ;;
  sparc*-*)               llvm_cv_target_arch="Sparc" ;;
  powerpc*-*)             llvm_cv_target_arch="PowerPC" ;;
+  arm64*-*)               llvm_cv_target_arch="ARM64" ;;
  arm*-*)                 llvm_cv_target_arch="ARM" ;;
  aarch64*-*)             llvm_cv_target_arch="AArch64" ;;
  mips-* | mips64-*)      llvm_cv_target_arch="Mips" ;;
@ -4187,6 +4188,7 @@ case $host in
  amd64-* | x86_64-*)     host_arch="x86_64" ;;
  sparc*-*)               host_arch="Sparc" ;;
  powerpc*-*)             host_arch="PowerPC" ;;
+  arm64*-*)               host_arch="ARM64" ;;
  arm*-*)                 host_arch="ARM" ;;
  aarch64*-*)             host_arch="AArch64" ;;
  mips-* | mips64-*)      host_arch="Mips" ;;
@ -5120,7 +5122,7 @@ else
  esac
 fi

-TARGETS_WITH_JIT="AArch64 ARM Mips PowerPC SystemZ X86"
+TARGETS_WITH_JIT="AArch64 ARM ARM64 Mips PowerPC SystemZ X86"
 TARGETS_WITH_JIT=$TARGETS_WITH_JIT


@ -5357,7 +5359,7 @@ _ACEOF

 fi

-ALL_TARGETS="X86 Sparc PowerPC AArch64 ARM Mips XCore MSP430 CppBackend NVPTX Hexagon SystemZ R600"
+ALL_TARGETS="X86 Sparc PowerPC AArch64 ARM ARM64 Mips XCore MSP430 CppBackend NVPTX Hexagon SystemZ R600"
 ALL_TARGETS=$ALL_TARGETS


@ -5381,6 +5383,7 @@ case "$enableval" in
        sparc)    TARGETS_TO_BUILD="Sparc $TARGETS_TO_BUILD" ;;
        powerpc)  TARGETS_TO_BUILD="PowerPC $TARGETS_TO_BUILD" ;;
        aarch64)  TARGETS_TO_BUILD="AArch64 $TARGETS_TO_BUILD" ;;
+        arm64)    TARGETS_TO_BUILD="ARM64 $TARGETS_TO_BUILD" ;;
        arm)      TARGETS_TO_BUILD="ARM $TARGETS_TO_BUILD" ;;
        mips)     TARGETS_TO_BUILD="Mips $TARGETS_TO_BUILD" ;;
        mipsel)   TARGETS_TO_BUILD="Mips $TARGETS_TO_BUILD" ;;
--- a/include/llvm-c/Disassembler.h
+++ b/include/llvm-c/Disassembler.h
@ -95,6 +95,16 @@ struct LLVMOpInfo1 {
 #define LLVMDisassembler_VariantKind_ARM_HI16 1 /* :upper16: */
 #define LLVMDisassembler_VariantKind_ARM_LO16 2 /* :lower16: */

+/**
+ * The ARM64 target VariantKinds.
+ */
+#define LLVMDisassembler_VariantKind_ARM64_PAGE       1 /* @page */
+#define LLVMDisassembler_VariantKind_ARM64_PAGEOFF    2 /* @pageoff */
+#define LLVMDisassembler_VariantKind_ARM64_GOTPAGE    3 /* @gotpage */
+#define LLVMDisassembler_VariantKind_ARM64_GOTPAGEOFF 4 /* @gotpageoff */
+#define LLVMDisassembler_VariantKind_ARM64_TLVP       5 /* @tvlppage */
+#define LLVMDisassembler_VariantKind_ARM64_TLVOFF     6 /* @tvlppageoff */
+
 /**
 * The type for the symbol lookup function.  This may be called by the
 * disassembler for things like adding a comment for a PC plus a constant
@ -123,6 +133,17 @@ typedef const char *(*LLVMSymbolLookupCallback)(void *DisInfo,
 /* The input reference is from a PC relative load instruction. */
 #define LLVMDisassembler_ReferenceType_In_PCrel_Load 2

+/* The input reference is from an ARM64::ADRP instruction. */
+#define LLVMDisassembler_ReferenceType_In_ARM64_ADRP 0x100000001
+/* The input reference is from an ARM64::ADDXri instruction. */
+#define LLVMDisassembler_ReferenceType_In_ARM64_ADDXri 0x100000002
+/* The input reference is from an ARM64::LDRXui instruction. */
+#define LLVMDisassembler_ReferenceType_In_ARM64_LDRXui 0x100000003
+/* The input reference is from an ARM64::LDRXl instruction. */
+#define LLVMDisassembler_ReferenceType_In_ARM64_LDRXl 0x100000004
+/* The input reference is from an ARM64::ADR instruction. */
+#define LLVMDisassembler_ReferenceType_In_ARM64_ADR 0x100000005
+
 /* The output reference is to as symbol stub. */
 #define LLVMDisassembler_ReferenceType_Out_SymbolStub 1
 /* The output reference is to a symbol address in a literal pool. */
--- a/include/llvm/ADT/Triple.h
+++ b/include/llvm/ADT/Triple.h
@ -48,6 +48,7 @@ public:

    arm,        // ARM (little endian): arm, armv.*, xscale
    armeb,      // ARM (big endian): armeb
+    arm64,      // ARM: arm64
    aarch64,    // AArch64 (little endian): aarch64
    aarch64_be, // AArch64 (big endian): aarch64_be
    hexagon,    // Hexagon: hexagon
--- a/include/llvm/IR/Intrinsics.td
+++ b/include/llvm/IR/Intrinsics.td
@ -529,6 +529,7 @@ def int_clear_cache : Intrinsic<[], [llvm_ptr_ty, llvm_ptr_ty],
 include "llvm/IR/IntrinsicsPowerPC.td"
 include "llvm/IR/IntrinsicsX86.td"
 include "llvm/IR/IntrinsicsARM.td"
+include "llvm/IR/IntrinsicsARM64.td"
 include "llvm/IR/IntrinsicsAArch64.td"
 include "llvm/IR/IntrinsicsXCore.td"
 include "llvm/IR/IntrinsicsHexagon.td"
--- a/include/llvm/IR/IntrinsicsARM64.td
+++ b/include/llvm/IR/IntrinsicsARM64.td
@ -0,0 +1,621 @@
+//===- IntrinsicsARM64.td - Defines ARM64 intrinsics -------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines all of the ARM64-specific intrinsics.
+//
+//===----------------------------------------------------------------------===//
+
+let TargetPrefix = "arm64" in {
+
+def int_arm64_ldxr : Intrinsic<[llvm_i64_ty], [llvm_anyptr_ty]>;
+def int_arm64_stxr : Intrinsic<[llvm_i32_ty], [llvm_i64_ty, llvm_anyptr_ty]>;
+def int_arm64_clrex : Intrinsic<[]>;
+
+def int_arm64_ldxp : Intrinsic<[llvm_i64_ty, llvm_i64_ty], [llvm_ptr_ty]>;
+def int_arm64_stxp : Intrinsic<[llvm_i32_ty], [llvm_i64_ty, llvm_i64_ty,
+    llvm_ptr_ty]>;
+
+def int_arm64_sdiv : Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>,
+                                LLVMMatchType<0>], [IntrNoMem]>;
+def int_arm64_udiv : Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>,
+                                LLVMMatchType<0>], [IntrNoMem]>;
+}
+
+//===----------------------------------------------------------------------===//
+// Advanced SIMD (NEON)
+
+let TargetPrefix = "arm64" in {  // All intrinsics start with "llvm.arm64.".
+  class AdvSIMD_2Scalar_Float_Intrinsic
+    : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, LLVMMatchType<0>],
+                [IntrNoMem]>;
+
+  class AdvSIMD_FPToIntRounding_Intrinsic
+    : Intrinsic<[llvm_anyint_ty], [llvm_anyfloat_ty], [IntrNoMem]>;
+
+  class AdvSIMD_1IntArg_Intrinsic
+    : Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>], [IntrNoMem]>;
+  class AdvSIMD_1FloatArg_Intrinsic
+    : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]>;
+  class AdvSIMD_1VectorArg_Intrinsic
+    : Intrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>], [IntrNoMem]>;
+  class AdvSIMD_1VectorArg_Expand_Intrinsic
+    : Intrinsic<[llvm_anyvector_ty], [llvm_anyvector_ty], [IntrNoMem]>;
+  class AdvSIMD_1VectorArg_Long_Intrinsic
+    : Intrinsic<[llvm_anyvector_ty], [LLVMTruncatedType<0>], [IntrNoMem]>;
+  class AdvSIMD_1IntArg_Narrow_Intrinsic
+    : Intrinsic<[llvm_anyint_ty], [llvm_anyint_ty], [IntrNoMem]>;
+  class AdvSIMD_1VectorArg_Narrow_Intrinsic
+    : Intrinsic<[llvm_anyint_ty], [LLVMExtendedType<0>], [IntrNoMem]>;
+  class AdvSIMD_1VectorArg_Int_Across_Intrinsic
+    : Intrinsic<[llvm_anyint_ty], [llvm_anyvector_ty], [IntrNoMem]>;
+  class AdvSIMD_1VectorArg_Float_Across_Intrinsic
+    : Intrinsic<[llvm_anyfloat_ty], [llvm_anyvector_ty], [IntrNoMem]>;
+
+  class AdvSIMD_2IntArg_Intrinsic
+    : Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>, LLVMMatchType<0>],
+                [IntrNoMem]>;
+  class AdvSIMD_2FloatArg_Intrinsic
+    : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, LLVMMatchType<0>],
+                [IntrNoMem]>;
+  class AdvSIMD_2VectorArg_Intrinsic
+    : Intrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>, LLVMMatchType<0>],
+                [IntrNoMem]>;
+  class AdvSIMD_2VectorArg_Compare_Intrinsic
+    : Intrinsic<[llvm_anyvector_ty], [llvm_anyvector_ty, LLVMMatchType<1>],
+                [IntrNoMem]>;
+  class AdvSIMD_2Arg_FloatCompare_Intrinsic
+    : Intrinsic<[llvm_anyint_ty], [llvm_anyfloat_ty, LLVMMatchType<1>],
+                [IntrNoMem]>;
+  class AdvSIMD_2VectorArg_Long_Intrinsic
+    : Intrinsic<[llvm_anyvector_ty],
+                [LLVMTruncatedType<0>,
+                 LLVMTruncatedType<0>],
+                [IntrNoMem]>;
+  class AdvSIMD_2VectorArg_Wide_Intrinsic
+    : Intrinsic<[llvm_anyvector_ty],
+                [LLVMMatchType<0>, LLVMTruncatedType<0>],
+                [IntrNoMem]>;
+  class AdvSIMD_2VectorArg_Narrow_Intrinsic
+    : Intrinsic<[llvm_anyvector_ty],
+                [LLVMExtendedType<0>, LLVMExtendedType<0>],
+                [IntrNoMem]>;
+  class AdvSIMD_2Arg_Scalar_Narrow_Intrinsic
+    : Intrinsic<[llvm_anyint_ty],
+                [LLVMExtendedType<0>, llvm_i32_ty],
+                [IntrNoMem]>;
+  class AdvSIMD_2VectorArg_Scalar_Expand_BySize_Intrinsic
+    : Intrinsic<[llvm_anyvector_ty],
+                [llvm_anyvector_ty],
+                [IntrNoMem]>;
+  class AdvSIMD_2VectorArg_Scalar_Wide_BySize_Intrinsic
+    : Intrinsic<[llvm_anyvector_ty],
+                [LLVMTruncatedType<0>],
+                [IntrNoMem]>;
+  class AdvSIMD_2VectorArg_Scalar_Wide_Intrinsic
+    : Intrinsic<[llvm_anyvector_ty],
+                [LLVMTruncatedType<0>, llvm_i32_ty],
+                [IntrNoMem]>;
+  class AdvSIMD_2VectorArg_Tied_Narrow_Intrinsic
+    : Intrinsic<[llvm_anyvector_ty],
+                [LLVMHalfElementsVectorType<0>, llvm_anyvector_ty],
+                [IntrNoMem]>;
+
+  class AdvSIMD_3VectorArg_Intrinsic
+      : Intrinsic<[llvm_anyvector_ty],
+               [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>],
+               [IntrNoMem]>;
+  class AdvSIMD_3VectorArg_Scalar_Intrinsic
+      : Intrinsic<[llvm_anyvector_ty],
+               [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty],
+               [IntrNoMem]>;
+  class AdvSIMD_3VectorArg_Tied_Narrow_Intrinsic
+      : Intrinsic<[llvm_anyvector_ty],
+               [LLVMHalfElementsVectorType<0>, llvm_anyvector_ty,
+                LLVMMatchType<1>], [IntrNoMem]>;
+  class AdvSIMD_3VectorArg_Scalar_Tied_Narrow_Intrinsic
+    : Intrinsic<[llvm_anyvector_ty],
+                [LLVMHalfElementsVectorType<0>, llvm_anyvector_ty, llvm_i32_ty],
+                [IntrNoMem]>;
+  class AdvSIMD_CvtFxToFP_Intrinsic
+    : Intrinsic<[llvm_anyfloat_ty], [llvm_anyint_ty, llvm_i32_ty],
+                [IntrNoMem]>;
+  class AdvSIMD_CvtFPToFx_Intrinsic
+    : Intrinsic<[llvm_anyint_ty], [llvm_anyfloat_ty, llvm_i32_ty],
+                [IntrNoMem]>;
+}
+
+// Arithmetic ops
+
+let Properties = [IntrNoMem] in {
+  // Vector Add Across Lanes
+  def int_arm64_neon_saddv : AdvSIMD_1VectorArg_Int_Across_Intrinsic;
+  def int_arm64_neon_uaddv : AdvSIMD_1VectorArg_Int_Across_Intrinsic;
+  def int_arm64_neon_faddv : AdvSIMD_1VectorArg_Float_Across_Intrinsic;
+
+  // Vector Long Add Across Lanes
+  def int_arm64_neon_saddlv : AdvSIMD_1VectorArg_Int_Across_Intrinsic;
+  def int_arm64_neon_uaddlv : AdvSIMD_1VectorArg_Int_Across_Intrinsic;
+
+  // Vector Halving Add
+  def int_arm64_neon_shadd : AdvSIMD_2VectorArg_Intrinsic;
+  def int_arm64_neon_uhadd : AdvSIMD_2VectorArg_Intrinsic;
+
+  // Vector Rounding Halving Add
+  def int_arm64_neon_srhadd : AdvSIMD_2VectorArg_Intrinsic;
+  def int_arm64_neon_urhadd : AdvSIMD_2VectorArg_Intrinsic;
+
+  // Vector Saturating Add
+  def int_arm64_neon_sqadd : AdvSIMD_2IntArg_Intrinsic;
+  def int_arm64_neon_suqadd : AdvSIMD_2IntArg_Intrinsic;
+  def int_arm64_neon_usqadd : AdvSIMD_2IntArg_Intrinsic;
+  def int_arm64_neon_uqadd : AdvSIMD_2IntArg_Intrinsic;
+
+  // Vector Add High-Half
+  // FIXME: this is a legacy intrinsic for aarch64_simd.h. Remove it when that
+  // header is no longer supported.
+  def int_arm64_neon_addhn : AdvSIMD_2VectorArg_Narrow_Intrinsic;
+
+  // Vector Rounding Add High-Half
+  def int_arm64_neon_raddhn : AdvSIMD_2VectorArg_Narrow_Intrinsic;
+
+  // Vector Saturating Doubling Multiply High
+  def int_arm64_neon_sqdmulh : AdvSIMD_2IntArg_Intrinsic;
+
+  // Vector Saturating Rounding Doubling Multiply High
+  def int_arm64_neon_sqrdmulh : AdvSIMD_2IntArg_Intrinsic;
+
+  // Vector Polynominal Multiply
+  def int_arm64_neon_pmul : AdvSIMD_2VectorArg_Intrinsic;
+
+  // Vector Long Multiply
+  def int_arm64_neon_smull : AdvSIMD_2VectorArg_Long_Intrinsic;
+  def int_arm64_neon_umull : AdvSIMD_2VectorArg_Long_Intrinsic;
+  def int_arm64_neon_pmull : AdvSIMD_2VectorArg_Long_Intrinsic;
+
+  // Vector Extending Multiply
+  def int_arm64_neon_fmulx : AdvSIMD_2FloatArg_Intrinsic;
+
+  // Vector Saturating Doubling Long Multiply
+  def int_arm64_neon_sqdmull : AdvSIMD_2VectorArg_Long_Intrinsic;
+  def int_arm64_neon_sqdmulls_scalar
+    : Intrinsic<[llvm_i64_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
+
+  // Vector Halving Subtract
+  def int_arm64_neon_shsub : AdvSIMD_2VectorArg_Intrinsic;
+  def int_arm64_neon_uhsub : AdvSIMD_2VectorArg_Intrinsic;
+
+  // Vector Saturating Subtract
+  def int_arm64_neon_sqsub : AdvSIMD_2IntArg_Intrinsic;
+  def int_arm64_neon_uqsub : AdvSIMD_2IntArg_Intrinsic;
+
+  // Vector Subtract High-Half
+  // FIXME: this is a legacy intrinsic for aarch64_simd.h. Remove it when that
+  // header is no longer supported.
+  def int_arm64_neon_subhn : AdvSIMD_2VectorArg_Narrow_Intrinsic;
+
+  // Vector Rounding Subtract High-Half
+  def int_arm64_neon_rsubhn : AdvSIMD_2VectorArg_Narrow_Intrinsic;
+
+  // Vector Compare Absolute Greater-than-or-equal
+  def int_arm64_neon_facge : AdvSIMD_2Arg_FloatCompare_Intrinsic;
+
+  // Vector Compare Absolute Greater-than
+  def int_arm64_neon_facgt : AdvSIMD_2Arg_FloatCompare_Intrinsic;
+
+  // Vector Absolute Difference
+  def int_arm64_neon_sabd : AdvSIMD_2VectorArg_Intrinsic;
+  def int_arm64_neon_uabd : AdvSIMD_2VectorArg_Intrinsic;
+  def int_arm64_neon_fabd : AdvSIMD_2VectorArg_Intrinsic;
+
+  // Scalar Absolute Difference
+  def int_arm64_sisd_fabd : AdvSIMD_2Scalar_Float_Intrinsic;
+
+  // Vector Max
+  def int_arm64_neon_smax : AdvSIMD_2VectorArg_Intrinsic;
+  def int_arm64_neon_umax : AdvSIMD_2VectorArg_Intrinsic;
+  def int_arm64_neon_fmax : AdvSIMD_2VectorArg_Intrinsic;
+  def int_arm64_neon_fmaxnmp : AdvSIMD_2VectorArg_Intrinsic;
+
+  // Vector Max Across Lanes
+  def int_arm64_neon_smaxv : AdvSIMD_1VectorArg_Int_Across_Intrinsic;
+  def int_arm64_neon_umaxv : AdvSIMD_1VectorArg_Int_Across_Intrinsic;
+  def int_arm64_neon_fmaxv : AdvSIMD_1VectorArg_Float_Across_Intrinsic;
+  def int_arm64_neon_fmaxnmv : AdvSIMD_1VectorArg_Float_Across_Intrinsic;
+
+  // Vector Min
+  def int_arm64_neon_smin : AdvSIMD_2VectorArg_Intrinsic;
+  def int_arm64_neon_umin : AdvSIMD_2VectorArg_Intrinsic;
+  def int_arm64_neon_fmin : AdvSIMD_2VectorArg_Intrinsic;
+  def int_arm64_neon_fminnmp : AdvSIMD_2VectorArg_Intrinsic;
+
+  // Vector Min/Max Number
+  def int_arm64_neon_fminnm : AdvSIMD_2FloatArg_Intrinsic;
+  def int_arm64_neon_fmaxnm : AdvSIMD_2FloatArg_Intrinsic;
+
+  // Vector Min Across Lanes
+  def int_arm64_neon_sminv : AdvSIMD_1VectorArg_Int_Across_Intrinsic;
+  def int_arm64_neon_uminv : AdvSIMD_1VectorArg_Int_Across_Intrinsic;
+  def int_arm64_neon_fminv : AdvSIMD_1VectorArg_Float_Across_Intrinsic;
+  def int_arm64_neon_fminnmv : AdvSIMD_1VectorArg_Float_Across_Intrinsic;
+
+  // Pairwise Add
+  def int_arm64_neon_addp : AdvSIMD_2VectorArg_Intrinsic;
+
+  // Long Pairwise Add
+  // FIXME: In theory, we shouldn't need intrinsics for saddlp or
+  // uaddlp, but tblgen's type inference currently can't handle the
+  // pattern fragments this ends up generating.
+  def int_arm64_neon_saddlp : AdvSIMD_1VectorArg_Expand_Intrinsic;
+  def int_arm64_neon_uaddlp : AdvSIMD_1VectorArg_Expand_Intrinsic;
+
+  // Folding Maximum
+  def int_arm64_neon_smaxp : AdvSIMD_2VectorArg_Intrinsic;
+  def int_arm64_neon_umaxp : AdvSIMD_2VectorArg_Intrinsic;
+  def int_arm64_neon_fmaxp : AdvSIMD_2VectorArg_Intrinsic;
+
+  // Folding Minimum
+  def int_arm64_neon_sminp : AdvSIMD_2VectorArg_Intrinsic;
+  def int_arm64_neon_uminp : AdvSIMD_2VectorArg_Intrinsic;
+  def int_arm64_neon_fminp : AdvSIMD_2VectorArg_Intrinsic;
+
+  // Reciprocal Estimate/Step
+  def int_arm64_neon_frecps : AdvSIMD_2FloatArg_Intrinsic;
+  def int_arm64_neon_frsqrts : AdvSIMD_2FloatArg_Intrinsic;
+
+  // Vector Saturating Shift Left
+  def int_arm64_neon_sqshl : AdvSIMD_2IntArg_Intrinsic;
+  def int_arm64_neon_uqshl : AdvSIMD_2IntArg_Intrinsic;
+
+  // Vector Rounding Shift Left
+  def int_arm64_neon_srshl : AdvSIMD_2IntArg_Intrinsic;
+  def int_arm64_neon_urshl : AdvSIMD_2IntArg_Intrinsic;
+
+  // Vector Saturating Rounding Shift Left
+  def int_arm64_neon_sqrshl : AdvSIMD_2IntArg_Intrinsic;
+  def int_arm64_neon_uqrshl : AdvSIMD_2IntArg_Intrinsic;
+
+  // Vector Signed->Unsigned Shift Left by Constant
+  def int_arm64_neon_sqshlu : AdvSIMD_2IntArg_Intrinsic;
+
+  // Vector Signed->Unsigned Narrowing Saturating Shift Right by Constant
+  def int_arm64_neon_sqshrun : AdvSIMD_2Arg_Scalar_Narrow_Intrinsic;
+
+  // Vector Signed->Unsigned Rounding Narrowing Saturating Shift Right by Const
+  def int_arm64_neon_sqrshrun : AdvSIMD_2Arg_Scalar_Narrow_Intrinsic;
+
+  // Vector Narrowing Shift Right by Constant
+  def int_arm64_neon_sqshrn : AdvSIMD_2Arg_Scalar_Narrow_Intrinsic;
+  def int_arm64_neon_uqshrn : AdvSIMD_2Arg_Scalar_Narrow_Intrinsic;
+
+  // Vector Rounding Narrowing Shift Right by Constant
+  def int_arm64_neon_rshrn : AdvSIMD_2Arg_Scalar_Narrow_Intrinsic;
+
+  // Vector Rounding Narrowing Saturating Shift Right by Constant
+  def int_arm64_neon_sqrshrn : AdvSIMD_2Arg_Scalar_Narrow_Intrinsic;
+  def int_arm64_neon_uqrshrn : AdvSIMD_2Arg_Scalar_Narrow_Intrinsic;
+
+  // Vector Shift Left
+  def int_arm64_neon_sshl : AdvSIMD_2IntArg_Intrinsic;
+  def int_arm64_neon_ushl : AdvSIMD_2IntArg_Intrinsic;
+
+  // Vector Widening Shift Left by Constant
+  def int_arm64_neon_shll : AdvSIMD_2VectorArg_Scalar_Wide_BySize_Intrinsic;
+  def int_arm64_neon_sshll : AdvSIMD_2VectorArg_Scalar_Wide_Intrinsic;
+  def int_arm64_neon_ushll : AdvSIMD_2VectorArg_Scalar_Wide_Intrinsic;
+
+  // Vector Shift Right by Constant and Insert
+  def int_arm64_neon_vsri : AdvSIMD_3VectorArg_Scalar_Intrinsic;
+
+  // Vector Shift Left by Constant and Insert
+  def int_arm64_neon_vsli : AdvSIMD_3VectorArg_Scalar_Intrinsic;
+
+  // Vector Saturating Narrow
+  def int_arm64_neon_scalar_sqxtn: AdvSIMD_1IntArg_Narrow_Intrinsic;
+  def int_arm64_neon_scalar_uqxtn : AdvSIMD_1IntArg_Narrow_Intrinsic;
+  def int_arm64_neon_sqxtn : AdvSIMD_1VectorArg_Narrow_Intrinsic;
+  def int_arm64_neon_uqxtn : AdvSIMD_1VectorArg_Narrow_Intrinsic;
+
+  // Vector Saturating Extract and Unsigned Narrow
+  def int_arm64_neon_scalar_sqxtun : AdvSIMD_1IntArg_Narrow_Intrinsic;
+  def int_arm64_neon_sqxtun : AdvSIMD_1VectorArg_Narrow_Intrinsic;
+
+  // Vector Absolute Value
+  def int_arm64_neon_abs : AdvSIMD_1VectorArg_Intrinsic;
+
+  // Vector Saturating Absolute Value
+  def int_arm64_neon_sqabs : AdvSIMD_1IntArg_Intrinsic;
+
+  // Vector Saturating Negation
+  def int_arm64_neon_sqneg : AdvSIMD_1IntArg_Intrinsic;
+
+  // Vector Count Leading Sign Bits
+  def int_arm64_neon_cls : AdvSIMD_1VectorArg_Intrinsic;
+
+  // Vector Reciprocal Estimate
+  def int_arm64_neon_urecpe : AdvSIMD_1VectorArg_Intrinsic;
+  def int_arm64_neon_frecpe : AdvSIMD_1VectorArg_Intrinsic;
+
+  // Vector Square Root Estimate
+  def int_arm64_neon_ursqrte : AdvSIMD_1VectorArg_Intrinsic;
+  def int_arm64_neon_frsqrte : AdvSIMD_1VectorArg_Intrinsic;
+
+  // Vector Bitwise Reverse
+  def int_arm64_neon_rbit : AdvSIMD_1VectorArg_Intrinsic;
+
+  // Vector Conversions Between Half-Precision and Single-Precision.
+  def int_arm64_neon_vcvtfp2hf
+    : Intrinsic<[llvm_v4i16_ty], [llvm_v4f32_ty], [IntrNoMem]>;
+  def int_arm64_neon_vcvthf2fp
+    : Intrinsic<[llvm_v4f32_ty], [llvm_v4i16_ty], [IntrNoMem]>;
+
+  // Vector Conversions Between Floating-point and Fixed-point.
+  def int_arm64_neon_vcvtfp2fxs : AdvSIMD_CvtFPToFx_Intrinsic;
+  def int_arm64_neon_vcvtfp2fxu : AdvSIMD_CvtFPToFx_Intrinsic;
+  def int_arm64_neon_vcvtfxs2fp : AdvSIMD_CvtFxToFP_Intrinsic;
+  def int_arm64_neon_vcvtfxu2fp : AdvSIMD_CvtFxToFP_Intrinsic;
+
+  // Vector FP->Int Conversions
+  def int_arm64_neon_fcvtas : AdvSIMD_FPToIntRounding_Intrinsic;
+  def int_arm64_neon_fcvtau : AdvSIMD_FPToIntRounding_Intrinsic;
+  def int_arm64_neon_fcvtms : AdvSIMD_FPToIntRounding_Intrinsic;
+  def int_arm64_neon_fcvtmu : AdvSIMD_FPToIntRounding_Intrinsic;
+  def int_arm64_neon_fcvtns : AdvSIMD_FPToIntRounding_Intrinsic;
+  def int_arm64_neon_fcvtnu : AdvSIMD_FPToIntRounding_Intrinsic;
+  def int_arm64_neon_fcvtps : AdvSIMD_FPToIntRounding_Intrinsic;
+  def int_arm64_neon_fcvtpu : AdvSIMD_FPToIntRounding_Intrinsic;
+  def int_arm64_neon_fcvtzs : AdvSIMD_FPToIntRounding_Intrinsic;
+  def int_arm64_neon_fcvtzu : AdvSIMD_FPToIntRounding_Intrinsic;
+
+  // Vector FP Rounding: only ties to even is unrepresented by a normal
+  // intrinsic.
+  def int_arm64_neon_frintn : AdvSIMD_1FloatArg_Intrinsic;
+
+  // Scalar FP->Int conversions
+
+  // Vector FP Inexact Narrowing
+  def int_arm64_neon_fcvtxn : AdvSIMD_1VectorArg_Expand_Intrinsic;
+
+  // Scalar FP Inexact Narrowing
+  def int_arm64_sisd_fcvtxn : Intrinsic<[llvm_float_ty], [llvm_double_ty],
+                                        [IntrNoMem]>;
+}
+
+let TargetPrefix = "arm64" in {  // All intrinsics start with "llvm.arm64.".
+  class AdvSIMD_2Vector2Index_Intrinsic
+    : Intrinsic<[llvm_anyvector_ty],
+                [llvm_anyvector_ty, llvm_i64_ty, LLVMMatchType<0>, llvm_i64_ty],
+                [IntrNoMem]>;
+}
+
+// Vector element to element moves
+def int_arm64_neon_vcopy_lane: AdvSIMD_2Vector2Index_Intrinsic;
+
+let TargetPrefix = "arm64" in {  // All intrinsics start with "llvm.arm64.".
+  class AdvSIMD_1Vec_Load_Intrinsic
+      : Intrinsic<[llvm_anyvector_ty], [LLVMAnyPointerType<LLVMMatchType<0>>],
+                  [IntrReadArgMem]>;
+  class AdvSIMD_1Vec_Store_Lane_Intrinsic
+    : Intrinsic<[], [llvm_anyvector_ty, llvm_i64_ty, llvm_anyptr_ty],
+                [IntrReadWriteArgMem, NoCapture<2>]>;
+
+  class AdvSIMD_2Vec_Load_Intrinsic
+    : Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>],
+                [LLVMAnyPointerType<LLVMMatchType<0>>],
+                [IntrReadArgMem]>;
+  class AdvSIMD_2Vec_Load_Lane_Intrinsic
+    : Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>],
+                [LLVMMatchType<0>, LLVMMatchType<0>,
+                 llvm_i64_ty, llvm_anyptr_ty],
+                [IntrReadArgMem]>;
+  class AdvSIMD_2Vec_Store_Intrinsic
+    : Intrinsic<[], [llvm_anyvector_ty, LLVMMatchType<0>,
+                     LLVMAnyPointerType<LLVMMatchType<0>>],
+                [IntrReadWriteArgMem, NoCapture<2>]>;
+  class AdvSIMD_2Vec_Store_Lane_Intrinsic
+    : Intrinsic<[], [llvm_anyvector_ty, LLVMMatchType<0>,
+                 llvm_i64_ty, llvm_anyptr_ty],
+                [IntrReadWriteArgMem, NoCapture<3>]>;
+
+  class AdvSIMD_3Vec_Load_Intrinsic
+    : Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>, LLVMMatchType<0>],
+                [LLVMAnyPointerType<LLVMMatchType<0>>],
+                [IntrReadArgMem]>;
+  class AdvSIMD_3Vec_Load_Lane_Intrinsic
+    : Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>, LLVMMatchType<0>],
+                [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>,
+                 llvm_i64_ty, llvm_anyptr_ty],
+                [IntrReadArgMem]>;
+  class AdvSIMD_3Vec_Store_Intrinsic
+    : Intrinsic<[], [llvm_anyvector_ty, LLVMMatchType<0>,
+                     LLVMMatchType<0>, LLVMAnyPointerType<LLVMMatchType<0>>],
+                [IntrReadWriteArgMem, NoCapture<3>]>;
+  class AdvSIMD_3Vec_Store_Lane_Intrinsic
+    : Intrinsic<[], [llvm_anyvector_ty,
+                 LLVMMatchType<0>, LLVMMatchType<0>,
+                 llvm_i64_ty, llvm_anyptr_ty],
+                [IntrReadWriteArgMem, NoCapture<4>]>;
+
+  class AdvSIMD_4Vec_Load_Intrinsic
+    : Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>,
+                 LLVMMatchType<0>, LLVMMatchType<0>],
+                [LLVMAnyPointerType<LLVMMatchType<0>>],
+                [IntrReadArgMem]>;
+  class AdvSIMD_4Vec_Load_Lane_Intrinsic
+    : Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>,
+                 LLVMMatchType<0>, LLVMMatchType<0>],
+                [LLVMMatchType<0>, LLVMMatchType<0>,
+                 LLVMMatchType<0>, LLVMMatchType<0>,
+                 llvm_i64_ty, llvm_anyptr_ty],
+                [IntrReadArgMem]>;
+  class AdvSIMD_4Vec_Store_Intrinsic
+    : Intrinsic<[], [llvm_anyvector_ty, LLVMMatchType<0>,
+                 LLVMMatchType<0>, LLVMMatchType<0>,
+                 LLVMAnyPointerType<LLVMMatchType<0>>],
+                [IntrReadWriteArgMem, NoCapture<4>]>;
+  class AdvSIMD_4Vec_Store_Lane_Intrinsic
+    : Intrinsic<[], [llvm_anyvector_ty, LLVMMatchType<0>,
+                 LLVMMatchType<0>, LLVMMatchType<0>,
+                 llvm_i64_ty, llvm_anyptr_ty],
+                [IntrReadWriteArgMem, NoCapture<5>]>;
+}
+
+// Memory ops
+
+def int_arm64_neon_ld1x2 : AdvSIMD_2Vec_Load_Intrinsic;
+def int_arm64_neon_ld1x3 : AdvSIMD_3Vec_Load_Intrinsic;
+def int_arm64_neon_ld1x4 : AdvSIMD_4Vec_Load_Intrinsic;
+
+def int_arm64_neon_st1x2 : AdvSIMD_2Vec_Store_Intrinsic;
+def int_arm64_neon_st1x3 : AdvSIMD_3Vec_Store_Intrinsic;
+def int_arm64_neon_st1x4 : AdvSIMD_4Vec_Store_Intrinsic;
+
+def int_arm64_neon_ld2 : AdvSIMD_2Vec_Load_Intrinsic;
+def int_arm64_neon_ld3 : AdvSIMD_3Vec_Load_Intrinsic;
+def int_arm64_neon_ld4 : AdvSIMD_4Vec_Load_Intrinsic;
+
+def int_arm64_neon_ld2lane : AdvSIMD_2Vec_Load_Lane_Intrinsic;
+def int_arm64_neon_ld3lane : AdvSIMD_3Vec_Load_Lane_Intrinsic;
+def int_arm64_neon_ld4lane : AdvSIMD_4Vec_Load_Lane_Intrinsic;
+
+def int_arm64_neon_ld2r : AdvSIMD_2Vec_Load_Intrinsic;
+def int_arm64_neon_ld3r : AdvSIMD_3Vec_Load_Intrinsic;
+def int_arm64_neon_ld4r : AdvSIMD_4Vec_Load_Intrinsic;
+
+def int_arm64_neon_st2  : AdvSIMD_2Vec_Store_Intrinsic;
+def int_arm64_neon_st3  : AdvSIMD_3Vec_Store_Intrinsic;
+def int_arm64_neon_st4  : AdvSIMD_4Vec_Store_Intrinsic;
+
+def int_arm64_neon_st2lane  : AdvSIMD_2Vec_Store_Lane_Intrinsic;
+def int_arm64_neon_st3lane  : AdvSIMD_3Vec_Store_Lane_Intrinsic;
+def int_arm64_neon_st4lane  : AdvSIMD_4Vec_Store_Lane_Intrinsic;
+
+let TargetPrefix = "arm64" in {  // All intrinsics start with "llvm.arm64.".
+  class AdvSIMD_Tbl1_Intrinsic
+    : Intrinsic<[llvm_anyvector_ty], [llvm_v16i8_ty, LLVMMatchType<0>],
+                [IntrNoMem]>;
+  class AdvSIMD_Tbl2_Intrinsic
+    : Intrinsic<[llvm_anyvector_ty],
+                [llvm_v16i8_ty, llvm_v16i8_ty, LLVMMatchType<0>], [IntrNoMem]>;
+  class AdvSIMD_Tbl3_Intrinsic
+    : Intrinsic<[llvm_anyvector_ty],
+                [llvm_v16i8_ty, llvm_v16i8_ty, llvm_v16i8_ty,
+                 LLVMMatchType<0>],
+                [IntrNoMem]>;
+  class AdvSIMD_Tbl4_Intrinsic
+    : Intrinsic<[llvm_anyvector_ty],
+                [llvm_v16i8_ty, llvm_v16i8_ty, llvm_v16i8_ty, llvm_v16i8_ty,
+                 LLVMMatchType<0>],
+                [IntrNoMem]>;
+
+  class AdvSIMD_Tbx1_Intrinsic
+    : Intrinsic<[llvm_anyvector_ty],
+                [LLVMMatchType<0>, llvm_v16i8_ty, LLVMMatchType<0>],
+                [IntrNoMem]>;
+  class AdvSIMD_Tbx2_Intrinsic
+    : Intrinsic<[llvm_anyvector_ty],
+                [LLVMMatchType<0>, llvm_v16i8_ty, llvm_v16i8_ty,
+                 LLVMMatchType<0>],
+                [IntrNoMem]>;
+  class AdvSIMD_Tbx3_Intrinsic
+    : Intrinsic<[llvm_anyvector_ty],
+                [LLVMMatchType<0>, llvm_v16i8_ty, llvm_v16i8_ty,
+                 llvm_v16i8_ty, LLVMMatchType<0>],
+                [IntrNoMem]>;
+  class AdvSIMD_Tbx4_Intrinsic
+    : Intrinsic<[llvm_anyvector_ty],
+                [LLVMMatchType<0>, llvm_v16i8_ty, llvm_v16i8_ty,
+                 llvm_v16i8_ty, llvm_v16i8_ty, LLVMMatchType<0>],
+                [IntrNoMem]>;
+}
+def int_arm64_neon_tbl1 : AdvSIMD_Tbl1_Intrinsic;
+def int_arm64_neon_tbl2 : AdvSIMD_Tbl2_Intrinsic;
+def int_arm64_neon_tbl3 : AdvSIMD_Tbl3_Intrinsic;
+def int_arm64_neon_tbl4 : AdvSIMD_Tbl4_Intrinsic;
+
+def int_arm64_neon_tbx1 : AdvSIMD_Tbx1_Intrinsic;
+def int_arm64_neon_tbx2 : AdvSIMD_Tbx2_Intrinsic;
+def int_arm64_neon_tbx3 : AdvSIMD_Tbx3_Intrinsic;
+def int_arm64_neon_tbx4 : AdvSIMD_Tbx4_Intrinsic;
+
+let TargetPrefix = "arm64" in {
+  class Crypto_AES_DataKey_Intrinsic
+    : Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty], [IntrNoMem]>;
+
+  class Crypto_AES_Data_Intrinsic
+    : Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty], [IntrNoMem]>;
+
+  // SHA intrinsic taking 5 words of the hash (v4i32, i32) and 4 of the schedule
+  // (v4i32).
+  class Crypto_SHA_5Hash4Schedule_Intrinsic
+    : Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_i32_ty, llvm_v4i32_ty],
+                [IntrNoMem]>;
+
+  // SHA intrinsic taking 5 words of the hash (v4i32, i32) and 4 of the schedule
+  // (v4i32).
+  class Crypto_SHA_1Hash_Intrinsic
+    : Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>;
+
+  // SHA intrinsic taking 8 words of the schedule
+  class Crypto_SHA_8Schedule_Intrinsic
+    : Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty], [IntrNoMem]>;
+
+  // SHA intrinsic taking 12 words of the schedule
+  class Crypto_SHA_12Schedule_Intrinsic
+    : Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty, llvm_v4i32_ty],
+                [IntrNoMem]>;
+
+  // SHA intrinsic taking 8 words of the hash and 4 of the schedule.
+  class Crypto_SHA_8Hash4Schedule_Intrinsic
+    : Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty, llvm_v4i32_ty],
+                [IntrNoMem]>;
+}
+
+// AES
+def int_arm64_crypto_aese   : Crypto_AES_DataKey_Intrinsic;
+def int_arm64_crypto_aesd   : Crypto_AES_DataKey_Intrinsic;
+def int_arm64_crypto_aesmc  : Crypto_AES_Data_Intrinsic;
+def int_arm64_crypto_aesimc : Crypto_AES_Data_Intrinsic;
+
+// SHA1
+def int_arm64_crypto_sha1c  : Crypto_SHA_5Hash4Schedule_Intrinsic;
+def int_arm64_crypto_sha1p  : Crypto_SHA_5Hash4Schedule_Intrinsic;
+def int_arm64_crypto_sha1m  : Crypto_SHA_5Hash4Schedule_Intrinsic;
+def int_arm64_crypto_sha1h  : Crypto_SHA_1Hash_Intrinsic;
+
+def int_arm64_crypto_sha1su0 : Crypto_SHA_12Schedule_Intrinsic;
+def int_arm64_crypto_sha1su1 : Crypto_SHA_8Schedule_Intrinsic;
+
+// SHA256
+def int_arm64_crypto_sha256h   : Crypto_SHA_8Hash4Schedule_Intrinsic;
+def int_arm64_crypto_sha256h2  : Crypto_SHA_8Hash4Schedule_Intrinsic;
+def int_arm64_crypto_sha256su0 : Crypto_SHA_8Schedule_Intrinsic;
+def int_arm64_crypto_sha256su1 : Crypto_SHA_12Schedule_Intrinsic;
+
+//===----------------------------------------------------------------------===//
+// CRC32
+
+let TargetPrefix = "arm64" in {
+
+def int_arm64_crc32b  : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
+    [IntrNoMem]>;
+def int_arm64_crc32cb : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
+    [IntrNoMem]>;
+def int_arm64_crc32h  : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
+    [IntrNoMem]>;
+def int_arm64_crc32ch : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
+    [IntrNoMem]>;
+def int_arm64_crc32w  : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
+    [IntrNoMem]>;
+def int_arm64_crc32cw : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
+    [IntrNoMem]>;
+def int_arm64_crc32x  : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i64_ty],
+    [IntrNoMem]>;
+def int_arm64_crc32cx : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i64_ty],
+    [IntrNoMem]>;
+}
--- a/include/llvm/MC/MCExpr.h
+++ b/include/llvm/MC/MCExpr.h
@ -158,7 +158,13 @@ public:
    VK_TLSLDM,
    VK_TPOFF,
    VK_DTPOFF,
-    VK_TLVP,      // Mach-O thread local variable relocation
+    VK_TLVP,      // Mach-O thread local variable relocations
+    VK_TLVPPAGE,
+    VK_TLVPPAGEOFF,
+    VK_PAGE,
+    VK_PAGEOFF,
+    VK_GOTPAGE,
+    VK_GOTPAGEOFF,
    VK_SECREL,
    VK_WEAKREF,   // The link between the symbols in .weakref foo, bar

--- a/include/llvm/Support/MachO.h
+++ b/include/llvm/Support/MachO.h
@ -408,6 +408,34 @@ namespace llvm {
      ARM_RELOC_HALF               = 8,
      ARM_RELOC_HALF_SECTDIFF      = 9,

+      // Constant values for the r_type field in an ARM64 architecture
+      // llvm::MachO::relocation_info or llvm::MachO::scattered_relocation_info
+      // structure.
+
+      // For pointers.
+      ARM64_RELOC_UNSIGNED            = 0,
+      // Must be followed by an ARM64_RELOC_UNSIGNED
+      ARM64_RELOC_SUBTRACTOR          = 1,
+      // A B/BL instruction with 26-bit displacement.
+      ARM64_RELOC_BRANCH26            = 2,
+      // PC-rel distance to page of target.
+      ARM64_RELOC_PAGE21              = 3,
+      // Offset within page, scaled by r_length.
+      ARM64_RELOC_PAGEOFF12           = 4,
+      // PC-rel distance to page of GOT slot.
+      ARM64_RELOC_GOT_LOAD_PAGE21     = 5,
+      // Offset within page of GOT slot, scaled by r_length.
+      ARM64_RELOC_GOT_LOAD_PAGEOFF12  = 6,
+      // For pointers to GOT slots.
+      ARM64_RELOC_POINTER_TO_GOT      = 7,
+      // PC-rel distance to page of TLVP slot.
+      ARM64_RELOC_TLVP_LOAD_PAGE21    = 8,
+      // Offset within page of TLVP slot, scaled by r_length.
+      ARM64_RELOC_TLVP_LOAD_PAGEOFF12 = 9,
+      // Must be followed by ARM64_RELOC_PAGE21 or ARM64_RELOC_PAGEOFF12.
+      ARM64_RELOC_ADDEND              = 10,
+
+
      // Constant values for the r_type field in an x86_64 architecture
      // llvm::MachO::relocation_info or llvm::MachO::scattered_relocation_info
      // structure
@ -914,6 +942,7 @@ namespace llvm {
   /* CPU_TYPE_MIPS      = 8, */
      CPU_TYPE_MC98000   = 10, // Old Motorola PowerPC
      CPU_TYPE_ARM       = 12,
+      CPU_TYPE_ARM64     = CPU_TYPE_ARM | CPU_ARCH_ABI64,
      CPU_TYPE_SPARC     = 14,
      CPU_TYPE_POWERPC   = 18,
      CPU_TYPE_POWERPC64 = CPU_TYPE_POWERPC | CPU_ARCH_ABI64
@ -987,6 +1016,10 @@ namespace llvm {
      CPU_SUBTYPE_ARM_V7EM    = 16
    };

+    enum CPUSubTypeARM64 {
+      CPU_SUBTYPE_ARM64_ALL   = 0
+    };
+
    enum CPUSubTypeSPARC {
      CPU_SUBTYPE_SPARC_ALL   = 0
    };
--- a/include/llvm/Target/TargetSelectionDAG.td
+++ b/include/llvm/Target/TargetSelectionDAG.td
@ -362,7 +362,6 @@ def bitconvert : SDNode<"ISD::BITCAST"    , SDTUnaryOp>;
 def extractelt : SDNode<"ISD::EXTRACT_VECTOR_ELT", SDTVecExtract>;
 def insertelt  : SDNode<"ISD::INSERT_VECTOR_ELT", SDTVecInsert>;

-
 def fadd       : SDNode<"ISD::FADD"       , SDTFPBinOp, [SDNPCommutative]>;
 def fsub       : SDNode<"ISD::FSUB"       , SDTFPBinOp>;
 def fmul       : SDNode<"ISD::FMUL"       , SDTFPBinOp, [SDNPCommutative]>;
@ -466,7 +465,7 @@ def vector_extract : SDNode<"ISD::EXTRACT_VECTOR_ELT",
 def vector_insert : SDNode<"ISD::INSERT_VECTOR_ELT",
    SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>, SDTCisPtrTy<3>]>, []>;
 def concat_vectors : SDNode<"ISD::CONCAT_VECTORS",
-    SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>, SDTCisSameAs<1, 2>]>,[]>;
+    SDTypeProfile<1, 2, [SDTCisSubVecOfVec<1, 0>, SDTCisSameAs<1, 2>]>,[]>;

 // This operator does not do subvector type checking.  The ARM
 // backend, at least, needs it.
--- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.cpp
+++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.cpp
@ -167,6 +167,10 @@ void RuntimeDyldMachO::resolveRelocation(const SectionEntry &Section,
    resolveARMRelocation(LocalAddress, FinalAddress, (uintptr_t)Value, isPCRel,
                         MachoType, Size, Addend);
    break;
+  case Triple::arm64:
+    resolveARM64Relocation(LocalAddress, FinalAddress, (uintptr_t)Value,
+                           isPCRel, MachoType, Size, Addend);
+    break;
  }
 }

@ -293,6 +297,55 @@ bool RuntimeDyldMachO::resolveARMRelocation(uint8_t *LocalAddress,
  return false;
 }

+bool RuntimeDyldMachO::resolveARM64Relocation(uint8_t *LocalAddress,
+                                              uint64_t FinalAddress,
+                                              uint64_t Value, bool isPCRel,
+                                              unsigned Type, unsigned Size,
+                                              int64_t Addend) {
+  // If the relocation is PC-relative, the value to be encoded is the
+  // pointer difference.
+  if (isPCRel)
+    Value -= FinalAddress;
+
+  switch (Type) {
+  default:
+    llvm_unreachable("Invalid relocation type!");
+  case MachO::ARM64_RELOC_UNSIGNED: {
+    // Mask in the target value a byte at a time (we don't have an alignment
+    // guarantee for the target address, so this is safest).
+    uint8_t *p = (uint8_t *)LocalAddress;
+    for (unsigned i = 0; i < Size; ++i) {
+      *p++ = (uint8_t)Value;
+      Value >>= 8;
+    }
+    break;
+  }
+  case MachO::ARM64_RELOC_BRANCH26: {
+    // Mask the value into the target address. We know instructions are
+    // 32-bit aligned, so we can do it all at once.
+    uint32_t *p = (uint32_t *)LocalAddress;
+    // The low two bits of the value are not encoded.
+    Value >>= 2;
+    // Mask the value to 26 bits.
+    Value &= 0x3ffffff;
+    // Insert the value into the instruction.
+    *p = (*p & ~0x3ffffff) | Value;
+    break;
+  }
+  case MachO::ARM64_RELOC_SUBTRACTOR:
+  case MachO::ARM64_RELOC_PAGE21:
+  case MachO::ARM64_RELOC_PAGEOFF12:
+  case MachO::ARM64_RELOC_GOT_LOAD_PAGE21:
+  case MachO::ARM64_RELOC_GOT_LOAD_PAGEOFF12:
+  case MachO::ARM64_RELOC_POINTER_TO_GOT:
+  case MachO::ARM64_RELOC_TLVP_LOAD_PAGE21:
+  case MachO::ARM64_RELOC_TLVP_LOAD_PAGEOFF12:
+  case MachO::ARM64_RELOC_ADDEND:
+    return Error("Relocation type not implemented yet!");
+  }
+  return false;
+}
+
 relocation_iterator RuntimeDyldMachO::processRelocationRef(
    unsigned SectionID, relocation_iterator RelI, ObjectImage &Obj,
    ObjSectionToIDMap &ObjSectionToID, const SymbolTableMap &Symbols,
--- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.h
+++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.h
@ -34,6 +34,9 @@ class RuntimeDyldMachO : public RuntimeDyldImpl {
  bool resolveARMRelocation(uint8_t *LocalAddress, uint64_t FinalAddress,
                            uint64_t Value, bool isPCRel, unsigned Type,
                            unsigned Size, int64_t Addend);
+  bool resolveARM64Relocation(uint8_t *LocalAddress, uint64_t FinalAddress,
+                              uint64_t Value, bool IsPCRel, unsigned Type,
+                              unsigned Size, int64_t Addend);

  void resolveRelocation(const SectionEntry &Section, uint64_t Offset,
                         uint64_t Value, uint32_t Type, int64_t Addend,
--- a/lib/LTO/LTOCodeGenerator.cpp
+++ b/lib/LTO/LTOCodeGenerator.cpp
@ -321,6 +321,8 @@ bool LTOCodeGenerator::determineTarget(std::string &errMsg) {
      MCpu = "core2";
    else if (Triple.getArch() == llvm::Triple::x86)
      MCpu = "yonah";
+    else if (Triple.getArch() == llvm::Triple::arm64)
+      MCpu = "cyclone";
  }

  TargetMach = march->createTargetMachine(TripleStr, MCpu, FeatureStr, Options,
--- a/lib/LTO/LTOModule.cpp
+++ b/lib/LTO/LTOModule.cpp
@ -168,6 +168,8 @@ LTOModule *LTOModule::makeLTOModule(MemoryBuffer *buffer,
      CPU = "core2";
    else if (Triple.getArch() == llvm::Triple::x86)
      CPU = "yonah";
+    else if (Triple.getArch() == llvm::Triple::arm64)
+      CPU = "cyclone";
  }

  TargetMachine *target = march->createTargetMachine(TripleStr, CPU, FeatureStr,
--- a/lib/MC/MCExpr.cpp
+++ b/lib/MC/MCExpr.cpp
@ -179,6 +179,12 @@ StringRef MCSymbolRefExpr::getVariantKindName(VariantKind Kind) {
  case VK_TPOFF: return "TPOFF";
  case VK_DTPOFF: return "DTPOFF";
  case VK_TLVP: return "TLVP";
+  case VK_TLVPPAGE: return "TLVPPAGE";
+  case VK_TLVPPAGEOFF: return "TLVPPAGEOFF";
+  case VK_PAGE: return "PAGE";
+  case VK_PAGEOFF: return "PAGEOFF";
+  case VK_GOTPAGE: return "GOTPAGE";
+  case VK_GOTPAGEOFF: return "GOTPAGEOFF";
  case VK_SECREL: return "SECREL32";
  case VK_WEAKREF: return "WEAKREF";
  case VK_ARM_NONE: return "none";
@ -300,6 +306,18 @@ MCSymbolRefExpr::getVariantKindForName(StringRef Name) {
    .Case("dtpoff", VK_DTPOFF)
    .Case("TLVP", VK_TLVP)
    .Case("tlvp", VK_TLVP)
+    .Case("TLVPPAGE", VK_TLVPPAGE)
+    .Case("tlvppage", VK_TLVPPAGE)
+    .Case("TLVPPAGEOFF", VK_TLVPPAGEOFF)
+    .Case("tlvppageoff", VK_TLVPPAGEOFF)
+    .Case("PAGE", VK_PAGE)
+    .Case("page", VK_PAGE)
+    .Case("PAGEOFF", VK_PAGEOFF)
+    .Case("pageoff", VK_PAGEOFF)
+    .Case("GOTPAGE", VK_GOTPAGE)
+    .Case("gotpage", VK_GOTPAGE)
+    .Case("GOTPAGEOFF", VK_GOTPAGEOFF)
+    .Case("gotpageoff", VK_GOTPAGEOFF)
    .Case("IMGREL", VK_COFF_IMGREL32)
    .Case("imgrel", VK_COFF_IMGREL32)
    .Case("SECREL32", VK_SECREL)
--- a/lib/MC/MCObjectFileInfo.cpp
+++ b/lib/MC/MCObjectFileInfo.cpp
@ -22,6 +22,9 @@ void MCObjectFileInfo::InitMachOMCObjectFileInfo(Triple T) {
  IsFunctionEHFrameSymbolPrivate = false;
  SupportsWeakOmittedEHFrame = false;

+  if (T.isOSDarwin() && T.getArch() == Triple::arm64)
+    SupportsCompactUnwindWithoutEHFrame = true;
+
  PersonalityEncoding = dwarf::DW_EH_PE_indirect | dwarf::DW_EH_PE_pcrel
    | dwarf::DW_EH_PE_sdata4;
  LSDAEncoding = FDEEncoding = FDECFIEncoding = dwarf::DW_EH_PE_pcrel;
@ -146,7 +149,8 @@ void MCObjectFileInfo::InitMachOMCObjectFileInfo(Triple T) {

  COFFDebugSymbolsSection = 0;

-  if (T.isMacOSX() && !T.isMacOSXVersionLT(10, 6)) {
+  if ((T.isMacOSX() && !T.isMacOSXVersionLT(10, 6)) ||
+      (T.isOSDarwin() && T.getArch() == Triple::arm64)) {
    CompactUnwindSection =
      Ctx->getMachOSection("__LD", "__compact_unwind",
                           MachO::S_ATTR_DEBUG,
@ -154,6 +158,8 @@ void MCObjectFileInfo::InitMachOMCObjectFileInfo(Triple T) {

    if (T.getArch() == Triple::x86_64 || T.getArch() == Triple::x86)
      CompactUnwindDwarfEHFrameOnly = 0x04000000;
+    else if (T.getArch() == Triple::arm64)
+      CompactUnwindDwarfEHFrameOnly = 0x03000000;
  }

  // Debug Information.
@ -763,6 +769,7 @@ void MCObjectFileInfo::InitMCObjectFileInfo(StringRef TT, Reloc::Model relocm,
  // cellspu-apple-darwin. Perhaps we should fix in Triple?
  if ((Arch == Triple::x86 || Arch == Triple::x86_64 ||
       Arch == Triple::arm || Arch == Triple::thumb ||
+       Arch == Triple::arm64 ||
       Arch == Triple::ppc || Arch == Triple::ppc64 ||
       Arch == Triple::UnknownArch) &&
      (T.isOSDarwin() || T.isOSBinFormatMachO())) {
--- a/lib/Object/MachOObjectFile.cpp
+++ b/lib/Object/MachOObjectFile.cpp
@ -13,6 +13,7 @@
 //===----------------------------------------------------------------------===//

 #include "llvm/Object/MachO.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/Support/DataExtractor.h"
 #include "llvm/Support/Format.h"
@ -934,6 +935,23 @@ MachOObjectFile::getRelocationTypeName(DataRefImpl Rel,
        res = Table[RType];
      break;
    }
+    case Triple::arm64:
+    case Triple::aarch64: {
+      static const char *const Table[] = {
+        "ARM64_RELOC_UNSIGNED",           "ARM64_RELOC_SUBTRACTOR",
+        "ARM64_RELOC_BRANCH26",           "ARM64_RELOC_PAGE21",
+        "ARM64_RELOC_PAGEOFF12",          "ARM64_RELOC_GOT_LOAD_PAGE21",
+        "ARM64_RELOC_GOT_LOAD_PAGEOFF12", "ARM64_RELOC_POINTER_TO_GOT",
+        "ARM64_RELOC_TLVP_LOAD_PAGE21",   "ARM64_RELOC_TLVP_LOAD_PAGEOFF12",
+        "ARM64_RELOC_ADDEND"
+      };
+
+      if (RType >= array_lengthof(Table))
+        res = "Unknown";
+      else
+        res = Table[RType];
+      break;
+    }
    case Triple::ppc: {
      static const char *const Table[] =  {
        "PPC_RELOC_VANILLA",
@ -1256,6 +1274,8 @@ StringRef MachOObjectFile::getFileFormatName() const {
  switch (CPUType) {
  case llvm::MachO::CPU_TYPE_X86_64:
    return "Mach-O 64-bit x86-64";
+  case llvm::MachO::CPU_TYPE_ARM64:
+    return "Mach-O arm64";
  case llvm::MachO::CPU_TYPE_POWERPC64:
    return "Mach-O 64-bit ppc64";
  default:
@ -1271,6 +1291,8 @@ Triple::ArchType MachOObjectFile::getArch(uint32_t CPUType) {
    return Triple::x86_64;
  case llvm::MachO::CPU_TYPE_ARM:
    return Triple::arm;
+  case llvm::MachO::CPU_TYPE_ARM64:
+    return Triple::arm64;
  case llvm::MachO::CPU_TYPE_POWERPC:
    return Triple::ppc;
  case llvm::MachO::CPU_TYPE_POWERPC64:
--- a/lib/Support/Triple.cpp
+++ b/lib/Support/Triple.cpp
@ -23,6 +23,7 @@ const char *Triple::getArchTypeName(ArchType Kind) {
  case aarch64_be:  return "aarch64_be";
  case arm:         return "arm";
  case armeb:       return "armeb";
+  case arm64:       return "arm64";
  case hexagon:     return "hexagon";
  case mips:        return "mips";
  case mipsel:      return "mipsel";
@ -66,6 +67,8 @@ const char *Triple::getArchTypePrefix(ArchType Kind) {
  case thumb:
  case thumbeb:     return "arm";

+  case arm64:       return "arm64";
+
  case ppc64:
  case ppc64le:
  case ppc:         return "ppc";
@ -91,6 +94,7 @@ const char *Triple::getArchTypePrefix(ArchType Kind) {

  case nvptx:       return "nvptx";
  case nvptx64:     return "nvptx";
+
  case le32:        return "le32";
  case amdil:       return "amdil";
  case spir:        return "spir";
@ -173,6 +177,7 @@ Triple::ArchType Triple::getArchTypeForLLVMName(StringRef Name) {
    .Case("aarch64_be", aarch64_be)
    .Case("arm", arm)
    .Case("armeb", armeb)
+    .Case("arm64", arm64)
    .Case("mips", mips)
    .Case("mipsel", mipsel)
    .Case("mips64", mips64)
@ -219,6 +224,7 @@ const char *Triple::getArchNameForAssembler() {
    .Cases("armv6", "thumbv6", "armv6")
    .Cases("armv7", "thumbv7", "armv7")
    .Case("armeb", "armeb")
+    .Case("arm64", "arm64")
    .Case("r600", "r600")
    .Case("nvptx", "nvptx")
    .Case("nvptx64", "nvptx64")
@ -250,6 +256,7 @@ static Triple::ArchType parseArch(StringRef ArchName) {
    .StartsWith("thumbv", Triple::thumb)
    .Case("thumbeb", Triple::thumbeb)
    .StartsWith("thumbebv", Triple::thumbeb)
+    .Case("arm64", Triple::arm64)
    .Case("msp430", Triple::msp430)
    .Cases("mips", "mipseb", "mipsallegrex", Triple::mips)
    .Cases("mipsel", "mipsallegrexel", Triple::mipsel)
@ -681,9 +688,9 @@ void Triple::getiOSVersion(unsigned &Major, unsigned &Minor,
    break;
  case IOS:
    getOSVersion(Major, Minor, Micro);
-    // Default to 5.0.
+    // Default to 5.0 (or 7.0 for arm64).
    if (Major == 0)
-      Major = 5;
+      Major = (getArch() == arm64) ? 7 : 5;
    break;
  }
 }
@ -771,6 +778,7 @@ static unsigned getArchPointerBitWidth(llvm::Triple::ArchType Arch) {
  case llvm::Triple::spir:
    return 32;

+  case llvm::Triple::arm64:
  case llvm::Triple::aarch64:
  case llvm::Triple::aarch64_be:
  case llvm::Triple::mips64:
@ -838,6 +846,7 @@ Triple Triple::get32BitArchVariant() const {
  case Triple::sparcv9:   T.setArch(Triple::sparc);   break;
  case Triple::x86_64:    T.setArch(Triple::x86);     break;
  case Triple::spir64:    T.setArch(Triple::spir);    break;
+  case Triple::arm64:     T.setArch(Triple::arm);     break;
  }
  return T;
 }
@ -847,7 +856,6 @@ Triple Triple::get64BitArchVariant() const {
  switch (getArch()) {
  case Triple::UnknownArch:
  case Triple::amdil:
-  case Triple::arm:
  case Triple::armeb:
  case Triple::hexagon:
  case Triple::le32:
@ -871,6 +879,7 @@ Triple Triple::get64BitArchVariant() const {
  case Triple::sparcv9:
  case Triple::systemz:
  case Triple::x86_64:
+  case Triple::arm64:
    // Already 64-bit.
    break;

@ -881,6 +890,7 @@ Triple Triple::get64BitArchVariant() const {
  case Triple::sparc:   T.setArch(Triple::sparcv9);   break;
  case Triple::x86:     T.setArch(Triple::x86_64);    break;
  case Triple::spir:    T.setArch(Triple::spir64);    break;
+  case Triple::arm:     T.setArch(Triple::arm64);     break;
  }
  return T;
 }
--- a/lib/Support/Unix/Memory.inc
+++ b/lib/Support/Unix/Memory.inc
@ -205,7 +205,7 @@ Memory::AllocateRWX(size_t NumBytes, const MemoryBlock* NearBlock,
  void* start = NearBlock ? (unsigned char*)NearBlock->base() +
                            NearBlock->size() : 0;

-#if defined(__APPLE__) && defined(__arm__)
+#if defined(__APPLE__) && (defined(__arm__) || defined(__arm64__))
  void *pa = ::mmap(start, PageSize*NumPages, PROT_READ|PROT_EXEC,
                    flags, fd, 0);
 #else
@ -220,7 +220,7 @@ Memory::AllocateRWX(size_t NumBytes, const MemoryBlock* NearBlock,
    return MemoryBlock();
  }

-#if defined(__APPLE__) && defined(__arm__)
+#if defined(__APPLE__) && (defined(__arm__) || defined(__arm64__))
  kern_return_t kr = vm_protect(mach_task_self(), (vm_address_t)pa,
                                (vm_size_t)(PageSize*NumPages), 0,
                                VM_PROT_READ | VM_PROT_EXECUTE | VM_PROT_COPY);
@ -253,7 +253,7 @@ bool Memory::ReleaseRWX(MemoryBlock &M, std::string *ErrMsg) {
 }

 bool Memory::setWritable (MemoryBlock &M, std::string *ErrMsg) {
-#if defined(__APPLE__) && defined(__arm__)
+#if defined(__APPLE__) && (defined(__arm__) || defined(__arm64__))
  if (M.Address == 0 || M.Size == 0) return false;
  Memory::InvalidateInstructionCache(M.Address, M.Size);
  kern_return_t kr = vm_protect(mach_task_self(), (vm_address_t)M.Address,
@ -265,7 +265,7 @@ bool Memory::setWritable (MemoryBlock &M, std::string *ErrMsg) {
 }

 bool Memory::setExecutable (MemoryBlock &M, std::string *ErrMsg) {
-#if defined(__APPLE__) && defined(__arm__)
+#if defined(__APPLE__) && (defined(__arm__) || defined(__arm64__))
  if (M.Address == 0 || M.Size == 0) return false;
  Memory::InvalidateInstructionCache(M.Address, M.Size);
  kern_return_t kr = vm_protect(mach_task_self(), (vm_address_t)M.Address,
@ -280,7 +280,7 @@ bool Memory::setExecutable (MemoryBlock &M, std::string *ErrMsg) {
 }

 bool Memory::setRangeWritable(const void *Addr, size_t Size) {
-#if defined(__APPLE__) && defined(__arm__)
+#if defined(__APPLE__) && (defined(__arm__) || defined(__arm64__))
  kern_return_t kr = vm_protect(mach_task_self(), (vm_address_t)Addr,
                                (vm_size_t)Size, 0,
                                VM_PROT_READ | VM_PROT_WRITE);
@ -291,7 +291,7 @@ bool Memory::setRangeWritable(const void *Addr, size_t Size) {
 }

 bool Memory::setRangeExecutable(const void *Addr, size_t Size) {
-#if defined(__APPLE__) && defined(__arm__)
+#if defined(__APPLE__) && (defined(__arm__) || defined(__arm64__))
  kern_return_t kr = vm_protect(mach_task_self(), (vm_address_t)Addr,
                                (vm_size_t)Size, 0,
                                VM_PROT_READ | VM_PROT_EXECUTE | VM_PROT_COPY);
@ -311,7 +311,8 @@ void Memory::InvalidateInstructionCache(const void *Addr,
 #if defined(__APPLE__)

 #  if (defined(__POWERPC__) || defined (__ppc__) || \
-     defined(_POWER) || defined(_ARCH_PPC)) || defined(__arm__)
+       defined(_POWER) || defined(_ARCH_PPC) || defined(__arm__) || \
+       defined(__arm64__))
  sys_icache_invalidate(const_cast<void *>(Addr), Len);
 #  endif

--- a/lib/Target/ARM64/ARM64.h
+++ b/lib/Target/ARM64/ARM64.h
@ -0,0 +1,48 @@
+//===-- ARM64.h - Top-level interface for ARM64 representation --*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the entry points for global functions defined in the LLVM
+// ARM64 back-end.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef TARGET_ARM64_H
+#define TARGET_ARM64_H
+
+#include "MCTargetDesc/ARM64BaseInfo.h"
+#include "MCTargetDesc/ARM64MCTargetDesc.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Support/DataTypes.h"
+
+namespace llvm {
+
+class ARM64TargetMachine;
+class FunctionPass;
+class MachineFunctionPass;
+
+FunctionPass *createARM64DeadRegisterDefinitions();
+FunctionPass *createARM64ConditionalCompares();
+FunctionPass *createARM64AdvSIMDScalar();
+FunctionPass *createARM64BranchRelaxation();
+FunctionPass *createARM64ISelDag(ARM64TargetMachine &TM,
+                                 CodeGenOpt::Level OptLevel);
+FunctionPass *createARM64StorePairSuppressPass();
+FunctionPass *createARM64ExpandPseudoPass();
+FunctionPass *createARM64LoadStoreOptimizationPass();
+ModulePass *createARM64PromoteConstantPass();
+FunctionPass *createARM64AddressTypePromotionPass();
+/// \brief Creates an ARM-specific Target Transformation Info pass.
+ImmutablePass *createARM64TargetTransformInfoPass(const ARM64TargetMachine *TM);
+
+FunctionPass *createARM64CleanupLocalDynamicTLSPass();
+
+FunctionPass *createARM64CollectLOHPass();
+} // end namespace llvm
+
+#endif
--- a/lib/Target/ARM64/ARM64.td
+++ b/lib/Target/ARM64/ARM64.td
@ -0,0 +1,95 @@
+//===- ARM64.td - Describe the ARM64 Target Machine --------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Target-independent interfaces which we are implementing
+//===----------------------------------------------------------------------===//
+
+include "llvm/Target/Target.td"
+
+//===----------------------------------------------------------------------===//
+// ARM64 Subtarget features.
+//
+
+/// Cyclone has register move instructions which are "free".
+def FeatureZCRegMove : SubtargetFeature<"zcm", "HasZeroCycleRegMove", "true",
+                                        "Has zereo-cycle register moves">;
+
+/// Cyclone has instructions which zero registers for "free".
+def FeatureZCZeroing : SubtargetFeature<"zcz", "HasZeroCycleZeroing", "true",
+                                        "Has zero-cycle zeroing instructions">;
+
+//===----------------------------------------------------------------------===//
+// Register File Description
+//===----------------------------------------------------------------------===//
+
+include "ARM64RegisterInfo.td"
+include "ARM64CallingConvention.td"
+
+//===----------------------------------------------------------------------===//
+// Instruction Descriptions
+//===----------------------------------------------------------------------===//
+
+include "ARM64Schedule.td"
+include "ARM64InstrInfo.td"
+
+def ARM64InstrInfo : InstrInfo;
+
+//===----------------------------------------------------------------------===//
+// ARM64 Processors supported.
+//
+include "ARM64SchedCyclone.td"
+
+def : ProcessorModel<"arm64-generic", NoSchedModel, []>;
+
+def : ProcessorModel<"cyclone", CycloneModel, [FeatureZCRegMove, FeatureZCZeroing]>;
+
+//===----------------------------------------------------------------------===//
+// Assembly parser
+//===----------------------------------------------------------------------===//
+
+def GenericAsmParserVariant : AsmParserVariant {
+  int Variant = 0;
+  string Name = "generic";
+}
+
+def AppleAsmParserVariant : AsmParserVariant {
+  int Variant = 1;
+  string Name = "apple-neon";
+}
+
+//===----------------------------------------------------------------------===//
+// Assembly printer
+//===----------------------------------------------------------------------===//
+// ARM64 Uses the MC printer for asm output, so make sure the TableGen
+// AsmWriter bits get associated with the correct class.
+def GenericAsmWriter : AsmWriter {
+  string AsmWriterClassName  = "InstPrinter";
+  int Variant = 0;
+  bit isMCAsmWriter = 1;
+}
+
+def AppleAsmWriter : AsmWriter {
+  let AsmWriterClassName = "AppleInstPrinter";
+  int Variant = 1;
+  int isMCAsmWriter = 1;
+}
+
+//===----------------------------------------------------------------------===//
+// Target Declaration
+//===----------------------------------------------------------------------===//
+
+def ARM64 : Target {
+  let InstructionSet = ARM64InstrInfo;
+  let AssemblyParserVariants = [GenericAsmParserVariant, AppleAsmParserVariant];
+  let AssemblyWriters = [GenericAsmWriter, AppleAsmWriter];
+}
--- a/lib/Target/ARM64/ARM64AddressTypePromotion.cpp
+++ b/lib/Target/ARM64/ARM64AddressTypePromotion.cpp
@ -0,0 +1,505 @@
+
+//===-- ARM64AddressTypePromotion.cpp --- Promote type for addr accesses -===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass tries to promote the computations use to obtained a sign extended
+// value used into memory accesses.
+// E.g.
+// a = add nsw i32 b, 3
+// d = sext i32 a to i64
+// e = getelementptr ..., i64 d
+//
+// =>
+// f = sext i32 b to i64
+// a = add nsw i64 f, 3
+// e = getelementptr ..., i64 a
+//
+// This is legal to do so if the computations are markers with either nsw or nuw
+// markers.
+// Moreover, the current heuristic is simple: it does not create new sext
+// operations, i.e., it gives up when a sext would have forked (e.g., if
+// a = add i32 b, c, two sexts are required to promote the computation).
+//
+// FIXME: This pass may be useful for other targets too.
+// ===---------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "arm64-type-promotion"
+#include "ARM64.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+
+static cl::opt<bool>
+EnableAddressTypePromotion("arm64-type-promotion", cl::Hidden,
+                           cl::desc("Enable the type promotion pass"),
+                           cl::init(true));
+static cl::opt<bool>
+EnableMerge("arm64-type-promotion-merge", cl::Hidden,
+            cl::desc("Enable merging of redundant sexts when one is dominating"
+                     " the other."),
+            cl::init(true));
+
+//===----------------------------------------------------------------------===//
+//                       ARM64AddressTypePromotion
+//===----------------------------------------------------------------------===//
+
+namespace llvm {
+void initializeARM64AddressTypePromotionPass(PassRegistry &);
+}
+
+namespace {
+class ARM64AddressTypePromotion : public FunctionPass {
+
+public:
+  static char ID;
+  ARM64AddressTypePromotion()
+      : FunctionPass(ID), Func(NULL), ConsideredSExtType(NULL) {
+    initializeARM64AddressTypePromotionPass(*PassRegistry::getPassRegistry());
+  }
+
+  virtual const char *getPassName() const {
+    return "ARM64 Address Type Promotion";
+  }
+
+  /// Iterate over the functions and promote the computation of interesting
+  // sext instructions.
+  bool runOnFunction(Function &F);
+
+private:
+  /// The current function.
+  Function *Func;
+  /// Filter out all sexts that does not have this type.
+  /// Currently initialized with Int64Ty.
+  Type *ConsideredSExtType;
+
+  // This transformation requires dominator info.
+  virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+    AU.setPreservesCFG();
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addPreserved<DominatorTreeWrapperPass>();
+    FunctionPass::getAnalysisUsage(AU);
+  }
+
+  typedef SmallPtrSet<Instruction *, 32> SetOfInstructions;
+  typedef SmallVector<Instruction *, 16> Instructions;
+  typedef DenseMap<Value *, Instructions> ValueToInsts;
+
+  /// Check if it is profitable to move a sext through this instruction.
+  /// Currently, we consider it is profitable if:
+  /// - Inst is used only once (no need to insert truncate).
+  /// - Inst has only one operand that will require a sext operation (we do
+  ///   do not create new sext operation).
+  bool shouldGetThrough(const Instruction *Inst);
+
+  /// Check if it is possible and legal to move a sext through this
+  /// instruction.
+  /// Current heuristic considers that we can get through:
+  /// - Arithmetic operation marked with the nsw or nuw flag.
+  /// - Other sext operation.
+  /// - Truncate operation if it was just dropping sign extended bits.
+  bool canGetThrough(const Instruction *Inst);
+
+  /// Move sext operations through safe to sext instructions.
+  bool propagateSignExtension(Instructions &SExtInsts);
+
+  /// Is this sext should be considered for code motion.
+  /// We look for sext with ConsideredSExtType and uses in at least one
+  // GetElementPtrInst.
+  bool shouldConsiderSExt(const Instruction *SExt) const;
+
+  /// Collect all interesting sext operations, i.e., the ones with the right
+  /// type and used in memory accesses.
+  /// More precisely, a sext instruction is considered as interesting if it
+  /// is used in a "complex" getelementptr or it exits at least another
+  /// sext instruction that sign extended the same initial value.
+  /// A getelementptr is considered as "complex" if it has more than 2
+  // operands.
+  void analyzeSExtension(Instructions &SExtInsts);
+
+  /// Merge redundant sign extension operations in common dominator.
+  void mergeSExts(ValueToInsts &ValToSExtendedUses,
+                  SetOfInstructions &ToRemove);
+};
+} // end anonymous namespace.
+
+char ARM64AddressTypePromotion::ID = 0;
+
+INITIALIZE_PASS_BEGIN(ARM64AddressTypePromotion, "arm64-type-promotion",
+                      "ARM64 Type Promotion Pass", false, false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_END(ARM64AddressTypePromotion, "arm64-type-promotion",
+                    "ARM64 Type Promotion Pass", false, false)
+
+FunctionPass *llvm::createARM64AddressTypePromotionPass() {
+  return new ARM64AddressTypePromotion();
+}
+
+bool ARM64AddressTypePromotion::canGetThrough(const Instruction *Inst) {
+  if (isa<SExtInst>(Inst))
+    return true;
+
+  const BinaryOperator *BinOp = dyn_cast<BinaryOperator>(Inst);
+  if (BinOp && isa<OverflowingBinaryOperator>(BinOp) &&
+      (BinOp->hasNoUnsignedWrap() || BinOp->hasNoSignedWrap()))
+    return true;
+
+  // sext(trunc(sext)) --> sext
+  if (isa<TruncInst>(Inst) && isa<SExtInst>(Inst->getOperand(0))) {
+    const Instruction *Opnd = cast<Instruction>(Inst->getOperand(0));
+    // Check that the truncate just drop sign extended bits.
+    if (Inst->getType()->getIntegerBitWidth() >=
+            Opnd->getOperand(0)->getType()->getIntegerBitWidth() &&
+        Inst->getOperand(0)->getType()->getIntegerBitWidth() <=
+            ConsideredSExtType->getIntegerBitWidth())
+      return true;
+  }
+
+  return false;
+}
+
+bool ARM64AddressTypePromotion::shouldGetThrough(const Instruction *Inst) {
+  // If the type of the sext is the same as the considered one, this sext
+  // will become useless.
+  // Otherwise, we will have to do something to preserve the original value,
+  // unless it is used once.
+  if (isa<SExtInst>(Inst) &&
+      (Inst->getType() == ConsideredSExtType || Inst->hasOneUse()))
+    return true;
+
+  // If the Inst is used more that once, we may need to insert truncate
+  // operations and we don't do that at the moment.
+  if (!Inst->hasOneUse())
+    return false;
+
+  // This truncate is used only once, thus if we can get thourgh, it will become
+  // useless.
+  if (isa<TruncInst>(Inst))
+    return true;
+
+  // If both operands are not constant, a new sext will be created here.
+  // Current heuristic is: each step should be profitable.
+  // Therefore we don't allow to increase the number of sext even if it may
+  // be profitable later on.
+  if (isa<BinaryOperator>(Inst) && isa<ConstantInt>(Inst->getOperand(1)))
+    return true;
+
+  return false;
+}
+
+static bool shouldSExtOperand(const Instruction *Inst, int OpIdx) {
+  if (isa<SelectInst>(Inst) && OpIdx == 0)
+    return false;
+  return true;
+}
+
+bool
+ARM64AddressTypePromotion::shouldConsiderSExt(const Instruction *SExt) const {
+  if (SExt->getType() != ConsideredSExtType)
+    return false;
+
+  for (Value::const_use_iterator UseIt = SExt->use_begin(),
+                                 EndUseIt = SExt->use_end();
+       UseIt != EndUseIt; ++UseIt) {
+    if (isa<GetElementPtrInst>(*UseIt))
+      return true;
+  }
+
+  return false;
+}
+
+// Input:
+// - SExtInsts contains all the sext instructions that are use direclty in
+//   GetElementPtrInst, i.e., access to memory.
+// Algorithm:
+// - For each sext operation in SExtInsts:
+//   Let var be the operand of sext.
+//   while it is profitable (see shouldGetThrough), legal, and safe
+//   (see canGetThrough) to move sext through var's definition:
+//   * promote the type of var's definition.
+//   * fold var into sext uses.
+//   * move sext above var's definition.
+//   * update sext operand to use the operand of var that should be sign
+//     extended (by construction there is only one).
+//
+//   E.g.,
+//   a = ... i32 c, 3
+//   b = sext i32 a to i64 <- is it legal/safe/profitable to get through 'a'
+//   ...
+//   = b
+// => Yes, update the code
+//   b = sext i32 c to i64
+//   a = ... i64 b, 3
+//   ...
+//   = a
+// Iterate on 'c'.
+bool
+ARM64AddressTypePromotion::propagateSignExtension(Instructions &SExtInsts) {
+  DEBUG(dbgs() << "*** Propagate Sign Extension ***\n");
+
+  bool LocalChange = false;
+  SetOfInstructions ToRemove;
+  ValueToInsts ValToSExtendedUses;
+  while (!SExtInsts.empty()) {
+    // Get through simple chain.
+    Instruction *SExt = SExtInsts.pop_back_val();
+
+    DEBUG(dbgs() << "Consider:\n" << *SExt << '\n');
+
+    // If this SExt has already been merged continue.
+    if (SExt->use_empty() && ToRemove.count(SExt)) {
+      DEBUG(dbgs() << "No uses => marked as delete\n");
+      continue;
+    }
+
+    // Now try to get through the chain of definitions.
+    while (isa<Instruction>(SExt->getOperand(0))) {
+      Instruction *Inst = dyn_cast<Instruction>(SExt->getOperand(0));
+      DEBUG(dbgs() << "Try to get through:\n" << *Inst << '\n');
+      if (!canGetThrough(Inst) || !shouldGetThrough(Inst)) {
+        // We cannot get through something that is not an Instruction
+        // or not safe to SExt.
+        DEBUG(dbgs() << "Cannot get through\n");
+        break;
+      }
+
+      LocalChange = true;
+      // If this is a sign extend, it becomes useless.
+      if (isa<SExtInst>(Inst) || isa<TruncInst>(Inst)) {
+        DEBUG(dbgs() << "SExt or trunc, mark it as to remove\n");
+        // We cannot use replaceAllUsesWith here because we may trigger some
+        // assertion on the type as all involved sext operation may have not
+        // been moved yet.
+        while (!Inst->use_empty()) {
+          Value::use_iterator UseIt = Inst->use_begin();
+          Instruction *UseInst = dyn_cast<Instruction>(*UseIt);
+          assert(UseInst && "Use of sext is not an Instruction!");
+          UseInst->setOperand(UseIt->getOperandNo(), SExt);
+        }
+        ToRemove.insert(Inst);
+        SExt->setOperand(0, Inst->getOperand(0));
+        SExt->moveBefore(Inst);
+        continue;
+      }
+
+      // Get through the Instruction:
+      // 1. Update its type.
+      // 2. Replace the uses of SExt by Inst.
+      // 3. Sign extend each operand that needs to be sign extended.
+
+      // Step #1.
+      Inst->mutateType(SExt->getType());
+      // Step #2.
+      SExt->replaceAllUsesWith(Inst);
+      // Step #3.
+      Instruction *SExtForOpnd = SExt;
+
+      DEBUG(dbgs() << "Propagate SExt to operands\n");
+      for (int OpIdx = 0, EndOpIdx = Inst->getNumOperands(); OpIdx != EndOpIdx;
+           ++OpIdx) {
+        DEBUG(dbgs() << "Operand:\n" << *(Inst->getOperand(OpIdx)) << '\n');
+        if (Inst->getOperand(OpIdx)->getType() == SExt->getType() ||
+            !shouldSExtOperand(Inst, OpIdx)) {
+          DEBUG(dbgs() << "No need to propagate\n");
+          continue;
+        }
+        // Check if we can statically sign extend the operand.
+        Value *Opnd = Inst->getOperand(OpIdx);
+        if (const ConstantInt *Cst = dyn_cast<ConstantInt>(Opnd)) {
+          DEBUG(dbgs() << "Statically sign extend\n");
+          Inst->setOperand(OpIdx, ConstantInt::getSigned(SExt->getType(),
+                                                         Cst->getSExtValue()));
+          continue;
+        }
+        // UndefValue are typed, so we have to statically sign extend them.
+        if (isa<UndefValue>(Opnd)) {
+          DEBUG(dbgs() << "Statically sign extend\n");
+          Inst->setOperand(OpIdx, UndefValue::get(SExt->getType()));
+          continue;
+        }
+
+        // Otherwise we have to explicity sign extend it.
+        assert(SExtForOpnd &&
+               "Only one operand should have been sign extended");
+
+        SExtForOpnd->setOperand(0, Opnd);
+
+        DEBUG(dbgs() << "Move before:\n" << *Inst << "\nSign extend\n");
+        // Move the sign extension before the insertion point.
+        SExtForOpnd->moveBefore(Inst);
+        Inst->setOperand(OpIdx, SExtForOpnd);
+        // If more sext are required, new instructions will have to be created.
+        SExtForOpnd = NULL;
+      }
+      if (SExtForOpnd == SExt) {
+        DEBUG(dbgs() << "Sign extension is useless now\n");
+        ToRemove.insert(SExt);
+        break;
+      }
+    }
+
+    // If the use is already of the right type, connect its uses to its argument
+    // and delete it.
+    // This can happen for an Instruction which all uses are sign extended.
+    if (!ToRemove.count(SExt) &&
+        SExt->getType() == SExt->getOperand(0)->getType()) {
+      DEBUG(dbgs() << "Sign extension is useless, attach its use to "
+                      "its argument\n");
+      SExt->replaceAllUsesWith(SExt->getOperand(0));
+      ToRemove.insert(SExt);
+    } else
+      ValToSExtendedUses[SExt->getOperand(0)].push_back(SExt);
+  }
+
+  if (EnableMerge)
+    mergeSExts(ValToSExtendedUses, ToRemove);
+
+  // Remove all instructions marked as ToRemove.
+  for (SetOfInstructions::iterator ToRemoveIt = ToRemove.begin(),
+                                   EndToRemoveIt = ToRemove.end();
+       ToRemoveIt != EndToRemoveIt; ++ToRemoveIt)
+    (*ToRemoveIt)->eraseFromParent();
+  return LocalChange;
+}
+
+void ARM64AddressTypePromotion::mergeSExts(ValueToInsts &ValToSExtendedUses,
+                                           SetOfInstructions &ToRemove) {
+  DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+
+  for (ValueToInsts::iterator It = ValToSExtendedUses.begin(),
+                              EndIt = ValToSExtendedUses.end();
+       It != EndIt; ++It) {
+    Instructions &Insts = It->second;
+    Instructions CurPts;
+    for (Instructions::iterator IIt = Insts.begin(), EndIIt = Insts.end();
+         IIt != EndIIt; ++IIt) {
+      if (ToRemove.count(*IIt))
+        continue;
+      bool inserted = false;
+      for (Instructions::iterator CurPtsIt = CurPts.begin(),
+                                  EndCurPtsIt = CurPts.end();
+           CurPtsIt != EndCurPtsIt; ++CurPtsIt) {
+        if (DT.dominates(*IIt, *CurPtsIt)) {
+          DEBUG(dbgs() << "Replace all uses of:\n" << **CurPtsIt << "\nwith:\n"
+                       << **IIt << '\n');
+          (*CurPtsIt)->replaceAllUsesWith(*IIt);
+          ToRemove.insert(*CurPtsIt);
+          *CurPtsIt = *IIt;
+          inserted = true;
+          break;
+        }
+        if (!DT.dominates(*CurPtsIt, *IIt))
+          // Give up if we need to merge in a common dominator as the
+          // expermients show it is not profitable.
+          continue;
+
+        DEBUG(dbgs() << "Replace all uses of:\n" << **IIt << "\nwith:\n"
+                     << **CurPtsIt << '\n');
+        (*IIt)->replaceAllUsesWith(*CurPtsIt);
+        ToRemove.insert(*IIt);
+        inserted = true;
+        break;
+      }
+      if (!inserted)
+        CurPts.push_back(*IIt);
+    }
+  }
+}
+
+void ARM64AddressTypePromotion::analyzeSExtension(Instructions &SExtInsts) {
+  DEBUG(dbgs() << "*** Analyze Sign Extensions ***\n");
+
+  DenseMap<Value *, Instruction *> SeenChains;
+
+  for (Function::iterator IBB = Func->begin(), IEndBB = Func->end();
+       IBB != IEndBB; ++IBB) {
+    for (BasicBlock::iterator II = IBB->begin(), IEndI = IBB->end();
+         II != IEndI; ++II) {
+
+      // Collect all sext operation per type.
+      if (!isa<SExtInst>(II) || !shouldConsiderSExt(II))
+        continue;
+      Instruction *SExt = II;
+
+      DEBUG(dbgs() << "Found:\n" << (*II) << '\n');
+
+      // Cases where we actually perform the optimization:
+      // 1. SExt is used in a getelementptr with more than 2 operand =>
+      //    likely we can merge some computation if they are done on 64 bits.
+      // 2. The beginning of the SExt chain is SExt several time. =>
+      //    code sharing is possible.
+
+      bool insert = false;
+      // #1.
+      for (Value::use_iterator UseIt = SExt->use_begin(),
+                               EndUseIt = SExt->use_end();
+           UseIt != EndUseIt; ++UseIt) {
+        const Instruction *Inst = dyn_cast<GetElementPtrInst>(*UseIt);
+        if (Inst && Inst->getNumOperands() > 2) {
+          DEBUG(dbgs() << "Interesting use in GetElementPtrInst\n" << *Inst
+                       << '\n');
+          insert = true;
+          break;
+        }
+      }
+
+      // #2.
+      // Check the head of the chain.
+      Instruction *Inst = SExt;
+      Value *Last;
+      do {
+        int OpdIdx = 0;
+        const BinaryOperator *BinOp = dyn_cast<BinaryOperator>(Inst);
+        if (BinOp && isa<ConstantInt>(BinOp->getOperand(0)))
+          OpdIdx = 1;
+        Last = Inst->getOperand(OpdIdx);
+        Inst = dyn_cast<Instruction>(Last);
+      } while (Inst && canGetThrough(Inst) && shouldGetThrough(Inst));
+
+      DEBUG(dbgs() << "Head of the chain:\n" << *Last << '\n');
+      DenseMap<Value *, Instruction *>::iterator AlreadySeen =
+          SeenChains.find(Last);
+      if (insert || AlreadySeen != SeenChains.end()) {
+        DEBUG(dbgs() << "Insert\n");
+        SExtInsts.push_back(II);
+        if (AlreadySeen != SeenChains.end() && AlreadySeen->second != NULL) {
+          DEBUG(dbgs() << "Insert chain member\n");
+          SExtInsts.push_back(AlreadySeen->second);
+          SeenChains[Last] = NULL;
+        }
+      } else {
+        DEBUG(dbgs() << "Record its chain membership\n");
+        SeenChains[Last] = SExt;
+      }
+    }
+  }
+}
+
+bool ARM64AddressTypePromotion::runOnFunction(Function &F) {
+  if (!EnableAddressTypePromotion || F.isDeclaration())
+    return false;
+  Func = &F;
+  ConsideredSExtType = Type::getInt64Ty(Func->getContext());
+
+  DEBUG(dbgs() << "*** " << getPassName() << ": " << Func->getName() << '\n');
+
+  Instructions SExtInsts;
+  analyzeSExtension(SExtInsts);
+  return propagateSignExtension(SExtInsts);
+}
--- a/lib/Target/ARM64/ARM64AdvSIMDScalarPass.cpp
+++ b/lib/Target/ARM64/ARM64AdvSIMDScalarPass.cpp
@ -0,0 +1,392 @@
+//===-- ARM64AdvSIMDScalar.cpp - Replace dead defs w/ zero reg --===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// When profitable, replace GPR targeting i64 instructions with their
+// AdvSIMD scalar equivalents. Generally speaking, "profitable" is defined
+// as minimizing the number of cross-class register copies.
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// TODO: Graph based predicate heuristics.
+// Walking the instruction list linearly will get many, perhaps most, of
+// the cases, but to do a truly throrough job of this, we need a more
+// wholistic approach.
+//
+// This optimization is very similar in spirit to the register allocator's
+// spill placement, only here we're determining where to place cross-class
+// register copies rather than spills. As such, a similar approach is
+// called for.
+//
+// We want to build up a set of graphs of all instructions which are candidates
+// for transformation along with instructions which generate their inputs and
+// consume their outputs. For each edge in the graph, we assign a weight
+// based on whether there is a copy required there (weight zero if not) and
+// the block frequency of the block containing the defining or using
+// instruction, whichever is less. Our optimization is then a graph problem
+// to minimize the total weight of all the graphs, then transform instructions
+// and add or remove copy instructions as called for to implement the
+// solution.
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "arm64-simd-scalar"
+#include "ARM64.h"
+#include "ARM64InstrInfo.h"
+#include "ARM64RegisterInfo.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+static cl::opt<bool>
+AdvSIMDScalar("arm64-simd-scalar",
+              cl::desc("enable use of AdvSIMD scalar integer instructions"),
+              cl::init(false), cl::Hidden);
+// Allow forcing all i64 operations with equivalent SIMD instructions to use
+// them. For stress-testing the transformation function.
+static cl::opt<bool>
+TransformAll("arm64-simd-scalar-force-all",
+             cl::desc("Force use of AdvSIMD scalar instructions everywhere"),
+             cl::init(false), cl::Hidden);
+
+STATISTIC(NumScalarInsnsUsed, "Number of scalar instructions used");
+STATISTIC(NumCopiesDeleted, "Number of cross-class copies deleted");
+STATISTIC(NumCopiesInserted, "Number of cross-class copies inserted");
+
+namespace {
+class ARM64AdvSIMDScalar : public MachineFunctionPass {
+  MachineRegisterInfo *MRI;
+  const ARM64InstrInfo *TII;
+
+private:
+  // isProfitableToTransform - Predicate function to determine whether an
+  // instruction should be transformed to its equivalent AdvSIMD scalar
+  // instruction. "add Xd, Xn, Xm" ==> "add Dd, Da, Db", for example.
+  bool isProfitableToTransform(const MachineInstr *MI) const;
+
+  // tranformInstruction - Perform the transformation of an instruction
+  // to its equivalant AdvSIMD scalar instruction. Update inputs and outputs
+  // to be the correct register class, minimizing cross-class copies.
+  void transformInstruction(MachineInstr *MI);
+
+  // processMachineBasicBlock - Main optimzation loop.
+  bool processMachineBasicBlock(MachineBasicBlock *MBB);
+
+public:
+  static char ID; // Pass identification, replacement for typeid.
+  explicit ARM64AdvSIMDScalar() : MachineFunctionPass(ID) {}
+
+  virtual bool runOnMachineFunction(MachineFunction &F);
+
+  const char *getPassName() const {
+    return "AdvSIMD scalar operation optimization";
+  }
+
+  virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+    AU.setPreservesCFG();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+};
+char ARM64AdvSIMDScalar::ID = 0;
+} // end anonymous namespace
+
+static bool isGPR64(unsigned Reg, unsigned SubReg,
+                    const MachineRegisterInfo *MRI) {
+  if (SubReg)
+    return false;
+  if (TargetRegisterInfo::isVirtualRegister(Reg))
+    return MRI->getRegClass(Reg)->hasSuperClassEq(&ARM64::GPR64RegClass);
+  return ARM64::GPR64RegClass.contains(Reg);
+}
+
+static bool isFPR64(unsigned Reg, unsigned SubReg,
+                    const MachineRegisterInfo *MRI) {
+  if (TargetRegisterInfo::isVirtualRegister(Reg))
+    return (MRI->getRegClass(Reg)->hasSuperClassEq(&ARM64::FPR64RegClass) &&
+            SubReg == 0) ||
+           (MRI->getRegClass(Reg)->hasSuperClassEq(&ARM64::FPR128RegClass) &&
+            SubReg == ARM64::dsub);
+  // Physical register references just check the regist class directly.
+  return (ARM64::FPR64RegClass.contains(Reg) && SubReg == 0) ||
+         (ARM64::FPR128RegClass.contains(Reg) && SubReg == ARM64::dsub);
+}
+
+// getSrcFromCopy - Get the original source register for a GPR64 <--> FPR64
+// copy instruction. Return zero_reg if the instruction is not a copy.
+static unsigned getSrcFromCopy(const MachineInstr *MI,
+                               const MachineRegisterInfo *MRI,
+                               unsigned &SubReg) {
+  SubReg = 0;
+  // The "FMOV Xd, Dn" instruction is the typical form.
+  if (MI->getOpcode() == ARM64::FMOVDXr || MI->getOpcode() == ARM64::FMOVXDr)
+    return MI->getOperand(1).getReg();
+  // A lane zero extract "UMOV.d Xd, Vn[0]" is equivalent. We shouldn't see
+  // these at this stage, but it's easy to check for.
+  if (MI->getOpcode() == ARM64::UMOVvi64 && MI->getOperand(2).getImm() == 0) {
+    SubReg = ARM64::dsub;
+    return MI->getOperand(1).getReg();
+  }
+  // Or just a plain COPY instruction. This can be directly to/from FPR64,
+  // or it can be a dsub subreg reference to an FPR128.
+  if (MI->getOpcode() == ARM64::COPY) {
+    if (isFPR64(MI->getOperand(0).getReg(), MI->getOperand(0).getSubReg(),
+                MRI) &&
+        isGPR64(MI->getOperand(1).getReg(), MI->getOperand(1).getSubReg(), MRI))
+      return MI->getOperand(1).getReg();
+    if (isGPR64(MI->getOperand(0).getReg(), MI->getOperand(0).getSubReg(),
+                MRI) &&
+        isFPR64(MI->getOperand(1).getReg(), MI->getOperand(1).getSubReg(),
+                MRI)) {
+      SubReg = ARM64::dsub;
+      return MI->getOperand(1).getReg();
+    }
+  }
+
+  // Otherwise, this is some other kind of instruction.
+  return 0;
+}
+
+// getTransformOpcode - For any opcode for which there is an AdvSIMD equivalent
+// that we're considering transforming to, return that AdvSIMD opcode. For all
+// others, return the original opcode.
+static int getTransformOpcode(unsigned Opc) {
+  switch (Opc) {
+  default:
+    break;
+  // FIXME: Lots more possibilities.
+  case ARM64::ADDXrr:
+    return ARM64::ADDv1i64;
+  case ARM64::SUBXrr:
+    return ARM64::SUBv1i64;
+  }
+  // No AdvSIMD equivalent, so just return the original opcode.
+  return Opc;
+}
+
+static bool isTransformable(const MachineInstr *MI) {
+  int Opc = MI->getOpcode();
+  return Opc != getTransformOpcode(Opc);
+}
+
+// isProfitableToTransform - Predicate function to determine whether an
+// instruction should be transformed to its equivalent AdvSIMD scalar
+// instruction. "add Xd, Xn, Xm" ==> "add Dd, Da, Db", for example.
+bool ARM64AdvSIMDScalar::isProfitableToTransform(const MachineInstr *MI) const {
+  // If this instruction isn't eligible to be transformed (no SIMD equivalent),
+  // early exit since that's the common case.
+  if (!isTransformable(MI))
+    return false;
+
+  // Count the number of copies we'll need to add and approximate the number
+  // of copies that a transform will enable us to remove.
+  unsigned NumNewCopies = 3;
+  unsigned NumRemovableCopies = 0;
+
+  unsigned OrigSrc0 = MI->getOperand(1).getReg();
+  unsigned OrigSrc1 = MI->getOperand(2).getReg();
+  unsigned Src0 = 0, SubReg0;
+  unsigned Src1 = 0, SubReg1;
+  if (!MRI->def_empty(OrigSrc0)) {
+    MachineRegisterInfo::def_instr_iterator Def =
+        MRI->def_instr_begin(OrigSrc0);
+    assert(std::next(Def) == MRI->def_instr_end() && "Multiple def in SSA!");
+    Src0 = getSrcFromCopy(&*Def, MRI, SubReg0);
+    // If the source was from a copy, we don't need to insert a new copy.
+    if (Src0)
+      --NumNewCopies;
+    // If there are no other users of the original source, we can delete
+    // that instruction.
+    if (Src0 && MRI->hasOneNonDBGUse(OrigSrc0))
+      ++NumRemovableCopies;
+  }
+  if (!MRI->def_empty(OrigSrc1)) {
+    MachineRegisterInfo::def_instr_iterator Def =
+        MRI->def_instr_begin(OrigSrc1);
+    assert(std::next(Def) == MRI->def_instr_end() && "Multiple def in SSA!");
+    Src1 = getSrcFromCopy(&*Def, MRI, SubReg1);
+    if (Src1)
+      --NumNewCopies;
+    // If there are no other users of the original source, we can delete
+    // that instruction.
+    if (Src1 && MRI->hasOneNonDBGUse(OrigSrc1))
+      ++NumRemovableCopies;
+  }
+
+  // If any of the uses of the original instructions is a cross class copy,
+  // that's a copy that will be removable if we transform. Likewise, if
+  // any of the uses is a transformable instruction, it's likely the tranforms
+  // will chain, enabling us to save a copy there, too. This is an aggressive
+  // heuristic that approximates the graph based cost analysis described above.
+  unsigned Dst = MI->getOperand(0).getReg();
+  bool AllUsesAreCopies = true;
+  for (MachineRegisterInfo::use_instr_nodbg_iterator
+           Use = MRI->use_instr_nodbg_begin(Dst),
+           E = MRI->use_instr_nodbg_end();
+       Use != E; ++Use) {
+    unsigned SubReg;
+    if (getSrcFromCopy(&*Use, MRI, SubReg) || isTransformable(&*Use))
+      ++NumRemovableCopies;
+    // If the use is an INSERT_SUBREG, that's still something that can
+    // directly use the FPR64, so we don't invalidate AllUsesAreCopies. It's
+    // preferable to have it use the FPR64 in most cases, as if the source
+    // vector is an IMPLICIT_DEF, the INSERT_SUBREG just goes away entirely.
+    // Ditto for a lane insert.
+    else if (Use->getOpcode() == ARM64::INSERT_SUBREG ||
+             Use->getOpcode() == ARM64::INSvi64gpr)
+      ;
+    else
+      AllUsesAreCopies = false;
+  }
+  // If all of the uses of the original destination register are copies to
+  // FPR64, then we won't end up having a new copy back to GPR64 either.
+  if (AllUsesAreCopies)
+    --NumNewCopies;
+
+  // If a tranform will not increase the number of cross-class copies required,
+  // return true.
+  if (NumNewCopies <= NumRemovableCopies)
+    return true;
+
+  // Finally, even if we otherwise wouldn't transform, check if we're forcing
+  // transformation of everything.
+  return TransformAll;
+}
+
+static MachineInstr *insertCopy(const ARM64InstrInfo *TII, MachineInstr *MI,
+                                unsigned Dst, unsigned Src, bool IsKill) {
+  MachineInstrBuilder MIB =
+      BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII->get(ARM64::COPY),
+              Dst)
+          .addReg(Src, getKillRegState(IsKill));
+  DEBUG(dbgs() << "    adding copy: " << *MIB);
+  ++NumCopiesInserted;
+  return MIB;
+}
+
+// tranformInstruction - Perform the transformation of an instruction
+// to its equivalant AdvSIMD scalar instruction. Update inputs and outputs
+// to be the correct register class, minimizing cross-class copies.
+void ARM64AdvSIMDScalar::transformInstruction(MachineInstr *MI) {
+  DEBUG(dbgs() << "Scalar transform: " << *MI);
+
+  MachineBasicBlock *MBB = MI->getParent();
+  int OldOpc = MI->getOpcode();
+  int NewOpc = getTransformOpcode(OldOpc);
+  assert(OldOpc != NewOpc && "transform an instruction to itself?!");
+
+  // Check if we need a copy for the source registers.
+  unsigned OrigSrc0 = MI->getOperand(1).getReg();
+  unsigned OrigSrc1 = MI->getOperand(2).getReg();
+  unsigned Src0 = 0, SubReg0;
+  unsigned Src1 = 0, SubReg1;
+  if (!MRI->def_empty(OrigSrc0)) {
+    MachineRegisterInfo::def_instr_iterator Def =
+        MRI->def_instr_begin(OrigSrc0);
+    assert(std::next(Def) == MRI->def_instr_end() && "Multiple def in SSA!");
+    Src0 = getSrcFromCopy(&*Def, MRI, SubReg0);
+    // If there are no other users of the original source, we can delete
+    // that instruction.
+    if (Src0 && MRI->hasOneNonDBGUse(OrigSrc0)) {
+      assert(Src0 && "Can't delete copy w/o a valid original source!");
+      Def->eraseFromParent();
+      ++NumCopiesDeleted;
+    }
+  }
+  if (!MRI->def_empty(OrigSrc1)) {
+    MachineRegisterInfo::def_instr_iterator Def =
+        MRI->def_instr_begin(OrigSrc1);
+    assert(std::next(Def) == MRI->def_instr_end() && "Multiple def in SSA!");
+    Src1 = getSrcFromCopy(&*Def, MRI, SubReg1);
+    // If there are no other users of the original source, we can delete
+    // that instruction.
+    if (Src1 && MRI->hasOneNonDBGUse(OrigSrc1)) {
+      assert(Src1 && "Can't delete copy w/o a valid original source!");
+      Def->eraseFromParent();
+      ++NumCopiesDeleted;
+    }
+  }
+  // If we weren't able to reference the original source directly, create a
+  // copy.
+  if (!Src0) {
+    SubReg0 = 0;
+    Src0 = MRI->createVirtualRegister(&ARM64::FPR64RegClass);
+    insertCopy(TII, MI, Src0, OrigSrc0, true);
+  }
+  if (!Src1) {
+    SubReg1 = 0;
+    Src1 = MRI->createVirtualRegister(&ARM64::FPR64RegClass);
+    insertCopy(TII, MI, Src1, OrigSrc1, true);
+  }
+
+  // Create a vreg for the destination.
+  // FIXME: No need to do this if the ultimate user expects an FPR64.
+  // Check for that and avoid the copy if possible.
+  unsigned Dst = MRI->createVirtualRegister(&ARM64::FPR64RegClass);
+
+  // For now, all of the new instructions have the same simple three-register
+  // form, so no need to special case based on what instruction we're
+  // building.
+  BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(NewOpc), Dst)
+      .addReg(Src0, getKillRegState(true), SubReg0)
+      .addReg(Src1, getKillRegState(true), SubReg1);
+
+  // Now copy the result back out to a GPR.
+  // FIXME: Try to avoid this if all uses could actually just use the FPR64
+  // directly.
+  insertCopy(TII, MI, MI->getOperand(0).getReg(), Dst, true);
+
+  // Erase the old instruction.
+  MI->eraseFromParent();
+
+  ++NumScalarInsnsUsed;
+}
+
+// processMachineBasicBlock - Main optimzation loop.
+bool ARM64AdvSIMDScalar::processMachineBasicBlock(MachineBasicBlock *MBB) {
+  bool Changed = false;
+  for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E;) {
+    MachineInstr *MI = I;
+    ++I;
+    if (isProfitableToTransform(MI)) {
+      transformInstruction(MI);
+      Changed = true;
+    }
+  }
+  return Changed;
+}
+
+// runOnMachineFunction - Pass entry point from PassManager.
+bool ARM64AdvSIMDScalar::runOnMachineFunction(MachineFunction &mf) {
+  // Early exit if pass disabled.
+  if (!AdvSIMDScalar)
+    return false;
+
+  bool Changed = false;
+  DEBUG(dbgs() << "***** ARM64AdvSIMDScalar *****\n");
+
+  const TargetMachine &TM = mf.getTarget();
+  MRI = &mf.getRegInfo();
+  TII = static_cast<const ARM64InstrInfo *>(TM.getInstrInfo());
+
+  // Just check things on a one-block-at-a-time basis.
+  for (MachineFunction::iterator I = mf.begin(), E = mf.end(); I != E; ++I)
+    if (processMachineBasicBlock(I))
+      Changed = true;
+  return Changed;
+}
+
+// createARM64AdvSIMDScalar - Factory function used by ARM64TargetMachine
+// to add the pass to the PassManager.
+FunctionPass *llvm::createARM64AdvSIMDScalar() {
+  return new ARM64AdvSIMDScalar();
+}
--- a/lib/Target/ARM64/ARM64AsmPrinter.cpp
+++ b/lib/Target/ARM64/ARM64AsmPrinter.cpp
@ -0,0 +1,573 @@
+//===-- ARM64AsmPrinter.cpp - ARM64 LLVM assembly writer ------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a printer that converts from our internal representation
+// of machine-dependent LLVM code to the ARM64 assembly language.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "asm-printer"
+#include "ARM64.h"
+#include "ARM64MachineFunctionInfo.h"
+#include "ARM64MCInstLower.h"
+#include "ARM64RegisterInfo.h"
+#include "InstPrinter/ARM64InstPrinter.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/StackMaps.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DebugInfo.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstBuilder.h"
+#include "llvm/MC/MCLinkerOptimizationHint.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/TargetRegistry.h"
+using namespace llvm;
+
+namespace {
+
+class ARM64AsmPrinter : public AsmPrinter {
+  ARM64MCInstLower MCInstLowering;
+  StackMaps SM;
+
+public:
+  ARM64AsmPrinter(TargetMachine &TM, MCStreamer &Streamer)
+      : AsmPrinter(TM, Streamer), MCInstLowering(OutContext, *Mang, *this),
+        SM(*this), ARM64FI(NULL), LOHLabelCounter(0) {}
+
+  virtual const char *getPassName() const { return "ARM64 Assembly Printer"; }
+
+  /// \brief Wrapper for MCInstLowering.lowerOperand() for the
+  /// tblgen'erated pseudo lowering.
+  bool lowerOperand(const MachineOperand &MO, MCOperand &MCOp) const {
+    return MCInstLowering.lowerOperand(MO, MCOp);
+  }
+
+  void LowerSTACKMAP(MCStreamer &OutStreamer, StackMaps &SM,
+                     const MachineInstr &MI);
+  void LowerPATCHPOINT(MCStreamer &OutStreamer, StackMaps &SM,
+                       const MachineInstr &MI);
+  /// \brief tblgen'erated driver function for lowering simple MI->MC
+  /// pseudo instructions.
+  bool emitPseudoExpansionLowering(MCStreamer &OutStreamer,
+                                   const MachineInstr *MI);
+
+  void EmitInstruction(const MachineInstr *MI);
+
+  void getAnalysisUsage(AnalysisUsage &AU) const {
+    AsmPrinter::getAnalysisUsage(AU);
+    AU.setPreservesAll();
+  }
+
+  bool runOnMachineFunction(MachineFunction &F) {
+    ARM64FI = F.getInfo<ARM64FunctionInfo>();
+    return AsmPrinter::runOnMachineFunction(F);
+  }
+
+private:
+  MachineLocation getDebugValueLocation(const MachineInstr *MI) const;
+  void printOperand(const MachineInstr *MI, unsigned OpNum, raw_ostream &O);
+  bool printAsmMRegister(const MachineOperand &MO, char Mode, raw_ostream &O);
+  bool printAsmRegInClass(const MachineOperand &MO,
+                          const TargetRegisterClass *RC, bool isVector,
+                          raw_ostream &O);
+
+  bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
+                       unsigned AsmVariant, const char *ExtraCode,
+                       raw_ostream &O);
+  bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNum,
+                             unsigned AsmVariant, const char *ExtraCode,
+                             raw_ostream &O);
+
+  void PrintDebugValueComment(const MachineInstr *MI, raw_ostream &OS);
+
+  void EmitFunctionBodyEnd();
+
+  MCSymbol *GetCPISymbol(unsigned CPID) const;
+  void EmitEndOfAsmFile(Module &M);
+  ARM64FunctionInfo *ARM64FI;
+
+  /// \brief Emit the LOHs contained in ARM64FI.
+  void EmitLOHs();
+
+  typedef std::map<const MachineInstr *, MCSymbol *> MInstToMCSymbol;
+  MInstToMCSymbol LOHInstToLabel;
+  unsigned LOHLabelCounter;
+};
+
+} // end of anonymous namespace
+
+//===----------------------------------------------------------------------===//
+
+void ARM64AsmPrinter::EmitEndOfAsmFile(Module &M) {
+  // Funny Darwin hack: This flag tells the linker that no global symbols
+  // contain code that falls through to other global symbols (e.g. the obvious
+  // implementation of multiple entry points).  If this doesn't occur, the
+  // linker can safely perform dead code stripping.  Since LLVM never
+  // generates code that does this, it is always safe to set.
+  OutStreamer.EmitAssemblerFlag(MCAF_SubsectionsViaSymbols);
+  SM.serializeToStackMapSection();
+}
+
+MachineLocation
+ARM64AsmPrinter::getDebugValueLocation(const MachineInstr *MI) const {
+  MachineLocation Location;
+  assert(MI->getNumOperands() == 4 && "Invalid no. of machine operands!");
+  // Frame address.  Currently handles register +- offset only.
+  if (MI->getOperand(0).isReg() && MI->getOperand(1).isImm())
+    Location.set(MI->getOperand(0).getReg(), MI->getOperand(1).getImm());
+  else {
+    DEBUG(dbgs() << "DBG_VALUE instruction ignored! " << *MI << "\n");
+  }
+  return Location;
+}
+
+void ARM64AsmPrinter::EmitLOHs() {
+  const ARM64FunctionInfo::MILOHDirectives &LOHs =
+      const_cast<const ARM64FunctionInfo *>(ARM64FI)
+          ->getLOHContainer()
+          .getDirectives();
+  SmallVector<MCSymbol *, 3> MCArgs;
+
+  for (ARM64FunctionInfo::MILOHDirectives::const_iterator It = LOHs.begin(),
+                                                          EndIt = LOHs.end();
+       It != EndIt; ++It) {
+    const ARM64FunctionInfo::MILOHArgs &MIArgs = It->getArgs();
+    for (ARM64FunctionInfo::MILOHArgs::const_iterator
+             MIArgsIt = MIArgs.begin(),
+             EndMIArgsIt = MIArgs.end();
+         MIArgsIt != EndMIArgsIt; ++MIArgsIt) {
+      MInstToMCSymbol::iterator LabelIt = LOHInstToLabel.find(*MIArgsIt);
+      assert(LabelIt != LOHInstToLabel.end() &&
+             "Label hasn't been inserted for LOH related instruction");
+      MCArgs.push_back(LabelIt->second);
+    }
+    OutStreamer.EmitLOHDirective(It->getKind(), MCArgs);
+    MCArgs.clear();
+  }
+}
+
+void ARM64AsmPrinter::EmitFunctionBodyEnd() {
+  if (!ARM64FI->getLOHRelated().empty())
+    EmitLOHs();
+}
+
+/// GetCPISymbol - Return the symbol for the specified constant pool entry.
+MCSymbol *ARM64AsmPrinter::GetCPISymbol(unsigned CPID) const {
+  // Darwin uses a linker-private symbol name for constant-pools (to
+  // avoid addends on the relocation?), ELF has no such concept and
+  // uses a normal private symbol.
+  if (getDataLayout().getLinkerPrivateGlobalPrefix()[0])
+    return OutContext.GetOrCreateSymbol(
+        Twine(getDataLayout().getLinkerPrivateGlobalPrefix()) + "CPI" +
+        Twine(getFunctionNumber()) + "_" + Twine(CPID));
+
+  return OutContext.GetOrCreateSymbol(
+      Twine(getDataLayout().getPrivateGlobalPrefix()) + "CPI" +
+      Twine(getFunctionNumber()) + "_" + Twine(CPID));
+}
+
+void ARM64AsmPrinter::printOperand(const MachineInstr *MI, unsigned OpNum,
+                                   raw_ostream &O) {
+  const MachineOperand &MO = MI->getOperand(OpNum);
+  switch (MO.getType()) {
+  default:
+    assert(0 && "<unknown operand type>");
+  case MachineOperand::MO_Register: {
+    unsigned Reg = MO.getReg();
+    assert(TargetRegisterInfo::isPhysicalRegister(Reg));
+    assert(!MO.getSubReg() && "Subregs should be eliminated!");
+    O << ARM64InstPrinter::getRegisterName(Reg);
+    break;
+  }
+  case MachineOperand::MO_Immediate: {
+    int64_t Imm = MO.getImm();
+    O << '#' << Imm;
+    break;
+  }
+  }
+}
+
+bool ARM64AsmPrinter::printAsmMRegister(const MachineOperand &MO, char Mode,
+                                        raw_ostream &O) {
+  unsigned Reg = MO.getReg();
+  switch (Mode) {
+  default:
+    return true; // Unknown mode.
+  case 'w':
+    Reg = getWRegFromXReg(Reg);
+    break;
+  case 'x':
+    Reg = getXRegFromWReg(Reg);
+    break;
+  }
+
+  O << ARM64InstPrinter::getRegisterName(Reg);
+  return false;
+}
+
+// Prints the register in MO using class RC using the offset in the
+// new register class. This should not be used for cross class
+// printing.
+bool ARM64AsmPrinter::printAsmRegInClass(const MachineOperand &MO,
+                                         const TargetRegisterClass *RC,
+                                         bool isVector, raw_ostream &O) {
+  assert(MO.isReg() && "Should only get here with a register!");
+  const ARM64RegisterInfo *RI =
+      static_cast<const ARM64RegisterInfo *>(TM.getRegisterInfo());
+  unsigned Reg = MO.getReg();
+  unsigned RegToPrint = RC->getRegister(RI->getEncodingValue(Reg));
+  assert(RI->regsOverlap(RegToPrint, Reg));
+  O << ARM64InstPrinter::getRegisterName(
+           RegToPrint, isVector ? ARM64::vreg : ARM64::NoRegAltName);
+  return false;
+}
+
+bool ARM64AsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
+                                      unsigned AsmVariant,
+                                      const char *ExtraCode, raw_ostream &O) {
+  const MachineOperand &MO = MI->getOperand(OpNum);
+  // Does this asm operand have a single letter operand modifier?
+  if (ExtraCode && ExtraCode[0]) {
+    if (ExtraCode[1] != 0)
+      return true; // Unknown modifier.
+
+    switch (ExtraCode[0]) {
+    default:
+      return true; // Unknown modifier.
+    case 'w':      // Print W register
+    case 'x':      // Print X register
+      if (MO.isReg())
+        return printAsmMRegister(MO, ExtraCode[0], O);
+      if (MO.isImm() && MO.getImm() == 0) {
+        unsigned Reg = ExtraCode[0] == 'w' ? ARM64::WZR : ARM64::XZR;
+        O << ARM64InstPrinter::getRegisterName(Reg);
+        return false;
+      }
+      printOperand(MI, OpNum, O);
+      return false;
+    case 'b': // Print B register.
+    case 'h': // Print H register.
+    case 's': // Print S register.
+    case 'd': // Print D register.
+    case 'q': // Print Q register.
+      if (MO.isReg()) {
+        const TargetRegisterClass *RC;
+        switch (ExtraCode[0]) {
+        case 'b':
+          RC = &ARM64::FPR8RegClass;
+          break;
+        case 'h':
+          RC = &ARM64::FPR16RegClass;
+          break;
+        case 's':
+          RC = &ARM64::FPR32RegClass;
+          break;
+        case 'd':
+          RC = &ARM64::FPR64RegClass;
+          break;
+        case 'q':
+          RC = &ARM64::FPR128RegClass;
+          break;
+        default:
+          return true;
+        }
+        return printAsmRegInClass(MO, RC, false /* vector */, O);
+      }
+      printOperand(MI, OpNum, O);
+      return false;
+    }
+  }
+
+  // According to ARM, we should emit x and v registers unless we have a
+  // modifier.
+  if (MO.isReg()) {
+    unsigned Reg = MO.getReg();
+
+    // If this is a w or x register, print an x register.
+    if (ARM64::GPR32allRegClass.contains(Reg) ||
+        ARM64::GPR64allRegClass.contains(Reg))
+      return printAsmMRegister(MO, 'x', O);
+
+    // If this is a b, h, s, d, or q register, print it as a v register.
+    return printAsmRegInClass(MO, &ARM64::FPR128RegClass, true /* vector */, O);
+  }
+
+  printOperand(MI, OpNum, O);
+  return false;
+}
+
+bool ARM64AsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
+                                            unsigned OpNum, unsigned AsmVariant,
+                                            const char *ExtraCode,
+                                            raw_ostream &O) {
+  if (ExtraCode && ExtraCode[0])
+    return true; // Unknown modifier.
+
+  const MachineOperand &MO = MI->getOperand(OpNum);
+  assert(MO.isReg() && "unexpected inline asm memory operand");
+  O << "[" << ARM64InstPrinter::getRegisterName(MO.getReg()) << "]";
+  return false;
+}
+
+void ARM64AsmPrinter::PrintDebugValueComment(const MachineInstr *MI,
+                                             raw_ostream &OS) {
+  unsigned NOps = MI->getNumOperands();
+  assert(NOps == 4);
+  OS << '\t' << MAI->getCommentString() << "DEBUG_VALUE: ";
+  // cast away const; DIetc do not take const operands for some reason.
+  DIVariable V(const_cast<MDNode *>(MI->getOperand(NOps - 1).getMetadata()));
+  OS << V.getName();
+  OS << " <- ";
+  // Frame address.  Currently handles register +- offset only.
+  assert(MI->getOperand(0).isReg() && MI->getOperand(1).isImm());
+  OS << '[';
+  printOperand(MI, 0, OS);
+  OS << '+';
+  printOperand(MI, 1, OS);
+  OS << ']';
+  OS << "+";
+  printOperand(MI, NOps - 2, OS);
+}
+
+void ARM64AsmPrinter::LowerSTACKMAP(MCStreamer &OutStreamer, StackMaps &SM,
+                                    const MachineInstr &MI) {
+  unsigned NumNOPBytes = MI.getOperand(1).getImm();
+
+  SM.recordStackMap(MI);
+  // Emit padding.
+  assert(NumNOPBytes % 4 == 0 && "Invalid number of NOP bytes requested!");
+  for (unsigned i = 0; i < NumNOPBytes; i += 4)
+    EmitToStreamer(OutStreamer, MCInstBuilder(ARM64::HINT).addImm(0));
+}
+
+// Lower a patchpoint of the form:
+// [<def>], <id>, <numBytes>, <target>, <numArgs>
+void ARM64AsmPrinter::LowerPATCHPOINT(MCStreamer &OutStreamer, StackMaps &SM,
+                                      const MachineInstr &MI) {
+  SM.recordPatchPoint(MI);
+
+  PatchPointOpers Opers(&MI);
+
+  int64_t CallTarget = Opers.getMetaOper(PatchPointOpers::TargetPos).getImm();
+  unsigned EncodedBytes = 0;
+  if (CallTarget) {
+    assert((CallTarget & 0xFFFFFFFFFFFF) == CallTarget &&
+           "High 16 bits of call target should be zero.");
+    unsigned ScratchReg = MI.getOperand(Opers.getNextScratchIdx()).getReg();
+    EncodedBytes = 16;
+    // Materialize the jump address:
+    EmitToStreamer(OutStreamer, MCInstBuilder(ARM64::MOVZWi)
+                                    .addReg(ScratchReg)
+                                    .addImm((CallTarget >> 32) & 0xFFFF)
+                                    .addImm(32));
+    EmitToStreamer(OutStreamer, MCInstBuilder(ARM64::MOVKWi)
+                                    .addReg(ScratchReg)
+                                    .addReg(ScratchReg)
+                                    .addImm((CallTarget >> 16) & 0xFFFF)
+                                    .addImm(16));
+    EmitToStreamer(OutStreamer, MCInstBuilder(ARM64::MOVKWi)
+                                    .addReg(ScratchReg)
+                                    .addReg(ScratchReg)
+                                    .addImm(CallTarget & 0xFFFF)
+                                    .addImm(0));
+    EmitToStreamer(OutStreamer, MCInstBuilder(ARM64::BLR).addReg(ScratchReg));
+  }
+  // Emit padding.
+  unsigned NumBytes = Opers.getMetaOper(PatchPointOpers::NBytesPos).getImm();
+  assert(NumBytes >= EncodedBytes &&
+         "Patchpoint can't request size less than the length of a call.");
+  assert((NumBytes - EncodedBytes) % 4 == 0 &&
+         "Invalid number of NOP bytes requested!");
+  for (unsigned i = EncodedBytes; i < NumBytes; i += 4)
+    EmitToStreamer(OutStreamer, MCInstBuilder(ARM64::HINT).addImm(0));
+}
+
+// Simple pseudo-instructions have their lowering (with expansion to real
+// instructions) auto-generated.
+#include "ARM64GenMCPseudoLowering.inc"
+
+static unsigned getRealIndexedOpcode(unsigned Opc) {
+  switch (Opc) {
+  case ARM64::LDRXpre_isel:    return ARM64::LDRXpre;
+  case ARM64::LDRWpre_isel:    return ARM64::LDRWpre;
+  case ARM64::LDRDpre_isel:    return ARM64::LDRDpre;
+  case ARM64::LDRSpre_isel:    return ARM64::LDRSpre;
+  case ARM64::LDRBBpre_isel:   return ARM64::LDRBBpre;
+  case ARM64::LDRHHpre_isel:   return ARM64::LDRHHpre;
+  case ARM64::LDRSBWpre_isel:  return ARM64::LDRSBWpre;
+  case ARM64::LDRSBXpre_isel:  return ARM64::LDRSBXpre;
+  case ARM64::LDRSHWpre_isel:  return ARM64::LDRSHWpre;
+  case ARM64::LDRSHXpre_isel:  return ARM64::LDRSHXpre;
+  case ARM64::LDRSWpre_isel:   return ARM64::LDRSWpre;
+
+  case ARM64::LDRDpost_isel:   return ARM64::LDRDpost;
+  case ARM64::LDRSpost_isel:   return ARM64::LDRSpost;
+  case ARM64::LDRXpost_isel:   return ARM64::LDRXpost;
+  case ARM64::LDRWpost_isel:   return ARM64::LDRWpost;
+  case ARM64::LDRHHpost_isel:  return ARM64::LDRHHpost;
+  case ARM64::LDRBBpost_isel:  return ARM64::LDRBBpost;
+  case ARM64::LDRSWpost_isel:  return ARM64::LDRSWpost;
+  case ARM64::LDRSHWpost_isel: return ARM64::LDRSHWpost;
+  case ARM64::LDRSHXpost_isel: return ARM64::LDRSHXpost;
+  case ARM64::LDRSBWpost_isel: return ARM64::LDRSBWpost;
+  case ARM64::LDRSBXpost_isel: return ARM64::LDRSBXpost;
+
+  case ARM64::STRXpre_isel:    return ARM64::STRXpre;
+  case ARM64::STRWpre_isel:    return ARM64::STRWpre;
+  case ARM64::STRHHpre_isel:   return ARM64::STRHHpre;
+  case ARM64::STRBBpre_isel:   return ARM64::STRBBpre;
+  case ARM64::STRDpre_isel:    return ARM64::STRDpre;
+  case ARM64::STRSpre_isel:    return ARM64::STRSpre;
+  }
+  llvm_unreachable("Unexpected pre-indexed opcode!");
+}
+
+void ARM64AsmPrinter::EmitInstruction(const MachineInstr *MI) {
+  // Do any auto-generated pseudo lowerings.
+  if (emitPseudoExpansionLowering(OutStreamer, MI))
+    return;
+
+  if (ARM64FI->getLOHRelated().count(MI)) {
+    // Generate a label for LOH related instruction
+    MCSymbol *LOHLabel = GetTempSymbol("loh", LOHLabelCounter++);
+    // Associate the instruction with the label
+    LOHInstToLabel[MI] = LOHLabel;
+    OutStreamer.EmitLabel(LOHLabel);
+  }
+
+  // Do any manual lowerings.
+  switch (MI->getOpcode()) {
+  default:
+    break;
+  case ARM64::DBG_VALUE: {
+    if (isVerbose() && OutStreamer.hasRawTextSupport()) {
+      SmallString<128> TmpStr;
+      raw_svector_ostream OS(TmpStr);
+      PrintDebugValueComment(MI, OS);
+      OutStreamer.EmitRawText(StringRef(OS.str()));
+    }
+    return;
+  }
+  // Indexed loads and stores use a pseudo to handle complex operand
+  // tricks and writeback to the base register. We strip off the writeback
+  // operand and switch the opcode here. Post-indexed stores were handled by the
+  // tablegen'erated pseudos above. (The complex operand <--> simple
+  // operand isel is beyond tablegen's ability, so we do these manually).
+  case ARM64::LDRHHpre_isel:
+  case ARM64::LDRBBpre_isel:
+  case ARM64::LDRXpre_isel:
+  case ARM64::LDRWpre_isel:
+  case ARM64::LDRDpre_isel:
+  case ARM64::LDRSpre_isel:
+  case ARM64::LDRSBWpre_isel:
+  case ARM64::LDRSBXpre_isel:
+  case ARM64::LDRSHWpre_isel:
+  case ARM64::LDRSHXpre_isel:
+  case ARM64::LDRSWpre_isel:
+  case ARM64::LDRDpost_isel:
+  case ARM64::LDRSpost_isel:
+  case ARM64::LDRXpost_isel:
+  case ARM64::LDRWpost_isel:
+  case ARM64::LDRHHpost_isel:
+  case ARM64::LDRBBpost_isel:
+  case ARM64::LDRSWpost_isel:
+  case ARM64::LDRSHWpost_isel:
+  case ARM64::LDRSHXpost_isel:
+  case ARM64::LDRSBWpost_isel:
+  case ARM64::LDRSBXpost_isel: {
+    MCInst TmpInst;
+    // For loads, the writeback operand to be skipped is the second.
+    TmpInst.setOpcode(getRealIndexedOpcode(MI->getOpcode()));
+    TmpInst.addOperand(MCOperand::CreateReg(MI->getOperand(0).getReg()));
+    TmpInst.addOperand(MCOperand::CreateReg(MI->getOperand(2).getReg()));
+    TmpInst.addOperand(MCOperand::CreateImm(MI->getOperand(3).getImm()));
+    EmitToStreamer(OutStreamer, TmpInst);
+    return;
+  }
+  case ARM64::STRXpre_isel:
+  case ARM64::STRWpre_isel:
+  case ARM64::STRHHpre_isel:
+  case ARM64::STRBBpre_isel:
+  case ARM64::STRDpre_isel:
+  case ARM64::STRSpre_isel: {
+    MCInst TmpInst;
+    // For loads, the writeback operand to be skipped is the first.
+    TmpInst.setOpcode(getRealIndexedOpcode(MI->getOpcode()));
+    TmpInst.addOperand(MCOperand::CreateReg(MI->getOperand(1).getReg()));
+    TmpInst.addOperand(MCOperand::CreateReg(MI->getOperand(2).getReg()));
+    TmpInst.addOperand(MCOperand::CreateImm(MI->getOperand(3).getImm()));
+    EmitToStreamer(OutStreamer, TmpInst);
+    return;
+  }
+
+  // Tail calls use pseudo instructions so they have the proper code-gen
+  // attributes (isCall, isReturn, etc.). We lower them to the real
+  // instruction here.
+  case ARM64::TCRETURNri: {
+    MCInst TmpInst;
+    TmpInst.setOpcode(ARM64::BR);
+    TmpInst.addOperand(MCOperand::CreateReg(MI->getOperand(0).getReg()));
+    EmitToStreamer(OutStreamer, TmpInst);
+    return;
+  }
+  case ARM64::TCRETURNdi: {
+    MCOperand Dest;
+    MCInstLowering.lowerOperand(MI->getOperand(0), Dest);
+    MCInst TmpInst;
+    TmpInst.setOpcode(ARM64::B);
+    TmpInst.addOperand(Dest);
+    EmitToStreamer(OutStreamer, TmpInst);
+    return;
+  }
+  case ARM64::TLSDESC_BLR: {
+    MCOperand Callee, Sym;
+    MCInstLowering.lowerOperand(MI->getOperand(0), Callee);
+    MCInstLowering.lowerOperand(MI->getOperand(1), Sym);
+
+    // First emit a relocation-annotation. This expands to no code, but requests
+    // the following instruction gets an R_AARCH64_TLSDESC_CALL.
+    MCInst TLSDescCall;
+    TLSDescCall.setOpcode(ARM64::TLSDESCCALL);
+    TLSDescCall.addOperand(Sym);
+    EmitToStreamer(OutStreamer, TLSDescCall);
+
+    // Other than that it's just a normal indirect call to the function loaded
+    // from the descriptor.
+    MCInst BLR;
+    BLR.setOpcode(ARM64::BLR);
+    BLR.addOperand(Callee);
+    EmitToStreamer(OutStreamer, BLR);
+
+    return;
+  }
+
+  case TargetOpcode::STACKMAP:
+    return LowerSTACKMAP(OutStreamer, SM, *MI);
+
+  case TargetOpcode::PATCHPOINT:
+    return LowerPATCHPOINT(OutStreamer, SM, *MI);
+  }
+
+  // Finally, do the automated lowerings for everything else.
+  MCInst TmpInst;
+  MCInstLowering.Lower(MI, TmpInst);
+  EmitToStreamer(OutStreamer, TmpInst);
+}
+
+// Force static initialization.
+extern "C" void LLVMInitializeARM64AsmPrinter() {
+  RegisterAsmPrinter<ARM64AsmPrinter> X(TheARM64Target);
+}
--- a/lib/Target/ARM64/ARM64BranchRelaxation.cpp
+++ b/lib/Target/ARM64/ARM64BranchRelaxation.cpp
@ -0,0 +1,506 @@
+//===-- ARM64BranchRelaxation.cpp - ARM64 branch relaxation ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "arm64-branch-relax"
+#include "ARM64.h"
+#include "ARM64InstrInfo.h"
+#include "ARM64MachineFunctionInfo.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Support/CommandLine.h"
+using namespace llvm;
+
+static cl::opt<bool>
+BranchRelaxation("arm64-branch-relax", cl::Hidden, cl::init(true),
+                 cl::desc("Relax out of range conditional branches"));
+
+static cl::opt<unsigned>
+TBZDisplacementBits("arm64-tbz-offset-bits", cl::Hidden, cl::init(14),
+                    cl::desc("Restrict range of TB[N]Z instructions (DEBUG)"));
+
+static cl::opt<unsigned>
+CBZDisplacementBits("arm64-cbz-offset-bits", cl::Hidden, cl::init(19),
+                    cl::desc("Restrict range of CB[N]Z instructions (DEBUG)"));
+
+static cl::opt<unsigned>
+BCCDisplacementBits("arm64-bcc-offset-bits", cl::Hidden, cl::init(19),
+                    cl::desc("Restrict range of Bcc instructions (DEBUG)"));
+
+STATISTIC(NumSplit, "Number of basic blocks split");
+STATISTIC(NumRelaxed, "Number of conditional branches relaxed");
+
+namespace {
+class ARM64BranchRelaxation : public MachineFunctionPass {
+  /// BasicBlockInfo - Information about the offset and size of a single
+  /// basic block.
+  struct BasicBlockInfo {
+    /// Offset - Distance from the beginning of the function to the beginning
+    /// of this basic block.
+    ///
+    /// The offset is always aligned as required by the basic block.
+    unsigned Offset;
+
+    /// Size - Size of the basic block in bytes.  If the block contains
+    /// inline assembly, this is a worst case estimate.
+    ///
+    /// The size does not include any alignment padding whether from the
+    /// beginning of the block, or from an aligned jump table at the end.
+    unsigned Size;
+
+    BasicBlockInfo() : Offset(0), Size(0) {}
+
+    /// Compute the offset immediately following this block.  If LogAlign is
+    /// specified, return the offset the successor block will get if it has
+    /// this alignment.
+    unsigned postOffset(unsigned LogAlign = 0) const {
+      unsigned PO = Offset + Size;
+      unsigned Align = 1 << LogAlign;
+      return (PO + Align - 1) / Align * Align;
+    }
+  };
+
+  SmallVector<BasicBlockInfo, 16> BlockInfo;
+
+  MachineFunction *MF;
+  const ARM64InstrInfo *TII;
+
+  bool relaxBranchInstructions();
+  void scanFunction();
+  MachineBasicBlock *splitBlockBeforeInstr(MachineInstr *MI);
+  void adjustBlockOffsets(MachineBasicBlock *BB);
+  bool isBlockInRange(MachineInstr *MI, MachineBasicBlock *BB, unsigned Disp);
+  bool fixupConditionalBranch(MachineInstr *MI);
+  void computeBlockSize(MachineBasicBlock *MBB);
+  unsigned getInstrOffset(MachineInstr *MI) const;
+  void dumpBBs();
+  void verify();
+
+public:
+  static char ID;
+  ARM64BranchRelaxation() : MachineFunctionPass(ID) {}
+
+  virtual bool runOnMachineFunction(MachineFunction &MF);
+
+  virtual const char *getPassName() const {
+    return "ARM64 branch relaxation pass";
+  }
+};
+char ARM64BranchRelaxation::ID = 0;
+}
+
+/// verify - check BBOffsets, BBSizes, alignment of islands
+void ARM64BranchRelaxation::verify() {
+#ifndef NDEBUG
+  unsigned PrevNum = MF->begin()->getNumber();
+  for (MachineFunction::iterator MBBI = MF->begin(), E = MF->end(); MBBI != E;
+       ++MBBI) {
+    MachineBasicBlock *MBB = MBBI;
+    unsigned Align = MBB->getAlignment();
+    unsigned Num = MBB->getNumber();
+    assert(BlockInfo[Num].Offset % (1u << Align) == 0);
+    assert(!Num || BlockInfo[PrevNum].postOffset() <= BlockInfo[Num].Offset);
+    PrevNum = Num;
+  }
+#endif
+}
+
+/// print block size and offset information - debugging
+void ARM64BranchRelaxation::dumpBBs() {
+  for (MachineFunction::iterator MBBI = MF->begin(), E = MF->end(); MBBI != E;
+       ++MBBI) {
+    const BasicBlockInfo &BBI = BlockInfo[MBBI->getNumber()];
+    dbgs() << format("BB#%u\toffset=%08x\t", MBBI->getNumber(), BBI.Offset)
+           << format("size=%#x\n", BBI.Size);
+  }
+}
+
+/// BBHasFallthrough - Return true if the specified basic block can fallthrough
+/// into the block immediately after it.
+static bool BBHasFallthrough(MachineBasicBlock *MBB) {
+  // Get the next machine basic block in the function.
+  MachineFunction::iterator MBBI = MBB;
+  // Can't fall off end of function.
+  if (std::next(MBBI) == MBB->getParent()->end())
+    return false;
+
+  MachineBasicBlock *NextBB = std::next(MBBI);
+  for (MachineBasicBlock::succ_iterator I = MBB->succ_begin(),
+                                        E = MBB->succ_end();
+       I != E; ++I)
+    if (*I == NextBB)
+      return true;
+
+  return false;
+}
+
+/// scanFunction - Do the initial scan of the function, building up
+/// information about each block.
+void ARM64BranchRelaxation::scanFunction() {
+  BlockInfo.clear();
+  BlockInfo.resize(MF->getNumBlockIDs());
+
+  // First thing, compute the size of all basic blocks, and see if the function
+  // has any inline assembly in it. If so, we have to be conservative about
+  // alignment assumptions, as we don't know for sure the size of any
+  // instructions in the inline assembly.
+  for (MachineFunction::iterator I = MF->begin(), E = MF->end(); I != E; ++I)
+    computeBlockSize(I);
+
+  // Compute block offsets and known bits.
+  adjustBlockOffsets(MF->begin());
+}
+
+/// computeBlockSize - Compute the size for MBB.
+/// This function updates BlockInfo directly.
+void ARM64BranchRelaxation::computeBlockSize(MachineBasicBlock *MBB) {
+  unsigned Size = 0;
+  for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E;
+       ++I)
+    Size += TII->GetInstSizeInBytes(I);
+  BlockInfo[MBB->getNumber()].Size = Size;
+}
+
+/// getInstrOffset - Return the current offset of the specified machine
+/// instruction from the start of the function.  This offset changes as stuff is
+/// moved around inside the function.
+unsigned ARM64BranchRelaxation::getInstrOffset(MachineInstr *MI) const {
+  MachineBasicBlock *MBB = MI->getParent();
+
+  // The offset is composed of two things: the sum of the sizes of all MBB's
+  // before this instruction's block, and the offset from the start of the block
+  // it is in.
+  unsigned Offset = BlockInfo[MBB->getNumber()].Offset;
+
+  // Sum instructions before MI in MBB.
+  for (MachineBasicBlock::iterator I = MBB->begin(); &*I != MI; ++I) {
+    assert(I != MBB->end() && "Didn't find MI in its own basic block?");
+    Offset += TII->GetInstSizeInBytes(I);
+  }
+  return Offset;
+}
+
+void ARM64BranchRelaxation::adjustBlockOffsets(MachineBasicBlock *Start) {
+  unsigned PrevNum = Start->getNumber();
+  MachineFunction::iterator MBBI = Start, E = MF->end();
+  for (++MBBI; MBBI != E; ++MBBI) {
+    MachineBasicBlock *MBB = MBBI;
+    unsigned Num = MBB->getNumber();
+    if (!Num) // block zero is never changed from offset zero.
+      continue;
+    // Get the offset and known bits at the end of the layout predecessor.
+    // Include the alignment of the current block.
+    unsigned LogAlign = MBBI->getAlignment();
+    BlockInfo[Num].Offset = BlockInfo[PrevNum].postOffset(LogAlign);
+    PrevNum = Num;
+  }
+}
+
+/// Split the basic block containing MI into two blocks, which are joined by
+/// an unconditional branch.  Update data structures and renumber blocks to
+/// account for this change and returns the newly created block.
+/// NOTE: Successor list of the original BB is out of date after this function,
+/// and must be updated by the caller! Other transforms follow using this
+/// utility function, so no point updating now rather than waiting.
+MachineBasicBlock *
+ARM64BranchRelaxation::splitBlockBeforeInstr(MachineInstr *MI) {
+  MachineBasicBlock *OrigBB = MI->getParent();
+
+  // Create a new MBB for the code after the OrigBB.
+  MachineBasicBlock *NewBB =
+      MF->CreateMachineBasicBlock(OrigBB->getBasicBlock());
+  MachineFunction::iterator MBBI = OrigBB;
+  ++MBBI;
+  MF->insert(MBBI, NewBB);
+
+  // Splice the instructions starting with MI over to NewBB.
+  NewBB->splice(NewBB->end(), OrigBB, MI, OrigBB->end());
+
+  // Add an unconditional branch from OrigBB to NewBB.
+  // Note the new unconditional branch is not being recorded.
+  // There doesn't seem to be meaningful DebugInfo available; this doesn't
+  // correspond to anything in the source.
+  BuildMI(OrigBB, DebugLoc(), TII->get(ARM64::B)).addMBB(NewBB);
+
+  // Insert an entry into BlockInfo to align it properly with the block numbers.
+  BlockInfo.insert(BlockInfo.begin() + NewBB->getNumber(), BasicBlockInfo());
+
+  // Figure out how large the OrigBB is.  As the first half of the original
+  // block, it cannot contain a tablejump.  The size includes
+  // the new jump we added.  (It should be possible to do this without
+  // recounting everything, but it's very confusing, and this is rarely
+  // executed.)
+  computeBlockSize(OrigBB);
+
+  // Figure out how large the NewMBB is.  As the second half of the original
+  // block, it may contain a tablejump.
+  computeBlockSize(NewBB);
+
+  // All BBOffsets following these blocks must be modified.
+  adjustBlockOffsets(OrigBB);
+
+  ++NumSplit;
+
+  return NewBB;
+}
+
+/// isBlockInRange - Returns true if the distance between specific MI and
+/// specific BB can fit in MI's displacement field.
+bool ARM64BranchRelaxation::isBlockInRange(MachineInstr *MI,
+                                           MachineBasicBlock *DestBB,
+                                           unsigned Bits) {
+  unsigned MaxOffs = ((1 << (Bits - 1)) - 1) << 2;
+  unsigned BrOffset = getInstrOffset(MI);
+  unsigned DestOffset = BlockInfo[DestBB->getNumber()].Offset;
+
+  DEBUG(dbgs() << "Branch of destination BB#" << DestBB->getNumber()
+               << " from BB#" << MI->getParent()->getNumber()
+               << " max delta=" << MaxOffs << " from " << getInstrOffset(MI)
+               << " to " << DestOffset << " offset "
+               << int(DestOffset - BrOffset) << "\t" << *MI);
+
+  // Branch before the Dest.
+  if (BrOffset <= DestOffset)
+    return (DestOffset - BrOffset <= MaxOffs);
+  return (BrOffset - DestOffset <= MaxOffs);
+}
+
+static bool isConditionalBranch(unsigned Opc) {
+  switch (Opc) {
+  default:
+    return false;
+  case ARM64::TBZ:
+  case ARM64::TBNZ:
+  case ARM64::CBZW:
+  case ARM64::CBNZW:
+  case ARM64::CBZX:
+  case ARM64::CBNZX:
+  case ARM64::Bcc:
+    return true;
+  }
+}
+
+static MachineBasicBlock *getDestBlock(MachineInstr *MI) {
+  switch (MI->getOpcode()) {
+  default:
+    assert(0 && "unexpected opcode!");
+  case ARM64::TBZ:
+  case ARM64::TBNZ:
+    return MI->getOperand(2).getMBB();
+  case ARM64::CBZW:
+  case ARM64::CBNZW:
+  case ARM64::CBZX:
+  case ARM64::CBNZX:
+  case ARM64::Bcc:
+    return MI->getOperand(1).getMBB();
+  }
+}
+
+static unsigned getOppositeConditionOpcode(unsigned Opc) {
+  switch (Opc) {
+  default:
+    assert(0 && "unexpected opcode!");
+  case ARM64::TBNZ:    return ARM64::TBZ;
+  case ARM64::TBZ:     return ARM64::TBNZ;
+  case ARM64::CBNZW:   return ARM64::CBZW;
+  case ARM64::CBNZX:   return ARM64::CBZX;
+  case ARM64::CBZW:    return ARM64::CBNZW;
+  case ARM64::CBZX:    return ARM64::CBNZX;
+  case ARM64::Bcc:     return ARM64::Bcc; // Condition is an operand for Bcc.
+  }
+}
+
+static unsigned getBranchDisplacementBits(unsigned Opc) {
+  switch (Opc) {
+  default:
+    assert(0 && "unexpected opcode!");
+  case ARM64::TBNZ:
+  case ARM64::TBZ:
+    return TBZDisplacementBits;
+  case ARM64::CBNZW:
+  case ARM64::CBZW:
+  case ARM64::CBNZX:
+  case ARM64::CBZX:
+    return CBZDisplacementBits;
+  case ARM64::Bcc:
+    return BCCDisplacementBits;
+  }
+}
+
+static inline void invertBccCondition(MachineInstr *MI) {
+  assert(MI->getOpcode() == ARM64::Bcc && "Unexpected opcode!");
+  ARM64CC::CondCode CC = (ARM64CC::CondCode)MI->getOperand(0).getImm();
+  CC = ARM64CC::getInvertedCondCode(CC);
+  MI->getOperand(0).setImm((int64_t)CC);
+}
+
+/// fixupConditionalBranch - Fix up a conditional branch whose destination is
+/// too far away to fit in its displacement field. It is converted to an inverse
+/// conditional branch + an unconditional branch to the destination.
+bool ARM64BranchRelaxation::fixupConditionalBranch(MachineInstr *MI) {
+  MachineBasicBlock *DestBB = getDestBlock(MI);
+
+  // Add an unconditional branch to the destination and invert the branch
+  // condition to jump over it:
+  // tbz L1
+  // =>
+  // tbnz L2
+  // b   L1
+  // L2:
+
+  // If the branch is at the end of its MBB and that has a fall-through block,
+  // direct the updated conditional branch to the fall-through block. Otherwise,
+  // split the MBB before the next instruction.
+  MachineBasicBlock *MBB = MI->getParent();
+  MachineInstr *BMI = &MBB->back();
+  bool NeedSplit = (BMI != MI) || !BBHasFallthrough(MBB);
+
+  if (BMI != MI) {
+    if (std::next(MachineBasicBlock::iterator(MI)) ==
+            std::prev(MBB->getLastNonDebugInstr()) &&
+        BMI->getOpcode() == ARM64::B) {
+      // Last MI in the BB is an unconditional branch. Can we simply invert the
+      // condition and swap destinations:
+      // beq L1
+      // b   L2
+      // =>
+      // bne L2
+      // b   L1
+      MachineBasicBlock *NewDest = BMI->getOperand(0).getMBB();
+      if (isBlockInRange(MI, NewDest,
+                         getBranchDisplacementBits(MI->getOpcode()))) {
+        DEBUG(dbgs() << "  Invert condition and swap its destination with "
+                     << *BMI);
+        BMI->getOperand(0).setMBB(DestBB);
+        unsigned OpNum =
+            (MI->getOpcode() == ARM64::TBZ || MI->getOpcode() == ARM64::TBNZ)
+                ? 2
+                : 1;
+        MI->getOperand(OpNum).setMBB(NewDest);
+        MI->setDesc(TII->get(getOppositeConditionOpcode(MI->getOpcode())));
+        if (MI->getOpcode() == ARM64::Bcc)
+          invertBccCondition(MI);
+        return true;
+      }
+    }
+  }
+
+  if (NeedSplit) {
+    // Analyze the branch so we know how to update the successor lists.
+    MachineBasicBlock *TBB, *FBB;
+    SmallVector<MachineOperand, 2> Cond;
+    TII->AnalyzeBranch(*MBB, TBB, FBB, Cond, false);
+
+    MachineBasicBlock *NewBB = splitBlockBeforeInstr(MI);
+    // No need for the branch to the next block. We're adding an unconditional
+    // branch to the destination.
+    int delta = TII->GetInstSizeInBytes(&MBB->back());
+    BlockInfo[MBB->getNumber()].Size -= delta;
+    MBB->back().eraseFromParent();
+    // BlockInfo[SplitBB].Offset is wrong temporarily, fixed below
+
+    // Update the successor lists according to the transformation to follow.
+    // Do it here since if there's no split, no update is needed.
+    MBB->replaceSuccessor(FBB, NewBB);
+    NewBB->addSuccessor(FBB);
+  }
+  MachineBasicBlock *NextBB = std::next(MachineFunction::iterator(MBB));
+
+  DEBUG(dbgs() << "  Insert B to BB#" << DestBB->getNumber()
+               << ", invert condition and change dest. to BB#"
+               << NextBB->getNumber() << "\n");
+
+  // Insert a new conditional branch and a new unconditional branch.
+  MachineInstrBuilder MIB = BuildMI(
+      MBB, DebugLoc(), TII->get(getOppositeConditionOpcode(MI->getOpcode())))
+                                .addOperand(MI->getOperand(0));
+  if (MI->getOpcode() == ARM64::TBZ || MI->getOpcode() == ARM64::TBNZ)
+    MIB.addOperand(MI->getOperand(1));
+  if (MI->getOpcode() == ARM64::Bcc)
+    invertBccCondition(MIB);
+  MIB.addMBB(NextBB);
+  BlockInfo[MBB->getNumber()].Size += TII->GetInstSizeInBytes(&MBB->back());
+  BuildMI(MBB, DebugLoc(), TII->get(ARM64::B)).addMBB(DestBB);
+  BlockInfo[MBB->getNumber()].Size += TII->GetInstSizeInBytes(&MBB->back());
+
+  // Remove the old conditional branch.  It may or may not still be in MBB.
+  BlockInfo[MI->getParent()->getNumber()].Size -= TII->GetInstSizeInBytes(MI);
+  MI->eraseFromParent();
+
+  // Finally, keep the block offsets up to date.
+  adjustBlockOffsets(MBB);
+  return true;
+}
+
+bool ARM64BranchRelaxation::relaxBranchInstructions() {
+  bool Changed = false;
+  // Relaxing branches involves creating new basic blocks, so re-eval
+  // end() for termination.
+  for (MachineFunction::iterator I = MF->begin(); I != MF->end(); ++I) {
+    MachineInstr *MI = I->getFirstTerminator();
+    if (isConditionalBranch(MI->getOpcode()) &&
+        !isBlockInRange(MI, getDestBlock(MI),
+                        getBranchDisplacementBits(MI->getOpcode()))) {
+      fixupConditionalBranch(MI);
+      ++NumRelaxed;
+      Changed = true;
+    }
+  }
+  return Changed;
+}
+
+bool ARM64BranchRelaxation::runOnMachineFunction(MachineFunction &mf) {
+  MF = &mf;
+
+  // If the pass is disabled, just bail early.
+  if (!BranchRelaxation)
+    return false;
+
+  DEBUG(dbgs() << "***** ARM64BranchRelaxation *****\n");
+
+  TII = (const ARM64InstrInfo *)MF->getTarget().getInstrInfo();
+
+  // Renumber all of the machine basic blocks in the function, guaranteeing that
+  // the numbers agree with the position of the block in the function.
+  MF->RenumberBlocks();
+
+  // Do the initial scan of the function, building up information about the
+  // sizes of each block.
+  scanFunction();
+
+  DEBUG(dbgs() << "  Basic blocks before relaxation\n");
+  DEBUG(dumpBBs());
+
+  bool MadeChange = false;
+  while (relaxBranchInstructions())
+    MadeChange = true;
+
+  // After a while, this might be made debug-only, but it is not expensive.
+  verify();
+
+  DEBUG(dbgs() << "  Basic blocks after relaxation\n");
+  DEBUG(dbgs() << '\n'; dumpBBs());
+
+  BlockInfo.clear();
+
+  return MadeChange;
+}
+
+/// createARM64BranchRelaxation - returns an instance of the constpool
+/// island pass.
+FunctionPass *llvm::createARM64BranchRelaxation() {
+  return new ARM64BranchRelaxation();
+}
--- a/lib/Target/ARM64/ARM64CallingConv.h
+++ b/lib/Target/ARM64/ARM64CallingConv.h
@ -0,0 +1,94 @@
+//=== ARM64CallingConv.h - Custom Calling Convention Routines -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the custom routines for the ARM64 Calling Convention that
+// aren't done by tablegen.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef ARM64CALLINGCONV_H
+#define ARM64CALLINGCONV_H
+
+#include "ARM64InstrInfo.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+namespace llvm {
+
+/// CC_ARM64_Custom_i1i8i16_Reg - customized handling of passing i1/i8/i16 via
+/// register. Here, ValVT can be i1/i8/i16 or i32 depending on whether the
+/// argument is already promoted and LocVT is i1/i8/i16. We only promote the
+/// argument to i32 if we are sure this argument will be passed in register.
+static bool CC_ARM64_Custom_i1i8i16_Reg(unsigned ValNo, MVT ValVT, MVT LocVT,
+                                        CCValAssign::LocInfo LocInfo,
+                                        ISD::ArgFlagsTy ArgFlags,
+                                        CCState &State,
+                                        bool IsWebKitJS = false) {
+  static const uint16_t RegList1[] = { ARM64::W0, ARM64::W1, ARM64::W2,
+                                       ARM64::W3, ARM64::W4, ARM64::W5,
+                                       ARM64::W6, ARM64::W7 };
+  static const uint16_t RegList2[] = { ARM64::X0, ARM64::X1, ARM64::X2,
+                                       ARM64::X3, ARM64::X4, ARM64::X5,
+                                       ARM64::X6, ARM64::X7 };
+  static const uint16_t WebKitRegList1[] = { ARM64::W0 };
+  static const uint16_t WebKitRegList2[] = { ARM64::X0 };
+
+  const uint16_t *List1 = IsWebKitJS ? WebKitRegList1 : RegList1;
+  const uint16_t *List2 = IsWebKitJS ? WebKitRegList2 : RegList2;
+
+  if (unsigned Reg = State.AllocateReg(List1, List2, 8)) {
+    // Customized extra section for handling i1/i8/i16:
+    // We need to promote the argument to i32 if it is not done already.
+    if (ValVT != MVT::i32) {
+      if (ArgFlags.isSExt())
+        LocInfo = CCValAssign::SExt;
+      else if (ArgFlags.isZExt())
+        LocInfo = CCValAssign::ZExt;
+      else
+        LocInfo = CCValAssign::AExt;
+      ValVT = MVT::i32;
+    }
+    // Set LocVT to i32 as well if passing via register.
+    LocVT = MVT::i32;
+    State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+    return true;
+  }
+  return false;
+}
+
+/// CC_ARM64_WebKit_JS_i1i8i16_Reg - customized handling of passing i1/i8/i16
+/// via register. This behaves the same as CC_ARM64_Custom_i1i8i16_Reg, but only
+/// uses the first register.
+static bool CC_ARM64_WebKit_JS_i1i8i16_Reg(unsigned ValNo, MVT ValVT, MVT LocVT,
+                                           CCValAssign::LocInfo LocInfo,
+                                           ISD::ArgFlagsTy ArgFlags,
+                                           CCState &State) {
+  return CC_ARM64_Custom_i1i8i16_Reg(ValNo, ValVT, LocVT, LocInfo, ArgFlags,
+                                     State, true);
+}
+
+/// CC_ARM64_Custom_i1i8i16_Stack: customized handling of passing i1/i8/i16 on
+/// stack. Here, ValVT can be i1/i8/i16 or i32 depending on whether the argument
+/// is already promoted and LocVT is i1/i8/i16. If ValVT is already promoted,
+/// it will be truncated back to i1/i8/i16.
+static bool CC_ARM64_Custom_i1i8i16_Stack(unsigned ValNo, MVT ValVT, MVT LocVT,
+                                          CCValAssign::LocInfo LocInfo,
+                                          ISD::ArgFlagsTy ArgFlags,
+                                          CCState &State) {
+  unsigned Space = ((LocVT == MVT::i1 || LocVT == MVT::i8) ? 1 : 2);
+  unsigned Offset12 = State.AllocateStack(Space, Space);
+  ValVT = LocVT;
+  State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset12, LocVT, LocInfo));
+  return true;
+}
+
+} // End llvm namespace
+
+#endif
--- a/lib/Target/ARM64/ARM64CallingConvention.td
+++ b/lib/Target/ARM64/ARM64CallingConvention.td
@ -0,0 +1,210 @@
+//===- ARM64CallingConv.td - Calling Conventions for ARM64 -*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This describes the calling conventions for ARM64 architecture.
+//
+//===----------------------------------------------------------------------===//
+
+/// CCIfAlign - Match of the original alignment of the arg
+class CCIfAlign<string Align, CCAction A> :
+  CCIf<!strconcat("ArgFlags.getOrigAlign() == ", Align), A>;
+
+//===----------------------------------------------------------------------===//
+// ARM AAPCS64 Calling Convention
+//===----------------------------------------------------------------------===//
+
+def CC_ARM64_AAPCS : CallingConv<[
+  CCIfType<[v2f32], CCBitConvertToType<v2i32>>,
+  CCIfType<[v2f64, v4f32, f128], CCBitConvertToType<v2i64>>,
+
+  // An SRet is passed in X8, not X0 like a normal pointer parameter.
+  CCIfSRet<CCIfType<[i64], CCAssignToRegWithShadow<[X8], [W8]>>>,
+
+  // Handle i1, i8, i16, i32, i64, f32, f64 and v2f64 by passing in registers,
+  // up to eight each of GPR and FPR.
+  CCIfType<[i1, i8, i16], CCCustom<"CC_ARM64_Custom_i1i8i16_Reg">>,
+  CCIfType<[i32], CCAssignToRegWithShadow<[W0, W1, W2, W3, W4, W5, W6, W7],
+                                          [X0, X1, X2, X3, X4, X5, X6, X7]>>,
+  // i128 is split to two i64s, we can't fit half to register X7.
+  CCIfType<[i64], CCIfSplit<CCAssignToRegWithShadow<[X0, X2, X4, X6],
+                                                    [X0, X1, X3, X5]>>>,
+
+  // i128 is split to two i64s, and its stack alignment is 16 bytes.
+  CCIfType<[i64], CCIfSplit<CCAssignToStack<8, 16>>>,
+
+  CCIfType<[i64], CCAssignToRegWithShadow<[X0, X1, X2, X3, X4, X5, X6, X7],
+                                          [W0, W1, W2, W3, W4, W5, W6, W7]>>,
+  CCIfType<[f32], CCAssignToRegWithShadow<[S0, S1, S2, S3, S4, S5, S6, S7],
+                                          [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
+  CCIfType<[f64], CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7],
+                                          [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
+  CCIfType<[v1i64, v2i32, v4i16, v8i8, v1f64, v2f32],
+           CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7],
+                                   [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
+  CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32, v2f64],
+           CCAssignToReg<[Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
+
+  // If more than will fit in registers, pass them on the stack instead.
+  CCIfType<[i1, i8, i16], CCAssignToStack<8, 8>>,
+  CCIfType<[i32, f32], CCAssignToStack<8, 8>>,
+  CCIfType<[i64, f64, v1f64, v2f32, v1i64, v2i32, v4i16, v8i8],
+           CCAssignToStack<8, 8>>,
+  CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32, v2f64], CCAssignToStack<16, 16>>
+]>;
+
+def RetCC_ARM64_AAPCS : CallingConv<[
+  CCIfType<[v2f32], CCBitConvertToType<v2i32>>,
+  CCIfType<[v2f64, v4f32, f128], CCBitConvertToType<v2i64>>,
+
+  CCIfType<[i32], CCAssignToRegWithShadow<[W0, W1, W2, W3, W4, W5, W6, W7],
+                                          [X0, X1, X2, X3, X4, X5, X6, X7]>>,
+  CCIfType<[i64], CCAssignToRegWithShadow<[X0, X1, X2, X3, X4, X5, X6, X7],
+                                          [W0, W1, W2, W3, W4, W5, W6, W7]>>,
+  CCIfType<[f32], CCAssignToRegWithShadow<[S0, S1, S2, S3, S4, S5, S6, S7],
+                                          [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
+  CCIfType<[f64], CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7],
+                                          [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
+  CCIfType<[v1i64, v2i32, v4i16, v8i8, v1f64, v2f32],
+      CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7],
+                              [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
+  CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32, v2f64],
+      CCAssignToReg<[Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>
+]>;
+
+
+// Darwin uses a calling convention which differs in only two ways
+// from the standard one at this level:
+//     + i128s (i.e. split i64s) don't need even registers.
+//     + Stack slots are sized as needed rather than being at least 64-bit.
+def CC_ARM64_DarwinPCS : CallingConv<[
+  CCIfType<[v2f32], CCBitConvertToType<v2i32>>,
+  CCIfType<[v2f64, v4f32, f128], CCBitConvertToType<v2i64>>,
+
+  // An SRet is passed in X8, not X0 like a normal pointer parameter.
+  CCIfSRet<CCIfType<[i64], CCAssignToRegWithShadow<[X8], [W8]>>>,
+
+  // Handle i1, i8, i16, i32, i64, f32, f64 and v2f64 by passing in registers,
+  // up to eight each of GPR and FPR.
+  CCIfType<[i1, i8, i16], CCCustom<"CC_ARM64_Custom_i1i8i16_Reg">>,
+  CCIfType<[i32], CCAssignToRegWithShadow<[W0, W1, W2, W3, W4, W5, W6, W7],
+                                          [X0, X1, X2, X3, X4, X5, X6, X7]>>,
+  // i128 is split to two i64s, we can't fit half to register X7.
+  CCIfType<[i64],
+           CCIfSplit<CCAssignToRegWithShadow<[X0, X1, X2, X3, X4, X5, X6],
+                                             [W0, W1, W2, W3, W4, W5, W6]>>>,
+  // i128 is split to two i64s, and its stack alignment is 16 bytes.
+  CCIfType<[i64], CCIfSplit<CCAssignToStackWithShadow<8, 16, [X7]>>>,
+
+  CCIfType<[i64], CCAssignToRegWithShadow<[X0, X1, X2, X3, X4, X5, X6, X7],
+                                          [W0, W1, W2, W3, W4, W5, W6, W7]>>,
+  CCIfType<[f32], CCAssignToRegWithShadow<[S0, S1, S2, S3, S4, S5, S6, S7],
+                                          [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
+  CCIfType<[f64], CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7],
+                                          [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
+  CCIfType<[v1i64, v2i32, v4i16, v8i8, v1f64, v2f32],
+           CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7],
+                                   [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
+  CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32, v2f64],
+           CCAssignToReg<[Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
+
+  // If more than will fit in registers, pass them on the stack instead.
+  CCIfType<[i1, i8, i16], CCCustom<"CC_ARM64_Custom_i1i8i16_Stack">>,
+  CCIfType<[i32, f32], CCAssignToStack<4, 4>>,
+  CCIfType<[i64, f64, v1f64, v2f32, v1i64, v2i32, v4i16, v8i8],
+           CCAssignToStack<8, 8>>,
+  CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32, v2f64], CCAssignToStack<16, 16>>
+]>;
+
+def CC_ARM64_DarwinPCS_VarArg : CallingConv<[
+  CCIfType<[v2f32], CCBitConvertToType<v2i32>>,
+  CCIfType<[v2f64, v4f32, f128], CCBitConvertToType<v2i64>>,
+
+  // Handle all scalar types as either i64 or f64.
+  CCIfType<[i8, i16, i32], CCPromoteToType<i64>>,
+  CCIfType<[f32],          CCPromoteToType<f64>>,
+
+  // Everything is on the stack.
+  // i128 is split to two i64s, and its stack alignment is 16 bytes.
+  CCIfType<[i64], CCIfSplit<CCAssignToStack<8, 16>>>,
+  CCIfType<[i64, f64, v1i64, v2i32, v4i16, v8i8, v1f64, v2f32], CCAssignToStack<8, 8>>,
+  CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32, v2f64],   CCAssignToStack<16, 16>>
+]>;
+
+// The WebKit_JS calling convention only passes the first argument (the callee)
+// in register and the remaining arguments on stack. We allow 32bit stack slots,
+// so that WebKit can write partial values in the stack and define the other
+// 32bit quantity as undef.
+def CC_ARM64_WebKit_JS : CallingConv<[
+  // Handle i1, i8, i16, i32, and i64 passing in register X0 (W0).
+  CCIfType<[i1, i8, i16], CCCustom<"CC_ARM64_WebKit_JS_i1i8i16_Reg">>,
+  CCIfType<[i32], CCAssignToRegWithShadow<[W0], [X0]>>,
+  CCIfType<[i64], CCAssignToRegWithShadow<[X0], [W0]>>,
+
+  // Pass the remaining arguments on the stack instead.
+  CCIfType<[i1, i8, i16], CCAssignToStack<4, 4>>,
+  CCIfType<[i32, f32], CCAssignToStack<4, 4>>,
+  CCIfType<[i64, f64], CCAssignToStack<8, 8>>
+]>;
+
+def RetCC_ARM64_WebKit_JS : CallingConv<[
+  CCIfType<[i32], CCAssignToRegWithShadow<[W0, W1, W2, W3, W4, W5, W6, W7],
+                                          [X0, X1, X2, X3, X4, X5, X6, X7]>>,
+  CCIfType<[i64], CCAssignToRegWithShadow<[X0, X1, X2, X3, X4, X5, X6, X7],
+                                          [W0, W1, W2, W3, W4, W5, W6, W7]>>,
+  CCIfType<[f32], CCAssignToRegWithShadow<[S0, S1, S2, S3, S4, S5, S6, S7],
+                                          [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
+  CCIfType<[f64], CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7],
+                                          [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>
+]>;
+
+// FIXME: LR is only callee-saved in the sense that *we* preserve it and are
+// presumably a callee to someone. External functions may not do so, but this
+// is currently safe since BL has LR as an implicit-def and what happens after a
+// tail call doesn't matter.
+//
+// It would be better to model its preservation semantics properly (create a
+// vreg on entry, use it in RET & tail call generation; make that vreg def if we
+// end up saving LR as part of a call frame). Watch this space...
+def CSR_ARM64_AAPCS : CalleeSavedRegs<(add LR, FP, X19, X20, X21, X22,
+                                           X23, X24, X25, X26, X27, X28,
+                                           D8,  D9,  D10, D11,
+                                           D12, D13, D14, D15)>;
+
+// Constructors and destructors return 'this' in the iOS 64-bit C++ ABI; since
+// 'this' and the pointer return value are both passed in X0 in these cases,
+// this can be partially modelled by treating X0 as a callee-saved register;
+// only the resulting RegMask is used; the SaveList is ignored
+//
+// (For generic ARM 64-bit ABI code, clang will not generate constructors or
+// destructors with 'this' returns, so this RegMask will not be used in that
+// case)
+def CSR_ARM64_AAPCS_ThisReturn : CalleeSavedRegs<(add CSR_ARM64_AAPCS, X0)>;
+
+// The function used by Darwin to obtain the address of a thread-local variable
+// guarantees more than a normal AAPCS function. x16 and x17 are used on the
+// fast path for calculation, but other registers except X0 (argument/return)
+// and LR (it is a call, after all) are preserved.
+def CSR_ARM64_TLS_Darwin
+    : CalleeSavedRegs<(add (sub (sequence "X%u", 1, 28), X16, X17),
+                           FP,
+                           (sequence "Q%u", 0, 31))>;
+
+// The ELF stub used for TLS-descriptor access saves every feasible
+// register. Only X0 and LR are clobbered.
+def CSR_ARM64_TLS_ELF
+    : CalleeSavedRegs<(add (sequence "X%u", 1, 28), FP,
+                           (sequence "Q%u", 0, 31))>;
+
+def CSR_ARM64_AllRegs
+    : CalleeSavedRegs<(add (sequence "W%u", 0, 30), WSP,
+                           (sequence "X%u", 0, 28), FP, LR, SP,
+                           (sequence "B%u", 0, 31), (sequence "H%u", 0, 31),
+                           (sequence "S%u", 0, 31), (sequence "D%u", 0, 31),
+                           (sequence "Q%u", 0, 31))>;
+
--- a/lib/Target/ARM64/ARM64CleanupLocalDynamicTLSPass.cpp
+++ b/lib/Target/ARM64/ARM64CleanupLocalDynamicTLSPass.cpp
@ -0,0 +1,148 @@
+//===-- ARM64CleanupLocalDynamicTLSPass.cpp -----------------------*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Local-dynamic access to thread-local variables proceeds in three stages.
+//
+// 1. The offset of this Module's thread-local area from TPIDR_EL0 is calculated
+//    in much the same way as a general-dynamic TLS-descriptor access against
+//    the special symbol _TLS_MODULE_BASE.
+// 2. The variable's offset from _TLS_MODULE_BASE_ is calculated using
+//    instructions with "dtprel" modifiers.
+// 3. These two are added, together with TPIDR_EL0, to obtain the variable's
+//    true address.
+//
+// This is only better than general-dynamic access to the variable if two or
+// more of the first stage TLS-descriptor calculations can be combined. This
+// pass looks through a function and performs such combinations.
+//
+//===----------------------------------------------------------------------===//
+#include "ARM64.h"
+#include "ARM64InstrInfo.h"
+#include "ARM64MachineFunctionInfo.h"
+#include "ARM64TargetMachine.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+using namespace llvm;
+
+namespace {
+struct LDTLSCleanup : public MachineFunctionPass {
+  static char ID;
+  LDTLSCleanup() : MachineFunctionPass(ID) {}
+
+  virtual bool runOnMachineFunction(MachineFunction &MF) {
+    ARM64FunctionInfo *AFI = MF.getInfo<ARM64FunctionInfo>();
+    if (AFI->getNumLocalDynamicTLSAccesses() < 2) {
+      // No point folding accesses if there isn't at least two.
+      return false;
+    }
+
+    MachineDominatorTree *DT = &getAnalysis<MachineDominatorTree>();
+    return VisitNode(DT->getRootNode(), 0);
+  }
+
+  // Visit the dominator subtree rooted at Node in pre-order.
+  // If TLSBaseAddrReg is non-null, then use that to replace any
+  // TLS_base_addr instructions. Otherwise, create the register
+  // when the first such instruction is seen, and then use it
+  // as we encounter more instructions.
+  bool VisitNode(MachineDomTreeNode *Node, unsigned TLSBaseAddrReg) {
+    MachineBasicBlock *BB = Node->getBlock();
+    bool Changed = false;
+
+    // Traverse the current block.
+    for (MachineBasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;
+         ++I) {
+      switch (I->getOpcode()) {
+      case ARM64::TLSDESC_BLR:
+        // Make sure it's a local dynamic access.
+        if (!I->getOperand(1).isSymbol() ||
+            strcmp(I->getOperand(1).getSymbolName(), "_TLS_MODULE_BASE_"))
+          break;
+
+        if (TLSBaseAddrReg)
+          I = replaceTLSBaseAddrCall(I, TLSBaseAddrReg);
+        else
+          I = setRegister(I, &TLSBaseAddrReg);
+        Changed = true;
+        break;
+      default:
+        break;
+      }
+    }
+
+    // Visit the children of this block in the dominator tree.
+    for (MachineDomTreeNode::iterator I = Node->begin(), E = Node->end();
+         I != E; ++I) {
+      Changed |= VisitNode(*I, TLSBaseAddrReg);
+    }
+
+    return Changed;
+  }
+
+  // Replace the TLS_base_addr instruction I with a copy from
+  // TLSBaseAddrReg, returning the new instruction.
+  MachineInstr *replaceTLSBaseAddrCall(MachineInstr *I,
+                                       unsigned TLSBaseAddrReg) {
+    MachineFunction *MF = I->getParent()->getParent();
+    const ARM64TargetMachine *TM =
+        static_cast<const ARM64TargetMachine *>(&MF->getTarget());
+    const ARM64InstrInfo *TII = TM->getInstrInfo();
+
+    // Insert a Copy from TLSBaseAddrReg to x0, which is where the rest of the
+    // code sequence assumes the address will be.
+    MachineInstr *Copy =
+        BuildMI(*I->getParent(), I, I->getDebugLoc(),
+                TII->get(TargetOpcode::COPY), ARM64::X0).addReg(TLSBaseAddrReg);
+
+    // Erase the TLS_base_addr instruction.
+    I->eraseFromParent();
+
+    return Copy;
+  }
+
+  // Create a virtal register in *TLSBaseAddrReg, and populate it by
+  // inserting a copy instruction after I. Returns the new instruction.
+  MachineInstr *setRegister(MachineInstr *I, unsigned *TLSBaseAddrReg) {
+    MachineFunction *MF = I->getParent()->getParent();
+    const ARM64TargetMachine *TM =
+        static_cast<const ARM64TargetMachine *>(&MF->getTarget());
+    const ARM64InstrInfo *TII = TM->getInstrInfo();
+
+    // Create a virtual register for the TLS base address.
+    MachineRegisterInfo &RegInfo = MF->getRegInfo();
+    *TLSBaseAddrReg = RegInfo.createVirtualRegister(&ARM64::GPR64RegClass);
+
+    // Insert a copy from X0 to TLSBaseAddrReg for later.
+    MachineInstr *Next = I->getNextNode();
+    MachineInstr *Copy = BuildMI(*I->getParent(), Next, I->getDebugLoc(),
+                                 TII->get(TargetOpcode::COPY),
+                                 *TLSBaseAddrReg).addReg(ARM64::X0);
+
+    return Copy;
+  }
+
+  virtual const char *getPassName() const {
+    return "Local Dynamic TLS Access Clean-up";
+  }
+
+  virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+    AU.setPreservesCFG();
+    AU.addRequired<MachineDominatorTree>();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+};
+}
+
+char LDTLSCleanup::ID = 0;
+FunctionPass *llvm::createARM64CleanupLocalDynamicTLSPass() {
+  return new LDTLSCleanup();
+}
--- a/lib/Target/ARM64/ARM64CollectLOH.cpp
+++ b/lib/Target/ARM64/ARM64CollectLOH.cpp
--- a/lib/Target/ARM64/ARM64ConditionalCompares.cpp
+++ b/lib/Target/ARM64/ARM64ConditionalCompares.cpp
@ -0,0 +1,918 @@
+//===-- ARM64ConditionalCompares.cpp --- CCMP formation for ARM64 ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the ARM64ConditionalCompares pass which reduces
+// branching and code size by using the conditional compare instructions CCMP,
+// CCMN, and FCMP.
+//
+// The CFG transformations for forming conditional compares are very similar to
+// if-conversion, and this pass should run immediately before the early
+// if-conversion pass.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "arm64-ccmp"
+#include "ARM64.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SparseSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineTraceMetrics.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+
+using namespace llvm;
+
+// Absolute maximum number of instructions allowed per speculated block.
+// This bypasses all other heuristics, so it should be set fairly high.
+static cl::opt<unsigned> BlockInstrLimit(
+    "arm64-ccmp-limit", cl::init(30), cl::Hidden,
+    cl::desc("Maximum number of instructions per speculated block."));
+
+// Stress testing mode - disable heuristics.
+static cl::opt<bool> Stress("arm64-stress-ccmp", cl::Hidden,
+                            cl::desc("Turn all knobs to 11"));
+
+STATISTIC(NumConsidered, "Number of ccmps considered");
+STATISTIC(NumPhiRejs, "Number of ccmps rejected (PHI)");
+STATISTIC(NumPhysRejs, "Number of ccmps rejected (Physregs)");
+STATISTIC(NumPhi2Rejs, "Number of ccmps rejected (PHI2)");
+STATISTIC(NumHeadBranchRejs, "Number of ccmps rejected (Head branch)");
+STATISTIC(NumCmpBranchRejs, "Number of ccmps rejected (CmpBB branch)");
+STATISTIC(NumCmpTermRejs, "Number of ccmps rejected (CmpBB is cbz...)");
+STATISTIC(NumImmRangeRejs, "Number of ccmps rejected (Imm out of range)");
+STATISTIC(NumLiveDstRejs, "Number of ccmps rejected (Cmp dest live)");
+STATISTIC(NumMultCPSRUses, "Number of ccmps rejected (CPSR used)");
+STATISTIC(NumUnknCPSRDefs, "Number of ccmps rejected (CPSR def unknown)");
+
+STATISTIC(NumSpeculateRejs, "Number of ccmps rejected (Can't speculate)");
+
+STATISTIC(NumConverted, "Number of ccmp instructions created");
+STATISTIC(NumCompBranches, "Number of cbz/cbnz branches converted");
+
+//===----------------------------------------------------------------------===//
+//                                 SSACCmpConv
+//===----------------------------------------------------------------------===//
+//
+// The SSACCmpConv class performs ccmp-conversion on SSA form machine code
+// after determining if it is possible. The class contains no heuristics;
+// external code should be used to determine when ccmp-conversion is a good
+// idea.
+//
+// CCmp-formation works on a CFG representing chained conditions, typically
+// from C's short-circuit || and && operators:
+//
+//   From:         Head            To:         Head
+//                 / |                         CmpBB
+//                /  |                         / |
+//               |  CmpBB                     /  |
+//               |  / |                    Tail  |
+//               | /  |                      |   |
+//              Tail  |                      |   |
+//                |   |                      |   |
+//               ... ...                    ... ...
+//
+// The Head block is terminated by a br.cond instruction, and the CmpBB block
+// contains compare + br.cond. Tail must be a successor of both.
+//
+// The cmp-conversion turns the compare instruction in CmpBB into a conditional
+// compare, and merges CmpBB into Head, speculatively executing its
+// instructions. The ARM64 conditional compare instructions have an immediate
+// operand that specifies the NZCV flag values when the condition is false and
+// the compare isn't executed. This makes it possible to chain compares with
+// different condition codes.
+//
+// Example:
+//
+//    if (a == 5 || b == 17)
+//      foo();
+//
+//    Head:
+//       cmp  w0, #5
+//       b.eq Tail
+//    CmpBB:
+//       cmp  w1, #17
+//       b.eq Tail
+//    ...
+//    Tail:
+//      bl _foo
+//
+//  Becomes:
+//
+//    Head:
+//       cmp  w0, #5
+//       ccmp w1, #17, 4, ne  ; 4 = nZcv
+//       b.eq Tail
+//    ...
+//    Tail:
+//      bl _foo
+//
+// The ccmp condition code is the one that would cause the Head terminator to
+// branch to CmpBB.
+//
+// FIXME: It should also be possible to speculate a block on the critical edge
+// between Head and Tail, just like if-converting a diamond.
+//
+// FIXME: Handle PHIs in Tail by turning them into selects (if-conversion).
+
+namespace {
+class SSACCmpConv {
+  MachineFunction *MF;
+  const TargetInstrInfo *TII;
+  const TargetRegisterInfo *TRI;
+  MachineRegisterInfo *MRI;
+
+public:
+  /// The first block containing a conditional branch, dominating everything
+  /// else.
+  MachineBasicBlock *Head;
+
+  /// The block containing cmp+br.cond with a sucessor shared with Head.
+  MachineBasicBlock *CmpBB;
+
+  /// The common successor for Head and CmpBB.
+  MachineBasicBlock *Tail;
+
+  /// The compare instruction in CmpBB that can be converted to a ccmp.
+  MachineInstr *CmpMI;
+
+private:
+  /// The branch condition in Head as determined by AnalyzeBranch.
+  SmallVector<MachineOperand, 4> HeadCond;
+
+  /// The condition code that makes Head branch to CmpBB.
+  ARM64CC::CondCode HeadCmpBBCC;
+
+  /// The branch condition in CmpBB.
+  SmallVector<MachineOperand, 4> CmpBBCond;
+
+  /// The condition code that makes CmpBB branch to Tail.
+  ARM64CC::CondCode CmpBBTailCC;
+
+  /// Check if the Tail PHIs are trivially convertible.
+  bool trivialTailPHIs();
+
+  /// Remove CmpBB from the Tail PHIs.
+  void updateTailPHIs();
+
+  /// Check if an operand defining DstReg is dead.
+  bool isDeadDef(unsigned DstReg);
+
+  /// Find the compare instruction in MBB that controls the conditional branch.
+  /// Return NULL if a convertible instruction can't be found.
+  MachineInstr *findConvertibleCompare(MachineBasicBlock *MBB);
+
+  /// Return true if all non-terminator instructions in MBB can be safely
+  /// speculated.
+  bool canSpeculateInstrs(MachineBasicBlock *MBB, const MachineInstr *CmpMI);
+
+public:
+  /// runOnMachineFunction - Initialize per-function data structures.
+  void runOnMachineFunction(MachineFunction &MF) {
+    this->MF = &MF;
+    TII = MF.getTarget().getInstrInfo();
+    TRI = MF.getTarget().getRegisterInfo();
+    MRI = &MF.getRegInfo();
+  }
+
+  /// If the sub-CFG headed by MBB can be cmp-converted, initialize the
+  /// internal state, and return true.
+  bool canConvert(MachineBasicBlock *MBB);
+
+  /// Cmo-convert the last block passed to canConvertCmp(), assuming
+  /// it is possible. Add any erased blocks to RemovedBlocks.
+  void convert(SmallVectorImpl<MachineBasicBlock *> &RemovedBlocks);
+
+  /// Return the expected code size delta if the conversion into a
+  /// conditional compare is performed.
+  int expectedCodeSizeDelta() const;
+};
+} // end anonymous namespace
+
+// Check that all PHIs in Tail are selecting the same value from Head and CmpBB.
+// This means that no if-conversion is required when merging CmpBB into Head.
+bool SSACCmpConv::trivialTailPHIs() {
+  for (MachineBasicBlock::iterator I = Tail->begin(), E = Tail->end();
+       I != E && I->isPHI(); ++I) {
+    unsigned HeadReg = 0, CmpBBReg = 0;
+    // PHI operands come in (VReg, MBB) pairs.
+    for (unsigned oi = 1, oe = I->getNumOperands(); oi != oe; oi += 2) {
+      MachineBasicBlock *MBB = I->getOperand(oi + 1).getMBB();
+      unsigned Reg = I->getOperand(oi).getReg();
+      if (MBB == Head) {
+        assert((!HeadReg || HeadReg == Reg) && "Inconsistent PHI operands");
+        HeadReg = Reg;
+      }
+      if (MBB == CmpBB) {
+        assert((!CmpBBReg || CmpBBReg == Reg) && "Inconsistent PHI operands");
+        CmpBBReg = Reg;
+      }
+    }
+    if (HeadReg != CmpBBReg)
+      return false;
+  }
+  return true;
+}
+
+// Assuming that trivialTailPHIs() is true, update the Tail PHIs by simply
+// removing the CmpBB operands. The Head operands will be identical.
+void SSACCmpConv::updateTailPHIs() {
+  for (MachineBasicBlock::iterator I = Tail->begin(), E = Tail->end();
+       I != E && I->isPHI(); ++I) {
+    // I is a PHI. It can have multiple entries for CmpBB.
+    for (unsigned oi = I->getNumOperands(); oi > 2; oi -= 2) {
+      // PHI operands are (Reg, MBB) at (oi-2, oi-1).
+      if (I->getOperand(oi - 1).getMBB() == CmpBB) {
+        I->RemoveOperand(oi - 1);
+        I->RemoveOperand(oi - 2);
+      }
+    }
+  }
+}
+
+// This pass runs before the ARM64DeadRegisterDefinitions pass, so compares are
+// still writing virtual registers without any uses.
+bool SSACCmpConv::isDeadDef(unsigned DstReg) {
+  // Writes to the zero register are dead.
+  if (DstReg == ARM64::WZR || DstReg == ARM64::XZR)
+    return true;
+  if (!TargetRegisterInfo::isVirtualRegister(DstReg))
+    return false;
+  // A virtual register def without any uses will be marked dead later, and
+  // eventually replaced by the zero register.
+  return MRI->use_nodbg_empty(DstReg);
+}
+
+// Parse a condition code returned by AnalyzeBranch, and compute the CondCode
+// corresponding to TBB.
+// Return
+bool parseCond(ArrayRef<MachineOperand> Cond, ARM64CC::CondCode &CC) {
+  // A normal br.cond simply has the condition code.
+  if (Cond[0].getImm() != -1) {
+    assert(Cond.size() == 1 && "Unknown Cond array format");
+    CC = (ARM64CC::CondCode)(int)Cond[0].getImm();
+    return true;
+  }
+  // For tbz and cbz instruction, the opcode is next.
+  switch (Cond[1].getImm()) {
+  default:
+    // This includes tbz / tbnz branches which can't be converted to
+    // ccmp + br.cond.
+    return false;
+  case ARM64::CBZW:
+  case ARM64::CBZX:
+    assert(Cond.size() == 3 && "Unknown Cond array format");
+    CC = ARM64CC::EQ;
+    return true;
+  case ARM64::CBNZW:
+  case ARM64::CBNZX:
+    assert(Cond.size() == 3 && "Unknown Cond array format");
+    CC = ARM64CC::NE;
+    return true;
+  }
+}
+
+MachineInstr *SSACCmpConv::findConvertibleCompare(MachineBasicBlock *MBB) {
+  MachineBasicBlock::iterator I = MBB->getFirstTerminator();
+  if (I == MBB->end())
+    return 0;
+  // The terminator must be controlled by the flags.
+  if (!I->readsRegister(ARM64::CPSR)) {
+    switch (I->getOpcode()) {
+    case ARM64::CBZW:
+    case ARM64::CBZX:
+    case ARM64::CBNZW:
+    case ARM64::CBNZX:
+      // These can be converted into a ccmp against #0.
+      return I;
+    }
+    ++NumCmpTermRejs;
+    DEBUG(dbgs() << "Flags not used by terminator: " << *I);
+    return 0;
+  }
+
+  // Now find the instruction controlling the terminator.
+  for (MachineBasicBlock::iterator B = MBB->begin(); I != B;) {
+    --I;
+    assert(!I->isTerminator() && "Spurious terminator");
+    switch (I->getOpcode()) {
+    // cmp is an alias for subs with a dead destination register.
+    case ARM64::SUBSWri:
+    case ARM64::SUBSXri:
+    // cmn is an alias for adds with a dead destination register.
+    case ARM64::ADDSWri:
+    case ARM64::ADDSXri:
+      // Check that the immediate operand is within range, ccmp wants a uimm5.
+      // Rd = SUBSri Rn, imm, shift
+      if (I->getOperand(3).getImm() || !isUInt<5>(I->getOperand(2).getImm())) {
+        DEBUG(dbgs() << "Immediate out of range for ccmp: " << *I);
+        ++NumImmRangeRejs;
+        return 0;
+      }
+    // Fall through.
+    case ARM64::SUBSWrr:
+    case ARM64::SUBSXrr:
+    case ARM64::ADDSWrr:
+    case ARM64::ADDSXrr:
+      if (isDeadDef(I->getOperand(0).getReg()))
+        return I;
+      DEBUG(dbgs() << "Can't convert compare with live destination: " << *I);
+      ++NumLiveDstRejs;
+      return 0;
+    case ARM64::FCMPSrr:
+    case ARM64::FCMPDrr:
+    case ARM64::FCMPESrr:
+    case ARM64::FCMPEDrr:
+      return I;
+    }
+
+    // Check for flag reads and clobbers.
+    MIOperands::PhysRegInfo PRI =
+        MIOperands(I).analyzePhysReg(ARM64::CPSR, TRI);
+
+    if (PRI.Reads) {
+      // The ccmp doesn't produce exactly the same flags as the original
+      // compare, so reject the transform if there are uses of the flags
+      // besides the terminators.
+      DEBUG(dbgs() << "Can't create ccmp with multiple uses: " << *I);
+      ++NumMultCPSRUses;
+      return 0;
+    }
+
+    if (PRI.Clobbers) {
+      DEBUG(dbgs() << "Not convertible compare: " << *I);
+      ++NumUnknCPSRDefs;
+      return 0;
+    }
+  }
+  DEBUG(dbgs() << "Flags not defined in BB#" << MBB->getNumber() << '\n');
+  return 0;
+}
+
+/// Determine if all the instructions in MBB can safely
+/// be speculated. The terminators are not considered.
+///
+/// Only CmpMI is allowed to clobber the flags.
+///
+bool SSACCmpConv::canSpeculateInstrs(MachineBasicBlock *MBB,
+                                     const MachineInstr *CmpMI) {
+  // Reject any live-in physregs. It's probably CPSR/EFLAGS, and very hard to
+  // get right.
+  if (!MBB->livein_empty()) {
+    DEBUG(dbgs() << "BB#" << MBB->getNumber() << " has live-ins.\n");
+    return false;
+  }
+
+  unsigned InstrCount = 0;
+
+  // Check all instructions, except the terminators. It is assumed that
+  // terminators never have side effects or define any used register values.
+  for (MachineBasicBlock::iterator I = MBB->begin(),
+                                   E = MBB->getFirstTerminator();
+       I != E; ++I) {
+    if (I->isDebugValue())
+      continue;
+
+    if (++InstrCount > BlockInstrLimit && !Stress) {
+      DEBUG(dbgs() << "BB#" << MBB->getNumber() << " has more than "
+                   << BlockInstrLimit << " instructions.\n");
+      return false;
+    }
+
+    // There shouldn't normally be any phis in a single-predecessor block.
+    if (I->isPHI()) {
+      DEBUG(dbgs() << "Can't hoist: " << *I);
+      return false;
+    }
+
+    // Don't speculate loads. Note that it may be possible and desirable to
+    // speculate GOT or constant pool loads that are guaranteed not to trap,
+    // but we don't support that for now.
+    if (I->mayLoad()) {
+      DEBUG(dbgs() << "Won't speculate load: " << *I);
+      return false;
+    }
+
+    // We never speculate stores, so an AA pointer isn't necessary.
+    bool DontMoveAcrossStore = true;
+    if (!I->isSafeToMove(TII, 0, DontMoveAcrossStore)) {
+      DEBUG(dbgs() << "Can't speculate: " << *I);
+      return false;
+    }
+
+    // Only CmpMI is alowed to clobber the flags.
+    if (&*I != CmpMI && I->modifiesRegister(ARM64::CPSR, TRI)) {
+      DEBUG(dbgs() << "Clobbers flags: " << *I);
+      return false;
+    }
+  }
+  return true;
+}
+
+/// Analyze the sub-cfg rooted in MBB, and return true if it is a potential
+/// candidate for cmp-conversion. Fill out the internal state.
+///
+bool SSACCmpConv::canConvert(MachineBasicBlock *MBB) {
+  Head = MBB;
+  Tail = CmpBB = 0;
+
+  if (Head->succ_size() != 2)
+    return false;
+  MachineBasicBlock *Succ0 = Head->succ_begin()[0];
+  MachineBasicBlock *Succ1 = Head->succ_begin()[1];
+
+  // CmpBB can only have a single predecessor. Tail is allowed many.
+  if (Succ0->pred_size() != 1)
+    std::swap(Succ0, Succ1);
+
+  // Succ0 is our candidate for CmpBB.
+  if (Succ0->pred_size() != 1 || Succ0->succ_size() != 2)
+    return false;
+
+  CmpBB = Succ0;
+  Tail = Succ1;
+
+  if (!CmpBB->isSuccessor(Tail))
+    return false;
+
+  // The CFG topology checks out.
+  DEBUG(dbgs() << "\nTriangle: BB#" << Head->getNumber() << " -> BB#"
+               << CmpBB->getNumber() << " -> BB#" << Tail->getNumber() << '\n');
+  ++NumConsidered;
+
+  // Tail is allowed to have many predecessors, but we can't handle PHIs yet.
+  //
+  // FIXME: Real PHIs could be if-converted as long as the CmpBB values are
+  // defined before The CmpBB cmp clobbers the flags. Alternatively, it should
+  // always be safe to sink the ccmp down to immediately before the CmpBB
+  // terminators.
+  if (!trivialTailPHIs()) {
+    DEBUG(dbgs() << "Can't handle phis in Tail.\n");
+    ++NumPhiRejs;
+    return false;
+  }
+
+  if (!Tail->livein_empty()) {
+    DEBUG(dbgs() << "Can't handle live-in physregs in Tail.\n");
+    ++NumPhysRejs;
+    return false;
+  }
+
+  // CmpBB should never have PHIs since Head is its only predecessor.
+  // FIXME: Clean them up if it happens.
+  if (!CmpBB->empty() && CmpBB->front().isPHI()) {
+    DEBUG(dbgs() << "Can't handle phis in CmpBB.\n");
+    ++NumPhi2Rejs;
+    return false;
+  }
+
+  if (!CmpBB->livein_empty()) {
+    DEBUG(dbgs() << "Can't handle live-in physregs in CmpBB.\n");
+    ++NumPhysRejs;
+    return false;
+  }
+
+  // The branch we're looking to eliminate must be analyzable.
+  HeadCond.clear();
+  MachineBasicBlock *TBB = 0, *FBB = 0;
+  if (TII->AnalyzeBranch(*Head, TBB, FBB, HeadCond)) {
+    DEBUG(dbgs() << "Head branch not analyzable.\n");
+    ++NumHeadBranchRejs;
+    return false;
+  }
+
+  // This is weird, probably some sort of degenerate CFG, or an edge to a
+  // landing pad.
+  if (!TBB || HeadCond.empty()) {
+    DEBUG(dbgs() << "AnalyzeBranch didn't find conditional branch in Head.\n");
+    ++NumHeadBranchRejs;
+    return false;
+  }
+
+  if (!parseCond(HeadCond, HeadCmpBBCC)) {
+    DEBUG(dbgs() << "Unsupported branch type on Head\n");
+    ++NumHeadBranchRejs;
+    return false;
+  }
+
+  // Make sure the branch direction is right.
+  if (TBB != CmpBB) {
+    assert(TBB == Tail && "Unexpected TBB");
+    HeadCmpBBCC = ARM64CC::getInvertedCondCode(HeadCmpBBCC);
+  }
+
+  CmpBBCond.clear();
+  TBB = FBB = 0;
+  if (TII->AnalyzeBranch(*CmpBB, TBB, FBB, CmpBBCond)) {
+    DEBUG(dbgs() << "CmpBB branch not analyzable.\n");
+    ++NumCmpBranchRejs;
+    return false;
+  }
+
+  if (!TBB || CmpBBCond.empty()) {
+    DEBUG(dbgs() << "AnalyzeBranch didn't find conditional branch in CmpBB.\n");
+    ++NumCmpBranchRejs;
+    return false;
+  }
+
+  if (!parseCond(CmpBBCond, CmpBBTailCC)) {
+    DEBUG(dbgs() << "Unsupported branch type on CmpBB\n");
+    ++NumCmpBranchRejs;
+    return false;
+  }
+
+  if (TBB != Tail)
+    CmpBBTailCC = ARM64CC::getInvertedCondCode(CmpBBTailCC);
+
+  DEBUG(dbgs() << "Head->CmpBB on " << ARM64CC::getCondCodeName(HeadCmpBBCC)
+               << ", CmpBB->Tail on " << ARM64CC::getCondCodeName(CmpBBTailCC)
+               << '\n');
+
+  CmpMI = findConvertibleCompare(CmpBB);
+  if (!CmpMI)
+    return false;
+
+  if (!canSpeculateInstrs(CmpBB, CmpMI)) {
+    ++NumSpeculateRejs;
+    return false;
+  }
+  return true;
+}
+
+void SSACCmpConv::convert(SmallVectorImpl<MachineBasicBlock *> &RemovedBlocks) {
+  DEBUG(dbgs() << "Merging BB#" << CmpBB->getNumber() << " into BB#"
+               << Head->getNumber() << ":\n" << *CmpBB);
+
+  // All CmpBB instructions are moved into Head, and CmpBB is deleted.
+  // Update the CFG first.
+  updateTailPHIs();
+  Head->removeSuccessor(CmpBB);
+  CmpBB->removeSuccessor(Tail);
+  Head->transferSuccessorsAndUpdatePHIs(CmpBB);
+  DebugLoc TermDL = Head->getFirstTerminator()->getDebugLoc();
+  TII->RemoveBranch(*Head);
+
+  // If the Head terminator was one of the cbz / tbz branches with built-in
+  // compare, we need to insert an explicit compare instruction in its place.
+  if (HeadCond[0].getImm() == -1) {
+    ++NumCompBranches;
+    unsigned Opc = 0;
+    switch (HeadCond[1].getImm()) {
+    case ARM64::CBZW:
+    case ARM64::CBNZW:
+      Opc = ARM64::SUBSWri;
+      break;
+    case ARM64::CBZX:
+    case ARM64::CBNZX:
+      Opc = ARM64::SUBSXri;
+      break;
+    default:
+      llvm_unreachable("Cannot convert Head branch");
+    }
+    const MCInstrDesc &MCID = TII->get(Opc);
+    // Create a dummy virtual register for the SUBS def.
+    unsigned DestReg =
+        MRI->createVirtualRegister(TII->getRegClass(MCID, 0, TRI, *MF));
+    // Insert a SUBS Rn, #0 instruction instead of the cbz / cbnz.
+    BuildMI(*Head, Head->end(), TermDL, MCID)
+        .addReg(DestReg, RegState::Define | RegState::Dead)
+        .addOperand(HeadCond[2])
+        .addImm(0)
+        .addImm(0);
+    // SUBS uses the GPR*sp register classes.
+    MRI->constrainRegClass(HeadCond[2].getReg(),
+                           TII->getRegClass(MCID, 1, TRI, *MF));
+  }
+
+  Head->splice(Head->end(), CmpBB, CmpBB->begin(), CmpBB->end());
+
+  // Now replace CmpMI with a ccmp instruction that also considers the incoming
+  // flags.
+  unsigned Opc = 0;
+  unsigned FirstOp = 1;   // First CmpMI operand to copy.
+  bool isZBranch = false; // CmpMI is a cbz/cbnz instruction.
+  switch (CmpMI->getOpcode()) {
+  default:
+    llvm_unreachable("Unknown compare opcode");
+  case ARM64::SUBSWri:    Opc = ARM64::CCMPWi; break;
+  case ARM64::SUBSWrr:    Opc = ARM64::CCMPWr; break;
+  case ARM64::SUBSXri:    Opc = ARM64::CCMPXi; break;
+  case ARM64::SUBSXrr:    Opc = ARM64::CCMPXr; break;
+  case ARM64::ADDSWri:    Opc = ARM64::CCMNWi; break;
+  case ARM64::ADDSWrr:    Opc = ARM64::CCMNWr; break;
+  case ARM64::ADDSXri:    Opc = ARM64::CCMNXi; break;
+  case ARM64::ADDSXrr:    Opc = ARM64::CCMNXr; break;
+  case ARM64::FCMPSrr:    Opc = ARM64::FCCMPSrr; FirstOp = 0; break;
+  case ARM64::FCMPDrr:    Opc = ARM64::FCCMPDrr; FirstOp = 0; break;
+  case ARM64::FCMPESrr:   Opc = ARM64::FCCMPESrr; FirstOp = 0; break;
+  case ARM64::FCMPEDrr:   Opc = ARM64::FCCMPEDrr; FirstOp = 0; break;
+  case ARM64::CBZW:
+  case ARM64::CBNZW:
+    Opc = ARM64::CCMPWi;
+    FirstOp = 0;
+    isZBranch = true;
+    break;
+  case ARM64::CBZX:
+  case ARM64::CBNZX:
+    Opc = ARM64::CCMPXi;
+    FirstOp = 0;
+    isZBranch = true;
+    break;
+  }
+
+  // The ccmp instruction should set the flags according to the comparison when
+  // Head would have branched to CmpBB.
+  // The NZCV immediate operand should provide flags for the case where Head
+  // would have branched to Tail. These flags should cause the new Head
+  // terminator to branch to tail.
+  unsigned NZCV = ARM64CC::getNZCVToSatisfyCondCode(CmpBBTailCC);
+  const MCInstrDesc &MCID = TII->get(Opc);
+  MRI->constrainRegClass(CmpMI->getOperand(FirstOp).getReg(),
+                         TII->getRegClass(MCID, 0, TRI, *MF));
+  if (CmpMI->getOperand(FirstOp + 1).isReg())
+    MRI->constrainRegClass(CmpMI->getOperand(FirstOp + 1).getReg(),
+                           TII->getRegClass(MCID, 1, TRI, *MF));
+  MachineInstrBuilder MIB =
+      BuildMI(*Head, CmpMI, CmpMI->getDebugLoc(), MCID)
+          .addOperand(CmpMI->getOperand(FirstOp)); // Register Rn
+  if (isZBranch)
+    MIB.addImm(0); // cbz/cbnz Rn -> ccmp Rn, #0
+  else
+    MIB.addOperand(CmpMI->getOperand(FirstOp + 1)); // Register Rm / Immediate
+  MIB.addImm(NZCV).addImm(HeadCmpBBCC);
+
+  // If CmpMI was a terminator, we need a new conditional branch to replace it.
+  // This now becomes a Head terminator.
+  if (isZBranch) {
+    bool isNZ = CmpMI->getOpcode() == ARM64::CBNZW ||
+                CmpMI->getOpcode() == ARM64::CBNZX;
+    BuildMI(*Head, CmpMI, CmpMI->getDebugLoc(), TII->get(ARM64::Bcc))
+        .addImm(isNZ ? ARM64CC::NE : ARM64CC::EQ)
+        .addOperand(CmpMI->getOperand(1)); // Branch target.
+  }
+  CmpMI->eraseFromParent();
+  Head->updateTerminator();
+
+  RemovedBlocks.push_back(CmpBB);
+  CmpBB->eraseFromParent();
+  DEBUG(dbgs() << "Result:\n" << *Head);
+  ++NumConverted;
+}
+
+int SSACCmpConv::expectedCodeSizeDelta() const {
+  int delta = 0;
+  // If the Head terminator was one of the cbz / tbz branches with built-in
+  // compare, we need to insert an explicit compare instruction in its place
+  // plus a branch instruction.
+  if (HeadCond[0].getImm() == -1) {
+    switch (HeadCond[1].getImm()) {
+    case ARM64::CBZW:
+    case ARM64::CBNZW:
+    case ARM64::CBZX:
+    case ARM64::CBNZX:
+      // Therefore delta += 1
+      delta = 1;
+      break;
+    default:
+      llvm_unreachable("Cannot convert Head branch");
+    }
+  }
+  // If the Cmp terminator was one of the cbz / tbz branches with
+  // built-in compare, it will be turned into a compare instruction
+  // into Head, but we do not save any instruction.
+  // Otherwise, we save the branch instruction.
+  switch (CmpMI->getOpcode()) {
+  default:
+    --delta;
+    break;
+  case ARM64::CBZW:
+  case ARM64::CBNZW:
+  case ARM64::CBZX:
+  case ARM64::CBNZX:
+    break;
+  }
+  return delta;
+}
+
+//===----------------------------------------------------------------------===//
+//                       ARM64ConditionalCompares Pass
+//===----------------------------------------------------------------------===//
+
+namespace {
+class ARM64ConditionalCompares : public MachineFunctionPass {
+  const TargetInstrInfo *TII;
+  const TargetRegisterInfo *TRI;
+  const MCSchedModel *SchedModel;
+  // Does the proceeded function has Oz attribute.
+  bool MinSize;
+  MachineRegisterInfo *MRI;
+  MachineDominatorTree *DomTree;
+  MachineLoopInfo *Loops;
+  MachineTraceMetrics *Traces;
+  MachineTraceMetrics::Ensemble *MinInstr;
+  SSACCmpConv CmpConv;
+
+public:
+  static char ID;
+  ARM64ConditionalCompares() : MachineFunctionPass(ID) {}
+  void getAnalysisUsage(AnalysisUsage &AU) const;
+  bool runOnMachineFunction(MachineFunction &MF);
+  const char *getPassName() const { return "ARM64 Conditional Compares"; }
+
+private:
+  bool tryConvert(MachineBasicBlock *);
+  void updateDomTree(ArrayRef<MachineBasicBlock *> Removed);
+  void updateLoops(ArrayRef<MachineBasicBlock *> Removed);
+  void invalidateTraces();
+  bool shouldConvert();
+};
+} // end anonymous namespace
+
+char ARM64ConditionalCompares::ID = 0;
+
+namespace llvm {
+void initializeARM64ConditionalComparesPass(PassRegistry &);
+}
+
+INITIALIZE_PASS_BEGIN(ARM64ConditionalCompares, "arm64-ccmp", "ARM64 CCMP Pass",
+                      false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfo)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachineTraceMetrics)
+INITIALIZE_PASS_END(ARM64ConditionalCompares, "arm64-ccmp", "ARM64 CCMP Pass",
+                    false, false)
+
+FunctionPass *llvm::createARM64ConditionalCompares() {
+  return new ARM64ConditionalCompares();
+}
+
+void ARM64ConditionalCompares::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<MachineBranchProbabilityInfo>();
+  AU.addRequired<MachineDominatorTree>();
+  AU.addPreserved<MachineDominatorTree>();
+  AU.addRequired<MachineLoopInfo>();
+  AU.addPreserved<MachineLoopInfo>();
+  AU.addRequired<MachineTraceMetrics>();
+  AU.addPreserved<MachineTraceMetrics>();
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+/// Update the dominator tree after if-conversion erased some blocks.
+void
+ARM64ConditionalCompares::updateDomTree(ArrayRef<MachineBasicBlock *> Removed) {
+  // convert() removes CmpBB which was previously dominated by Head.
+  // CmpBB children should be transferred to Head.
+  MachineDomTreeNode *HeadNode = DomTree->getNode(CmpConv.Head);
+  for (unsigned i = 0, e = Removed.size(); i != e; ++i) {
+    MachineDomTreeNode *Node = DomTree->getNode(Removed[i]);
+    assert(Node != HeadNode && "Cannot erase the head node");
+    assert(Node->getIDom() == HeadNode && "CmpBB should be dominated by Head");
+    while (Node->getNumChildren())
+      DomTree->changeImmediateDominator(Node->getChildren().back(), HeadNode);
+    DomTree->eraseNode(Removed[i]);
+  }
+}
+
+/// Update LoopInfo after if-conversion.
+void
+ARM64ConditionalCompares::updateLoops(ArrayRef<MachineBasicBlock *> Removed) {
+  if (!Loops)
+    return;
+  for (unsigned i = 0, e = Removed.size(); i != e; ++i)
+    Loops->removeBlock(Removed[i]);
+}
+
+/// Invalidate MachineTraceMetrics before if-conversion.
+void ARM64ConditionalCompares::invalidateTraces() {
+  Traces->invalidate(CmpConv.Head);
+  Traces->invalidate(CmpConv.CmpBB);
+}
+
+/// Apply cost model and heuristics to the if-conversion in IfConv.
+/// Return true if the conversion is a good idea.
+///
+bool ARM64ConditionalCompares::shouldConvert() {
+  // Stress testing mode disables all cost considerations.
+  if (Stress)
+    return true;
+  if (!MinInstr)
+    MinInstr = Traces->getEnsemble(MachineTraceMetrics::TS_MinInstrCount);
+
+  // Head dominates CmpBB, so it is always included in its trace.
+  MachineTraceMetrics::Trace Trace = MinInstr->getTrace(CmpConv.CmpBB);
+
+  // If code size is the main concern
+  if (MinSize) {
+    int CodeSizeDelta = CmpConv.expectedCodeSizeDelta();
+    DEBUG(dbgs() << "Code size delta:  " << CodeSizeDelta << '\n');
+    // If we are minimizing the code size, do the conversion whatever
+    // the cost is.
+    if (CodeSizeDelta < 0)
+      return true;
+    if (CodeSizeDelta > 0) {
+      DEBUG(dbgs() << "Code size is increasing, give up on this one.\n");
+      return false;
+    }
+    // CodeSizeDelta == 0, continue with the regular heuristics
+  }
+
+  // Heuristic: The compare conversion delays the execution of the branch
+  // instruction because we must wait for the inputs to the second compare as
+  // well. The branch has no dependent instructions, but delaying it increases
+  // the cost of a misprediction.
+  //
+  // Set a limit on the delay we will accept.
+  unsigned DelayLimit = SchedModel->MispredictPenalty * 3 / 4;
+
+  // Instruction depths can be computed for all trace instructions above CmpBB.
+  unsigned HeadDepth =
+      Trace.getInstrCycles(CmpConv.Head->getFirstTerminator()).Depth;
+  unsigned CmpBBDepth =
+      Trace.getInstrCycles(CmpConv.CmpBB->getFirstTerminator()).Depth;
+  DEBUG(dbgs() << "Head depth:  " << HeadDepth
+               << "\nCmpBB depth: " << CmpBBDepth << '\n');
+  if (CmpBBDepth > HeadDepth + DelayLimit) {
+    DEBUG(dbgs() << "Branch delay would be larger than " << DelayLimit
+                 << " cycles.\n");
+    return false;
+  }
+
+  // Check the resource depth at the bottom of CmpBB - these instructions will
+  // be speculated.
+  unsigned ResDepth = Trace.getResourceDepth(true);
+  DEBUG(dbgs() << "Resources:   " << ResDepth << '\n');
+
+  // Heuristic: The speculatively executed instructions must all be able to
+  // merge into the Head block. The Head critical path should dominate the
+  // resource cost of the speculated instructions.
+  if (ResDepth > HeadDepth) {
+    DEBUG(dbgs() << "Too many instructions to speculate.\n");
+    return false;
+  }
+  return true;
+}
+
+bool ARM64ConditionalCompares::tryConvert(MachineBasicBlock *MBB) {
+  bool Changed = false;
+  while (CmpConv.canConvert(MBB) && shouldConvert()) {
+    invalidateTraces();
+    SmallVector<MachineBasicBlock *, 4> RemovedBlocks;
+    CmpConv.convert(RemovedBlocks);
+    Changed = true;
+    updateDomTree(RemovedBlocks);
+    updateLoops(RemovedBlocks);
+  }
+  return Changed;
+}
+
+bool ARM64ConditionalCompares::runOnMachineFunction(MachineFunction &MF) {
+  DEBUG(dbgs() << "********** ARM64 Conditional Compares **********\n"
+               << "********** Function: " << MF.getName() << '\n');
+  TII = MF.getTarget().getInstrInfo();
+  TRI = MF.getTarget().getRegisterInfo();
+  SchedModel =
+      MF.getTarget().getSubtarget<TargetSubtargetInfo>().getSchedModel();
+  MRI = &MF.getRegInfo();
+  DomTree = &getAnalysis<MachineDominatorTree>();
+  Loops = getAnalysisIfAvailable<MachineLoopInfo>();
+  Traces = &getAnalysis<MachineTraceMetrics>();
+  MinInstr = 0;
+  MinSize = MF.getFunction()->getAttributes().hasAttribute(
+      AttributeSet::FunctionIndex, Attribute::MinSize);
+
+  bool Changed = false;
+  CmpConv.runOnMachineFunction(MF);
+
+  // Visit blocks in dominator tree pre-order. The pre-order enables multiple
+  // cmp-conversions from the same head block.
+  // Note that updateDomTree() modifies the children of the DomTree node
+  // currently being visited. The df_iterator supports that, it doesn't look at
+  // child_begin() / child_end() until after a node has been visited.
+  for (df_iterator<MachineDominatorTree *> I = df_begin(DomTree),
+                                           E = df_end(DomTree);
+       I != E; ++I)
+    if (tryConvert(I->getBlock()))
+      Changed = true;
+
+  return Changed;
+}
--- a/lib/Target/ARM64/ARM64DeadRegisterDefinitionsPass.cpp
+++ b/lib/Target/ARM64/ARM64DeadRegisterDefinitionsPass.cpp
@ -0,0 +1,104 @@
+//===-- ARM64DeadRegisterDefinitions.cpp - Replace dead defs w/ zero reg --===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// When allowed by the instruction, replace a dead definition of a GPR with
+// the zero register. This makes the code a bit friendlier towards the
+// hardware's register renamer.
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "arm64-dead-defs"
+#include "ARM64.h"
+#include "ARM64RegisterInfo.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+STATISTIC(NumDeadDefsReplaced, "Number of dead definitions replaced");
+
+namespace {
+class ARM64DeadRegisterDefinitions : public MachineFunctionPass {
+private:
+  bool processMachineBasicBlock(MachineBasicBlock *MBB);
+
+public:
+  static char ID; // Pass identification, replacement for typeid.
+  explicit ARM64DeadRegisterDefinitions() : MachineFunctionPass(ID) {}
+
+  virtual bool runOnMachineFunction(MachineFunction &F);
+
+  const char *getPassName() const { return "Dead register definitions"; }
+
+  virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+    AU.setPreservesCFG();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+};
+char ARM64DeadRegisterDefinitions::ID = 0;
+} // end anonymous namespace
+
+bool
+ARM64DeadRegisterDefinitions::processMachineBasicBlock(MachineBasicBlock *MBB) {
+  bool Changed = false;
+  for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E;
+       ++I) {
+    MachineInstr *MI = I;
+    for (int i = 0, e = MI->getDesc().getNumDefs(); i != e; ++i) {
+      MachineOperand &MO = MI->getOperand(i);
+      if (MO.isReg() && MO.isDead() && MO.isDef()) {
+        assert(!MO.isImplicit() && "Unexpected implicit def!");
+        DEBUG(dbgs() << "  Dead def operand #" << i << " in:\n    ";
+              MI->print(dbgs()));
+        // Be careful not to change the register if it's a tied operand.
+        if (MI->isRegTiedToUseOperand(i)) {
+          DEBUG(dbgs() << "    Ignoring, def is tied operand.\n");
+          continue;
+        }
+        // Make sure the instruction take a register class that contains
+        // the zero register and replace it if so.
+        unsigned NewReg;
+        switch (MI->getDesc().OpInfo[i].RegClass) {
+        default:
+          DEBUG(dbgs() << "    Ignoring, register is not a GPR.\n");
+          continue;
+        case ARM64::GPR32RegClassID:
+          NewReg = ARM64::WZR;
+          break;
+        case ARM64::GPR64RegClassID:
+          NewReg = ARM64::XZR;
+          break;
+        }
+        DEBUG(dbgs() << "    Replacing with zero register. New:\n      ");
+        MO.setReg(NewReg);
+        DEBUG(MI->print(dbgs()));
+        ++NumDeadDefsReplaced;
+      }
+    }
+  }
+  return Changed;
+}
+
+// Scan the function for instructions that have a dead definition of a
+// register. Replace that register with the zero register when possible.
+bool ARM64DeadRegisterDefinitions::runOnMachineFunction(MachineFunction &mf) {
+  MachineFunction *MF = &mf;
+  bool Changed = false;
+  DEBUG(dbgs() << "***** ARM64DeadRegisterDefinitions *****\n");
+
+  for (MachineFunction::iterator I = MF->begin(), E = MF->end(); I != E; ++I)
+    if (processMachineBasicBlock(I))
+      Changed = true;
+  return Changed;
+}
+
+FunctionPass *llvm::createARM64DeadRegisterDefinitions() {
+  return new ARM64DeadRegisterDefinitions();
+}
--- a/lib/Target/ARM64/ARM64ExpandPseudoInsts.cpp
+++ b/lib/Target/ARM64/ARM64ExpandPseudoInsts.cpp
@ -0,0 +1,726 @@
+//===-- ARM64ExpandPseudoInsts.cpp - Expand pseudo instructions ---*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a pass that expands pseudo instructions into target
+// instructions to allow proper scheduling and other late optimizations.  This
+// pass should be run after register allocation but before the post-regalloc
+// scheduling pass.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/ARM64AddressingModes.h"
+#include "ARM64InstrInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/Support/MathExtras.h"
+using namespace llvm;
+
+namespace {
+class ARM64ExpandPseudo : public MachineFunctionPass {
+public:
+  static char ID;
+  ARM64ExpandPseudo() : MachineFunctionPass(ID) {}
+
+  const ARM64InstrInfo *TII;
+
+  virtual bool runOnMachineFunction(MachineFunction &Fn);
+
+  virtual const char *getPassName() const {
+    return "ARM64 pseudo instruction expansion pass";
+  }
+
+private:
+  bool expandMBB(MachineBasicBlock &MBB);
+  bool expandMI(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI);
+  bool expandMOVImm(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+                    unsigned BitSize);
+};
+char ARM64ExpandPseudo::ID = 0;
+}
+
+/// \brief Transfer implicit operands on the pseudo instruction to the
+/// instructions created from the expansion.
+static void transferImpOps(MachineInstr &OldMI, MachineInstrBuilder &UseMI,
+                           MachineInstrBuilder &DefMI) {
+  const MCInstrDesc &Desc = OldMI.getDesc();
+  for (unsigned i = Desc.getNumOperands(), e = OldMI.getNumOperands(); i != e;
+       ++i) {
+    const MachineOperand &MO = OldMI.getOperand(i);
+    assert(MO.isReg() && MO.getReg());
+    if (MO.isUse())
+      UseMI.addOperand(MO);
+    else
+      DefMI.addOperand(MO);
+  }
+}
+
+/// \brief Helper function which extracts the specified 16-bit chunk from a
+/// 64-bit value.
+static uint64_t getChunk(uint64_t Imm, unsigned ChunkIdx) {
+  assert(ChunkIdx < 4 && "Out of range chunk index specified!");
+
+  return (Imm >> (ChunkIdx * 16)) & 0xFFFF;
+}
+
+/// \brief Helper function which replicates a 16-bit chunk within a 64-bit
+/// value. Indices correspond to element numbers in a v4i16.
+static uint64_t replicateChunk(uint64_t Imm, unsigned FromIdx, unsigned ToIdx) {
+  assert((FromIdx < 4) && (ToIdx < 4) && "Out of range chunk index specified!");
+  const unsigned ShiftAmt = ToIdx * 16;
+
+  // Replicate the source chunk to the destination position.
+  const uint64_t Chunk = getChunk(Imm, FromIdx) << ShiftAmt;
+  // Clear the destination chunk.
+  Imm &= ~(0xFFFFLL << ShiftAmt);
+  // Insert the replicated chunk.
+  return Imm | Chunk;
+}
+
+/// \brief Helper function which tries to materialize a 64-bit value with an
+/// ORR + MOVK instruction sequence.
+static bool tryOrrMovk(uint64_t UImm, uint64_t OrrImm, MachineInstr &MI,
+                       MachineBasicBlock &MBB,
+                       MachineBasicBlock::iterator &MBBI,
+                       const ARM64InstrInfo *TII, unsigned ChunkIdx) {
+  assert(ChunkIdx < 4 && "Out of range chunk index specified!");
+  const unsigned ShiftAmt = ChunkIdx * 16;
+
+  uint64_t Encoding;
+  if (ARM64_AM::processLogicalImmediate(OrrImm, 64, Encoding)) {
+    // Create the ORR-immediate instruction.
+    MachineInstrBuilder MIB =
+        BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM64::ORRXri))
+            .addOperand(MI.getOperand(0))
+            .addReg(ARM64::XZR)
+            .addImm(Encoding);
+
+    // Create the MOVK instruction.
+    const unsigned Imm16 = getChunk(UImm, ChunkIdx);
+    const unsigned DstReg = MI.getOperand(0).getReg();
+    const bool DstIsDead = MI.getOperand(0).isDead();
+    MachineInstrBuilder MIB1 =
+        BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM64::MOVKXi))
+            .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead))
+            .addReg(DstReg)
+            .addImm(Imm16)
+            .addImm(ARM64_AM::getShifterImm(ARM64_AM::LSL, ShiftAmt));
+
+    transferImpOps(MI, MIB, MIB1);
+    MI.eraseFromParent();
+    return true;
+  }
+
+  return false;
+}
+
+/// \brief Check whether the given 16-bit chunk replicated to full 64-bit width
+/// can be materialized with an ORR instruction.
+static bool canUseOrr(uint64_t Chunk, uint64_t &Encoding) {
+  Chunk = (Chunk << 48) | (Chunk << 32) | (Chunk << 16) | Chunk;
+
+  return ARM64_AM::processLogicalImmediate(Chunk, 64, Encoding);
+}
+
+/// \brief Check for identical 16-bit chunks within the constant and if so
+/// materialize them with a single ORR instruction. The remaining one or two
+/// 16-bit chunks will be materialized with MOVK instructions.
+///
+/// This allows us to materialize constants like |A|B|A|A| or |A|B|C|A| (order
+/// of the chunks doesn't matter), assuming |A|A|A|A| can be materialized with
+/// an ORR instruction.
+///
+static bool tryToreplicateChunks(uint64_t UImm, MachineInstr &MI,
+                                 MachineBasicBlock &MBB,
+                                 MachineBasicBlock::iterator &MBBI,
+                                 const ARM64InstrInfo *TII) {
+  typedef DenseMap<uint64_t, unsigned> CountMap;
+  CountMap Counts;
+
+  // Scan the constant and count how often every chunk occurs.
+  for (unsigned Idx = 0; Idx < 4; ++Idx)
+    ++Counts[getChunk(UImm, Idx)];
+
+  // Traverse the chunks to find one which occurs more than once.
+  for (CountMap::const_iterator Chunk = Counts.begin(), End = Counts.end();
+       Chunk != End; ++Chunk) {
+    const uint64_t ChunkVal = Chunk->first;
+    const unsigned Count = Chunk->second;
+
+    uint64_t Encoding = 0;
+
+    // We are looking for chunks which have two or three instances and can be
+    // materialized with an ORR instruction.
+    if ((Count != 2 && Count != 3) || !canUseOrr(ChunkVal, Encoding))
+      continue;
+
+    const bool CountThree = Count == 3;
+    // Create the ORR-immediate instruction.
+    MachineInstrBuilder MIB =
+        BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM64::ORRXri))
+            .addOperand(MI.getOperand(0))
+            .addReg(ARM64::XZR)
+            .addImm(Encoding);
+
+    const unsigned DstReg = MI.getOperand(0).getReg();
+    const bool DstIsDead = MI.getOperand(0).isDead();
+
+    unsigned ShiftAmt = 0;
+    uint64_t Imm16 = 0;
+    // Find the first chunk not materialized with the ORR instruction.
+    for (; ShiftAmt < 64; ShiftAmt += 16) {
+      Imm16 = (UImm >> ShiftAmt) & 0xFFFF;
+
+      if (Imm16 != ChunkVal)
+        break;
+    }
+
+    // Create the first MOVK instruction.
+    MachineInstrBuilder MIB1 =
+        BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM64::MOVKXi))
+            .addReg(DstReg,
+                    RegState::Define | getDeadRegState(DstIsDead && CountThree))
+            .addReg(DstReg)
+            .addImm(Imm16)
+            .addImm(ARM64_AM::getShifterImm(ARM64_AM::LSL, ShiftAmt));
+
+    // In case we have three instances the whole constant is now materialized
+    // and we can exit.
+    if (CountThree) {
+      transferImpOps(MI, MIB, MIB1);
+      MI.eraseFromParent();
+      return true;
+    }
+
+    // Find the remaining chunk which needs to be materialized.
+    for (ShiftAmt += 16; ShiftAmt < 64; ShiftAmt += 16) {
+      Imm16 = (UImm >> ShiftAmt) & 0xFFFF;
+
+      if (Imm16 != ChunkVal)
+        break;
+    }
+
+    // Create the second MOVK instruction.
+    MachineInstrBuilder MIB2 =
+        BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM64::MOVKXi))
+            .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead))
+            .addReg(DstReg)
+            .addImm(Imm16)
+            .addImm(ARM64_AM::getShifterImm(ARM64_AM::LSL, ShiftAmt));
+
+    transferImpOps(MI, MIB, MIB2);
+    MI.eraseFromParent();
+    return true;
+  }
+
+  return false;
+}
+
+/// \brief Check whether this chunk matches the pattern '1...0...'. This pattern
+/// starts a contiguous sequence of ones if we look at the bits from the LSB
+/// towards the MSB.
+static bool isStartChunk(uint64_t Chunk) {
+  if (Chunk == 0 || Chunk == UINT64_MAX)
+    return false;
+
+  return (CountLeadingOnes_64(Chunk) + countTrailingZeros(Chunk)) == 64;
+}
+
+/// \brief Check whether this chunk matches the pattern '0...1...' This pattern
+/// ends a contiguous sequence of ones if we look at the bits from the LSB
+/// towards the MSB.
+static bool isEndChunk(uint64_t Chunk) {
+  if (Chunk == 0 || Chunk == UINT64_MAX)
+    return false;
+
+  return (countLeadingZeros(Chunk) + CountTrailingOnes_64(Chunk)) == 64;
+}
+
+/// \brief Clear or set all bits in the chunk at the given index.
+static uint64_t updateImm(uint64_t Imm, unsigned Idx, bool Clear) {
+  const uint64_t Mask = 0xFFFF;
+
+  if (Clear)
+    // Clear chunk in the immediate.
+    Imm &= ~(Mask << (Idx * 16));
+  else
+    // Set all bits in the immediate for the particular chunk.
+    Imm |= Mask << (Idx * 16);
+
+  return Imm;
+}
+
+/// \brief Check whether the constant contains a sequence of contiguous ones,
+/// which might be interrupted by one or two chunks. If so, materialize the
+/// sequence of contiguous ones with an ORR instruction.
+/// Materialize the chunks which are either interrupting the sequence or outside
+/// of the sequence with a MOVK instruction.
+///
+/// Assuming S is a chunk which starts the sequence (1...0...), E is a chunk
+/// which ends the sequence (0...1...). Then we are looking for constants which
+/// contain at least one S and E chunk.
+/// E.g. |E|A|B|S|, |A|E|B|S| or |A|B|E|S|.
+///
+/// We are also looking for constants like |S|A|B|E| where the contiguous
+/// sequence of ones wraps around the MSB into the LSB.
+///
+static bool trySequenceOfOnes(uint64_t UImm, MachineInstr &MI,
+                              MachineBasicBlock &MBB,
+                              MachineBasicBlock::iterator &MBBI,
+                              const ARM64InstrInfo *TII) {
+  const int NotSet = -1;
+  const uint64_t Mask = 0xFFFF;
+
+  int StartIdx = NotSet;
+  int EndIdx = NotSet;
+  // Try to find the chunks which start/end a contiguous sequence of ones.
+  for (int Idx = 0; Idx < 4; ++Idx) {
+    int64_t Chunk = getChunk(UImm, Idx);
+    // Sign extend the 16-bit chunk to 64-bit.
+    Chunk = (Chunk << 48) >> 48;
+
+    if (isStartChunk(Chunk))
+      StartIdx = Idx;
+    else if (isEndChunk(Chunk))
+      EndIdx = Idx;
+  }
+
+  // Early exit in case we can't find a start/end chunk.
+  if (StartIdx == NotSet || EndIdx == NotSet)
+    return false;
+
+  // Outside of the contiguous sequence of ones everything needs to be zero.
+  uint64_t Outside = 0;
+  // Chunks between the start and end chunk need to have all their bits set.
+  uint64_t Inside = Mask;
+
+  // If our contiguous sequence of ones wraps around from the MSB into the LSB,
+  // just swap indices and pretend we are materializing a contiguous sequence
+  // of zeros surrounded by a contiguous sequence of ones.
+  if (StartIdx > EndIdx) {
+    std::swap(StartIdx, EndIdx);
+    std::swap(Outside, Inside);
+  }
+
+  uint64_t OrrImm = UImm;
+  int FirstMovkIdx = NotSet;
+  int SecondMovkIdx = NotSet;
+
+  // Find out which chunks we need to patch up to obtain a contiguous sequence
+  // of ones.
+  for (int Idx = 0; Idx < 4; ++Idx) {
+    const uint64_t Chunk = getChunk(UImm, Idx);
+
+    // Check whether we are looking at a chunk which is not part of the
+    // contiguous sequence of ones.
+    if ((Idx < StartIdx || EndIdx < Idx) && Chunk != Outside) {
+      OrrImm = updateImm(OrrImm, Idx, Outside == 0);
+
+      // Remember the index we need to patch.
+      if (FirstMovkIdx == NotSet)
+        FirstMovkIdx = Idx;
+      else
+        SecondMovkIdx = Idx;
+
+      // Check whether we are looking a chunk which is part of the contiguous
+      // sequence of ones.
+    } else if (Idx > StartIdx && Idx < EndIdx && Chunk != Inside) {
+      OrrImm = updateImm(OrrImm, Idx, Inside != Mask);
+
+      // Remember the index we need to patch.
+      if (FirstMovkIdx == NotSet)
+        FirstMovkIdx = Idx;
+      else
+        SecondMovkIdx = Idx;
+    }
+  }
+  assert(FirstMovkIdx != NotSet && "Constant materializable with single ORR!");
+
+  // Create the ORR-immediate instruction.
+  uint64_t Encoding = 0;
+  ARM64_AM::processLogicalImmediate(OrrImm, 64, Encoding);
+  MachineInstrBuilder MIB =
+      BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM64::ORRXri))
+          .addOperand(MI.getOperand(0))
+          .addReg(ARM64::XZR)
+          .addImm(Encoding);
+
+  const unsigned DstReg = MI.getOperand(0).getReg();
+  const bool DstIsDead = MI.getOperand(0).isDead();
+
+  const bool SingleMovk = SecondMovkIdx == NotSet;
+  // Create the first MOVK instruction.
+  MachineInstrBuilder MIB1 =
+      BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM64::MOVKXi))
+          .addReg(DstReg,
+                  RegState::Define | getDeadRegState(DstIsDead && SingleMovk))
+          .addReg(DstReg)
+          .addImm(getChunk(UImm, FirstMovkIdx))
+          .addImm(ARM64_AM::getShifterImm(ARM64_AM::LSL, FirstMovkIdx * 16));
+
+  // Early exit in case we only need to emit a single MOVK instruction.
+  if (SingleMovk) {
+    transferImpOps(MI, MIB, MIB1);
+    MI.eraseFromParent();
+    return true;
+  }
+
+  // Create the second MOVK instruction.
+  MachineInstrBuilder MIB2 =
+      BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM64::MOVKXi))
+          .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead))
+          .addReg(DstReg)
+          .addImm(getChunk(UImm, SecondMovkIdx))
+          .addImm(ARM64_AM::getShifterImm(ARM64_AM::LSL, SecondMovkIdx * 16));
+
+  transferImpOps(MI, MIB, MIB2);
+  MI.eraseFromParent();
+  return true;
+}
+
+/// \brief Expand a MOVi32imm or MOVi64imm pseudo instruction to one or more
+/// real move-immediate instructions to synthesize the immediate.
+bool ARM64ExpandPseudo::expandMOVImm(MachineBasicBlock &MBB,
+                                     MachineBasicBlock::iterator MBBI,
+                                     unsigned BitSize) {
+  MachineInstr &MI = *MBBI;
+  uint64_t Imm = MI.getOperand(1).getImm();
+  const unsigned Mask = 0xFFFF;
+
+  // Try a MOVI instruction (aka ORR-immediate with the zero register).
+  uint64_t UImm = Imm << (64 - BitSize) >> (64 - BitSize);
+  uint64_t Encoding;
+  if (ARM64_AM::processLogicalImmediate(UImm, BitSize, Encoding)) {
+    unsigned Opc = (BitSize == 32 ? ARM64::ORRWri : ARM64::ORRXri);
+    MachineInstrBuilder MIB =
+        BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opc))
+            .addOperand(MI.getOperand(0))
+            .addReg(BitSize == 32 ? ARM64::WZR : ARM64::XZR)
+            .addImm(Encoding);
+    transferImpOps(MI, MIB, MIB);
+    MI.eraseFromParent();
+    return true;
+  }
+
+  // Scan the immediate and count the number of 16-bit chunks which are either
+  // all ones or all zeros.
+  unsigned OneChunks = 0;
+  unsigned ZeroChunks = 0;
+  for (unsigned Shift = 0; Shift < BitSize; Shift += 16) {
+    const unsigned Chunk = (Imm >> Shift) & Mask;
+    if (Chunk == Mask)
+      OneChunks++;
+    else if (Chunk == 0)
+      ZeroChunks++;
+  }
+
+  // Since we can't materialize the constant with a single ORR instruction,
+  // let's see whether we can materialize 3/4 of the constant with an ORR
+  // instruction and use an additional MOVK instruction to materialize the
+  // remaining 1/4.
+  //
+  // We are looking for constants with a pattern like: |A|X|B|X| or |X|A|X|B|.
+  //
+  // E.g. assuming |A|X|A|X| is a pattern which can be materialized with ORR,
+  // we would create the following instruction sequence:
+  //
+  // ORR x0, xzr, |A|X|A|X|
+  // MOVK x0, |B|, LSL #16
+  //
+  // Only look at 64-bit constants which can't be materialized with a single
+  // instruction e.g. which have less than either three all zero or all one
+  // chunks.
+  //
+  // Ignore 32-bit constants here, they always can be materialized with a
+  // MOVZ/MOVN + MOVK pair. Since the 32-bit constant can't be materialized
+  // with a single ORR, the best sequence we can achieve is a ORR + MOVK pair.
+  // Thus we fall back to the default code below which in the best case creates
+  // a single MOVZ/MOVN instruction (in case one chunk is all zero or all one).
+  //
+  if (BitSize == 64 && OneChunks < 3 && ZeroChunks < 3) {
+    // If we interpret the 64-bit constant as a v4i16, are elements 0 and 2
+    // identical?
+    if (getChunk(UImm, 0) == getChunk(UImm, 2)) {
+      // See if we can come up with a constant which can be materialized with
+      // ORR-immediate by replicating element 3 into element 1.
+      uint64_t OrrImm = replicateChunk(UImm, 3, 1);
+      if (tryOrrMovk(UImm, OrrImm, MI, MBB, MBBI, TII, 1))
+        return true;
+
+      // See if we can come up with a constant which can be materialized with
+      // ORR-immediate by replicating element 1 into element 3.
+      OrrImm = replicateChunk(UImm, 1, 3);
+      if (tryOrrMovk(UImm, OrrImm, MI, MBB, MBBI, TII, 3))
+        return true;
+
+      // If we interpret the 64-bit constant as a v4i16, are elements 1 and 3
+      // identical?
+    } else if (getChunk(UImm, 1) == getChunk(UImm, 3)) {
+      // See if we can come up with a constant which can be materialized with
+      // ORR-immediate by replicating element 2 into element 0.
+      uint64_t OrrImm = replicateChunk(UImm, 2, 0);
+      if (tryOrrMovk(UImm, OrrImm, MI, MBB, MBBI, TII, 0))
+        return true;
+
+      // See if we can come up with a constant which can be materialized with
+      // ORR-immediate by replicating element 1 into element 3.
+      OrrImm = replicateChunk(UImm, 0, 2);
+      if (tryOrrMovk(UImm, OrrImm, MI, MBB, MBBI, TII, 2))
+        return true;
+    }
+  }
+
+  // Check for identical 16-bit chunks within the constant and if so materialize
+  // them with a single ORR instruction. The remaining one or two 16-bit chunks
+  // will be materialized with MOVK instructions.
+  if (BitSize == 64 && tryToreplicateChunks(UImm, MI, MBB, MBBI, TII))
+    return true;
+
+  // Check whether the constant contains a sequence of contiguous ones, which
+  // might be interrupted by one or two chunks. If so, materialize the sequence
+  // of contiguous ones with an ORR instruction. Materialize the chunks which
+  // are either interrupting the sequence or outside of the sequence with a
+  // MOVK instruction.
+  if (BitSize == 64 && trySequenceOfOnes(UImm, MI, MBB, MBBI, TII))
+    return true;
+
+  // Use a MOVZ or MOVN instruction to set the high bits, followed by one or
+  // more MOVK instructions to insert additional 16-bit portions into the
+  // lower bits.
+  bool isNeg = false;
+
+  // Use MOVN to materialize the high bits if we have more all one chunks
+  // than all zero chunks.
+  if (OneChunks > ZeroChunks) {
+    isNeg = true;
+    Imm = ~Imm;
+  }
+
+  unsigned FirstOpc;
+  if (BitSize == 32) {
+    Imm &= (1LL << 32) - 1;
+    FirstOpc = (isNeg ? ARM64::MOVNWi : ARM64::MOVZWi);
+  } else {
+    FirstOpc = (isNeg ? ARM64::MOVNXi : ARM64::MOVZXi);
+  }
+  unsigned Shift = 0;     // LSL amount for high bits with MOVZ/MOVN
+  unsigned LastShift = 0; // LSL amount for last MOVK
+  if (Imm != 0) {
+    unsigned LZ = countLeadingZeros(Imm);
+    unsigned TZ = countTrailingZeros(Imm);
+    Shift = ((63 - LZ) / 16) * 16;
+    LastShift = (TZ / 16) * 16;
+  }
+  unsigned Imm16 = (Imm >> Shift) & Mask;
+  unsigned DstReg = MI.getOperand(0).getReg();
+  bool DstIsDead = MI.getOperand(0).isDead();
+  MachineInstrBuilder MIB1 =
+      BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(FirstOpc))
+          .addReg(DstReg, RegState::Define |
+                              getDeadRegState(DstIsDead && Shift == LastShift))
+          .addImm(Imm16)
+          .addImm(ARM64_AM::getShifterImm(ARM64_AM::LSL, Shift));
+
+  // If a MOVN was used for the high bits of a negative value, flip the rest
+  // of the bits back for use with MOVK.
+  if (isNeg)
+    Imm = ~Imm;
+
+  if (Shift == LastShift) {
+    transferImpOps(MI, MIB1, MIB1);
+    MI.eraseFromParent();
+    return true;
+  }
+
+  MachineInstrBuilder MIB2;
+  unsigned Opc = (BitSize == 32 ? ARM64::MOVKWi : ARM64::MOVKXi);
+  while (Shift != LastShift) {
+    Shift -= 16;
+    Imm16 = (Imm >> Shift) & Mask;
+    if (Imm16 == (isNeg ? Mask : 0))
+      continue; // This 16-bit portion is already set correctly.
+    MIB2 = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opc))
+               .addReg(DstReg,
+                       RegState::Define |
+                           getDeadRegState(DstIsDead && Shift == LastShift))
+               .addReg(DstReg)
+               .addImm(Imm16)
+               .addImm(ARM64_AM::getShifterImm(ARM64_AM::LSL, Shift));
+  }
+
+  transferImpOps(MI, MIB1, MIB2);
+  MI.eraseFromParent();
+  return true;
+}
+
+/// \brief If MBBI references a pseudo instruction that should be expanded here,
+/// do the expansion and return true.  Otherwise return false.
+bool ARM64ExpandPseudo::expandMI(MachineBasicBlock &MBB,
+                                 MachineBasicBlock::iterator MBBI) {
+  MachineInstr &MI = *MBBI;
+  unsigned Opcode = MI.getOpcode();
+  switch (Opcode) {
+  default:
+    break;
+
+  case ARM64::ADDWrr:
+  case ARM64::SUBWrr:
+  case ARM64::ADDXrr:
+  case ARM64::SUBXrr:
+  case ARM64::ADDSWrr:
+  case ARM64::SUBSWrr:
+  case ARM64::ADDSXrr:
+  case ARM64::SUBSXrr:
+  case ARM64::ANDWrr:
+  case ARM64::ANDXrr:
+  case ARM64::BICWrr:
+  case ARM64::BICXrr:
+  case ARM64::EONWrr:
+  case ARM64::EONXrr:
+  case ARM64::EORWrr:
+  case ARM64::EORXrr:
+  case ARM64::ORNWrr:
+  case ARM64::ORNXrr:
+  case ARM64::ORRWrr:
+  case ARM64::ORRXrr: {
+    unsigned Opcode;
+    switch (MI.getOpcode()) {
+    default:
+      return false;
+    case ARM64::ADDWrr:      Opcode = ARM64::ADDWrs; break;
+    case ARM64::SUBWrr:      Opcode = ARM64::SUBWrs; break;
+    case ARM64::ADDXrr:      Opcode = ARM64::ADDXrs; break;
+    case ARM64::SUBXrr:      Opcode = ARM64::SUBXrs; break;
+    case ARM64::ADDSWrr:     Opcode = ARM64::ADDSWrs; break;
+    case ARM64::SUBSWrr:     Opcode = ARM64::SUBSWrs; break;
+    case ARM64::ADDSXrr:     Opcode = ARM64::ADDSXrs; break;
+    case ARM64::SUBSXrr:     Opcode = ARM64::SUBSXrs; break;
+    case ARM64::ANDWrr:      Opcode = ARM64::ANDWrs; break;
+    case ARM64::ANDXrr:      Opcode = ARM64::ANDXrs; break;
+    case ARM64::BICWrr:      Opcode = ARM64::BICWrs; break;
+    case ARM64::BICXrr:      Opcode = ARM64::BICXrs; break;
+    case ARM64::EONWrr:      Opcode = ARM64::EONWrs; break;
+    case ARM64::EONXrr:      Opcode = ARM64::EONXrs; break;
+    case ARM64::EORWrr:      Opcode = ARM64::EORWrs; break;
+    case ARM64::EORXrr:      Opcode = ARM64::EORXrs; break;
+    case ARM64::ORNWrr:      Opcode = ARM64::ORNWrs; break;
+    case ARM64::ORNXrr:      Opcode = ARM64::ORNXrs; break;
+    case ARM64::ORRWrr:      Opcode = ARM64::ORRWrs; break;
+    case ARM64::ORRXrr:      Opcode = ARM64::ORRXrs; break;
+    }
+    MachineInstrBuilder MIB1 =
+        BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opcode),
+                MI.getOperand(0).getReg())
+            .addOperand(MI.getOperand(1))
+            .addOperand(MI.getOperand(2))
+            .addImm(ARM64_AM::getShifterImm(ARM64_AM::LSL, 0));
+    transferImpOps(MI, MIB1, MIB1);
+    MI.eraseFromParent();
+    return true;
+  }
+
+  case ARM64::LOADgot: {
+    // Expand into ADRP + LDR.
+    unsigned DstReg = MI.getOperand(0).getReg();
+    const MachineOperand &MO1 = MI.getOperand(1);
+    unsigned Flags = MO1.getTargetFlags();
+    MachineInstrBuilder MIB1 =
+        BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM64::ADRP), DstReg);
+    MachineInstrBuilder MIB2 =
+        BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM64::LDRXui))
+            .addOperand(MI.getOperand(0))
+            .addReg(DstReg);
+
+    if (MO1.isGlobal()) {
+      MIB1.addGlobalAddress(MO1.getGlobal(), 0, Flags | ARM64II::MO_PAGE);
+      MIB2.addGlobalAddress(MO1.getGlobal(), 0,
+                            Flags | ARM64II::MO_PAGEOFF | ARM64II::MO_NC);
+    } else if (MO1.isSymbol()) {
+      MIB1.addExternalSymbol(MO1.getSymbolName(), Flags | ARM64II::MO_PAGE);
+      MIB2.addExternalSymbol(MO1.getSymbolName(),
+                             Flags | ARM64II::MO_PAGEOFF | ARM64II::MO_NC);
+    } else {
+      assert(MO1.isCPI() &&
+             "Only expect globals, externalsymbols, or constant pools");
+      MIB1.addConstantPoolIndex(MO1.getIndex(), MO1.getOffset(),
+                                Flags | ARM64II::MO_PAGE);
+      MIB2.addConstantPoolIndex(MO1.getIndex(), MO1.getOffset(),
+                                Flags | ARM64II::MO_PAGEOFF | ARM64II::MO_NC);
+    }
+
+    transferImpOps(MI, MIB1, MIB2);
+    MI.eraseFromParent();
+    return true;
+  }
+
+  case ARM64::MOVaddr:
+  case ARM64::MOVaddrJT:
+  case ARM64::MOVaddrCP:
+  case ARM64::MOVaddrBA:
+  case ARM64::MOVaddrTLS:
+  case ARM64::MOVaddrEXT: {
+    // Expand into ADRP + ADD.
+    unsigned DstReg = MI.getOperand(0).getReg();
+    MachineInstrBuilder MIB1 =
+        BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM64::ADRP), DstReg)
+            .addOperand(MI.getOperand(1));
+
+    MachineInstrBuilder MIB2 =
+        BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM64::ADDXri))
+            .addOperand(MI.getOperand(0))
+            .addReg(DstReg)
+            .addOperand(MI.getOperand(2))
+            .addImm(0);
+
+    transferImpOps(MI, MIB1, MIB2);
+    MI.eraseFromParent();
+    return true;
+  }
+
+  case ARM64::MOVi32imm:
+    return expandMOVImm(MBB, MBBI, 32);
+  case ARM64::MOVi64imm:
+    return expandMOVImm(MBB, MBBI, 64);
+  case ARM64::RET_ReallyLR:
+    BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM64::RET))
+        .addReg(ARM64::LR);
+    MI.eraseFromParent();
+    return true;
+  }
+  return false;
+}
+
+/// \brief Iterate over the instructions in basic block MBB and expand any
+/// pseudo instructions.  Return true if anything was modified.
+bool ARM64ExpandPseudo::expandMBB(MachineBasicBlock &MBB) {
+  bool Modified = false;
+
+  MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
+  while (MBBI != E) {
+    MachineBasicBlock::iterator NMBBI = std::next(MBBI);
+    Modified |= expandMI(MBB, MBBI);
+    MBBI = NMBBI;
+  }
+
+  return Modified;
+}
+
+bool ARM64ExpandPseudo::runOnMachineFunction(MachineFunction &MF) {
+  TII = static_cast<const ARM64InstrInfo *>(MF.getTarget().getInstrInfo());
+
+  bool Modified = false;
+  for (MachineFunction::iterator MFI = MF.begin(), E = MF.end(); MFI != E;
+       ++MFI)
+    Modified |= expandMBB(*MFI);
+  return Modified;
+}
+
+/// \brief Returns an instance of the pseudo instruction expansion pass.
+FunctionPass *llvm::createARM64ExpandPseudoPass() {
+  return new ARM64ExpandPseudo();
+}
--- a/lib/Target/ARM64/ARM64FastISel.cpp
+++ b/lib/Target/ARM64/ARM64FastISel.cpp
--- a/lib/Target/ARM64/ARM64FrameLowering.cpp
+++ b/lib/Target/ARM64/ARM64FrameLowering.cpp
@ -0,0 +1,818 @@
+//===- ARM64FrameLowering.cpp - ARM64 Frame Lowering -----------*- C++ -*-====//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the ARM64 implementation of TargetFrameLowering class.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "frame-info"
+#include "ARM64FrameLowering.h"
+#include "ARM64InstrInfo.h"
+#include "ARM64MachineFunctionInfo.h"
+#include "ARM64Subtarget.h"
+#include "ARM64TargetMachine.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Function.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+static cl::opt<bool> EnableRedZone("arm64-redzone",
+                                   cl::desc("enable use of redzone on ARM64"),
+                                   cl::init(false), cl::Hidden);
+
+STATISTIC(NumRedZoneFunctions, "Number of functions using red zone");
+
+static unsigned estimateStackSize(MachineFunction &MF) {
+  const MachineFrameInfo *FFI = MF.getFrameInfo();
+  int Offset = 0;
+  for (int i = FFI->getObjectIndexBegin(); i != 0; ++i) {
+    int FixedOff = -FFI->getObjectOffset(i);
+    if (FixedOff > Offset)
+      Offset = FixedOff;
+  }
+  for (unsigned i = 0, e = FFI->getObjectIndexEnd(); i != e; ++i) {
+    if (FFI->isDeadObjectIndex(i))
+      continue;
+    Offset += FFI->getObjectSize(i);
+    unsigned Align = FFI->getObjectAlignment(i);
+    // Adjust to alignment boundary
+    Offset = (Offset + Align - 1) / Align * Align;
+  }
+  // This does not include the 16 bytes used for fp and lr.
+  return (unsigned)Offset;
+}
+
+bool ARM64FrameLowering::canUseRedZone(const MachineFunction &MF) const {
+  if (!EnableRedZone)
+    return false;
+  // Don't use the red zone if the function explicitly asks us not to.
+  // This is typically used for kernel code.
+  if (MF.getFunction()->getAttributes().hasAttribute(
+          AttributeSet::FunctionIndex, Attribute::NoRedZone))
+    return false;
+
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  const ARM64FunctionInfo *AFI = MF.getInfo<ARM64FunctionInfo>();
+  unsigned NumBytes = AFI->getLocalStackSize();
+
+  // Note: currently hasFP() is always true for hasCalls(), but that's an
+  // implementation detail of the current code, not a strict requirement,
+  // so stay safe here and check both.
+  if (MFI->hasCalls() || hasFP(MF) || NumBytes > 128)
+    return false;
+  return true;
+}
+
+/// hasFP - Return true if the specified function should have a dedicated frame
+/// pointer register.
+bool ARM64FrameLowering::hasFP(const MachineFunction &MF) const {
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+
+#ifndef NDEBUG
+  const TargetRegisterInfo *RegInfo = MF.getTarget().getRegisterInfo();
+  assert(!RegInfo->needsStackRealignment(MF) &&
+         "No stack realignment on ARM64!");
+#endif
+
+  return (MFI->hasCalls() || MFI->hasVarSizedObjects() ||
+          MFI->isFrameAddressTaken());
+}
+
+/// hasReservedCallFrame - Under normal circumstances, when a frame pointer is
+/// not required, we reserve argument space for call sites in the function
+/// immediately on entry to the current function.  This eliminates the need for
+/// add/sub sp brackets around call sites.  Returns true if the call frame is
+/// included as part of the stack frame.
+bool ARM64FrameLowering::hasReservedCallFrame(const MachineFunction &MF) const {
+  return !MF.getFrameInfo()->hasVarSizedObjects();
+}
+
+void ARM64FrameLowering::eliminateCallFramePseudoInstr(
+    MachineFunction &MF, MachineBasicBlock &MBB,
+    MachineBasicBlock::iterator I) const {
+  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
+  const ARM64InstrInfo *TII =
+      static_cast<const ARM64InstrInfo *>(MF.getTarget().getInstrInfo());
+  if (!TFI->hasReservedCallFrame(MF)) {
+    // If we have alloca, convert as follows:
+    // ADJCALLSTACKDOWN -> sub, sp, sp, amount
+    // ADJCALLSTACKUP   -> add, sp, sp, amount
+    MachineInstr *Old = I;
+    DebugLoc DL = Old->getDebugLoc();
+    unsigned Amount = Old->getOperand(0).getImm();
+    if (Amount != 0) {
+      // We need to keep the stack aligned properly.  To do this, we round the
+      // amount of space needed for the outgoing arguments up to the next
+      // alignment boundary.
+      unsigned Align = TFI->getStackAlignment();
+      Amount = (Amount + Align - 1) / Align * Align;
+
+      // Replace the pseudo instruction with a new instruction...
+      unsigned Opc = Old->getOpcode();
+      if (Opc == ARM64::ADJCALLSTACKDOWN) {
+        emitFrameOffset(MBB, I, DL, ARM64::SP, ARM64::SP, -Amount, TII);
+      } else {
+        assert(Opc == ARM64::ADJCALLSTACKUP && "expected ADJCALLSTACKUP");
+        emitFrameOffset(MBB, I, DL, ARM64::SP, ARM64::SP, Amount, TII);
+      }
+    }
+  }
+  MBB.erase(I);
+}
+
+void
+ARM64FrameLowering::emitCalleeSavedFrameMoves(MachineBasicBlock &MBB,
+                                              MachineBasicBlock::iterator MBBI,
+                                              unsigned FramePtr) const {
+  MachineFunction &MF = *MBB.getParent();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  MachineModuleInfo &MMI = MF.getMMI();
+  const MCRegisterInfo *MRI = MMI.getContext().getRegisterInfo();
+  const ARM64InstrInfo *TII = TM.getInstrInfo();
+  DebugLoc DL = MBB.findDebugLoc(MBBI);
+
+  // Add callee saved registers to move list.
+  const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo();
+  if (CSI.empty())
+    return;
+
+  const DataLayout *TD = MF.getTarget().getDataLayout();
+  bool HasFP = hasFP(MF);
+
+  // Calculate amount of bytes used for return address storing.
+  int stackGrowth = -TD->getPointerSize(0);
+
+  // Calculate offsets.
+  int64_t saveAreaOffset = (HasFP ? 2 : 1) * stackGrowth;
+  unsigned TotalSkipped = 0;
+  for (std::vector<CalleeSavedInfo>::const_iterator I = CSI.begin(),
+                                                    E = CSI.end();
+       I != E; ++I) {
+    unsigned Reg = I->getReg();
+    int64_t Offset = MFI->getObjectOffset(I->getFrameIdx()) -
+                     getOffsetOfLocalArea() + saveAreaOffset;
+
+    // Don't output a new CFI directive if we're re-saving the frame pointer or
+    // link register. This happens when the PrologEpilogInserter has inserted an
+    // extra "STP" of the frame pointer and link register -- the "emitPrologue"
+    // method automatically generates the directives when frame pointers are
+    // used. If we generate CFI directives for the extra "STP"s, the linker will
+    // lose track of the correct values for the frame pointer and link register.
+    if (HasFP && (FramePtr == Reg || Reg == ARM64::LR)) {
+      TotalSkipped += stackGrowth;
+      continue;
+    }
+
+    unsigned DwarfReg = MRI->getDwarfRegNum(Reg, true);
+    unsigned CFIIndex = MMI.addFrameInst(MCCFIInstruction::createOffset(
+        nullptr, DwarfReg, Offset - TotalSkipped));
+    BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
+        .addCFIIndex(CFIIndex);
+  }
+}
+
+void ARM64FrameLowering::emitPrologue(MachineFunction &MF) const {
+  MachineBasicBlock &MBB = MF.front(); // Prologue goes in entry BB.
+  MachineBasicBlock::iterator MBBI = MBB.begin();
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  const Function *Fn = MF.getFunction();
+  const ARM64RegisterInfo *RegInfo = TM.getRegisterInfo();
+  const ARM64InstrInfo *TII = TM.getInstrInfo();
+  MachineModuleInfo &MMI = MF.getMMI();
+  ARM64FunctionInfo *AFI = MF.getInfo<ARM64FunctionInfo>();
+  bool needsFrameMoves = MMI.hasDebugInfo() || Fn->needsUnwindTableEntry();
+  bool HasFP = hasFP(MF);
+  DebugLoc DL = MBB.findDebugLoc(MBBI);
+
+  int NumBytes = (int)MFI->getStackSize();
+  if (!AFI->hasStackFrame()) {
+    assert(!HasFP && "unexpected function without stack frame but with FP");
+
+    // All of the stack allocation is for locals.
+    AFI->setLocalStackSize(NumBytes);
+
+    // Label used to tie together the PROLOG_LABEL and the MachineMoves.
+    MCSymbol *FrameLabel = MMI.getContext().CreateTempSymbol();
+
+    // REDZONE: If the stack size is less than 128 bytes, we don't need
+    // to actually allocate.
+    if (NumBytes && !canUseRedZone(MF)) {
+      emitFrameOffset(MBB, MBBI, DL, ARM64::SP, ARM64::SP, -NumBytes, TII,
+                      MachineInstr::FrameSetup);
+
+      // Encode the stack size of the leaf function.
+      unsigned CFIIndex = MMI.addFrameInst(
+          MCCFIInstruction::createDefCfaOffset(FrameLabel, -NumBytes));
+      BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
+          .addCFIIndex(CFIIndex);
+    } else if (NumBytes) {
+      ++NumRedZoneFunctions;
+    }
+
+    return;
+  }
+
+  // Only set up FP if we actually need to.
+  int FPOffset = 0;
+  if (HasFP) {
+    // First instruction must a) allocate the stack  and b) have an immediate
+    // that is a multiple of -2.
+    assert((MBBI->getOpcode() == ARM64::STPXpre ||
+            MBBI->getOpcode() == ARM64::STPDpre) &&
+           MBBI->getOperand(2).getReg() == ARM64::SP &&
+           MBBI->getOperand(3).getImm() < 0 &&
+           (MBBI->getOperand(3).getImm() & 1) == 0);
+
+    // Frame pointer is fp = sp - 16. Since the  STPXpre subtracts the space
+    // required for the callee saved register area we get the frame pointer
+    // by addding that offset - 16 = -getImm()*8 - 2*8 = -(getImm() + 2) * 8.
+    FPOffset = -(MBBI->getOperand(3).getImm() + 2) * 8;
+    assert(FPOffset >= 0 && "Bad Framepointer Offset");
+  }
+
+  // Move past the saves of the callee-saved registers.
+  while (MBBI->getOpcode() == ARM64::STPXi ||
+         MBBI->getOpcode() == ARM64::STPDi ||
+         MBBI->getOpcode() == ARM64::STPXpre ||
+         MBBI->getOpcode() == ARM64::STPDpre) {
+    ++MBBI;
+    NumBytes -= 16;
+  }
+  assert(NumBytes >= 0 && "Negative stack allocation size!?");
+  if (HasFP) {
+    // Issue    sub fp, sp, FPOffset or
+    //          mov fp,sp          when FPOffset is zero.
+    // Note: All stores of callee-saved registers are marked as "FrameSetup".
+    // This code marks the instruction(s) that set the FP also.
+    emitFrameOffset(MBB, MBBI, DL, ARM64::FP, ARM64::SP, FPOffset, TII,
+                    MachineInstr::FrameSetup);
+  }
+
+  // All of the remaining stack allocations are for locals.
+  AFI->setLocalStackSize(NumBytes);
+
+  // Allocate space for the rest of the frame.
+  if (NumBytes) {
+    // If we're a leaf function, try using the red zone.
+    if (!canUseRedZone(MF))
+      emitFrameOffset(MBB, MBBI, DL, ARM64::SP, ARM64::SP, -NumBytes, TII,
+                      MachineInstr::FrameSetup);
+  }
+
+  // If we need a base pointer, set it up here. It's whatever the value of the
+  // stack pointer is at this point. Any variable size objects will be allocated
+  // after this, so we can still use the base pointer to reference locals.
+  //
+  // FIXME: Clarify FrameSetup flags here.
+  // Note: Use emitFrameOffset() like above for FP if the FrameSetup flag is
+  // needed.
+  //
+  if (RegInfo->hasBasePointer(MF))
+    TII->copyPhysReg(MBB, MBBI, DL, ARM64::X19, ARM64::SP, false);
+
+  if (needsFrameMoves) {
+    const DataLayout *TD = MF.getTarget().getDataLayout();
+    const int StackGrowth = -TD->getPointerSize(0);
+    unsigned FramePtr = RegInfo->getFrameRegister(MF);
+
+    // An example of the prologue:
+    //
+    //     .globl __foo
+    //     .align 2
+    //  __foo:
+    // Ltmp0:
+    //     .cfi_startproc
+    //     .cfi_personality 155, ___gxx_personality_v0
+    // Leh_func_begin:
+    //     .cfi_lsda 16, Lexception33
+    //
+    //     stp  xa,bx, [sp, -#offset]!
+    //     ...
+    //     stp  x28, x27, [sp, #offset-32]
+    //     stp  fp, lr, [sp, #offset-16]
+    //     add  fp, sp, #offset - 16
+    //     sub  sp, sp, #1360
+    //
+    // The Stack:
+    //       +-------------------------------------------+
+    // 10000 | ........ | ........ | ........ | ........ |
+    // 10004 | ........ | ........ | ........ | ........ |
+    //       +-------------------------------------------+
+    // 10008 | ........ | ........ | ........ | ........ |
+    // 1000c | ........ | ........ | ........ | ........ |
+    //       +===========================================+
+    // 10010 |                X28 Register               |
+    // 10014 |                X28 Register               |
+    //       +-------------------------------------------+
+    // 10018 |                X27 Register               |
+    // 1001c |                X27 Register               |
+    //       +===========================================+
+    // 10020 |                Frame Pointer              |
+    // 10024 |                Frame Pointer              |
+    //       +-------------------------------------------+
+    // 10028 |                Link Register              |
+    // 1002c |                Link Register              |
+    //       +===========================================+
+    // 10030 | ........ | ........ | ........ | ........ |
+    // 10034 | ........ | ........ | ........ | ........ |
+    //       +-------------------------------------------+
+    // 10038 | ........ | ........ | ........ | ........ |
+    // 1003c | ........ | ........ | ........ | ........ |
+    //       +-------------------------------------------+
+    //
+    //     [sp] = 10030        ::    >>initial value<<
+    //     sp = 10020          ::  stp fp, lr, [sp, #-16]!
+    //     fp = sp == 10020    ::  mov fp, sp
+    //     [sp] == 10020       ::  stp x28, x27, [sp, #-16]!
+    //     sp == 10010         ::    >>final value<<
+    //
+    // The frame pointer (w29) points to address 10020. If we use an offset of
+    // '16' from 'w29', we get the CFI offsets of -8 for w30, -16 for w29, -24
+    // for w27, and -32 for w28:
+    //
+    //  Ltmp1:
+    //     .cfi_def_cfa w29, 16
+    //  Ltmp2:
+    //     .cfi_offset w30, -8
+    //  Ltmp3:
+    //     .cfi_offset w29, -16
+    //  Ltmp4:
+    //     .cfi_offset w27, -24
+    //  Ltmp5:
+    //     .cfi_offset w28, -32
+
+    if (HasFP) {
+      // Define the current CFA rule to use the provided FP.
+      unsigned Reg = RegInfo->getDwarfRegNum(FramePtr, true);
+      unsigned CFIIndex = MMI.addFrameInst(
+          MCCFIInstruction::createDefCfa(nullptr, Reg, 2 * StackGrowth));
+      BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
+          .addCFIIndex(CFIIndex);
+
+      // Record the location of the stored LR
+      unsigned LR = RegInfo->getDwarfRegNum(ARM64::LR, true);
+      CFIIndex = MMI.addFrameInst(
+          MCCFIInstruction::createOffset(nullptr, LR, StackGrowth));
+      BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
+          .addCFIIndex(CFIIndex);
+
+      // Record the location of the stored FP
+      CFIIndex = MMI.addFrameInst(
+          MCCFIInstruction::createOffset(nullptr, Reg, 2 * StackGrowth));
+      BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
+          .addCFIIndex(CFIIndex);
+    } else {
+      // Encode the stack size of the leaf function.
+      unsigned CFIIndex = MMI.addFrameInst(
+          MCCFIInstruction::createDefCfaOffset(nullptr, -MFI->getStackSize()));
+      BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
+          .addCFIIndex(CFIIndex);
+    }
+
+    // Now emit the moves for whatever callee saved regs we have.
+    emitCalleeSavedFrameMoves(MBB, MBBI, FramePtr);
+  }
+}
+
+static bool isCalleeSavedRegister(unsigned Reg, const uint16_t *CSRegs) {
+  for (unsigned i = 0; CSRegs[i]; ++i)
+    if (Reg == CSRegs[i])
+      return true;
+  return false;
+}
+
+static bool isCSRestore(MachineInstr *MI, const uint16_t *CSRegs) {
+  if (MI->getOpcode() == ARM64::LDPXpost ||
+      MI->getOpcode() == ARM64::LDPDpost || MI->getOpcode() == ARM64::LDPXi ||
+      MI->getOpcode() == ARM64::LDPDi) {
+    if (!isCalleeSavedRegister(MI->getOperand(0).getReg(), CSRegs) ||
+        !isCalleeSavedRegister(MI->getOperand(1).getReg(), CSRegs) ||
+        MI->getOperand(2).getReg() != ARM64::SP)
+      return false;
+    return true;
+  }
+
+  return false;
+}
+
+void ARM64FrameLowering::emitEpilogue(MachineFunction &MF,
+                                      MachineBasicBlock &MBB) const {
+  MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
+  assert(MBBI->isReturn() && "Can only insert epilog into returning blocks");
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  const ARM64InstrInfo *TII =
+      static_cast<const ARM64InstrInfo *>(MF.getTarget().getInstrInfo());
+  const ARM64RegisterInfo *RegInfo =
+      static_cast<const ARM64RegisterInfo *>(MF.getTarget().getRegisterInfo());
+  DebugLoc DL = MBBI->getDebugLoc();
+
+  unsigned NumBytes = MFI->getStackSize();
+  unsigned NumRestores = 0;
+  // Move past the restores of the callee-saved registers.
+  MachineBasicBlock::iterator LastPopI = MBBI;
+  const uint16_t *CSRegs = RegInfo->getCalleeSavedRegs(&MF);
+  if (LastPopI != MBB.begin()) {
+    do {
+      ++NumRestores;
+      --LastPopI;
+    } while (LastPopI != MBB.begin() && isCSRestore(LastPopI, CSRegs));
+    if (!isCSRestore(LastPopI, CSRegs)) {
+      ++LastPopI;
+      --NumRestores;
+    }
+  }
+  NumBytes -= NumRestores * 16;
+  assert(NumBytes >= 0 && "Negative stack allocation size!?");
+
+  if (!hasFP(MF)) {
+    // If this was a redzone leaf function, we don't need to restore the
+    // stack pointer.
+    if (!canUseRedZone(MF))
+      emitFrameOffset(MBB, LastPopI, DL, ARM64::SP, ARM64::SP, NumBytes, TII);
+    return;
+  }
+
+  // Restore the original stack pointer.
+  // FIXME: Rather than doing the math here, we should instead just use
+  // non-post-indexed loads for the restores if we aren't actually going to
+  // be able to save any instructions.
+  if (NumBytes || MFI->hasVarSizedObjects())
+    emitFrameOffset(MBB, LastPopI, DL, ARM64::SP, ARM64::FP,
+                    -(NumRestores - 1) * 16, TII, MachineInstr::NoFlags);
+}
+
+/// getFrameIndexOffset - Returns the displacement from the frame register to
+/// the stack frame of the specified index.
+int ARM64FrameLowering::getFrameIndexOffset(const MachineFunction &MF,
+                                            int FI) const {
+  unsigned FrameReg;
+  return getFrameIndexReference(MF, FI, FrameReg);
+}
+
+/// getFrameIndexReference - Provide a base+offset reference to an FI slot for
+/// debug info.  It's the same as what we use for resolving the code-gen
+/// references for now.  FIXME: This can go wrong when references are
+/// SP-relative and simple call frames aren't used.
+int ARM64FrameLowering::getFrameIndexReference(const MachineFunction &MF,
+                                               int FI,
+                                               unsigned &FrameReg) const {
+  return resolveFrameIndexReference(MF, FI, FrameReg);
+}
+
+int ARM64FrameLowering::resolveFrameIndexReference(const MachineFunction &MF,
+                                                   int FI, unsigned &FrameReg,
+                                                   bool PreferFP) const {
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  const ARM64RegisterInfo *RegInfo =
+      static_cast<const ARM64RegisterInfo *>(MF.getTarget().getRegisterInfo());
+  const ARM64FunctionInfo *AFI = MF.getInfo<ARM64FunctionInfo>();
+  int FPOffset = MFI->getObjectOffset(FI) + 16;
+  int Offset = MFI->getObjectOffset(FI) + MFI->getStackSize();
+  bool isFixed = MFI->isFixedObjectIndex(FI);
+
+  // Use frame pointer to reference fixed objects. Use it for locals if
+  // there are VLAs (and thus the SP isn't reliable as a base).
+  // Make sure useFPForScavengingIndex() does the right thing for the emergency
+  // spill slot.
+  bool UseFP = false;
+  if (AFI->hasStackFrame()) {
+    // Note: Keeping the following as multiple 'if' statements rather than
+    // merging to a single expression for readability.
+    //
+    // Argument access should always use the FP.
+    if (isFixed) {
+      UseFP = hasFP(MF);
+    } else if (hasFP(MF) && !RegInfo->hasBasePointer(MF)) {
+      // Use SP or FP, whichever gives us the best chance of the offset
+      // being in range for direct access. If the FPOffset is positive,
+      // that'll always be best, as the SP will be even further away.
+      // If the FPOffset is negative, we have to keep in mind that the
+      // available offset range for negative offsets is smaller than for
+      // positive ones. If we have variable sized objects, we're stuck with
+      // using the FP regardless, though, as the SP offset is unknown
+      // and we don't have a base pointer available. If an offset is
+      // available via the FP and the SP, use whichever is closest.
+      if (PreferFP || MFI->hasVarSizedObjects() || FPOffset >= 0 ||
+          (FPOffset >= -256 && Offset > -FPOffset))
+        UseFP = true;
+    }
+  }
+
+  if (UseFP) {
+    FrameReg = RegInfo->getFrameRegister(MF);
+    return FPOffset;
+  }
+
+  // Use the base pointer if we have one.
+  if (RegInfo->hasBasePointer(MF))
+    FrameReg = RegInfo->getBaseRegister();
+  else {
+    FrameReg = ARM64::SP;
+    // If we're using the red zone for this function, the SP won't actually
+    // be adjusted, so the offsets will be negative. They're also all
+    // within range of the signed 9-bit immediate instructions.
+    if (canUseRedZone(MF))
+      Offset -= AFI->getLocalStackSize();
+  }
+
+  return Offset;
+}
+
+static unsigned getPrologueDeath(MachineFunction &MF, unsigned Reg) {
+  if (Reg != ARM64::LR)
+    return getKillRegState(true);
+
+  // LR maybe referred to later by an @llvm.returnaddress intrinsic.
+  bool LRLiveIn = MF.getRegInfo().isLiveIn(ARM64::LR);
+  bool LRKill = !(LRLiveIn && MF.getFrameInfo()->isReturnAddressTaken());
+  return getKillRegState(LRKill);
+}
+
+bool ARM64FrameLowering::spillCalleeSavedRegisters(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+    const std::vector<CalleeSavedInfo> &CSI,
+    const TargetRegisterInfo *TRI) const {
+  MachineFunction &MF = *MBB.getParent();
+  const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
+  unsigned Count = CSI.size();
+  DebugLoc DL;
+  assert((Count & 1) == 0 && "Odd number of callee-saved regs to spill!");
+
+  if (MI != MBB.end())
+    DL = MI->getDebugLoc();
+
+  for (unsigned i = 0; i < Count; i += 2) {
+    unsigned idx = Count - i - 2;
+    unsigned Reg1 = CSI[idx].getReg();
+    unsigned Reg2 = CSI[idx + 1].getReg();
+    // GPRs and FPRs are saved in pairs of 64-bit regs. We expect the CSI
+    // list to come in sorted by frame index so that we can issue the store
+    // pair instructions directly. Assert if we see anything otherwise.
+    //
+    // The order of the registers in the list is controlled by
+    // getCalleeSavedRegs(), so they will always be in-order, as well.
+    assert(CSI[idx].getFrameIdx() + 1 == CSI[idx + 1].getFrameIdx() &&
+           "Out of order callee saved regs!");
+    unsigned StrOpc;
+    assert((Count & 1) == 0 && "Odd number of callee-saved regs to spill!");
+    assert((i & 1) == 0 && "Odd index for callee-saved reg spill!");
+    // Issue sequence of non-sp increment and pi sp spills for cs regs. The
+    // first spill is a pre-increment that allocates the stack.
+    // For example:
+    //    stp     x22, x21, [sp, #-48]!   // addImm(-6)
+    //    stp     x20, x19, [sp, #16]    // addImm(+2)
+    //    stp     fp, lr, [sp, #32]      // addImm(+4)
+    // Rationale: This sequence saves uop updates compared to a sequence of
+    // pre-increment spills like stp xi,xj,[sp,#-16]!
+    // Note: Similar rational and sequence for restores in epilog.
+    if (ARM64::GPR64RegClass.contains(Reg1)) {
+      assert(ARM64::GPR64RegClass.contains(Reg2) &&
+             "Expected GPR64 callee-saved register pair!");
+      // For first spill use pre-increment store.
+      if (i == 0)
+        StrOpc = ARM64::STPXpre;
+      else
+        StrOpc = ARM64::STPXi;
+    } else if (ARM64::FPR64RegClass.contains(Reg1)) {
+      assert(ARM64::FPR64RegClass.contains(Reg2) &&
+             "Expected FPR64 callee-saved register pair!");
+      // For first spill use pre-increment store.
+      if (i == 0)
+        StrOpc = ARM64::STPDpre;
+      else
+        StrOpc = ARM64::STPDi;
+    } else
+      llvm_unreachable("Unexpected callee saved register!");
+    DEBUG(dbgs() << "CSR spill: (" << TRI->getName(Reg1) << ", "
+                 << TRI->getName(Reg2) << ") -> fi#(" << CSI[idx].getFrameIdx()
+                 << ", " << CSI[idx + 1].getFrameIdx() << ")\n");
+    // Compute offset: i = 0 => offset = -Count;
+    //                 i = 2 => offset = -(Count - 2) + Count = 2 = i; etc.
+    const int Offset = (i == 0) ? -Count : i;
+    assert((Offset >= -64 && Offset <= 63) &&
+           "Offset out of bounds for STP immediate");
+    BuildMI(MBB, MI, DL, TII.get(StrOpc))
+        .addReg(Reg2, getPrologueDeath(MF, Reg2))
+        .addReg(Reg1, getPrologueDeath(MF, Reg1))
+        .addReg(ARM64::SP)
+        .addImm(Offset) // [sp, #offset * 8], where factor * 8 is implicit
+        .setMIFlag(MachineInstr::FrameSetup);
+  }
+  return true;
+}
+
+bool ARM64FrameLowering::restoreCalleeSavedRegisters(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+    const std::vector<CalleeSavedInfo> &CSI,
+    const TargetRegisterInfo *TRI) const {
+  MachineFunction &MF = *MBB.getParent();
+  const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
+  unsigned Count = CSI.size();
+  DebugLoc DL;
+  assert((Count & 1) == 0 && "Odd number of callee-saved regs to spill!");
+
+  if (MI != MBB.end())
+    DL = MI->getDebugLoc();
+
+  for (unsigned i = 0; i < Count; i += 2) {
+    unsigned Reg1 = CSI[i].getReg();
+    unsigned Reg2 = CSI[i + 1].getReg();
+    // GPRs and FPRs are saved in pairs of 64-bit regs. We expect the CSI
+    // list to come in sorted by frame index so that we can issue the store
+    // pair instructions directly. Assert if we see anything otherwise.
+    assert(CSI[i].getFrameIdx() + 1 == CSI[i + 1].getFrameIdx() &&
+           "Out of order callee saved regs!");
+    // Issue sequence of non-sp increment and sp-pi restores for cs regs. Only
+    // the last load is sp-pi post-increment and de-allocates the stack:
+    // For example:
+    //    ldp     fp, lr, [sp, #32]       // addImm(+4)
+    //    ldp     x20, x19, [sp, #16]     // addImm(+2)
+    //    ldp     x22, x21, [sp], #48     // addImm(+6)
+    // Note: see comment in spillCalleeSavedRegisters()
+    unsigned LdrOpc;
+
+    assert((Count & 1) == 0 && "Odd number of callee-saved regs to spill!");
+    assert((i & 1) == 0 && "Odd index for callee-saved reg spill!");
+    if (ARM64::GPR64RegClass.contains(Reg1)) {
+      assert(ARM64::GPR64RegClass.contains(Reg2) &&
+             "Expected GPR64 callee-saved register pair!");
+      if (i == Count - 2)
+        LdrOpc = ARM64::LDPXpost;
+      else
+        LdrOpc = ARM64::LDPXi;
+    } else if (ARM64::FPR64RegClass.contains(Reg1)) {
+      assert(ARM64::FPR64RegClass.contains(Reg2) &&
+             "Expected FPR64 callee-saved register pair!");
+      if (i == Count - 2)
+        LdrOpc = ARM64::LDPDpost;
+      else
+        LdrOpc = ARM64::LDPDi;
+    } else
+      llvm_unreachable("Unexpected callee saved register!");
+    DEBUG(dbgs() << "CSR restore: (" << TRI->getName(Reg1) << ", "
+                 << TRI->getName(Reg2) << ") -> fi#(" << CSI[i].getFrameIdx()
+                 << ", " << CSI[i + 1].getFrameIdx() << ")\n");
+
+    // Compute offset: i = 0 => offset = Count - 2; i = 2 => offset = Count - 4;
+    // etc.
+    const int Offset = (i == Count - 2) ? Count : Count - i - 2;
+    assert((Offset >= -64 && Offset <= 63) &&
+           "Offset out of bounds for LDP immediate");
+    BuildMI(MBB, MI, DL, TII.get(LdrOpc))
+        .addReg(Reg2, getDefRegState(true))
+        .addReg(Reg1, getDefRegState(true))
+        .addReg(ARM64::SP)
+        .addImm(Offset); // [sp], #offset * 8  or [sp, #offset * 8]
+                         // where the factor * 8 is implicit
+  }
+  return true;
+}
+
+void ARM64FrameLowering::processFunctionBeforeCalleeSavedScan(
+    MachineFunction &MF, RegScavenger *RS) const {
+  const ARM64RegisterInfo *RegInfo =
+      static_cast<const ARM64RegisterInfo *>(MF.getTarget().getRegisterInfo());
+  ARM64FunctionInfo *AFI = MF.getInfo<ARM64FunctionInfo>();
+  MachineRegisterInfo *MRI = &MF.getRegInfo();
+  SmallVector<unsigned, 4> UnspilledCSGPRs;
+  SmallVector<unsigned, 4> UnspilledCSFPRs;
+
+  // The frame record needs to be created by saving the appropriate registers
+  if (hasFP(MF)) {
+    MRI->setPhysRegUsed(ARM64::FP);
+    MRI->setPhysRegUsed(ARM64::LR);
+  }
+
+  // Spill the BasePtr if it's used. Do this first thing so that the
+  // getCalleeSavedRegs() below will get the right answer.
+  if (RegInfo->hasBasePointer(MF))
+    MRI->setPhysRegUsed(RegInfo->getBaseRegister());
+
+  // If any callee-saved registers are used, the frame cannot be eliminated.
+  unsigned NumGPRSpilled = 0;
+  unsigned NumFPRSpilled = 0;
+  bool ExtraCSSpill = false;
+  bool CanEliminateFrame = true;
+  DEBUG(dbgs() << "*** processFunctionBeforeCalleeSavedScan\nUsed CSRs:");
+  const uint16_t *CSRegs = RegInfo->getCalleeSavedRegs(&MF);
+
+  // Check pairs of consecutive callee-saved registers.
+  for (unsigned i = 0; CSRegs[i]; i += 2) {
+    assert(CSRegs[i + 1] && "Odd number of callee-saved registers!");
+
+    const unsigned OddReg = CSRegs[i];
+    const unsigned EvenReg = CSRegs[i + 1];
+    assert((ARM64::GPR64RegClass.contains(OddReg) &&
+            ARM64::GPR64RegClass.contains(EvenReg)) ^
+               (ARM64::FPR64RegClass.contains(OddReg) &&
+                ARM64::FPR64RegClass.contains(EvenReg)) &&
+           "Register class mismatch!");
+
+    const bool OddRegUsed = MRI->isPhysRegUsed(OddReg);
+    const bool EvenRegUsed = MRI->isPhysRegUsed(EvenReg);
+
+    // Early exit if none of the registers in the register pair is actually
+    // used.
+    if (!OddRegUsed && !EvenRegUsed) {
+      if (ARM64::GPR64RegClass.contains(OddReg)) {
+        UnspilledCSGPRs.push_back(OddReg);
+        UnspilledCSGPRs.push_back(EvenReg);
+      } else {
+        UnspilledCSFPRs.push_back(OddReg);
+        UnspilledCSFPRs.push_back(EvenReg);
+      }
+      continue;
+    }
+
+    unsigned Reg = ARM64::NoRegister;
+    // If only one of the registers of the register pair is used, make sure to
+    // mark the other one as used as well.
+    if (OddRegUsed ^ EvenRegUsed) {
+      // Find out which register is the additional spill.
+      Reg = OddRegUsed ? EvenReg : OddReg;
+      MRI->setPhysRegUsed(Reg);
+    }
+
+    DEBUG(dbgs() << ' ' << PrintReg(OddReg, RegInfo));
+    DEBUG(dbgs() << ' ' << PrintReg(EvenReg, RegInfo));
+
+    assert(((OddReg == ARM64::LR && EvenReg == ARM64::FP) ||
+            (RegInfo->getEncodingValue(OddReg) + 1 ==
+             RegInfo->getEncodingValue(EvenReg))) &&
+           "Register pair of non-adjacent registers!");
+    if (ARM64::GPR64RegClass.contains(OddReg)) {
+      NumGPRSpilled += 2;
+      // If it's not a reserved register, we can use it in lieu of an
+      // emergency spill slot for the register scavenger.
+      // FIXME: It would be better to instead keep looking and choose another
+      // unspilled register that isn't reserved, if there is one.
+      if (Reg != ARM64::NoRegister && !RegInfo->isReservedReg(MF, Reg))
+        ExtraCSSpill = true;
+    } else
+      NumFPRSpilled += 2;
+
+    CanEliminateFrame = false;
+  }
+
+  // FIXME: Set BigStack if any stack slot references may be out of range.
+  // For now, just conservatively guestimate based on unscaled indexing
+  // range. We'll end up allocating an unnecessary spill slot a lot, but
+  // realistically that's not a big deal at this stage of the game.
+  // The CSR spill slots have not been allocated yet, so estimateStackSize
+  // won't include them.
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  unsigned CFSize = estimateStackSize(MF) + 8 * (NumGPRSpilled + NumFPRSpilled);
+  DEBUG(dbgs() << "Estimated stack frame size: " << CFSize << " bytes.\n");
+  bool BigStack = (CFSize >= 256);
+  if (BigStack || !CanEliminateFrame || RegInfo->cannotEliminateFrame(MF))
+    AFI->setHasStackFrame(true);
+
+  // Estimate if we might need to scavenge a register at some point in order
+  // to materialize a stack offset. If so, either spill one additional
+  // callee-saved register or reserve a special spill slot to facilitate
+  // register scavenging. If we already spilled an extra callee-saved register
+  // above to keep the number of spills even, we don't need to do anything else
+  // here.
+  if (BigStack && !ExtraCSSpill) {
+
+    // If we're adding a register to spill here, we have to add two of them
+    // to keep the number of regs to spill even.
+    assert(((UnspilledCSGPRs.size() & 1) == 0) && "Odd number of registers!");
+    unsigned Count = 0;
+    while (!UnspilledCSGPRs.empty() && Count < 2) {
+      unsigned Reg = UnspilledCSGPRs.back();
+      UnspilledCSGPRs.pop_back();
+      DEBUG(dbgs() << "Spilling " << PrintReg(Reg, RegInfo)
+                   << " to get a scratch register.\n");
+      MRI->setPhysRegUsed(Reg);
+      ExtraCSSpill = true;
+      ++Count;
+    }
+
+    // If we didn't find an extra callee-saved register to spill, create
+    // an emergency spill slot.
+    if (!ExtraCSSpill) {
+      const TargetRegisterClass *RC = &ARM64::GPR64RegClass;
+      int FI = MFI->CreateStackObject(RC->getSize(), RC->getAlignment(), false);
+      RS->addScavengingFrameIndex(FI);
+      DEBUG(dbgs() << "No available CS registers, allocated fi#" << FI
+                   << " as the emergency spill slot.\n");
+    }
+  }
+}
--- a/lib/Target/ARM64/ARM64FrameLowering.h
+++ b/lib/Target/ARM64/ARM64FrameLowering.h
@ -0,0 +1,75 @@
+//===-- ARM64FrameLowering.h - TargetFrameLowering for ARM64 ----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef ARM64_FRAMELOWERING_H
+#define ARM64_FRAMELOWERING_H
+
+#include "llvm/Target/TargetFrameLowering.h"
+
+namespace llvm {
+
+class ARM64Subtarget;
+class ARM64TargetMachine;
+
+class ARM64FrameLowering : public TargetFrameLowering {
+  const ARM64TargetMachine &TM;
+
+public:
+  explicit ARM64FrameLowering(const ARM64TargetMachine &TM,
+                              const ARM64Subtarget &STI)
+      : TargetFrameLowering(StackGrowsDown, 16, 0, 16,
+                            false /*StackRealignable*/),
+        TM(TM) {}
+
+  void emitCalleeSavedFrameMoves(MachineBasicBlock &MBB,
+                                 MachineBasicBlock::iterator MBBI,
+                                 unsigned FramePtr) const;
+
+  void eliminateCallFramePseudoInstr(MachineFunction &MF,
+                                     MachineBasicBlock &MBB,
+                                     MachineBasicBlock::iterator I) const;
+
+  /// emitProlog/emitEpilog - These methods insert prolog and epilog code into
+  /// the function.
+  void emitPrologue(MachineFunction &MF) const;
+  void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const;
+
+  int getFrameIndexOffset(const MachineFunction &MF, int FI) const;
+  int getFrameIndexReference(const MachineFunction &MF, int FI,
+                             unsigned &FrameReg) const;
+  int resolveFrameIndexReference(const MachineFunction &MF, int FI,
+                                 unsigned &FrameReg,
+                                 bool PreferFP = false) const;
+  bool spillCalleeSavedRegisters(MachineBasicBlock &MBB,
+                                 MachineBasicBlock::iterator MI,
+                                 const std::vector<CalleeSavedInfo> &CSI,
+                                 const TargetRegisterInfo *TRI) const;
+
+  bool restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
+                                   MachineBasicBlock::iterator MI,
+                                   const std::vector<CalleeSavedInfo> &CSI,
+                                   const TargetRegisterInfo *TRI) const;
+
+  /// \brief Can this function use the red zone for local allocations.
+  bool canUseRedZone(const MachineFunction &MF) const;
+
+  bool hasFP(const MachineFunction &MF) const;
+  bool hasReservedCallFrame(const MachineFunction &MF) const;
+
+  void processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
+                                            RegScavenger *RS) const;
+};
+
+} // End llvm namespace
+
+#endif
--- a/lib/Target/ARM64/ARM64ISelDAGToDAG.cpp
+++ b/lib/Target/ARM64/ARM64ISelDAGToDAG.cpp
--- a/lib/Target/ARM64/ARM64ISelLowering.cpp
+++ b/lib/Target/ARM64/ARM64ISelLowering.cpp
--- a/lib/Target/ARM64/ARM64ISelLowering.h
+++ b/lib/Target/ARM64/ARM64ISelLowering.h
@ -0,0 +1,423 @@
+//==-- ARM64ISelLowering.h - ARM64 DAG Lowering Interface --------*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interfaces that ARM64 uses to lower LLVM code into a
+// selection DAG.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TARGET_ARM64_ISELLOWERING_H
+#define LLVM_TARGET_ARM64_ISELLOWERING_H
+
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/Target/TargetLowering.h"
+
+namespace llvm {
+
+namespace ARM64ISD {
+
+enum {
+  FIRST_NUMBER = ISD::BUILTIN_OP_END,
+  WrapperLarge, // 4-instruction MOVZ/MOVK sequence for 64-bit addresses.
+  CALL,         // Function call.
+
+  // Almost the same as a normal call node, except that a TLSDesc relocation is
+  // needed so the linker can relax it correctly if possible.
+  TLSDESC_CALL,
+  ADRP,     // Page address of a TargetGlobalAddress operand.
+  ADDlow,   // Add the low 12 bits of a TargetGlobalAddress operand.
+  LOADgot,  // Load from automatically generated descriptor (e.g. Global
+            // Offset Table, TLS record).
+  RET_FLAG, // Return with a flag operand. Operand 0 is the chain operand.
+  BRCOND,   // Conditional branch instruction; "b.cond".
+  CSEL,
+  FCSEL, // Conditional move instruction.
+  CSINV, // Conditional select invert.
+  CSNEG, // Conditional select negate.
+  CSINC, // Conditional select increment.
+
+  // Pointer to the thread's local storage area. Materialised from TPIDR_EL0 on
+  // ELF.
+  THREAD_POINTER,
+  ADC,
+  SBC, // adc, sbc instructions
+
+  // Arithmetic instructions which write flags.
+  ADDS,
+  SUBS,
+  ADCS,
+  SBCS,
+  ANDS,
+
+  // Floating point comparison
+  FCMP,
+
+  // Floating point max and min instructions.
+  FMAX,
+  FMIN,
+
+  // Scalar extract
+  EXTR,
+
+  // Scalar-to-vector duplication
+  DUP,
+  DUPLANE8,
+  DUPLANE16,
+  DUPLANE32,
+  DUPLANE64,
+
+  // Vector immedate moves
+  MOVI,
+  MOVIshift,
+  MOVIedit,
+  MOVImsl,
+  FMOV,
+  MVNIshift,
+  MVNImsl,
+
+  // Vector immediate ops
+  BICi,
+  ORRi,
+
+  // Vector arithmetic negation
+  NEG,
+
+  // Vector shuffles
+  ZIP1,
+  ZIP2,
+  UZP1,
+  UZP2,
+  TRN1,
+  TRN2,
+  REV16,
+  REV32,
+  REV64,
+  EXT,
+
+  // Vector shift by scalar
+  VSHL,
+  VLSHR,
+  VASHR,
+
+  // Vector shift by scalar (again)
+  SQSHL_I,
+  UQSHL_I,
+  SQSHLU_I,
+  SRSHR_I,
+  URSHR_I,
+
+  // Vector comparisons
+  CMEQ,
+  CMGE,
+  CMGT,
+  CMHI,
+  CMHS,
+  FCMEQ,
+  FCMGE,
+  FCMGT,
+
+  // Vector zero comparisons
+  CMEQz,
+  CMGEz,
+  CMGTz,
+  CMLEz,
+  CMLTz,
+  FCMEQz,
+  FCMGEz,
+  FCMGTz,
+  FCMLEz,
+  FCMLTz,
+
+  // Vector bitwise negation
+  NOT,
+
+  // Vector bitwise selection
+  BIT,
+
+  // Compare-and-branch
+  CBZ,
+  CBNZ,
+  TBZ,
+  TBNZ,
+
+  // Tail calls
+  TC_RETURN,
+
+  // Custom prefetch handling
+  PREFETCH,
+
+  // {s|u}int to FP within a FP register.
+  SITOF,
+  UITOF
+};
+
+} // end namespace ARM64ISD
+
+class ARM64Subtarget;
+class ARM64TargetMachine;
+
+class ARM64TargetLowering : public TargetLowering {
+  bool RequireStrictAlign;
+
+public:
+  explicit ARM64TargetLowering(ARM64TargetMachine &TM);
+
+  /// Selects the correct CCAssignFn for a the given CallingConvention
+  /// value.
+  CCAssignFn *CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg) const;
+
+  /// computeMaskedBitsForTargetNode - Determine which of the bits specified in
+  /// Mask are known to be either zero or one and return them in the
+  /// KnownZero/KnownOne bitsets.
+  void computeMaskedBitsForTargetNode(const SDValue Op, APInt &KnownZero,
+                                      APInt &KnownOne, const SelectionDAG &DAG,
+                                      unsigned Depth = 0) const;
+
+  virtual MVT getScalarShiftAmountTy(EVT LHSTy) const;
+
+  /// allowsUnalignedMemoryAccesses - Returns true if the target allows
+  /// unaligned memory accesses. of the specified type.
+  virtual bool allowsUnalignedMemoryAccesses(EVT VT, unsigned AddrSpace = 0,
+                                             bool *Fast = 0) const {
+    if (RequireStrictAlign)
+      return false;
+    // FIXME: True for Cyclone, but not necessary others.
+    if (Fast)
+      *Fast = true;
+    return true;
+  }
+
+  /// LowerOperation - Provide custom lowering hooks for some operations.
+  virtual SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const;
+
+  virtual const char *getTargetNodeName(unsigned Opcode) const;
+
+  virtual SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+
+  /// getFunctionAlignment - Return the Log2 alignment of this function.
+  virtual unsigned getFunctionAlignment(const Function *F) const;
+
+  /// getMaximalGlobalOffset - Returns the maximal possible offset which can
+  /// be used for loads / stores from the global.
+  virtual unsigned getMaximalGlobalOffset() const;
+
+  /// Returns true if a cast between SrcAS and DestAS is a noop.
+  virtual bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const {
+    // Addrspacecasts are always noops.
+    return true;
+  }
+
+  /// createFastISel - This method returns a target specific FastISel object,
+  /// or null if the target does not support "fast" ISel.
+  virtual FastISel *createFastISel(FunctionLoweringInfo &funcInfo,
+                                   const TargetLibraryInfo *libInfo) const;
+
+  virtual bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const;
+
+  virtual bool isFPImmLegal(const APFloat &Imm, EVT VT) const;
+
+  /// isShuffleMaskLegal - Return true if the given shuffle mask can be
+  /// codegen'd directly, or if it should be stack expanded.
+  virtual bool isShuffleMaskLegal(const SmallVectorImpl<int> &M, EVT VT) const;
+
+  /// getSetCCResultType - Return the ISD::SETCC ValueType
+  virtual EVT getSetCCResultType(LLVMContext &Context, EVT VT) const;
+
+  SDValue ReconstructShuffle(SDValue Op, SelectionDAG &DAG) const;
+
+  MachineBasicBlock *EmitAtomicBinary(MachineInstr *MI, MachineBasicBlock *BB,
+                                      unsigned Size, unsigned BinOpcode) const;
+  MachineBasicBlock *EmitAtomicCmpSwap(MachineInstr *MI, MachineBasicBlock *BB,
+                                       unsigned Size) const;
+  MachineBasicBlock *EmitAtomicBinary128(MachineInstr *MI,
+                                         MachineBasicBlock *BB,
+                                         unsigned BinOpcodeLo,
+                                         unsigned BinOpcodeHi) const;
+  MachineBasicBlock *EmitAtomicCmpSwap128(MachineInstr *MI,
+                                          MachineBasicBlock *BB) const;
+  MachineBasicBlock *EmitAtomicMinMax128(MachineInstr *MI,
+                                         MachineBasicBlock *BB,
+                                         unsigned CondCode) const;
+  MachineBasicBlock *EmitF128CSEL(MachineInstr *MI,
+                                  MachineBasicBlock *BB) const;
+
+  virtual MachineBasicBlock *
+  EmitInstrWithCustomInserter(MachineInstr *MI, MachineBasicBlock *MBB) const;
+
+  virtual bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I,
+                                  unsigned Intrinsic) const;
+
+  virtual bool isTruncateFree(Type *Ty1, Type *Ty2) const;
+  virtual bool isTruncateFree(EVT VT1, EVT VT2) const;
+
+  virtual bool isZExtFree(Type *Ty1, Type *Ty2) const;
+  virtual bool isZExtFree(EVT VT1, EVT VT2) const;
+  virtual bool isZExtFree(SDValue Val, EVT VT2) const;
+
+  virtual bool hasPairedLoad(Type *LoadedType,
+                             unsigned &RequiredAligment) const;
+  virtual bool hasPairedLoad(EVT LoadedType, unsigned &RequiredAligment) const;
+
+  virtual bool isLegalAddImmediate(int64_t) const;
+  virtual bool isLegalICmpImmediate(int64_t) const;
+
+  virtual EVT getOptimalMemOpType(uint64_t Size, unsigned DstAlign,
+                                  unsigned SrcAlign, bool IsMemset,
+                                  bool ZeroMemset, bool MemcpyStrSrc,
+                                  MachineFunction &MF) const;
+
+  /// isLegalAddressingMode - Return true if the addressing mode represented
+  /// by AM is legal for this target, for a load/store of the specified type.
+  virtual bool isLegalAddressingMode(const AddrMode &AM, Type *Ty) const;
+
+  /// \brief Return the cost of the scaling factor used in the addressing
+  /// mode represented by AM for this target, for a load/store
+  /// of the specified type.
+  /// If the AM is supported, the return value must be >= 0.
+  /// If the AM is not supported, it returns a negative value.
+  virtual int getScalingFactorCost(const AddrMode &AM, Type *Ty) const;
+
+  /// isFMAFasterThanFMulAndFAdd - Return true if an FMA operation is faster
+  /// than a pair of fmul and fadd instructions. fmuladd intrinsics will be
+  /// expanded to FMAs when this method returns true, otherwise fmuladd is
+  /// expanded to fmul + fadd.
+  virtual bool isFMAFasterThanFMulAndFAdd(EVT VT) const;
+
+  virtual const uint16_t *getScratchRegisters(CallingConv::ID CC) const;
+
+  virtual bool shouldConvertConstantLoadToIntImm(const APInt &Imm,
+                                                 Type *Ty) const;
+
+private:
+  /// Subtarget - Keep a pointer to the ARM64Subtarget around so that we can
+  /// make the right decision when generating code for different targets.
+  const ARM64Subtarget *Subtarget;
+
+  void addTypeForNEON(EVT VT, EVT PromotedBitwiseVT);
+  void addDRTypeForNEON(MVT VT);
+  void addQRTypeForNEON(MVT VT);
+
+  virtual SDValue
+  LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+                       const SmallVectorImpl<ISD::InputArg> &Ins, SDLoc DL,
+                       SelectionDAG &DAG,
+                       SmallVectorImpl<SDValue> &InVals) const;
+
+  virtual SDValue LowerCall(CallLoweringInfo & /*CLI*/,
+                            SmallVectorImpl<SDValue> &InVals) const;
+
+  SDValue LowerCallResult(SDValue Chain, SDValue InFlag,
+                          CallingConv::ID CallConv, bool isVarArg,
+                          const SmallVectorImpl<ISD::InputArg> &Ins, SDLoc DL,
+                          SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
+                          bool isThisReturn, SDValue ThisVal) const;
+
+  bool isEligibleForTailCallOptimization(
+      SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
+      bool isCalleeStructRet, bool isCallerStructRet,
+      const SmallVectorImpl<ISD::OutputArg> &Outs,
+      const SmallVectorImpl<SDValue> &OutVals,
+      const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const;
+
+  void saveVarArgRegisters(CCState &CCInfo, SelectionDAG &DAG, SDLoc DL,
+                           SDValue &Chain) const;
+
+  virtual bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF,
+                              bool isVarArg,
+                              const SmallVectorImpl<ISD::OutputArg> &Outs,
+                              LLVMContext &Context) const;
+
+  virtual SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv,
+                              bool isVarArg,
+                              const SmallVectorImpl<ISD::OutputArg> &Outs,
+                              const SmallVectorImpl<SDValue> &OutVals, SDLoc DL,
+                              SelectionDAG &DAG) const;
+
+  SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerDarwinGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerELFGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerELFTLSDescCall(SDValue SymAddr, SDValue DescAddr, SDLoc DL,
+                              SelectionDAG &DAG) const;
+  SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerBR_CC(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerAAPCS_VASTART(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerDarwin_VASTART(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerVACOPY(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerVAARG(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerVectorSRA_SRL_SHL(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerShiftLeftParts(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerShiftRightParts(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerCTPOP(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerF128Call(SDValue Op, SelectionDAG &DAG,
+                        RTLIB::Libcall Call) const;
+  SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerVectorAND(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerVectorOR(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const;
+
+  ConstraintType getConstraintType(const std::string &Constraint) const;
+
+  /// Examine constraint string and operand type and determine a weight value.
+  /// The operand object must already have been set up with the operand type.
+  ConstraintWeight getSingleConstraintMatchWeight(AsmOperandInfo &info,
+                                                  const char *constraint) const;
+
+  std::pair<unsigned, const TargetRegisterClass *>
+  getRegForInlineAsmConstraint(const std::string &Constraint, MVT VT) const;
+  void LowerAsmOperandForConstraint(SDValue Op, std::string &Constraint,
+                                    std::vector<SDValue> &Ops,
+                                    SelectionDAG &DAG) const;
+
+  bool isUsedByReturnOnly(SDNode *N, SDValue &Chain) const;
+  bool mayBeEmittedAsTailCall(CallInst *CI) const;
+  bool getIndexedAddressParts(SDNode *Op, SDValue &Base, SDValue &Offset,
+                              ISD::MemIndexedMode &AM, bool &IsInc,
+                              SelectionDAG &DAG) const;
+  bool getPreIndexedAddressParts(SDNode *N, SDValue &Base, SDValue &Offset,
+                                 ISD::MemIndexedMode &AM,
+                                 SelectionDAG &DAG) const;
+  bool getPostIndexedAddressParts(SDNode *N, SDNode *Op, SDValue &Base,
+                                  SDValue &Offset, ISD::MemIndexedMode &AM,
+                                  SelectionDAG &DAG) const;
+
+  void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue> &Results,
+                          SelectionDAG &DAG) const;
+};
+
+namespace ARM64 {
+FastISel *createFastISel(FunctionLoweringInfo &funcInfo,
+                         const TargetLibraryInfo *libInfo);
+} // end namespace ARM64
+
+} // end namespace llvm
+
+#endif // LLVM_TARGET_ARM64_ISELLOWERING_H
--- a/lib/Target/ARM64/ARM64InstrAtomics.td
+++ b/lib/Target/ARM64/ARM64InstrAtomics.td
@ -0,0 +1,293 @@
+//===- ARM64InstrAtomics.td - ARM64 Atomic codegen support -*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// ARM64 Atomic operand code-gen constructs.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------
+// Atomic fences
+//===----------------------------------
+def : Pat<(atomic_fence (i64 4), (imm)), (DMB (i32 0x9))>;
+def : Pat<(atomic_fence (imm), (imm)), (DMB (i32 0xb))>;
+
+//===----------------------------------
+// Atomic loads
+//===----------------------------------
+
+// When they're actually atomic, only one addressing mode (GPR64sp) is
+// supported, but when they're relaxed and anything can be used, all the
+// standard modes would be valid and may give efficiency gains.
+
+// A atomic load operation that actually needs acquire semantics.
+class acquiring_load<PatFrag base>
+  : PatFrag<(ops node:$ptr), (base node:$ptr), [{
+  AtomicOrdering Ordering = cast<AtomicSDNode>(N)->getOrdering();
+  assert(Ordering != AcquireRelease && "unexpected load ordering");
+  return Ordering == Acquire || Ordering == SequentiallyConsistent;
+}]>;
+
+// An atomic load operation that does not need either acquire or release
+// semantics.
+class relaxed_load<PatFrag base>
+  : PatFrag<(ops node:$ptr), (base node:$ptr), [{
+  AtomicOrdering Ordering = cast<AtomicSDNode>(N)->getOrdering();
+  return Ordering == Monotonic || Ordering == Unordered;
+}]>;
+
+// 8-bit loads
+def : Pat<(acquiring_load<atomic_load_8>  GPR64sp:$ptr), (LDARB GPR64sp:$ptr)>;
+def : Pat<(relaxed_load<atomic_load_8> ro_indexed8:$addr),
+          (LDRBBro ro_indexed8:$addr)>;
+def : Pat<(relaxed_load<atomic_load_8> am_indexed8:$addr),
+          (LDRBBui am_indexed8:$addr)>;
+def : Pat<(relaxed_load<atomic_load_8> am_unscaled8:$addr),
+          (LDURBBi am_unscaled8:$addr)>;
+
+// 16-bit loads
+def : Pat<(acquiring_load<atomic_load_16> GPR64sp:$ptr), (LDARH GPR64sp:$ptr)>;
+def : Pat<(relaxed_load<atomic_load_16> ro_indexed16:$addr),
+          (LDRHHro ro_indexed16:$addr)>;
+def : Pat<(relaxed_load<atomic_load_16> am_indexed16:$addr),
+          (LDRHHui am_indexed16:$addr)>;
+def : Pat<(relaxed_load<atomic_load_16> am_unscaled16:$addr),
+          (LDURHHi am_unscaled16:$addr)>;
+
+// 32-bit loads
+def : Pat<(acquiring_load<atomic_load_32> GPR64sp:$ptr), (LDARW GPR64sp:$ptr)>;
+def : Pat<(relaxed_load<atomic_load_32> ro_indexed32:$addr),
+          (LDRWro ro_indexed32:$addr)>;
+def : Pat<(relaxed_load<atomic_load_32> am_indexed32:$addr),
+          (LDRWui am_indexed32:$addr)>;
+def : Pat<(relaxed_load<atomic_load_32> am_unscaled32:$addr),
+          (LDURWi am_unscaled32:$addr)>;
+
+// 64-bit loads
+def : Pat<(acquiring_load<atomic_load_64> GPR64sp:$ptr), (LDARX GPR64sp:$ptr)>;
+def : Pat<(relaxed_load<atomic_load_64> ro_indexed64:$addr),
+          (LDRXro ro_indexed64:$addr)>;
+def : Pat<(relaxed_load<atomic_load_64> am_indexed64:$addr),
+          (LDRXui am_indexed64:$addr)>;
+def : Pat<(relaxed_load<atomic_load_64> am_unscaled64:$addr),
+          (LDURXi am_unscaled64:$addr)>;
+
+//===----------------------------------
+// Atomic stores
+//===----------------------------------
+
+// When they're actually atomic, only one addressing mode (GPR64sp) is
+// supported, but when they're relaxed and anything can be used, all the
+// standard modes would be valid and may give efficiency gains.
+
+// A store operation that actually needs release semantics.
+class releasing_store<PatFrag base>
+  : PatFrag<(ops node:$ptr, node:$val), (base node:$ptr, node:$val), [{
+  AtomicOrdering Ordering = cast<AtomicSDNode>(N)->getOrdering();
+  assert(Ordering != AcquireRelease && "unexpected store ordering");
+  return Ordering == Release || Ordering == SequentiallyConsistent;
+}]>;
+
+// An atomic store operation that doesn't actually need to be atomic on ARM64.
+class relaxed_store<PatFrag base>
+  : PatFrag<(ops node:$ptr, node:$val), (base node:$ptr, node:$val), [{
+  AtomicOrdering Ordering = cast<AtomicSDNode>(N)->getOrdering();
+  return Ordering == Monotonic || Ordering == Unordered;
+}]>;
+
+// 8-bit stores
+def : Pat<(releasing_store<atomic_store_8> GPR64sp:$ptr, GPR32:$val),
+          (STLRB GPR32:$val, GPR64sp:$ptr)>;
+def : Pat<(relaxed_store<atomic_store_8> ro_indexed8:$ptr, GPR32:$val),
+          (STRBBro GPR32:$val, ro_indexed8:$ptr)>;
+def : Pat<(relaxed_store<atomic_store_8> am_indexed8:$ptr, GPR32:$val),
+          (STRBBui GPR32:$val, am_indexed8:$ptr)>;
+def : Pat<(relaxed_store<atomic_store_8> am_unscaled8:$ptr, GPR32:$val),
+          (STURBBi GPR32:$val, am_unscaled8:$ptr)>;
+
+// 16-bit stores
+def : Pat<(releasing_store<atomic_store_16> GPR64sp:$ptr, GPR32:$val),
+          (STLRH GPR32:$val, GPR64sp:$ptr)>;
+def : Pat<(relaxed_store<atomic_store_16> ro_indexed16:$ptr, GPR32:$val),
+          (STRHHro GPR32:$val, ro_indexed16:$ptr)>;
+def : Pat<(relaxed_store<atomic_store_16> am_indexed16:$ptr, GPR32:$val),
+          (STRHHui GPR32:$val, am_indexed16:$ptr)>;
+def : Pat<(relaxed_store<atomic_store_16> am_unscaled16:$ptr, GPR32:$val),
+          (STURHHi GPR32:$val, am_unscaled16:$ptr)>;
+
+// 32-bit stores
+def : Pat<(releasing_store<atomic_store_32> GPR64sp:$ptr, GPR32:$val),
+          (STLRW GPR32:$val, GPR64sp:$ptr)>;
+def : Pat<(relaxed_store<atomic_store_32> ro_indexed32:$ptr, GPR32:$val),
+          (STRWro GPR32:$val, ro_indexed32:$ptr)>;
+def : Pat<(relaxed_store<atomic_store_32> am_indexed32:$ptr, GPR32:$val),
+          (STRWui GPR32:$val, am_indexed32:$ptr)>;
+def : Pat<(relaxed_store<atomic_store_32> am_unscaled32:$ptr, GPR32:$val),
+          (STURWi GPR32:$val, am_unscaled32:$ptr)>;
+
+// 64-bit stores
+def : Pat<(releasing_store<atomic_store_64> GPR64sp:$ptr, GPR64:$val),
+          (STLRX GPR64:$val, GPR64sp:$ptr)>;
+def : Pat<(relaxed_store<atomic_store_64> ro_indexed64:$ptr, GPR64:$val),
+          (STRXro GPR64:$val, ro_indexed64:$ptr)>;
+def : Pat<(relaxed_store<atomic_store_64> am_indexed64:$ptr, GPR64:$val),
+          (STRXui GPR64:$val, am_indexed64:$ptr)>;
+def : Pat<(relaxed_store<atomic_store_64> am_unscaled64:$ptr, GPR64:$val),
+          (STURXi GPR64:$val, am_unscaled64:$ptr)>;
+
+//===----------------------------------
+// Atomic read-modify-write operations
+//===----------------------------------
+
+// More complicated operations need lots of C++ support, so we just create
+// skeletons here for the C++ code to refer to.
+
+let usesCustomInserter = 1, hasCtrlDep = 1, mayLoad = 1, mayStore = 1 in {
+multiclass AtomicSizes {
+  def _I8 : Pseudo<(outs GPR32:$dst),
+                   (ins GPR64sp:$ptr, GPR32:$incr, i32imm:$ordering), []>;
+  def _I16 : Pseudo<(outs GPR32:$dst),
+                    (ins GPR64sp:$ptr, GPR32:$incr, i32imm:$ordering), []>;
+  def _I32 : Pseudo<(outs GPR32:$dst),
+                    (ins GPR64sp:$ptr, GPR32:$incr, i32imm:$ordering), []>;
+  def _I64 : Pseudo<(outs GPR64:$dst),
+                    (ins GPR64sp:$ptr, GPR64:$incr, i32imm:$ordering), []>;
+  def _I128 : Pseudo<(outs GPR64:$dstlo, GPR64:$dsthi),
+                     (ins GPR64sp:$ptr, GPR64:$incrlo, GPR64:$incrhi,
+                          i32imm:$ordering), []>;
+}
+}
+
+defm ATOMIC_LOAD_ADD  : AtomicSizes;
+defm ATOMIC_LOAD_SUB  : AtomicSizes;
+defm ATOMIC_LOAD_AND  : AtomicSizes;
+defm ATOMIC_LOAD_OR   : AtomicSizes;
+defm ATOMIC_LOAD_XOR  : AtomicSizes;
+defm ATOMIC_LOAD_NAND : AtomicSizes;
+defm ATOMIC_SWAP      : AtomicSizes;
+let Defs = [CPSR] in {
+  // These operations need a CMP to calculate the correct value
+  defm ATOMIC_LOAD_MIN  : AtomicSizes;
+  defm ATOMIC_LOAD_MAX  : AtomicSizes;
+  defm ATOMIC_LOAD_UMIN : AtomicSizes;
+  defm ATOMIC_LOAD_UMAX : AtomicSizes;
+}
+
+class AtomicCmpSwap<RegisterClass GPRData>
+  : Pseudo<(outs GPRData:$dst),
+           (ins GPR64sp:$ptr, GPRData:$old, GPRData:$new,
+                i32imm:$ordering), []> {
+  let usesCustomInserter = 1;
+  let hasCtrlDep = 1;
+  let mayLoad = 1;
+  let mayStore = 1;
+  let Defs = [CPSR];
+}
+
+def ATOMIC_CMP_SWAP_I8  : AtomicCmpSwap<GPR32>;
+def ATOMIC_CMP_SWAP_I16 : AtomicCmpSwap<GPR32>;
+def ATOMIC_CMP_SWAP_I32 : AtomicCmpSwap<GPR32>;
+def ATOMIC_CMP_SWAP_I64 : AtomicCmpSwap<GPR64>;
+
+def ATOMIC_CMP_SWAP_I128
+  : Pseudo<(outs GPR64:$dstlo, GPR64:$dsthi),
+           (ins GPR64sp:$ptr, GPR64:$oldlo, GPR64:$oldhi,
+                GPR64:$newlo, GPR64:$newhi, i32imm:$ordering), []> {
+  let usesCustomInserter = 1;
+  let hasCtrlDep = 1;
+  let mayLoad = 1;
+  let mayStore = 1;
+  let Defs = [CPSR];
+}
+
+//===----------------------------------
+// Low-level exclusive operations
+//===----------------------------------
+
+// Load-exclusives.
+
+def ldxr_1 : PatFrag<(ops node:$ptr), (int_arm64_ldxr node:$ptr), [{
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i8;
+}]>;
+
+def ldxr_2 : PatFrag<(ops node:$ptr), (int_arm64_ldxr node:$ptr), [{
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i16;
+}]>;
+
+def ldxr_4 : PatFrag<(ops node:$ptr), (int_arm64_ldxr node:$ptr), [{
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i32;
+}]>;
+
+def ldxr_8 : PatFrag<(ops node:$ptr), (int_arm64_ldxr node:$ptr), [{
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i64;
+}]>;
+
+def : Pat<(ldxr_1 am_noindex:$addr),
+          (SUBREG_TO_REG (i64 0), (LDXRB am_noindex:$addr), sub_32)>;
+def : Pat<(ldxr_2 am_noindex:$addr),
+          (SUBREG_TO_REG (i64 0), (LDXRH am_noindex:$addr), sub_32)>;
+def : Pat<(ldxr_4 am_noindex:$addr),
+          (SUBREG_TO_REG (i64 0), (LDXRW am_noindex:$addr), sub_32)>;
+def : Pat<(ldxr_8 am_noindex:$addr), (LDXRX am_noindex:$addr)>;
+
+def : Pat<(and (ldxr_1 am_noindex:$addr), 0xff),
+          (SUBREG_TO_REG (i64 0), (LDXRB am_noindex:$addr), sub_32)>;
+def : Pat<(and (ldxr_2 am_noindex:$addr), 0xffff),
+          (SUBREG_TO_REG (i64 0), (LDXRH am_noindex:$addr), sub_32)>;
+def : Pat<(and (ldxr_4 am_noindex:$addr), 0xffffffff),
+          (SUBREG_TO_REG (i64 0), (LDXRW am_noindex:$addr), sub_32)>;
+
+// Store-exclusives.
+
+def stxr_1 : PatFrag<(ops node:$val, node:$ptr),
+                     (int_arm64_stxr node:$val, node:$ptr), [{
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i8;
+}]>;
+
+def stxr_2 : PatFrag<(ops node:$val, node:$ptr),
+                     (int_arm64_stxr node:$val, node:$ptr), [{
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i16;
+}]>;
+
+def stxr_4 : PatFrag<(ops node:$val, node:$ptr),
+                     (int_arm64_stxr node:$val, node:$ptr), [{
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i32;
+}]>;
+
+def stxr_8 : PatFrag<(ops node:$val, node:$ptr),
+                     (int_arm64_stxr node:$val, node:$ptr), [{
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i64;
+}]>;
+
+def : Pat<(stxr_1 GPR64:$val, am_noindex:$addr),
+          (STXRB (EXTRACT_SUBREG GPR64:$val, sub_32), am_noindex:$addr)>;
+def : Pat<(stxr_2 GPR64:$val, am_noindex:$addr),
+          (STXRH (EXTRACT_SUBREG GPR64:$val, sub_32), am_noindex:$addr)>;
+def : Pat<(stxr_4 GPR64:$val, am_noindex:$addr),
+          (STXRW (EXTRACT_SUBREG GPR64:$val, sub_32), am_noindex:$addr)>;
+def : Pat<(stxr_8 GPR64:$val, am_noindex:$addr),
+          (STXRX GPR64:$val, am_noindex:$addr)>;
+
+def : Pat<(stxr_1 (zext (and GPR32:$val, 0xff)), am_noindex:$addr),
+          (STXRB GPR32:$val, am_noindex:$addr)>;
+def : Pat<(stxr_2 (zext (and GPR32:$val, 0xffff)), am_noindex:$addr),
+          (STXRH GPR32:$val, am_noindex:$addr)>;
+def : Pat<(stxr_4 (zext GPR32:$val), am_noindex:$addr),
+          (STXRW GPR32:$val, am_noindex:$addr)>;
+
+def : Pat<(stxr_1 (and GPR64:$val, 0xff), am_noindex:$addr),
+          (STXRB (EXTRACT_SUBREG GPR64:$val, sub_32), am_noindex:$addr)>;
+def : Pat<(stxr_2 (and GPR64:$val, 0xffff), am_noindex:$addr),
+          (STXRH (EXTRACT_SUBREG GPR64:$val, sub_32), am_noindex:$addr)>;
+def : Pat<(stxr_4 (and GPR64:$val, 0xffffffff), am_noindex:$addr),
+          (STXRW (EXTRACT_SUBREG GPR64:$val, sub_32), am_noindex:$addr)>;
+
+
+// And clear exclusive.
+
+def : Pat<(int_arm64_clrex), (CLREX 0xf)>;
--- a/lib/Target/ARM64/ARM64InstrFormats.td
+++ b/lib/Target/ARM64/ARM64InstrFormats.td
--- a/lib/Target/ARM64/ARM64InstrInfo.cpp
+++ b/lib/Target/ARM64/ARM64InstrInfo.cpp
--- a/lib/Target/ARM64/ARM64InstrInfo.h
+++ b/lib/Target/ARM64/ARM64InstrInfo.h
@ -0,0 +1,223 @@
+//===- ARM64InstrInfo.h - ARM64 Instruction Information ---------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the ARM64 implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TARGET_ARM64INSTRINFO_H
+#define LLVM_TARGET_ARM64INSTRINFO_H
+
+#include "ARM64.h"
+#include "ARM64RegisterInfo.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+#define GET_INSTRINFO_HEADER
+#include "ARM64GenInstrInfo.inc"
+
+namespace llvm {
+
+class ARM64Subtarget;
+class ARM64TargetMachine;
+
+class ARM64InstrInfo : public ARM64GenInstrInfo {
+  // Reserve bits in the MachineMemOperand target hint flags, starting at 1.
+  // They will be shifted into MOTargetHintStart when accessed.
+  enum TargetMemOperandFlags {
+    MOSuppressPair = 1
+  };
+
+  const ARM64RegisterInfo RI;
+  const ARM64Subtarget &Subtarget;
+
+public:
+  explicit ARM64InstrInfo(const ARM64Subtarget &STI);
+
+  /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info.  As
+  /// such, whenever a client has an instance of instruction info, it should
+  /// always be able to get register info as well (through this method).
+  virtual const ARM64RegisterInfo &getRegisterInfo() const { return RI; }
+
+  unsigned GetInstSizeInBytes(const MachineInstr *MI) const;
+
+  virtual bool isCoalescableExtInstr(const MachineInstr &MI, unsigned &SrcReg,
+                                     unsigned &DstReg, unsigned &SubIdx) const;
+
+  virtual unsigned isLoadFromStackSlot(const MachineInstr *MI,
+                                       int &FrameIndex) const;
+  virtual unsigned isStoreToStackSlot(const MachineInstr *MI,
+                                      int &FrameIndex) const;
+
+  /// \brief Does this instruction set its full destination register to zero?
+  bool isGPRZero(const MachineInstr *MI) const;
+
+  /// \brief Does this instruction rename a GPR without modifying bits?
+  bool isGPRCopy(const MachineInstr *MI) const;
+
+  /// \brief Does this instruction rename an FPR without modifying bits?
+  bool isFPRCopy(const MachineInstr *MI) const;
+
+  /// Return true if this is load/store scales or extends its register offset.
+  /// This refers to scaling a dynamic index as opposed to scaled immediates.
+  /// MI should be a memory op that allows scaled addressing.
+  bool isScaledAddr(const MachineInstr *MI) const;
+
+  /// Return true if pairing the given load or store is hinted to be
+  /// unprofitable.
+  bool isLdStPairSuppressed(const MachineInstr *MI) const;
+
+  /// Hint that pairing the given load or store is unprofitable.
+  void suppressLdStPair(MachineInstr *MI) const;
+
+  virtual bool getLdStBaseRegImmOfs(MachineInstr *LdSt, unsigned &BaseReg,
+                                    unsigned &Offset,
+                                    const TargetRegisterInfo *TRI) const;
+
+  virtual bool enableClusterLoads() const { return true; }
+
+  virtual bool shouldClusterLoads(MachineInstr *FirstLdSt,
+                                  MachineInstr *SecondLdSt,
+                                  unsigned NumLoads) const;
+
+  virtual bool shouldScheduleAdjacent(MachineInstr *First,
+                                      MachineInstr *Second) const;
+
+  MachineInstr *emitFrameIndexDebugValue(MachineFunction &MF, int FrameIx,
+                                         uint64_t Offset, const MDNode *MDPtr,
+                                         DebugLoc DL) const;
+  void copyPhysRegTuple(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+                        DebugLoc DL, unsigned DestReg, unsigned SrcReg,
+                        bool KillSrc, unsigned Opcode,
+                        llvm::ArrayRef<unsigned> Indices) const;
+  virtual void copyPhysReg(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator I, DebugLoc DL,
+                           unsigned DestReg, unsigned SrcReg,
+                           bool KillSrc) const;
+
+  virtual void storeRegToStackSlot(MachineBasicBlock &MBB,
+                                   MachineBasicBlock::iterator MBBI,
+                                   unsigned SrcReg, bool isKill, int FrameIndex,
+                                   const TargetRegisterClass *RC,
+                                   const TargetRegisterInfo *TRI) const;
+
+  virtual void loadRegFromStackSlot(MachineBasicBlock &MBB,
+                                    MachineBasicBlock::iterator MBBI,
+                                    unsigned DestReg, int FrameIndex,
+                                    const TargetRegisterClass *RC,
+                                    const TargetRegisterInfo *TRI) const;
+
+  virtual MachineInstr *
+  foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI,
+                        const SmallVectorImpl<unsigned> &Ops,
+                        int FrameIndex) const;
+
+  virtual bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
+                             MachineBasicBlock *&FBB,
+                             SmallVectorImpl<MachineOperand> &Cond,
+                             bool AllowModify = false) const;
+  virtual unsigned RemoveBranch(MachineBasicBlock &MBB) const;
+  virtual unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+                                MachineBasicBlock *FBB,
+                                const SmallVectorImpl<MachineOperand> &Cond,
+                                DebugLoc DL) const;
+  virtual bool
+  ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const;
+  virtual bool canInsertSelect(const MachineBasicBlock &,
+                               const SmallVectorImpl<MachineOperand> &Cond,
+                               unsigned, unsigned, int &, int &, int &) const;
+  virtual void insertSelect(MachineBasicBlock &MBB,
+                            MachineBasicBlock::iterator MI, DebugLoc DL,
+                            unsigned DstReg,
+                            const SmallVectorImpl<MachineOperand> &Cond,
+                            unsigned TrueReg, unsigned FalseReg) const;
+  virtual void getNoopForMachoTarget(MCInst &NopInst) const;
+
+  /// analyzeCompare - For a comparison instruction, return the source registers
+  /// in SrcReg and SrcReg2, and the value it compares against in CmpValue.
+  /// Return true if the comparison instruction can be analyzed.
+  virtual bool analyzeCompare(const MachineInstr *MI, unsigned &SrcReg,
+                              unsigned &SrcReg2, int &CmpMask,
+                              int &CmpValue) const;
+  /// optimizeCompareInstr - Convert the instruction supplying the argument to
+  /// the comparison into one that sets the zero bit in the flags register.
+  virtual bool optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg,
+                                    unsigned SrcReg2, int CmpMask, int CmpValue,
+                                    const MachineRegisterInfo *MRI) const;
+
+private:
+  void instantiateCondBranch(MachineBasicBlock &MBB, DebugLoc DL,
+                             MachineBasicBlock *TBB,
+                             const SmallVectorImpl<MachineOperand> &Cond) const;
+};
+
+/// emitFrameOffset - Emit instructions as needed to set DestReg to SrcReg
+/// plus Offset.  This is intended to be used from within the prolog/epilog
+/// insertion (PEI) pass, where a virtual scratch register may be allocated
+/// if necessary, to be replaced by the scavenger at the end of PEI.
+void emitFrameOffset(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+                     DebugLoc DL, unsigned DestReg, unsigned SrcReg, int Offset,
+                     const ARM64InstrInfo *TII,
+                     MachineInstr::MIFlag = MachineInstr::NoFlags,
+                     bool SetCPSR = false);
+
+/// rewriteARM64FrameIndex - Rewrite MI to access 'Offset' bytes from the
+/// FP. Return false if the offset could not be handled directly in MI, and
+/// return the left-over portion by reference.
+bool rewriteARM64FrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
+                            unsigned FrameReg, int &Offset,
+                            const ARM64InstrInfo *TII);
+
+/// \brief Use to report the frame offset status in isARM64FrameOffsetLegal.
+enum ARM64FrameOffsetStatus {
+  ARM64FrameOffsetCannotUpdate = 0x0, ///< Offset cannot apply.
+  ARM64FrameOffsetIsLegal = 0x1,      ///< Offset is legal.
+  ARM64FrameOffsetCanUpdate = 0x2     ///< Offset can apply, at least partly.
+};
+
+/// \brief Check if the @p Offset is a valid frame offset for @p MI.
+/// The returned value reports the validity of the frame offset for @p MI.
+/// It uses the values defined by ARM64FrameOffsetStatus for that.
+/// If result == ARM64FrameOffsetCannotUpdate, @p MI cannot be updated to
+/// use an offset.eq
+/// If result & ARM64FrameOffsetIsLegal, @p Offset can completely be
+/// rewriten in @p MI.
+/// If result & ARM64FrameOffsetCanUpdate, @p Offset contains the
+/// amount that is off the limit of the legal offset.
+/// If set, @p OutUseUnscaledOp will contain the whether @p MI should be
+/// turned into an unscaled operator, which opcode is in @p OutUnscaledOp.
+/// If set, @p EmittableOffset contains the amount that can be set in @p MI
+/// (possibly with @p OutUnscaledOp if OutUseUnscaledOp is true) and that
+/// is a legal offset.
+int isARM64FrameOffsetLegal(const MachineInstr &MI, int &Offset,
+                            bool *OutUseUnscaledOp = NULL,
+                            unsigned *OutUnscaledOp = NULL,
+                            int *EmittableOffset = NULL);
+
+static inline bool isUncondBranchOpcode(int Opc) { return Opc == ARM64::B; }
+
+static inline bool isCondBranchOpcode(int Opc) {
+  switch (Opc) {
+  case ARM64::Bcc:
+  case ARM64::CBZW:
+  case ARM64::CBZX:
+  case ARM64::CBNZW:
+  case ARM64::CBNZX:
+  case ARM64::TBZ:
+  case ARM64::TBNZ:
+    return true;
+  default:
+    return false;
+  }
+}
+
+static inline bool isIndirectBranchOpcode(int Opc) { return Opc == ARM64::BR; }
+
+} // end namespace llvm
+
+#endif
--- a/lib/Target/ARM64/ARM64InstrInfo.td
+++ b/lib/Target/ARM64/ARM64InstrInfo.td
--- a/lib/Target/ARM64/ARM64LoadStoreOptimizer.cpp
+++ b/lib/Target/ARM64/ARM64LoadStoreOptimizer.cpp
@ -0,0 +1,950 @@
+//===-- ARM64LoadStoreOptimizer.cpp - ARM64 load/store opt. pass --*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a pass that performs load / store related peephole
+// optimizations. This pass should be run after register allocation.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "arm64-ldst-opt"
+#include "ARM64InstrInfo.h"
+#include "MCTargetDesc/ARM64AddressingModes.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/ADT/Statistic.h"
+using namespace llvm;
+
+/// ARM64AllocLoadStoreOpt - Post-register allocation pass to combine
+/// load / store instructions to form ldp / stp instructions.
+
+STATISTIC(NumPairCreated, "Number of load/store pair instructions generated");
+STATISTIC(NumPostFolded, "Number of post-index updates folded");
+STATISTIC(NumPreFolded, "Number of pre-index updates folded");
+STATISTIC(NumUnscaledPairCreated,
+          "Number of load/store from unscaled generated");
+
+static cl::opt<bool> DoLoadStoreOpt("arm64-load-store-opt", cl::init(true),
+                                    cl::Hidden);
+static cl::opt<unsigned> ScanLimit("arm64-load-store-scan-limit", cl::init(20),
+                                   cl::Hidden);
+
+// Place holder while testing unscaled load/store combining
+static cl::opt<bool>
+EnableARM64UnscaledMemOp("arm64-unscaled-mem-op", cl::Hidden,
+                         cl::desc("Allow ARM64 unscaled load/store combining"),
+                         cl::init(true));
+
+namespace {
+struct ARM64LoadStoreOpt : public MachineFunctionPass {
+  static char ID;
+  ARM64LoadStoreOpt() : MachineFunctionPass(ID) {}
+
+  const ARM64InstrInfo *TII;
+  const TargetRegisterInfo *TRI;
+
+  // Scan the instructions looking for a load/store that can be combined
+  // with the current instruction into a load/store pair.
+  // Return the matching instruction if one is found, else MBB->end().
+  // If a matching instruction is found, mergeForward is set to true if the
+  // merge is to remove the first instruction and replace the second with
+  // a pair-wise insn, and false if the reverse is true.
+  MachineBasicBlock::iterator findMatchingInsn(MachineBasicBlock::iterator I,
+                                               bool &mergeForward,
+                                               unsigned Limit);
+  // Merge the two instructions indicated into a single pair-wise instruction.
+  // If mergeForward is true, erase the first instruction and fold its
+  // operation into the second. If false, the reverse. Return the instruction
+  // following the first instruction (which may change during proecessing).
+  MachineBasicBlock::iterator
+  mergePairedInsns(MachineBasicBlock::iterator I,
+                   MachineBasicBlock::iterator Paired, bool mergeForward);
+
+  // Scan the instruction list to find a base register update that can
+  // be combined with the current instruction (a load or store) using
+  // pre or post indexed addressing with writeback. Scan forwards.
+  MachineBasicBlock::iterator
+  findMatchingUpdateInsnForward(MachineBasicBlock::iterator I, unsigned Limit,
+                                int Value);
+
+  // Scan the instruction list to find a base register update that can
+  // be combined with the current instruction (a load or store) using
+  // pre or post indexed addressing with writeback. Scan backwards.
+  MachineBasicBlock::iterator
+  findMatchingUpdateInsnBackward(MachineBasicBlock::iterator I, unsigned Limit);
+
+  // Merge a pre-index base register update into a ld/st instruction.
+  MachineBasicBlock::iterator
+  mergePreIdxUpdateInsn(MachineBasicBlock::iterator I,
+                        MachineBasicBlock::iterator Update);
+
+  // Merge a post-index base register update into a ld/st instruction.
+  MachineBasicBlock::iterator
+  mergePostIdxUpdateInsn(MachineBasicBlock::iterator I,
+                         MachineBasicBlock::iterator Update);
+
+  bool optimizeBlock(MachineBasicBlock &MBB);
+
+  virtual bool runOnMachineFunction(MachineFunction &Fn);
+
+  virtual const char *getPassName() const {
+    return "ARM64 load / store optimization pass";
+  }
+
+private:
+  int getMemSize(MachineInstr *MemMI);
+};
+char ARM64LoadStoreOpt::ID = 0;
+}
+
+static bool isUnscaledLdst(unsigned Opc) {
+  switch (Opc) {
+  default:
+    return false;
+  case ARM64::STURSi:
+    return true;
+  case ARM64::STURDi:
+    return true;
+  case ARM64::STURQi:
+    return true;
+  case ARM64::STURWi:
+    return true;
+  case ARM64::STURXi:
+    return true;
+  case ARM64::LDURSi:
+    return true;
+  case ARM64::LDURDi:
+    return true;
+  case ARM64::LDURQi:
+    return true;
+  case ARM64::LDURWi:
+    return true;
+  case ARM64::LDURXi:
+    return true;
+  }
+}
+
+// Size in bytes of the data moved by an unscaled load or store
+int ARM64LoadStoreOpt::getMemSize(MachineInstr *MemMI) {
+  switch (MemMI->getOpcode()) {
+  default:
+    llvm_unreachable("Opcode has has unknown size!");
+  case ARM64::STRSui:
+  case ARM64::STURSi:
+    return 4;
+  case ARM64::STRDui:
+  case ARM64::STURDi:
+    return 8;
+  case ARM64::STRQui:
+  case ARM64::STURQi:
+    return 16;
+  case ARM64::STRWui:
+  case ARM64::STURWi:
+    return 4;
+  case ARM64::STRXui:
+  case ARM64::STURXi:
+    return 8;
+  case ARM64::LDRSui:
+  case ARM64::LDURSi:
+    return 4;
+  case ARM64::LDRDui:
+  case ARM64::LDURDi:
+    return 8;
+  case ARM64::LDRQui:
+  case ARM64::LDURQi:
+    return 16;
+  case ARM64::LDRWui:
+  case ARM64::LDURWi:
+    return 4;
+  case ARM64::LDRXui:
+  case ARM64::LDURXi:
+    return 8;
+  }
+}
+
+static unsigned getMatchingPairOpcode(unsigned Opc) {
+  switch (Opc) {
+  default:
+    llvm_unreachable("Opcode has no pairwise equivalent!");
+  case ARM64::STRSui:
+  case ARM64::STURSi:
+    return ARM64::STPSi;
+  case ARM64::STRDui:
+  case ARM64::STURDi:
+    return ARM64::STPDi;
+  case ARM64::STRQui:
+  case ARM64::STURQi:
+    return ARM64::STPQi;
+  case ARM64::STRWui:
+  case ARM64::STURWi:
+    return ARM64::STPWi;
+  case ARM64::STRXui:
+  case ARM64::STURXi:
+    return ARM64::STPXi;
+  case ARM64::LDRSui:
+  case ARM64::LDURSi:
+    return ARM64::LDPSi;
+  case ARM64::LDRDui:
+  case ARM64::LDURDi:
+    return ARM64::LDPDi;
+  case ARM64::LDRQui:
+  case ARM64::LDURQi:
+    return ARM64::LDPQi;
+  case ARM64::LDRWui:
+  case ARM64::LDURWi:
+    return ARM64::LDPWi;
+  case ARM64::LDRXui:
+  case ARM64::LDURXi:
+    return ARM64::LDPXi;
+  }
+}
+
+static unsigned getPreIndexedOpcode(unsigned Opc) {
+  switch (Opc) {
+  default:
+    llvm_unreachable("Opcode has no pre-indexed equivalent!");
+  case ARM64::STRSui:    return ARM64::STRSpre;
+  case ARM64::STRDui:    return ARM64::STRDpre;
+  case ARM64::STRQui:    return ARM64::STRQpre;
+  case ARM64::STRWui:    return ARM64::STRWpre;
+  case ARM64::STRXui:    return ARM64::STRXpre;
+  case ARM64::LDRSui:    return ARM64::LDRSpre;
+  case ARM64::LDRDui:    return ARM64::LDRDpre;
+  case ARM64::LDRQui:    return ARM64::LDRQpre;
+  case ARM64::LDRWui:    return ARM64::LDRWpre;
+  case ARM64::LDRXui:    return ARM64::LDRXpre;
+  }
+}
+
+static unsigned getPostIndexedOpcode(unsigned Opc) {
+  switch (Opc) {
+  default:
+    llvm_unreachable("Opcode has no post-indexed wise equivalent!");
+  case ARM64::STRSui:
+    return ARM64::STRSpost;
+  case ARM64::STRDui:
+    return ARM64::STRDpost;
+  case ARM64::STRQui:
+    return ARM64::STRQpost;
+  case ARM64::STRWui:
+    return ARM64::STRWpost;
+  case ARM64::STRXui:
+    return ARM64::STRXpost;
+  case ARM64::LDRSui:
+    return ARM64::LDRSpost;
+  case ARM64::LDRDui:
+    return ARM64::LDRDpost;
+  case ARM64::LDRQui:
+    return ARM64::LDRQpost;
+  case ARM64::LDRWui:
+    return ARM64::LDRWpost;
+  case ARM64::LDRXui:
+    return ARM64::LDRXpost;
+  }
+}
+
+MachineBasicBlock::iterator
+ARM64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I,
+                                    MachineBasicBlock::iterator Paired,
+                                    bool mergeForward) {
+  MachineBasicBlock::iterator NextI = I;
+  ++NextI;
+  // If NextI is the second of the two instructions to be merged, we need
+  // to skip one further. Either way we merge will invalidate the iterator,
+  // and we don't need to scan the new instruction, as it's a pairwise
+  // instruction, which we're not considering for further action anyway.
+  if (NextI == Paired)
+    ++NextI;
+
+  bool IsUnscaled = isUnscaledLdst(I->getOpcode());
+  int OffsetStride = IsUnscaled && EnableARM64UnscaledMemOp ? getMemSize(I) : 1;
+
+  unsigned NewOpc = getMatchingPairOpcode(I->getOpcode());
+  // Insert our new paired instruction after whichever of the paired
+  // instructions mergeForward indicates.
+  MachineBasicBlock::iterator InsertionPoint = mergeForward ? Paired : I;
+  // Also based on mergeForward is from where we copy the base register operand
+  // so we get the flags compatible with the input code.
+  MachineOperand &BaseRegOp =
+      mergeForward ? Paired->getOperand(1) : I->getOperand(1);
+
+  // Which register is Rt and which is Rt2 depends on the offset order.
+  MachineInstr *RtMI, *Rt2MI;
+  if (I->getOperand(2).getImm() ==
+      Paired->getOperand(2).getImm() + OffsetStride) {
+    RtMI = Paired;
+    Rt2MI = I;
+  } else {
+    RtMI = I;
+    Rt2MI = Paired;
+  }
+  // Handle Unscaled
+  int OffsetImm = RtMI->getOperand(2).getImm();
+  if (IsUnscaled && EnableARM64UnscaledMemOp)
+    OffsetImm /= OffsetStride;
+
+  // Construct the new instruction.
+  MachineInstrBuilder MIB = BuildMI(*I->getParent(), InsertionPoint,
+                                    I->getDebugLoc(), TII->get(NewOpc))
+                                .addOperand(RtMI->getOperand(0))
+                                .addOperand(Rt2MI->getOperand(0))
+                                .addOperand(BaseRegOp)
+                                .addImm(OffsetImm);
+  (void)MIB;
+
+  // FIXME: Do we need/want to copy the mem operands from the source
+  //        instructions? Probably. What uses them after this?
+
+  DEBUG(dbgs() << "Creating pair load/store. Replacing instructions:\n    ");
+  DEBUG(I->print(dbgs()));
+  DEBUG(dbgs() << "    ");
+  DEBUG(Paired->print(dbgs()));
+  DEBUG(dbgs() << "  with instruction:\n    ");
+  DEBUG(((MachineInstr *)MIB)->print(dbgs()));
+  DEBUG(dbgs() << "\n");
+
+  // Erase the old instructions.
+  I->eraseFromParent();
+  Paired->eraseFromParent();
+
+  return NextI;
+}
+
+/// trackRegDefsUses - Remember what registers the specified instruction uses
+/// and modifies.
+static void trackRegDefsUses(MachineInstr *MI, BitVector &ModifiedRegs,
+                             BitVector &UsedRegs,
+                             const TargetRegisterInfo *TRI) {
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    MachineOperand &MO = MI->getOperand(i);
+    if (MO.isRegMask())
+      ModifiedRegs.setBitsNotInMask(MO.getRegMask());
+
+    if (!MO.isReg())
+      continue;
+    unsigned Reg = MO.getReg();
+    if (MO.isDef()) {
+      for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
+        ModifiedRegs.set(*AI);
+    } else {
+      assert(MO.isUse() && "Reg operand not a def and not a use?!?");
+      for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
+        UsedRegs.set(*AI);
+    }
+  }
+}
+
+static bool inBoundsForPair(bool IsUnscaled, int Offset, int OffsetStride) {
+  if (!IsUnscaled && (Offset > 63 || Offset < -64))
+    return false;
+  if (IsUnscaled) {
+    // Convert the byte-offset used by unscaled into an "element" offset used
+    // by the scaled pair load/store instructions.
+    int elemOffset = Offset / OffsetStride;
+    if (elemOffset > 63 || elemOffset < -64)
+      return false;
+  }
+  return true;
+}
+
+// Do alignment, specialized to power of 2 and for signed ints,
+// avoiding having to do a C-style cast from uint_64t to int when
+// using RoundUpToAlignment from include/llvm/Support/MathExtras.h.
+// FIXME: Move this function to include/MathExtras.h?
+static int alignTo(int Num, int PowOf2) {
+  return (Num + PowOf2 - 1) & ~(PowOf2 - 1);
+}
+
+/// findMatchingInsn - Scan the instructions looking for a load/store that can
+/// be combined with the current instruction into a load/store pair.
+MachineBasicBlock::iterator
+ARM64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
+                                    bool &mergeForward, unsigned Limit) {
+  MachineBasicBlock::iterator E = I->getParent()->end();
+  MachineBasicBlock::iterator MBBI = I;
+  MachineInstr *FirstMI = I;
+  ++MBBI;
+
+  int Opc = FirstMI->getOpcode();
+  bool mayLoad = FirstMI->mayLoad();
+  bool IsUnscaled = isUnscaledLdst(Opc);
+  unsigned Reg = FirstMI->getOperand(0).getReg();
+  unsigned BaseReg = FirstMI->getOperand(1).getReg();
+  int Offset = FirstMI->getOperand(2).getImm();
+
+  // Early exit if the first instruction modifies the base register.
+  // e.g., ldr x0, [x0]
+  // Early exit if the offset if not possible to match. (6 bits of positive
+  // range, plus allow an extra one in case we find a later insn that matches
+  // with Offset-1
+  if (FirstMI->modifiesRegister(BaseReg, TRI))
+    return E;
+  int OffsetStride =
+      IsUnscaled && EnableARM64UnscaledMemOp ? getMemSize(FirstMI) : 1;
+  if (!inBoundsForPair(IsUnscaled, Offset, OffsetStride))
+    return E;
+
+  // Track which registers have been modified and used between the first insn
+  // (inclusive) and the second insn.
+  BitVector ModifiedRegs, UsedRegs;
+  ModifiedRegs.resize(TRI->getNumRegs());
+  UsedRegs.resize(TRI->getNumRegs());
+  for (unsigned Count = 0; MBBI != E && Count < Limit; ++MBBI) {
+    MachineInstr *MI = MBBI;
+    // Skip DBG_VALUE instructions. Otherwise debug info can affect the
+    // optimization by changing how far we scan.
+    if (MI->isDebugValue())
+      continue;
+
+    // Now that we know this is a real instruction, count it.
+    ++Count;
+
+    if (Opc == MI->getOpcode() && MI->getOperand(2).isImm()) {
+      // If we've found another instruction with the same opcode, check to see
+      // if the base and offset are compatible with our starting instruction.
+      // These instructions all have scaled immediate operands, so we just
+      // check for +1/-1. Make sure to check the new instruction offset is
+      // actually an immediate and not a symbolic reference destined for
+      // a relocation.
+      //
+      // Pairwise instructions have a 7-bit signed offset field. Single insns
+      // have a 12-bit unsigned offset field. To be a valid combine, the
+      // final offset must be in range.
+      unsigned MIBaseReg = MI->getOperand(1).getReg();
+      int MIOffset = MI->getOperand(2).getImm();
+      if (BaseReg == MIBaseReg && ((Offset == MIOffset + OffsetStride) ||
+                                   (Offset + OffsetStride == MIOffset))) {
+        int MinOffset = Offset < MIOffset ? Offset : MIOffset;
+        // If this is a volatile load/store that otherwise matched, stop looking
+        // as something is going on that we don't have enough information to
+        // safely transform. Similarly, stop if we see a hint to avoid pairs.
+        if (MI->hasOrderedMemoryRef() || TII->isLdStPairSuppressed(MI))
+          return E;
+        // If the resultant immediate offset of merging these instructions
+        // is out of range for a pairwise instruction, bail and keep looking.
+        bool MIIsUnscaled = isUnscaledLdst(MI->getOpcode());
+        if (!inBoundsForPair(MIIsUnscaled, MinOffset, OffsetStride)) {
+          trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
+          continue;
+        }
+        // If the alignment requirements of the paired (scaled) instruction
+        // can't express the offset of the unscaled input, bail and keep
+        // looking.
+        if (IsUnscaled && EnableARM64UnscaledMemOp &&
+            (alignTo(MinOffset, OffsetStride) != MinOffset)) {
+          trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
+          continue;
+        }
+        // If the destination register of the loads is the same register, bail
+        // and keep looking. A load-pair instruction with both destination
+        // registers the same is UNPREDICTABLE and will result in an exception.
+        if (mayLoad && Reg == MI->getOperand(0).getReg()) {
+          trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
+          continue;
+        }
+
+        // If the Rt of the second instruction was not modified or used between
+        // the two instructions, we can combine the second into the first.
+        if (!ModifiedRegs[MI->getOperand(0).getReg()] &&
+            !UsedRegs[MI->getOperand(0).getReg()]) {
+          mergeForward = false;
+          return MBBI;
+        }
+
+        // Likewise, if the Rt of the first instruction is not modified or used
+        // between the two instructions, we can combine the first into the
+        // second.
+        if (!ModifiedRegs[FirstMI->getOperand(0).getReg()] &&
+            !UsedRegs[FirstMI->getOperand(0).getReg()]) {
+          mergeForward = true;
+          return MBBI;
+        }
+        // Unable to combine these instructions due to interference in between.
+        // Keep looking.
+      }
+    }
+
+    // If the instruction wasn't a matching load or store, but does (or can)
+    // modify memory, stop searching, as we don't have alias analysis or
+    // anything like that to tell us whether the access is tromping on the
+    // locations we care about. The big one we want to catch is calls.
+    //
+    // FIXME: Theoretically, we can do better than that for SP and FP based
+    // references since we can effectively know where those are touching. It's
+    // unclear if it's worth the extra code, though. Most paired instructions
+    // will be sequential, perhaps with a few intervening non-memory related
+    // instructions.
+    if (MI->mayStore() || MI->isCall())
+      return E;
+    // Likewise, if we're matching a store instruction, we don't want to
+    // move across a load, as it may be reading the same location.
+    if (FirstMI->mayStore() && MI->mayLoad())
+      return E;
+
+    // Update modified / uses register lists.
+    trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
+
+    // Otherwise, if the base register is modified, we have no match, so
+    // return early.
+    if (ModifiedRegs[BaseReg])
+      return E;
+  }
+  return E;
+}
+
+MachineBasicBlock::iterator
+ARM64LoadStoreOpt::mergePreIdxUpdateInsn(MachineBasicBlock::iterator I,
+                                         MachineBasicBlock::iterator Update) {
+  assert((Update->getOpcode() == ARM64::ADDXri ||
+          Update->getOpcode() == ARM64::SUBXri) &&
+         "Unexpected base register update instruction to merge!");
+  MachineBasicBlock::iterator NextI = I;
+  // Return the instruction following the merged instruction, which is
+  // the instruction following our unmerged load. Unless that's the add/sub
+  // instruction we're merging, in which case it's the one after that.
+  if (++NextI == Update)
+    ++NextI;
+
+  int Value = Update->getOperand(2).getImm();
+  assert(ARM64_AM::getShiftValue(Update->getOperand(3).getImm()) == 0 &&
+         "Can't merge 1 << 12 offset into pre-indexed load / store");
+  if (Update->getOpcode() == ARM64::SUBXri)
+    Value = -Value;
+
+  unsigned NewOpc = getPreIndexedOpcode(I->getOpcode());
+  MachineInstrBuilder MIB =
+      BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc))
+          .addOperand(I->getOperand(0))
+          .addOperand(I->getOperand(1))
+          .addImm(Value);
+  (void)MIB;
+
+  DEBUG(dbgs() << "Creating pre-indexed load/store.");
+  DEBUG(dbgs() << "    Replacing instructions:\n    ");
+  DEBUG(I->print(dbgs()));
+  DEBUG(dbgs() << "    ");
+  DEBUG(Update->print(dbgs()));
+  DEBUG(dbgs() << "  with instruction:\n    ");
+  DEBUG(((MachineInstr *)MIB)->print(dbgs()));
+  DEBUG(dbgs() << "\n");
+
+  // Erase the old instructions for the block.
+  I->eraseFromParent();
+  Update->eraseFromParent();
+
+  return NextI;
+}
+
+MachineBasicBlock::iterator
+ARM64LoadStoreOpt::mergePostIdxUpdateInsn(MachineBasicBlock::iterator I,
+                                          MachineBasicBlock::iterator Update) {
+  assert((Update->getOpcode() == ARM64::ADDXri ||
+          Update->getOpcode() == ARM64::SUBXri) &&
+         "Unexpected base register update instruction to merge!");
+  MachineBasicBlock::iterator NextI = I;
+  // Return the instruction following the merged instruction, which is
+  // the instruction following our unmerged load. Unless that's the add/sub
+  // instruction we're merging, in which case it's the one after that.
+  if (++NextI == Update)
+    ++NextI;
+
+  int Value = Update->getOperand(2).getImm();
+  assert(ARM64_AM::getShiftValue(Update->getOperand(3).getImm()) == 0 &&
+         "Can't merge 1 << 12 offset into post-indexed load / store");
+  if (Update->getOpcode() == ARM64::SUBXri)
+    Value = -Value;
+
+  unsigned NewOpc = getPostIndexedOpcode(I->getOpcode());
+  MachineInstrBuilder MIB =
+      BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc))
+          .addOperand(I->getOperand(0))
+          .addOperand(I->getOperand(1))
+          .addImm(Value);
+  (void)MIB;
+
+  DEBUG(dbgs() << "Creating post-indexed load/store.");
+  DEBUG(dbgs() << "    Replacing instructions:\n    ");
+  DEBUG(I->print(dbgs()));
+  DEBUG(dbgs() << "    ");
+  DEBUG(Update->print(dbgs()));
+  DEBUG(dbgs() << "  with instruction:\n    ");
+  DEBUG(((MachineInstr *)MIB)->print(dbgs()));
+  DEBUG(dbgs() << "\n");
+
+  // Erase the old instructions for the block.
+  I->eraseFromParent();
+  Update->eraseFromParent();
+
+  return NextI;
+}
+
+static bool isMatchingUpdateInsn(MachineInstr *MI, unsigned BaseReg,
+                                 int Offset) {
+  switch (MI->getOpcode()) {
+  default:
+    break;
+  case ARM64::SUBXri:
+    // Negate the offset for a SUB instruction.
+    Offset *= -1;
+  // FALLTHROUGH
+  case ARM64::ADDXri:
+    // Make sure it's a vanilla immediate operand, not a relocation or
+    // anything else we can't handle.
+    if (!MI->getOperand(2).isImm())
+      break;
+    // Watch out for 1 << 12 shifted value.
+    if (ARM64_AM::getShiftValue(MI->getOperand(3).getImm()))
+      break;
+    // If the instruction has the base register as source and dest and the
+    // immediate will fit in a signed 9-bit integer, then we have a match.
+    if (MI->getOperand(0).getReg() == BaseReg &&
+        MI->getOperand(1).getReg() == BaseReg &&
+        MI->getOperand(2).getImm() <= 255 &&
+        MI->getOperand(2).getImm() >= -256) {
+      // If we have a non-zero Offset, we check that it matches the amount
+      // we're adding to the register.
+      if (!Offset || Offset == MI->getOperand(2).getImm())
+        return true;
+    }
+    break;
+  }
+  return false;
+}
+
+MachineBasicBlock::iterator
+ARM64LoadStoreOpt::findMatchingUpdateInsnForward(MachineBasicBlock::iterator I,
+                                                 unsigned Limit, int Value) {
+  MachineBasicBlock::iterator E = I->getParent()->end();
+  MachineInstr *MemMI = I;
+  MachineBasicBlock::iterator MBBI = I;
+  const MachineFunction &MF = *MemMI->getParent()->getParent();
+
+  unsigned DestReg = MemMI->getOperand(0).getReg();
+  unsigned BaseReg = MemMI->getOperand(1).getReg();
+  int Offset = MemMI->getOperand(2).getImm() *
+               TII->getRegClass(MemMI->getDesc(), 0, TRI, MF)->getSize();
+
+  // If the base register overlaps the destination register, we can't
+  // merge the update.
+  if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg))
+    return E;
+
+  // Scan forward looking for post-index opportunities.
+  // Updating instructions can't be formed if the memory insn already
+  // has an offset other than the value we're looking for.
+  if (Offset != Value)
+    return E;
+
+  // Track which registers have been modified and used between the first insn
+  // (inclusive) and the second insn.
+  BitVector ModifiedRegs, UsedRegs;
+  ModifiedRegs.resize(TRI->getNumRegs());
+  UsedRegs.resize(TRI->getNumRegs());
+  ++MBBI;
+  for (unsigned Count = 0; MBBI != E; ++MBBI) {
+    MachineInstr *MI = MBBI;
+    // Skip DBG_VALUE instructions. Otherwise debug info can affect the
+    // optimization by changing how far we scan.
+    if (MI->isDebugValue())
+      continue;
+
+    // Now that we know this is a real instruction, count it.
+    ++Count;
+
+    // If we found a match, return it.
+    if (isMatchingUpdateInsn(MI, BaseReg, Value))
+      return MBBI;
+
+    // Update the status of what the instruction clobbered and used.
+    trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
+
+    // Otherwise, if the base register is used or modified, we have no match, so
+    // return early.
+    if (ModifiedRegs[BaseReg] || UsedRegs[BaseReg])
+      return E;
+  }
+  return E;
+}
+
+MachineBasicBlock::iterator
+ARM64LoadStoreOpt::findMatchingUpdateInsnBackward(MachineBasicBlock::iterator I,
+                                                  unsigned Limit) {
+  MachineBasicBlock::iterator B = I->getParent()->begin();
+  MachineBasicBlock::iterator E = I->getParent()->end();
+  MachineInstr *MemMI = I;
+  MachineBasicBlock::iterator MBBI = I;
+  const MachineFunction &MF = *MemMI->getParent()->getParent();
+
+  unsigned DestReg = MemMI->getOperand(0).getReg();
+  unsigned BaseReg = MemMI->getOperand(1).getReg();
+  int Offset = MemMI->getOperand(2).getImm();
+  unsigned RegSize = TII->getRegClass(MemMI->getDesc(), 0, TRI, MF)->getSize();
+
+  // If the load/store is the first instruction in the block, there's obviously
+  // not any matching update. Ditto if the memory offset isn't zero.
+  if (MBBI == B || Offset != 0)
+    return E;
+  // If the base register overlaps the destination register, we can't
+  // merge the update.
+  if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg))
+    return E;
+
+  // Track which registers have been modified and used between the first insn
+  // (inclusive) and the second insn.
+  BitVector ModifiedRegs, UsedRegs;
+  ModifiedRegs.resize(TRI->getNumRegs());
+  UsedRegs.resize(TRI->getNumRegs());
+  --MBBI;
+  for (unsigned Count = 0; MBBI != B; --MBBI) {
+    MachineInstr *MI = MBBI;
+    // Skip DBG_VALUE instructions. Otherwise debug info can affect the
+    // optimization by changing how far we scan.
+    if (MI->isDebugValue())
+      continue;
+
+    // Now that we know this is a real instruction, count it.
+    ++Count;
+
+    // If we found a match, return it.
+    if (isMatchingUpdateInsn(MI, BaseReg, RegSize))
+      return MBBI;
+
+    // Update the status of what the instruction clobbered and used.
+    trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
+
+    // Otherwise, if the base register is used or modified, we have no match, so
+    // return early.
+    if (ModifiedRegs[BaseReg] || UsedRegs[BaseReg])
+      return E;
+  }
+  return E;
+}
+
+bool ARM64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB) {
+  bool Modified = false;
+  // Two tranformations to do here:
+  // 1) Find loads and stores that can be merged into a single load or store
+  //    pair instruction.
+  //      e.g.,
+  //        ldr x0, [x2]
+  //        ldr x1, [x2, #8]
+  //        ; becomes
+  //        ldp x0, x1, [x2]
+  // 2) Find base register updates that can be merged into the load or store
+  //    as a base-reg writeback.
+  //      e.g.,
+  //        ldr x0, [x2]
+  //        add x2, x2, #4
+  //        ; becomes
+  //        ldr x0, [x2], #4
+
+  for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
+       MBBI != E;) {
+    MachineInstr *MI = MBBI;
+    switch (MI->getOpcode()) {
+    default:
+      // Just move on to the next instruction.
+      ++MBBI;
+      break;
+    case ARM64::STRSui:
+    case ARM64::STRDui:
+    case ARM64::STRQui:
+    case ARM64::STRXui:
+    case ARM64::STRWui:
+    case ARM64::LDRSui:
+    case ARM64::LDRDui:
+    case ARM64::LDRQui:
+    case ARM64::LDRXui:
+    case ARM64::LDRWui:
+    // do the unscaled versions as well
+    case ARM64::STURSi:
+    case ARM64::STURDi:
+    case ARM64::STURQi:
+    case ARM64::STURWi:
+    case ARM64::STURXi:
+    case ARM64::LDURSi:
+    case ARM64::LDURDi:
+    case ARM64::LDURQi:
+    case ARM64::LDURWi:
+    case ARM64::LDURXi: {
+      // If this is a volatile load/store, don't mess with it.
+      if (MI->hasOrderedMemoryRef()) {
+        ++MBBI;
+        break;
+      }
+      // Make sure this is a reg+imm (as opposed to an address reloc).
+      if (!MI->getOperand(2).isImm()) {
+        ++MBBI;
+        break;
+      }
+      // Check if this load/store has a hint to avoid pair formation.
+      // MachineMemOperands hints are set by the ARM64StorePairSuppress pass.
+      if (TII->isLdStPairSuppressed(MI)) {
+        ++MBBI;
+        break;
+      }
+      // Look ahead up to ScanLimit instructions for a pairable instruction.
+      bool mergeForward = false;
+      MachineBasicBlock::iterator Paired =
+          findMatchingInsn(MBBI, mergeForward, ScanLimit);
+      if (Paired != E) {
+        // Merge the loads into a pair. Keeping the iterator straight is a
+        // pain, so we let the merge routine tell us what the next instruction
+        // is after it's done mucking about.
+        MBBI = mergePairedInsns(MBBI, Paired, mergeForward);
+
+        Modified = true;
+        ++NumPairCreated;
+        if (isUnscaledLdst(MI->getOpcode()))
+          ++NumUnscaledPairCreated;
+        break;
+      }
+      ++MBBI;
+      break;
+    }
+      // FIXME: Do the other instructions.
+    }
+  }
+
+  for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
+       MBBI != E;) {
+    MachineInstr *MI = MBBI;
+    // Do update merging. It's simpler to keep this separate from the above
+    // switch, though not strictly necessary.
+    int Opc = MI->getOpcode();
+    switch (Opc) {
+    default:
+      // Just move on to the next instruction.
+      ++MBBI;
+      break;
+    case ARM64::STRSui:
+    case ARM64::STRDui:
+    case ARM64::STRQui:
+    case ARM64::STRXui:
+    case ARM64::STRWui:
+    case ARM64::LDRSui:
+    case ARM64::LDRDui:
+    case ARM64::LDRQui:
+    case ARM64::LDRXui:
+    case ARM64::LDRWui:
+    // do the unscaled versions as well
+    case ARM64::STURSi:
+    case ARM64::STURDi:
+    case ARM64::STURQi:
+    case ARM64::STURWi:
+    case ARM64::STURXi:
+    case ARM64::LDURSi:
+    case ARM64::LDURDi:
+    case ARM64::LDURQi:
+    case ARM64::LDURWi:
+    case ARM64::LDURXi: {
+      // Make sure this is a reg+imm (as opposed to an address reloc).
+      if (!MI->getOperand(2).isImm()) {
+        ++MBBI;
+        break;
+      }
+      // Look ahead up to ScanLimit instructions for a mergable instruction.
+      MachineBasicBlock::iterator Update =
+          findMatchingUpdateInsnForward(MBBI, ScanLimit, 0);
+      if (Update != E) {
+        // Merge the update into the ld/st.
+        MBBI = mergePostIdxUpdateInsn(MBBI, Update);
+        Modified = true;
+        ++NumPostFolded;
+        break;
+      }
+      // Don't know how to handle pre/post-index versions, so move to the next
+      // instruction.
+      if (isUnscaledLdst(Opc)) {
+        ++MBBI;
+        break;
+      }
+
+      // Look back to try to find a pre-index instruction. For example,
+      // add x0, x0, #8
+      // ldr x1, [x0]
+      //   merged into:
+      // ldr x1, [x0, #8]!
+      Update = findMatchingUpdateInsnBackward(MBBI, ScanLimit);
+      if (Update != E) {
+        // Merge the update into the ld/st.
+        MBBI = mergePreIdxUpdateInsn(MBBI, Update);
+        Modified = true;
+        ++NumPreFolded;
+        break;
+      }
+
+      // Look forward to try to find a post-index instruction. For example,
+      // ldr x1, [x0, #64]
+      // add x0, x0, #64
+      //   merged into:
+      // ldr x1, [x0], #64
+
+      // The immediate in the load/store is scaled by the size of the register
+      // being loaded. The immediate in the add we're looking for,
+      // however, is not, so adjust here.
+      int Value = MI->getOperand(2).getImm() *
+                  TII->getRegClass(MI->getDesc(), 0, TRI, *(MBB.getParent()))
+                      ->getSize();
+      Update = findMatchingUpdateInsnForward(MBBI, ScanLimit, Value);
+      if (Update != E) {
+        // Merge the update into the ld/st.
+        MBBI = mergePreIdxUpdateInsn(MBBI, Update);
+        Modified = true;
+        ++NumPreFolded;
+        break;
+      }
+
+      // Nothing found. Just move to the next instruction.
+      ++MBBI;
+      break;
+    }
+      // FIXME: Do the other instructions.
+    }
+  }
+
+  return Modified;
+}
+
+bool ARM64LoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
+  // Early exit if pass disabled.
+  if (!DoLoadStoreOpt)
+    return false;
+
+  const TargetMachine &TM = Fn.getTarget();
+  TII = static_cast<const ARM64InstrInfo *>(TM.getInstrInfo());
+  TRI = TM.getRegisterInfo();
+
+  bool Modified = false;
+  for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E;
+       ++MFI) {
+    MachineBasicBlock &MBB = *MFI;
+    Modified |= optimizeBlock(MBB);
+  }
+
+  return Modified;
+}
+
+// FIXME: Do we need/want a pre-alloc pass like ARM has to try to keep
+// loads and stores near one another?
+
+/// createARMLoadStoreOptimizationPass - returns an instance of the load / store
+/// optimization pass.
+FunctionPass *llvm::createARM64LoadStoreOptimizationPass() {
+  return new ARM64LoadStoreOpt();
+}
--- a/lib/Target/ARM64/ARM64MCInstLower.cpp
+++ b/lib/Target/ARM64/ARM64MCInstLower.cpp
@ -0,0 +1,201 @@
+//===-- ARM64MCInstLower.cpp - Convert ARM64 MachineInstr to an MCInst---===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains code to lower ARM64 MachineInstrs to their corresponding
+// MCInst records.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARM64MCInstLower.h"
+#include "MCTargetDesc/ARM64BaseInfo.h"
+#include "MCTargetDesc/ARM64MCExpr.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/IR/Mangler.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/Support/CodeGen.h"
+#include "llvm/Target/TargetMachine.h"
+using namespace llvm;
+
+ARM64MCInstLower::ARM64MCInstLower(MCContext &ctx, Mangler &mang,
+                                   AsmPrinter &printer)
+    : Ctx(ctx), Printer(printer), TargetTriple(printer.getTargetTriple()) {}
+
+MCSymbol *
+ARM64MCInstLower::GetGlobalAddressSymbol(const MachineOperand &MO) const {
+  return Printer.getSymbol(MO.getGlobal());
+}
+
+MCSymbol *
+ARM64MCInstLower::GetExternalSymbolSymbol(const MachineOperand &MO) const {
+  return Printer.GetExternalSymbolSymbol(MO.getSymbolName());
+}
+
+MCOperand ARM64MCInstLower::lowerSymbolOperandDarwin(const MachineOperand &MO,
+                                                     MCSymbol *Sym) const {
+  // FIXME: We would like an efficient form for this, so we don't have to do a
+  // lot of extra uniquing.
+  MCSymbolRefExpr::VariantKind RefKind = MCSymbolRefExpr::VK_None;
+  if ((MO.getTargetFlags() & ARM64II::MO_GOT) != 0) {
+    if ((MO.getTargetFlags() & ARM64II::MO_FRAGMENT) == ARM64II::MO_PAGE)
+      RefKind = MCSymbolRefExpr::VK_GOTPAGE;
+    else if ((MO.getTargetFlags() & ARM64II::MO_FRAGMENT) ==
+             ARM64II::MO_PAGEOFF)
+      RefKind = MCSymbolRefExpr::VK_GOTPAGEOFF;
+    else
+      assert(0 && "Unexpected target flags with MO_GOT on GV operand");
+  } else if ((MO.getTargetFlags() & ARM64II::MO_TLS) != 0) {
+    if ((MO.getTargetFlags() & ARM64II::MO_FRAGMENT) == ARM64II::MO_PAGE)
+      RefKind = MCSymbolRefExpr::VK_TLVPPAGE;
+    else if ((MO.getTargetFlags() & ARM64II::MO_FRAGMENT) ==
+             ARM64II::MO_PAGEOFF)
+      RefKind = MCSymbolRefExpr::VK_TLVPPAGEOFF;
+    else
+      llvm_unreachable("Unexpected target flags with MO_TLS on GV operand");
+  } else {
+    if ((MO.getTargetFlags() & ARM64II::MO_FRAGMENT) == ARM64II::MO_PAGE)
+      RefKind = MCSymbolRefExpr::VK_PAGE;
+    else if ((MO.getTargetFlags() & ARM64II::MO_FRAGMENT) ==
+             ARM64II::MO_PAGEOFF)
+      RefKind = MCSymbolRefExpr::VK_PAGEOFF;
+  }
+  const MCExpr *Expr = MCSymbolRefExpr::Create(Sym, RefKind, Ctx);
+  if (!MO.isJTI() && MO.getOffset())
+    Expr = MCBinaryExpr::CreateAdd(
+        Expr, MCConstantExpr::Create(MO.getOffset(), Ctx), Ctx);
+  return MCOperand::CreateExpr(Expr);
+}
+
+MCOperand ARM64MCInstLower::lowerSymbolOperandELF(const MachineOperand &MO,
+                                                  MCSymbol *Sym) const {
+  uint32_t RefFlags = 0;
+
+  if (MO.getTargetFlags() & ARM64II::MO_GOT)
+    RefFlags |= ARM64MCExpr::VK_GOT;
+  else if (MO.getTargetFlags() & ARM64II::MO_TLS) {
+    TLSModel::Model Model;
+    if (MO.isGlobal()) {
+      const GlobalValue *GV = MO.getGlobal();
+      Model = Printer.TM.getTLSModel(GV);
+    } else {
+      assert(MO.isSymbol() &&
+             StringRef(MO.getSymbolName()) == "_TLS_MODULE_BASE_" &&
+             "unexpected external TLS symbol");
+      Model = TLSModel::GeneralDynamic;
+    }
+    switch (Model) {
+    case TLSModel::InitialExec:
+      RefFlags |= ARM64MCExpr::VK_GOTTPREL;
+      break;
+    case TLSModel::LocalExec:
+      RefFlags |= ARM64MCExpr::VK_TPREL;
+      break;
+    case TLSModel::LocalDynamic:
+      RefFlags |= ARM64MCExpr::VK_DTPREL;
+      break;
+    case TLSModel::GeneralDynamic:
+      RefFlags |= ARM64MCExpr::VK_TLSDESC;
+      break;
+    }
+  } else {
+    // No modifier means this is a generic reference, classified as absolute for
+    // the cases where it matters (:abs_g0: etc).
+    RefFlags |= ARM64MCExpr::VK_ABS;
+  }
+
+  if ((MO.getTargetFlags() & ARM64II::MO_FRAGMENT) == ARM64II::MO_PAGE)
+    RefFlags |= ARM64MCExpr::VK_PAGE;
+  else if ((MO.getTargetFlags() & ARM64II::MO_FRAGMENT) == ARM64II::MO_PAGEOFF)
+    RefFlags |= ARM64MCExpr::VK_PAGEOFF;
+  else if ((MO.getTargetFlags() & ARM64II::MO_FRAGMENT) == ARM64II::MO_G3)
+    RefFlags |= ARM64MCExpr::VK_G3;
+  else if ((MO.getTargetFlags() & ARM64II::MO_FRAGMENT) == ARM64II::MO_G2)
+    RefFlags |= ARM64MCExpr::VK_G2;
+  else if ((MO.getTargetFlags() & ARM64II::MO_FRAGMENT) == ARM64II::MO_G1)
+    RefFlags |= ARM64MCExpr::VK_G1;
+  else if ((MO.getTargetFlags() & ARM64II::MO_FRAGMENT) == ARM64II::MO_G0)
+    RefFlags |= ARM64MCExpr::VK_G0;
+
+  if (MO.getTargetFlags() & ARM64II::MO_NC)
+    RefFlags |= ARM64MCExpr::VK_NC;
+
+  const MCExpr *Expr =
+      MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_None, Ctx);
+  if (!MO.isJTI() && MO.getOffset())
+    Expr = MCBinaryExpr::CreateAdd(
+        Expr, MCConstantExpr::Create(MO.getOffset(), Ctx), Ctx);
+
+  ARM64MCExpr::VariantKind RefKind;
+  RefKind = static_cast<ARM64MCExpr::VariantKind>(RefFlags);
+  Expr = ARM64MCExpr::Create(Expr, RefKind, Ctx);
+
+  return MCOperand::CreateExpr(Expr);
+}
+
+MCOperand ARM64MCInstLower::LowerSymbolOperand(const MachineOperand &MO,
+                                               MCSymbol *Sym) const {
+  if (TargetTriple.isOSDarwin())
+    return lowerSymbolOperandDarwin(MO, Sym);
+
+  assert(TargetTriple.isOSBinFormatELF() && "Expect Darwin or ELF target");
+  return lowerSymbolOperandELF(MO, Sym);
+}
+
+bool ARM64MCInstLower::lowerOperand(const MachineOperand &MO,
+                                    MCOperand &MCOp) const {
+  switch (MO.getType()) {
+  default:
+    assert(0 && "unknown operand type");
+  case MachineOperand::MO_Register:
+    // Ignore all implicit register operands.
+    if (MO.isImplicit())
+      return false;
+    MCOp = MCOperand::CreateReg(MO.getReg());
+    break;
+  case MachineOperand::MO_RegisterMask:
+    // Regmasks are like implicit defs.
+    return false;
+  case MachineOperand::MO_Immediate:
+    MCOp = MCOperand::CreateImm(MO.getImm());
+    break;
+  case MachineOperand::MO_MachineBasicBlock:
+    MCOp = MCOperand::CreateExpr(
+        MCSymbolRefExpr::Create(MO.getMBB()->getSymbol(), Ctx));
+    break;
+  case MachineOperand::MO_GlobalAddress:
+    MCOp = LowerSymbolOperand(MO, GetGlobalAddressSymbol(MO));
+    break;
+  case MachineOperand::MO_ExternalSymbol:
+    MCOp = LowerSymbolOperand(MO, GetExternalSymbolSymbol(MO));
+    break;
+  case MachineOperand::MO_JumpTableIndex:
+    MCOp = LowerSymbolOperand(MO, Printer.GetJTISymbol(MO.getIndex()));
+    break;
+  case MachineOperand::MO_ConstantPoolIndex:
+    MCOp = LowerSymbolOperand(MO, Printer.GetCPISymbol(MO.getIndex()));
+    break;
+  case MachineOperand::MO_BlockAddress:
+    MCOp = LowerSymbolOperand(
+        MO, Printer.GetBlockAddressSymbol(MO.getBlockAddress()));
+    break;
+  }
+  return true;
+}
+
+void ARM64MCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const {
+  OutMI.setOpcode(MI->getOpcode());
+
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    MCOperand MCOp;
+    if (lowerOperand(MI->getOperand(i), MCOp))
+      OutMI.addOperand(MCOp);
+  }
+}
--- a/lib/Target/ARM64/ARM64MCInstLower.h
+++ b/lib/Target/ARM64/ARM64MCInstLower.h
@ -0,0 +1,52 @@
+//===-- ARM64MCInstLower.h - Lower MachineInstr to MCInst ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef ARM64_MCINSTLOWER_H
+#define ARM64_MCINSTLOWER_H
+
+#include "llvm/ADT/Triple.h"
+#include "llvm/Support/Compiler.h"
+
+namespace llvm {
+class AsmPrinter;
+class MCAsmInfo;
+class MCContext;
+class MCInst;
+class MCOperand;
+class MCSymbol;
+class MachineInstr;
+class MachineModuleInfoMachO;
+class MachineOperand;
+class Mangler;
+
+/// ARM64MCInstLower - This class is used to lower an MachineInstr
+/// into an MCInst.
+class LLVM_LIBRARY_VISIBILITY ARM64MCInstLower {
+  MCContext &Ctx;
+  AsmPrinter &Printer;
+  Triple TargetTriple;
+
+public:
+  ARM64MCInstLower(MCContext &ctx, Mangler &mang, AsmPrinter &printer);
+
+  bool lowerOperand(const MachineOperand &MO, MCOperand &MCOp) const;
+  void Lower(const MachineInstr *MI, MCInst &OutMI) const;
+
+  MCOperand lowerSymbolOperandDarwin(const MachineOperand &MO,
+                                     MCSymbol *Sym) const;
+  MCOperand lowerSymbolOperandELF(const MachineOperand &MO,
+                                  MCSymbol *Sym) const;
+  MCOperand LowerSymbolOperand(const MachineOperand &MO, MCSymbol *Sym) const;
+
+  MCSymbol *GetGlobalAddressSymbol(const MachineOperand &MO) const;
+  MCSymbol *GetExternalSymbolSymbol(const MachineOperand &MO) const;
+};
+}
+
+#endif
--- a/lib/Target/ARM64/ARM64MachineFunctionInfo.h
+++ b/lib/Target/ARM64/ARM64MachineFunctionInfo.h
@ -0,0 +1,126 @@
+//===- ARM64MachineFuctionInfo.h - ARM64 machine function info --*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares ARM64-specific per-machine-function information.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef ARM64MACHINEFUNCTIONINFO_H
+#define ARM64MACHINEFUNCTIONINFO_H
+
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/MC/MCLinkerOptimizationHint.h"
+
+namespace llvm {
+
+/// ARM64FunctionInfo - This class is derived from MachineFunctionInfo and
+/// contains private ARM64-specific information for each MachineFunction.
+class ARM64FunctionInfo : public MachineFunctionInfo {
+
+  /// HasStackFrame - True if this function has a stack frame. Set by
+  /// processFunctionBeforeCalleeSavedScan().
+  bool HasStackFrame;
+
+  /// \brief Amount of stack frame size, not including callee-saved registers.
+  unsigned LocalStackSize;
+
+  /// \brief Number of TLS accesses using the special (combinable)
+  /// _TLS_MODULE_BASE_ symbol.
+  unsigned NumLocalDynamicTLSAccesses;
+
+  /// \brief FrameIndex for start of varargs area for arguments passed on the
+  /// stack.
+  int VarArgsStackIndex;
+
+  /// \brief FrameIndex for start of varargs area for arguments passed in
+  /// general purpose registers.
+  int VarArgsGPRIndex;
+
+  /// \brief Size of the varargs area for arguments passed in general purpose
+  /// registers.
+  unsigned VarArgsGPRSize;
+
+  /// \brief FrameIndex for start of varargs area for arguments passed in
+  /// floating-point registers.
+  int VarArgsFPRIndex;
+
+  /// \brief Size of the varargs area for arguments passed in floating-point
+  /// registers.
+  unsigned VarArgsFPRSize;
+
+public:
+  ARM64FunctionInfo()
+      : HasStackFrame(false), NumLocalDynamicTLSAccesses(0),
+        VarArgsStackIndex(0), VarArgsGPRIndex(0), VarArgsGPRSize(0),
+        VarArgsFPRIndex(0), VarArgsFPRSize(0) {}
+
+  explicit ARM64FunctionInfo(MachineFunction &MF)
+      : HasStackFrame(false), NumLocalDynamicTLSAccesses(0),
+        VarArgsStackIndex(0), VarArgsGPRIndex(0), VarArgsGPRSize(0),
+        VarArgsFPRIndex(0), VarArgsFPRSize(0) {
+    (void)MF;
+  }
+
+  bool hasStackFrame() const { return HasStackFrame; }
+  void setHasStackFrame(bool s) { HasStackFrame = s; }
+
+  void setLocalStackSize(unsigned Size) { LocalStackSize = Size; }
+  unsigned getLocalStackSize() const { return LocalStackSize; }
+
+  void incNumLocalDynamicTLSAccesses() { ++NumLocalDynamicTLSAccesses; }
+  unsigned getNumLocalDynamicTLSAccesses() const {
+    return NumLocalDynamicTLSAccesses;
+  }
+
+  int getVarArgsStackIndex() const { return VarArgsStackIndex; }
+  void setVarArgsStackIndex(int Index) { VarArgsStackIndex = Index; }
+
+  int getVarArgsGPRIndex() const { return VarArgsGPRIndex; }
+  void setVarArgsGPRIndex(int Index) { VarArgsGPRIndex = Index; }
+
+  unsigned getVarArgsGPRSize() const { return VarArgsGPRSize; }
+  void setVarArgsGPRSize(unsigned Size) { VarArgsGPRSize = Size; }
+
+  int getVarArgsFPRIndex() const { return VarArgsFPRIndex; }
+  void setVarArgsFPRIndex(int Index) { VarArgsFPRIndex = Index; }
+
+  unsigned getVarArgsFPRSize() const { return VarArgsFPRSize; }
+  void setVarArgsFPRSize(unsigned Size) { VarArgsFPRSize = Size; }
+
+  typedef SmallPtrSet<const MachineInstr *, 16> SetOfInstructions;
+
+  const SetOfInstructions &getLOHRelated() const { return LOHRelated; }
+
+  // Shortcuts for LOH related types.
+  typedef LOHDirective<const MachineInstr> MILOHDirective;
+  typedef MILOHDirective::LOHArgs MILOHArgs;
+
+  typedef LOHContainer<const MachineInstr> MILOHContainer;
+  typedef MILOHContainer::LOHDirectives MILOHDirectives;
+
+  const MILOHContainer &getLOHContainer() const { return LOHContainerSet; }
+
+  /// Add a LOH directive of this @p Kind and this @p Args.
+  void addLOHDirective(MCLOHType Kind, const MILOHArgs &Args) {
+    LOHContainerSet.addDirective(Kind, Args);
+    for (MILOHArgs::const_iterator It = Args.begin(), EndIt = Args.end();
+         It != EndIt; ++It)
+      LOHRelated.insert(*It);
+  }
+
+private:
+  // Hold the lists of LOHs.
+  MILOHContainer LOHContainerSet;
+  SetOfInstructions LOHRelated;
+};
+} // End llvm namespace
+
+#endif // ARM64MACHINEFUNCTIONINFO_H
--- a/lib/Target/ARM64/ARM64PerfectShuffle.h
+++ b/lib/Target/ARM64/ARM64PerfectShuffle.h
--- a/lib/Target/ARM64/ARM64PromoteConstant.cpp
+++ b/lib/Target/ARM64/ARM64PromoteConstant.cpp
@ -0,0 +1,588 @@
+
+//===-- ARM64PromoteConstant.cpp --- Promote constant to global for ARM64 -===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the ARM64PromoteConstant pass which promotes constant
+// to global variables when this is likely to be more efficient.
+// Currently only types related to constant vector (i.e., constant vector, array
+// of constant vectors, constant structure with a constant vector field, etc.)
+// are promoted to global variables.
+// Indeed, constant vector are likely to be lowered in target constant pool
+// during instruction selection.
+// Therefore, the access will remain the same (memory load), but the structures
+// types are not split into different constant pool accesses for each field.
+// The bonus side effect is that created globals may be merged by the global
+// merge pass.
+//
+// FIXME: This pass may be useful for other targets too.
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "arm64-promote-const"
+#include "ARM64.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+
+// Stress testing mode - disable heuristics.
+static cl::opt<bool> Stress("arm64-stress-promote-const", cl::Hidden,
+                            cl::desc("Promote all vector constants"));
+
+STATISTIC(NumPromoted, "Number of promoted constants");
+STATISTIC(NumPromotedUses, "Number of promoted constants uses");
+
+//===----------------------------------------------------------------------===//
+//                       ARM64PromoteConstant
+//===----------------------------------------------------------------------===//
+
+namespace {
+/// Promotes interesting constant into global variables.
+/// The motivating example is:
+/// static const uint16_t TableA[32] = {
+///   41944, 40330, 38837, 37450, 36158, 34953, 33826, 32768,
+///   31776, 30841, 29960, 29128, 28340, 27595, 26887, 26215,
+///   25576, 24967, 24386, 23832, 23302, 22796, 22311, 21846,
+///   21400, 20972, 20561, 20165, 19785, 19419, 19066, 18725,
+/// };
+///
+/// uint8x16x4_t LoadStatic(void) {
+///   uint8x16x4_t ret;
+///   ret.val[0] = vld1q_u16(TableA +  0);
+///   ret.val[1] = vld1q_u16(TableA +  8);
+///   ret.val[2] = vld1q_u16(TableA + 16);
+///   ret.val[3] = vld1q_u16(TableA + 24);
+///   return ret;
+/// }
+///
+/// The constants in that example are folded into the uses. Thus, 4 different
+/// constants are created.
+/// As their type is vector the cheapest way to create them is to load them
+/// for the memory.
+/// Therefore the final assembly final has 4 different load.
+/// With this pass enabled, only one load is issued for the constants.
+class ARM64PromoteConstant : public ModulePass {
+
+public:
+  static char ID;
+  ARM64PromoteConstant() : ModulePass(ID) {}
+
+  virtual const char *getPassName() const { return "ARM64 Promote Constant"; }
+
+  /// Iterate over the functions and promote the interesting constants into
+  /// global variables with module scope.
+  bool runOnModule(Module &M) {
+    DEBUG(dbgs() << getPassName() << '\n');
+    bool Changed = false;
+    for (Module::iterator IFn = M.begin(), IEndFn = M.end(); IFn != IEndFn;
+         ++IFn) {
+      Changed |= runOnFunction(*IFn);
+    }
+    return Changed;
+  }
+
+private:
+  /// Look for interesting constants used within the given function.
+  /// Promote them into global variables, load these global variables within
+  /// the related function, so that the number of inserted load is minimal.
+  bool runOnFunction(Function &F);
+
+  // This transformation requires dominator info
+  virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+    AU.setPreservesCFG();
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addPreserved<DominatorTreeWrapperPass>();
+  }
+
+  /// Type to store a list of User
+  typedef SmallVector<Value::user_iterator, 4> Users;
+  /// Map an insertion point to all the uses it dominates.
+  typedef DenseMap<Instruction *, Users> InsertionPoints;
+  /// Map a function to the required insertion point of load for a
+  /// global variable
+  typedef DenseMap<Function *, InsertionPoints> InsertionPointsPerFunc;
+
+  /// Find the closest point that dominates the given Use.
+  Instruction *findInsertionPoint(Value::user_iterator &Use);
+
+  /// Check if the given insertion point is dominated by an existing
+  /// insertion point.
+  /// If true, the given use is added to the list of dominated uses for
+  /// the related existing point.
+  /// \param NewPt the insertion point to be checked
+  /// \param UseIt the use to be added into the list of dominated uses
+  /// \param InsertPts existing insertion points
+  /// \pre NewPt and all instruction in InsertPts belong to the same function
+  /// \retun true if one of the insertion point in InsertPts dominates NewPt,
+  ///        false otherwise
+  bool isDominated(Instruction *NewPt, Value::user_iterator &UseIt,
+                   InsertionPoints &InsertPts);
+
+  /// Check if the given insertion point can be merged with an existing
+  /// insertion point in a common dominator.
+  /// If true, the given use is added to the list of the created insertion
+  /// point.
+  /// \param NewPt the insertion point to be checked
+  /// \param UseIt the use to be added into the list of dominated uses
+  /// \param InsertPts existing insertion points
+  /// \pre NewPt and all instruction in InsertPts belong to the same function
+  /// \pre isDominated returns false for the exact same parameters.
+  /// \retun true if it exists an insertion point in InsertPts that could
+  ///        have been merged with NewPt in a common dominator,
+  ///        false otherwise
+  bool tryAndMerge(Instruction *NewPt, Value::user_iterator &UseIt,
+                   InsertionPoints &InsertPts);
+
+  /// Compute the minimal insertion points to dominates all the interesting
+  /// uses of value.
+  /// Insertion points are group per function and each insertion point
+  /// contains a list of all the uses it dominates within the related function
+  /// \param Val constant to be examined
+  /// \param InsPtsPerFunc[out] output storage of the analysis
+  void computeInsertionPoints(Constant *Val,
+                              InsertionPointsPerFunc &InsPtsPerFunc);
+
+  /// Insert a definition of a new global variable at each point contained in
+  /// InsPtsPerFunc and update the related uses (also contained in
+  /// InsPtsPerFunc).
+  bool insertDefinitions(Constant *Cst, InsertionPointsPerFunc &InsPtsPerFunc);
+
+  /// Compute the minimal insertion points to dominate all the interesting
+  /// uses of Val and insert a definition of a new global variable
+  /// at these points.
+  /// Also update the uses of Val accordingly.
+  /// Currently a use of Val is considered interesting if:
+  /// - Val is not UndefValue
+  /// - Val is not zeroinitialized
+  /// - Replacing Val per a load of a global variable is valid.
+  /// \see shouldConvert for more details
+  bool computeAndInsertDefinitions(Constant *Val);
+
+  /// Promote the given constant into a global variable if it is expected to
+  /// be profitable.
+  /// \return true if Cst has been promoted
+  bool promoteConstant(Constant *Cst);
+
+  /// Transfer the list of dominated uses of IPI to NewPt in InsertPts.
+  /// Append UseIt to this list and delete the entry of IPI in InsertPts.
+  static void appendAndTransferDominatedUses(Instruction *NewPt,
+                                             Value::user_iterator &UseIt,
+                                             InsertionPoints::iterator &IPI,
+                                             InsertionPoints &InsertPts) {
+    // Record the dominated use
+    IPI->second.push_back(UseIt);
+    // Transfer the dominated uses of IPI to NewPt
+    // Inserting into the DenseMap may invalidate existing iterator.
+    // Keep a copy of the key to find the iterator to erase.
+    Instruction *OldInstr = IPI->first;
+    InsertPts.insert(InsertionPoints::value_type(NewPt, IPI->second));
+    // Erase IPI
+    IPI = InsertPts.find(OldInstr);
+    InsertPts.erase(IPI);
+  }
+};
+} // end anonymous namespace
+
+char ARM64PromoteConstant::ID = 0;
+
+namespace llvm {
+void initializeARM64PromoteConstantPass(PassRegistry &);
+}
+
+INITIALIZE_PASS_BEGIN(ARM64PromoteConstant, "arm64-promote-const",
+                      "ARM64 Promote Constant Pass", false, false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_END(ARM64PromoteConstant, "arm64-promote-const",
+                    "ARM64 Promote Constant Pass", false, false)
+
+ModulePass *llvm::createARM64PromoteConstantPass() {
+  return new ARM64PromoteConstant();
+}
+
+/// Check if the given type uses a vector type.
+static bool isConstantUsingVectorTy(const Type *CstTy) {
+  if (CstTy->isVectorTy())
+    return true;
+  if (CstTy->isStructTy()) {
+    for (unsigned EltIdx = 0, EndEltIdx = CstTy->getStructNumElements();
+         EltIdx < EndEltIdx; ++EltIdx)
+      if (isConstantUsingVectorTy(CstTy->getStructElementType(EltIdx)))
+        return true;
+  } else if (CstTy->isArrayTy())
+    return isConstantUsingVectorTy(CstTy->getArrayElementType());
+  return false;
+}
+
+/// Check if the given use (Instruction + OpIdx) of Cst should be converted into
+/// a load of a global variable initialized with Cst.
+/// A use should be converted if it is legal to do so.
+/// For instance, it is not legal to turn the mask operand of a shuffle vector
+/// into a load of a global variable.
+static bool shouldConvertUse(const Constant *Cst, const Instruction *Instr,
+                             unsigned OpIdx) {
+  // shufflevector instruction expects a const for the mask argument, i.e., the
+  // third argument. Do not promote this use in that case.
+  if (isa<const ShuffleVectorInst>(Instr) && OpIdx == 2)
+    return false;
+
+  // extractvalue instruction expects a const idx
+  if (isa<const ExtractValueInst>(Instr) && OpIdx > 0)
+    return false;
+
+  // extractvalue instruction expects a const idx
+  if (isa<const InsertValueInst>(Instr) && OpIdx > 1)
+    return false;
+
+  if (isa<const AllocaInst>(Instr) && OpIdx > 0)
+    return false;
+
+  // Alignment argument must be constant
+  if (isa<const LoadInst>(Instr) && OpIdx > 0)
+    return false;
+
+  // Alignment argument must be constant
+  if (isa<const StoreInst>(Instr) && OpIdx > 1)
+    return false;
+
+  // Index must be constant
+  if (isa<const GetElementPtrInst>(Instr) && OpIdx > 0)
+    return false;
+
+  // Personality function and filters must be constant.
+  // Give up on that instruction.
+  if (isa<const LandingPadInst>(Instr))
+    return false;
+
+  // switch instruction expects constants to compare to
+  if (isa<const SwitchInst>(Instr))
+    return false;
+
+  // Expected address must be a constant
+  if (isa<const IndirectBrInst>(Instr))
+    return false;
+
+  // Do not mess with intrinsic
+  if (isa<const IntrinsicInst>(Instr))
+    return false;
+
+  // Do not mess with inline asm
+  const CallInst *CI = dyn_cast<const CallInst>(Instr);
+  if (CI && isa<const InlineAsm>(CI->getCalledValue()))
+    return false;
+
+  return true;
+}
+
+/// Check if the given Cst should be converted into
+/// a load of a global variable initialized with Cst.
+/// A constant should be converted if it is likely that the materialization of
+/// the constant will be tricky. Thus, we give up on zero or undef values.
+///
+/// \todo Currently, accept only vector related types.
+/// Also we give up on all simple vector type to keep the existing
+/// behavior. Otherwise, we should push here all the check of the lowering of
+/// BUILD_VECTOR. By giving up, we lose the potential benefit of merging
+/// constant via global merge and the fact that the same constant is stored
+/// only once with this method (versus, as many function that uses the constant
+/// for the regular approach, even for float).
+/// Again, the simplest solution would be to promote every
+/// constant and rematerialize them when they are actually cheap to create.
+static bool shouldConvert(const Constant *Cst) {
+  if (isa<const UndefValue>(Cst))
+    return false;
+
+  // FIXME: In some cases, it may be interesting to promote in memory
+  // a zero initialized constant.
+  // E.g., when the type of Cst require more instructions than the
+  // adrp/add/load sequence or when this sequence can be shared by several
+  // instances of Cst.
+  // Ideally, we could promote this into a global and rematerialize the constant
+  // when it was a bad idea.
+  if (Cst->isZeroValue())
+    return false;
+
+  if (Stress)
+    return true;
+
+  // FIXME: see function \todo
+  if (Cst->getType()->isVectorTy())
+    return false;
+  return isConstantUsingVectorTy(Cst->getType());
+}
+
+Instruction *
+ARM64PromoteConstant::findInsertionPoint(Value::user_iterator &Use) {
+  // If this user is a phi, the insertion point is in the related
+  // incoming basic block
+  PHINode *PhiInst = dyn_cast<PHINode>(*Use);
+  Instruction *InsertionPoint;
+  if (PhiInst)
+    InsertionPoint =
+        PhiInst->getIncomingBlock(Use.getOperandNo())->getTerminator();
+  else
+    InsertionPoint = dyn_cast<Instruction>(*Use);
+  assert(InsertionPoint && "User is not an instruction!");
+  return InsertionPoint;
+}
+
+bool ARM64PromoteConstant::isDominated(Instruction *NewPt,
+                                       Value::user_iterator &UseIt,
+                                       InsertionPoints &InsertPts) {
+
+  DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>(
+      *NewPt->getParent()->getParent()).getDomTree();
+
+  // Traverse all the existing insertion point and check if one is dominating
+  // NewPt
+  for (InsertionPoints::iterator IPI = InsertPts.begin(),
+                                 EndIPI = InsertPts.end();
+       IPI != EndIPI; ++IPI) {
+    if (NewPt == IPI->first || DT.dominates(IPI->first, NewPt) ||
+        // When IPI->first is a terminator instruction, DT may think that
+        // the result is defined on the edge.
+        // Here we are testing the insertion point, not the definition.
+        (IPI->first->getParent() != NewPt->getParent() &&
+         DT.dominates(IPI->first->getParent(), NewPt->getParent()))) {
+      // No need to insert this point
+      // Record the dominated use
+      DEBUG(dbgs() << "Insertion point dominated by:\n");
+      DEBUG(IPI->first->print(dbgs()));
+      DEBUG(dbgs() << '\n');
+      IPI->second.push_back(UseIt);
+      return true;
+    }
+  }
+  return false;
+}
+
+bool ARM64PromoteConstant::tryAndMerge(Instruction *NewPt,
+                                       Value::user_iterator &UseIt,
+                                       InsertionPoints &InsertPts) {
+  DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>(
+      *NewPt->getParent()->getParent()).getDomTree();
+  BasicBlock *NewBB = NewPt->getParent();
+
+  // Traverse all the existing insertion point and check if one is dominated by
+  // NewPt and thus useless or can be combined with NewPt into a common
+  // dominator
+  for (InsertionPoints::iterator IPI = InsertPts.begin(),
+                                 EndIPI = InsertPts.end();
+       IPI != EndIPI; ++IPI) {
+    BasicBlock *CurBB = IPI->first->getParent();
+    if (NewBB == CurBB) {
+      // Instructions are in the same block.
+      // By construction, NewPt is dominating the other.
+      // Indeed, isDominated returned false with the exact same arguments.
+      DEBUG(dbgs() << "Merge insertion point with:\n");
+      DEBUG(IPI->first->print(dbgs()));
+      DEBUG(dbgs() << "\nat considered insertion point.\n");
+      appendAndTransferDominatedUses(NewPt, UseIt, IPI, InsertPts);
+      return true;
+    }
+
+    // Look for a common dominator
+    BasicBlock *CommonDominator = DT.findNearestCommonDominator(NewBB, CurBB);
+    // If none exists, we cannot merge these two points
+    if (!CommonDominator)
+      continue;
+
+    if (CommonDominator != NewBB) {
+      // By construction, the CommonDominator cannot be CurBB
+      assert(CommonDominator != CurBB &&
+             "Instruction has not been rejected during isDominated check!");
+      // Take the last instruction of the CommonDominator as insertion point
+      NewPt = CommonDominator->getTerminator();
+    }
+    // else, CommonDominator is the block of NewBB, hence NewBB is the last
+    // possible insertion point in that block
+    DEBUG(dbgs() << "Merge insertion point with:\n");
+    DEBUG(IPI->first->print(dbgs()));
+    DEBUG(dbgs() << '\n');
+    DEBUG(NewPt->print(dbgs()));
+    DEBUG(dbgs() << '\n');
+    appendAndTransferDominatedUses(NewPt, UseIt, IPI, InsertPts);
+    return true;
+  }
+  return false;
+}
+
+void ARM64PromoteConstant::computeInsertionPoints(
+    Constant *Val, InsertionPointsPerFunc &InsPtsPerFunc) {
+  DEBUG(dbgs() << "** Compute insertion points **\n");
+  for (Value::user_iterator UseIt = Val->user_begin(),
+                            EndUseIt = Val->user_end();
+       UseIt != EndUseIt; ++UseIt) {
+    // If the user is not an Instruction, we cannot modify it
+    if (!isa<Instruction>(*UseIt))
+      continue;
+
+    // Filter out uses that should not be converted
+    if (!shouldConvertUse(Val, cast<Instruction>(*UseIt), UseIt.getOperandNo()))
+      continue;
+
+    DEBUG(dbgs() << "Considered use, opidx " << UseIt.getOperandNo() << ":\n");
+    DEBUG((*UseIt)->print(dbgs()));
+    DEBUG(dbgs() << '\n');
+
+    Instruction *InsertionPoint = findInsertionPoint(UseIt);
+
+    DEBUG(dbgs() << "Considered insertion point:\n");
+    DEBUG(InsertionPoint->print(dbgs()));
+    DEBUG(dbgs() << '\n');
+
+    // Check if the current insertion point is useless, i.e., it is dominated
+    // by another one.
+    InsertionPoints &InsertPts =
+        InsPtsPerFunc[InsertionPoint->getParent()->getParent()];
+    if (isDominated(InsertionPoint, UseIt, InsertPts))
+      continue;
+    // This insertion point is useful, check if we can merge some insertion
+    // point in a common dominator or if NewPt dominates an existing one.
+    if (tryAndMerge(InsertionPoint, UseIt, InsertPts))
+      continue;
+
+    DEBUG(dbgs() << "Keep considered insertion point\n");
+
+    // It is definitely useful by its own
+    InsertPts[InsertionPoint].push_back(UseIt);
+  }
+}
+
+bool
+ARM64PromoteConstant::insertDefinitions(Constant *Cst,
+                                        InsertionPointsPerFunc &InsPtsPerFunc) {
+  // We will create one global variable per Module
+  DenseMap<Module *, GlobalVariable *> ModuleToMergedGV;
+  bool HasChanged = false;
+
+  // Traverse all insertion points in all the function
+  for (InsertionPointsPerFunc::iterator FctToInstPtsIt = InsPtsPerFunc.begin(),
+                                        EndIt = InsPtsPerFunc.end();
+       FctToInstPtsIt != EndIt; ++FctToInstPtsIt) {
+    InsertionPoints &InsertPts = FctToInstPtsIt->second;
+// Do more check for debug purposes
+#ifndef NDEBUG
+    DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>(
+        *FctToInstPtsIt->first).getDomTree();
+#endif
+    GlobalVariable *PromotedGV;
+    assert(!InsertPts.empty() && "Empty uses does not need a definition");
+
+    Module *M = FctToInstPtsIt->first->getParent();
+    DenseMap<Module *, GlobalVariable *>::iterator MapIt =
+        ModuleToMergedGV.find(M);
+    if (MapIt == ModuleToMergedGV.end()) {
+      PromotedGV = new GlobalVariable(
+          *M, Cst->getType(), true, GlobalValue::InternalLinkage, 0,
+          "_PromotedConst", 0, GlobalVariable::NotThreadLocal);
+      PromotedGV->setInitializer(Cst);
+      ModuleToMergedGV[M] = PromotedGV;
+      DEBUG(dbgs() << "Global replacement: ");
+      DEBUG(PromotedGV->print(dbgs()));
+      DEBUG(dbgs() << '\n');
+      ++NumPromoted;
+      HasChanged = true;
+    } else {
+      PromotedGV = MapIt->second;
+    }
+
+    for (InsertionPoints::iterator IPI = InsertPts.begin(),
+                                   EndIPI = InsertPts.end();
+         IPI != EndIPI; ++IPI) {
+      // Create the load of the global variable
+      IRBuilder<> Builder(IPI->first->getParent(), IPI->first);
+      LoadInst *LoadedCst = Builder.CreateLoad(PromotedGV);
+      DEBUG(dbgs() << "**********\n");
+      DEBUG(dbgs() << "New def: ");
+      DEBUG(LoadedCst->print(dbgs()));
+      DEBUG(dbgs() << '\n');
+
+      // Update the dominated uses
+      Users &DominatedUsers = IPI->second;
+      for (Users::iterator UseIt = DominatedUsers.begin(),
+                           EndIt = DominatedUsers.end();
+           UseIt != EndIt; ++UseIt) {
+#ifndef NDEBUG
+        assert((DT.dominates(LoadedCst, cast<Instruction>(**UseIt)) ||
+                (isa<PHINode>(**UseIt) &&
+                 DT.dominates(LoadedCst, findInsertionPoint(*UseIt)))) &&
+               "Inserted definition does not dominate all its uses!");
+#endif
+        DEBUG(dbgs() << "Use to update " << UseIt->getOperandNo() << ":");
+        DEBUG((*UseIt)->print(dbgs()));
+        DEBUG(dbgs() << '\n');
+        (*UseIt)->setOperand(UseIt->getOperandNo(), LoadedCst);
+        ++NumPromotedUses;
+      }
+    }
+  }
+  return HasChanged;
+}
+
+bool ARM64PromoteConstant::computeAndInsertDefinitions(Constant *Val) {
+  InsertionPointsPerFunc InsertPtsPerFunc;
+  computeInsertionPoints(Val, InsertPtsPerFunc);
+  return insertDefinitions(Val, InsertPtsPerFunc);
+}
+
+bool ARM64PromoteConstant::promoteConstant(Constant *Cst) {
+  assert(Cst && "Given variable is not a valid constant.");
+
+  if (!shouldConvert(Cst))
+    return false;
+
+  DEBUG(dbgs() << "******************************\n");
+  DEBUG(dbgs() << "Candidate constant: ");
+  DEBUG(Cst->print(dbgs()));
+  DEBUG(dbgs() << '\n');
+
+  return computeAndInsertDefinitions(Cst);
+}
+
+bool ARM64PromoteConstant::runOnFunction(Function &F) {
+  // Look for instructions using constant vector
+  // Promote that constant to a global variable.
+  // Create as few load of this variable as possible and update the uses
+  // accordingly
+  bool LocalChange = false;
+  SmallSet<Constant *, 8> AlreadyChecked;
+
+  for (Function::iterator IBB = F.begin(), IEndBB = F.end(); IBB != IEndBB;
+       ++IBB) {
+    for (BasicBlock::iterator II = IBB->begin(), IEndI = IBB->end();
+         II != IEndI; ++II) {
+      // Traverse the operand, looking for constant vectors
+      // Replace them by a load of a global variable of type constant vector
+      for (unsigned OpIdx = 0, EndOpIdx = II->getNumOperands();
+           OpIdx != EndOpIdx; ++OpIdx) {
+        Constant *Cst = dyn_cast<Constant>(II->getOperand(OpIdx));
+        // There is no point is promoting global value, they are already global.
+        // Do not promote constant expression, as they may require some code
+        // expansion.
+        if (Cst && !isa<GlobalValue>(Cst) && !isa<ConstantExpr>(Cst) &&
+            AlreadyChecked.insert(Cst))
+          LocalChange |= promoteConstant(Cst);
+      }
+    }
+  }
+  return LocalChange;
+}
--- a/lib/Target/ARM64/ARM64RegisterInfo.cpp
+++ b/lib/Target/ARM64/ARM64RegisterInfo.cpp
@ -0,0 +1,402 @@
+//===- ARM64RegisterInfo.cpp - ARM64 Register Information -----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the ARM64 implementation of the TargetRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARM64RegisterInfo.h"
+#include "ARM64FrameLowering.h"
+#include "ARM64InstrInfo.h"
+#include "ARM64Subtarget.h"
+#include "MCTargetDesc/ARM64AddressingModes.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetOptions.h"
+
+#define GET_REGINFO_TARGET_DESC
+#include "ARM64GenRegisterInfo.inc"
+
+using namespace llvm;
+
+ARM64RegisterInfo::ARM64RegisterInfo(const ARM64InstrInfo *tii,
+                                     const ARM64Subtarget *sti)
+    : ARM64GenRegisterInfo(ARM64::LR), TII(tii), STI(sti) {}
+
+const uint16_t *
+ARM64RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
+  assert(MF && "Invalid MachineFunction pointer.");
+  if (MF->getFunction()->getCallingConv() == CallingConv::AnyReg)
+    return CSR_ARM64_AllRegs_SaveList;
+  else
+    return CSR_ARM64_AAPCS_SaveList;
+}
+
+const uint32_t *
+ARM64RegisterInfo::getCallPreservedMask(CallingConv::ID CC) const {
+  if (CC == CallingConv::AnyReg)
+    return CSR_ARM64_AllRegs_RegMask;
+  else
+    return CSR_ARM64_AAPCS_RegMask;
+}
+
+const uint32_t *ARM64RegisterInfo::getTLSCallPreservedMask() const {
+  if (STI->isTargetDarwin())
+    return CSR_ARM64_TLS_Darwin_RegMask;
+
+  assert(STI->isTargetELF() && "only expect Darwin or ELF TLS");
+  return CSR_ARM64_TLS_ELF_RegMask;
+}
+
+const uint32_t *
+ARM64RegisterInfo::getThisReturnPreservedMask(CallingConv::ID) const {
+  // This should return a register mask that is the same as that returned by
+  // getCallPreservedMask but that additionally preserves the register used for
+  // the first i64 argument (which must also be the register used to return a
+  // single i64 return value)
+  //
+  // In case that the calling convention does not use the same register for
+  // both, the function should return NULL (does not currently apply)
+  return CSR_ARM64_AAPCS_ThisReturn_RegMask;
+}
+
+BitVector ARM64RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
+  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
+
+  // FIXME: avoid re-calculating this everytime.
+  BitVector Reserved(getNumRegs());
+  Reserved.set(ARM64::SP);
+  Reserved.set(ARM64::XZR);
+  Reserved.set(ARM64::WSP);
+  Reserved.set(ARM64::WZR);
+
+  if (TFI->hasFP(MF) || STI->isTargetDarwin()) {
+    Reserved.set(ARM64::FP);
+    Reserved.set(ARM64::W29);
+  }
+
+  if (STI->isTargetDarwin()) {
+    Reserved.set(ARM64::X18); // Platform register
+    Reserved.set(ARM64::W18);
+  }
+
+  if (hasBasePointer(MF)) {
+    Reserved.set(ARM64::X19);
+    Reserved.set(ARM64::W19);
+  }
+
+  return Reserved;
+}
+
+bool ARM64RegisterInfo::isReservedReg(const MachineFunction &MF,
+                                      unsigned Reg) const {
+  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
+
+  switch (Reg) {
+  default:
+    break;
+  case ARM64::SP:
+  case ARM64::XZR:
+  case ARM64::WSP:
+  case ARM64::WZR:
+    return true;
+  case ARM64::X18:
+  case ARM64::W18:
+    return STI->isTargetDarwin();
+  case ARM64::FP:
+  case ARM64::W29:
+    return TFI->hasFP(MF) || STI->isTargetDarwin();
+  case ARM64::W19:
+  case ARM64::X19:
+    return hasBasePointer(MF);
+  }
+
+  return false;
+}
+
+const TargetRegisterClass *
+ARM64RegisterInfo::getPointerRegClass(const MachineFunction &MF,
+                                      unsigned Kind) const {
+  return &ARM64::GPR64RegClass;
+}
+
+const TargetRegisterClass *
+ARM64RegisterInfo::getCrossCopyRegClass(const TargetRegisterClass *RC) const {
+  if (RC == &ARM64::CCRRegClass)
+    return NULL; // Can't copy CPSR.
+  return RC;
+}
+
+unsigned ARM64RegisterInfo::getBaseRegister() const { return ARM64::X19; }
+
+bool ARM64RegisterInfo::hasBasePointer(const MachineFunction &MF) const {
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+
+  // In the presence of variable sized objects, if the fixed stack size is
+  // large enough that referencing from the FP won't result in things being
+  // in range relatively often, we can use a base pointer to allow access
+  // from the other direction like the SP normally works.
+  if (MFI->hasVarSizedObjects()) {
+    // Conservatively estimate whether the negative offset from the frame
+    // pointer will be sufficient to reach. If a function has a smallish
+    // frame, it's less likely to have lots of spills and callee saved
+    // space, so it's all more likely to be within range of the frame pointer.
+    // If it's wrong, we'll materialize the constant and still get to the
+    // object; it's just suboptimal. Negative offsets use the unscaled
+    // load/store instructions, which have a 9-bit signed immediate.
+    if (MFI->getLocalFrameSize() < 256)
+      return false;
+    return true;
+  }
+
+  return false;
+}
+
+unsigned ARM64RegisterInfo::getFrameRegister(const MachineFunction &MF) const {
+  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
+
+  return TFI->hasFP(MF) ? ARM64::FP : ARM64::SP;
+}
+
+bool
+ARM64RegisterInfo::requiresRegisterScavenging(const MachineFunction &MF) const {
+  return true;
+}
+
+bool ARM64RegisterInfo::requiresVirtualBaseRegisters(const MachineFunction &MF)
+    const {
+  return true;
+}
+
+bool
+ARM64RegisterInfo::useFPForScavengingIndex(const MachineFunction &MF) const {
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  // ARM64FrameLowering::resolveFrameIndexReference() can always fall back
+  // to the stack pointer, so only put the emergency spill slot next to the
+  // FP when there's no better way to access it (SP or base pointer).
+  return MFI->hasVarSizedObjects() && !hasBasePointer(MF);
+}
+
+bool ARM64RegisterInfo::requiresFrameIndexScavenging(const MachineFunction &MF)
+    const {
+  return true;
+}
+
+bool ARM64RegisterInfo::cannotEliminateFrame(const MachineFunction &MF) const {
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  // Only consider eliminating leaf frames.
+  if (MFI->hasCalls() || (MF.getTarget().Options.DisableFramePointerElim(MF) &&
+                          MFI->adjustsStack()))
+    return true;
+  return MFI->hasVarSizedObjects() || MFI->isFrameAddressTaken();
+}
+
+/// needsFrameBaseReg - Returns true if the instruction's frame index
+/// reference would be better served by a base register other than FP
+/// or SP. Used by LocalStackFrameAllocation to determine which frame index
+/// references it should create new base registers for.
+bool ARM64RegisterInfo::needsFrameBaseReg(MachineInstr *MI,
+                                          int64_t Offset) const {
+  for (unsigned i = 0; !MI->getOperand(i).isFI(); ++i)
+    assert(i < MI->getNumOperands() &&
+           "Instr doesn't have FrameIndex operand!");
+
+  // It's the load/store FI references that cause issues, as it can be difficult
+  // to materialize the offset if it won't fit in the literal field. Estimate
+  // based on the size of the local frame and some conservative assumptions
+  // about the rest of the stack frame (note, this is pre-regalloc, so
+  // we don't know everything for certain yet) whether this offset is likely
+  // to be out of range of the immediate. Return true if so.
+
+  // We only generate virtual base registers for loads and stores, so
+  // return false for everything else.
+  if (!MI->mayLoad() && !MI->mayStore())
+    return false;
+
+  // Without a virtual base register, if the function has variable sized
+  // objects, all fixed-size local references will be via the frame pointer,
+  // Approximate the offset and see if it's legal for the instruction.
+  // Note that the incoming offset is based on the SP value at function entry,
+  // so it'll be negative.
+  MachineFunction &MF = *MI->getParent()->getParent();
+  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+
+  // Estimate an offset from the frame pointer.
+  // Conservatively assume all GPR callee-saved registers get pushed.
+  // FP, LR, X19-X28, D8-D15. 64-bits each.
+  int64_t FPOffset = Offset - 16 * 20;
+  // Estimate an offset from the stack pointer.
+  // The incoming offset is relating to the SP at the start of the function,
+  // but when we access the local it'll be relative to the SP after local
+  // allocation, so adjust our SP-relative offset by that allocation size.
+  Offset += MFI->getLocalFrameSize();
+  // Assume that we'll have at least some spill slots allocated.
+  // FIXME: This is a total SWAG number. We should run some statistics
+  //        and pick a real one.
+  Offset += 128; // 128 bytes of spill slots
+
+  // If there is a frame pointer, try using it.
+  // The FP is only available if there is no dynamic realignment. We
+  // don't know for sure yet whether we'll need that, so we guess based
+  // on whether there are any local variables that would trigger it.
+  if (TFI->hasFP(MF) && isFrameOffsetLegal(MI, FPOffset))
+    return false;
+
+  // If we can reference via the stack pointer or base pointer, try that.
+  // FIXME: This (and the code that resolves the references) can be improved
+  //        to only disallow SP relative references in the live range of
+  //        the VLA(s). In practice, it's unclear how much difference that
+  //        would make, but it may be worth doing.
+  if (isFrameOffsetLegal(MI, Offset))
+    return false;
+
+  // The offset likely isn't legal; we want to allocate a virtual base register.
+  return true;
+}
+
+bool ARM64RegisterInfo::isFrameOffsetLegal(const MachineInstr *MI,
+                                           int64_t Offset) const {
+  assert(Offset <= INT_MAX && "Offset too big to fit in int.");
+  assert(MI && "Unable to get the legal offset for nil instruction.");
+  int SaveOffset = Offset;
+  return isARM64FrameOffsetLegal(*MI, SaveOffset) & ARM64FrameOffsetIsLegal;
+}
+
+/// Insert defining instruction(s) for BaseReg to be a pointer to FrameIdx
+/// at the beginning of the basic block.
+void ARM64RegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB,
+                                                     unsigned BaseReg,
+                                                     int FrameIdx,
+                                                     int64_t Offset) const {
+  MachineBasicBlock::iterator Ins = MBB->begin();
+  DebugLoc DL; // Defaults to "unknown"
+  if (Ins != MBB->end())
+    DL = Ins->getDebugLoc();
+
+  const MCInstrDesc &MCID = TII->get(ARM64::ADDXri);
+  MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
+  const MachineFunction &MF = *MBB->getParent();
+  MRI.constrainRegClass(BaseReg, TII->getRegClass(MCID, 0, this, MF));
+  unsigned Shifter = ARM64_AM::getShifterImm(ARM64_AM::LSL, 0);
+
+  BuildMI(*MBB, Ins, DL, MCID, BaseReg)
+      .addFrameIndex(FrameIdx)
+      .addImm(Offset)
+      .addImm(Shifter);
+}
+
+void ARM64RegisterInfo::resolveFrameIndex(MachineBasicBlock::iterator I,
+                                          unsigned BaseReg,
+                                          int64_t Offset) const {
+  MachineInstr &MI = *I;
+  int Off = Offset; // ARM doesn't need the general 64-bit offsets
+  unsigned i = 0;
+
+  while (!MI.getOperand(i).isFI()) {
+    ++i;
+    assert(i < MI.getNumOperands() && "Instr doesn't have FrameIndex operand!");
+  }
+  bool Done = rewriteARM64FrameIndex(MI, i, BaseReg, Off, TII);
+  assert(Done && "Unable to resolve frame index!");
+  (void)Done;
+}
+
+void ARM64RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
+                                            int SPAdj, unsigned FIOperandNum,
+                                            RegScavenger *RS) const {
+  assert(SPAdj == 0 && "Unexpected");
+
+  MachineInstr &MI = *II;
+  MachineBasicBlock &MBB = *MI.getParent();
+  MachineFunction &MF = *MBB.getParent();
+  const ARM64FrameLowering *TFI = static_cast<const ARM64FrameLowering *>(
+      MF.getTarget().getFrameLowering());
+
+  int FrameIndex = MI.getOperand(FIOperandNum).getIndex();
+  unsigned FrameReg;
+  int Offset;
+
+  // Special handling of dbg_value, stackmap and patchpoint instructions.
+  if (MI.isDebugValue() || MI.getOpcode() == TargetOpcode::STACKMAP ||
+      MI.getOpcode() == TargetOpcode::PATCHPOINT) {
+    Offset = TFI->resolveFrameIndexReference(MF, FrameIndex, FrameReg,
+                                             /*PreferFP=*/true);
+    Offset += MI.getOperand(FIOperandNum + 1).getImm();
+    MI.getOperand(FIOperandNum).ChangeToRegister(FrameReg, false /*isDef*/);
+    MI.getOperand(FIOperandNum + 1).ChangeToImmediate(Offset);
+    return;
+  }
+
+  // Modify MI as necessary to handle as much of 'Offset' as possible
+  Offset = TFI->resolveFrameIndexReference(MF, FrameIndex, FrameReg);
+  if (rewriteARM64FrameIndex(MI, FIOperandNum, FrameReg, Offset, TII))
+    return;
+
+  assert((!RS || !RS->isScavengingFrameIndex(FrameIndex)) &&
+         "Emergency spill slot is out of reach");
+
+  // If we get here, the immediate doesn't fit into the instruction.  We folded
+  // as much as possible above.  Handle the rest, providing a register that is
+  // SP+LargeImm.
+  unsigned ScratchReg =
+      MF.getRegInfo().createVirtualRegister(&ARM64::GPR64RegClass);
+  emitFrameOffset(MBB, II, MI.getDebugLoc(), ScratchReg, FrameReg, Offset, TII);
+  MI.getOperand(FIOperandNum).ChangeToRegister(ScratchReg, false, false, true);
+}
+
+namespace llvm {
+
+unsigned ARM64RegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
+                                                MachineFunction &MF) const {
+  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
+
+  switch (RC->getID()) {
+  default:
+    return 0;
+  case ARM64::GPR32RegClassID:
+  case ARM64::GPR32spRegClassID:
+  case ARM64::GPR32allRegClassID:
+  case ARM64::GPR64spRegClassID:
+  case ARM64::GPR64allRegClassID:
+  case ARM64::GPR64RegClassID:
+  case ARM64::GPR32commonRegClassID:
+  case ARM64::GPR64commonRegClassID:
+    return 32 - 1                                      // XZR/SP
+           - (TFI->hasFP(MF) || STI->isTargetDarwin()) // FP
+           - STI->isTargetDarwin() // X18 reserved as platform register
+           - hasBasePointer(MF);   // X19
+  case ARM64::FPR8RegClassID:
+  case ARM64::FPR16RegClassID:
+  case ARM64::FPR32RegClassID:
+  case ARM64::FPR64RegClassID:
+  case ARM64::FPR128RegClassID:
+    return 32;
+
+  case ARM64::DDRegClassID:
+  case ARM64::DDDRegClassID:
+  case ARM64::DDDDRegClassID:
+  case ARM64::QQRegClassID:
+  case ARM64::QQQRegClassID:
+  case ARM64::QQQQRegClassID:
+    return 32;
+
+  case ARM64::FPR128_loRegClassID:
+    return 16;
+  }
+}
+
+} // namespace llvm
--- a/lib/Target/ARM64/ARM64RegisterInfo.h
+++ b/lib/Target/ARM64/ARM64RegisterInfo.h
@ -0,0 +1,89 @@
+//===- ARM64RegisterInfo.h - ARM64 Register Information Impl ----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the ARM64 implementation of the MRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TARGET_ARM64REGISTERINFO_H
+#define LLVM_TARGET_ARM64REGISTERINFO_H
+
+#define GET_REGINFO_HEADER
+#include "ARM64GenRegisterInfo.inc"
+
+namespace llvm {
+
+class ARM64InstrInfo;
+class ARM64Subtarget;
+class MachineFunction;
+class RegScavenger;
+class TargetRegisterClass;
+
+struct ARM64RegisterInfo : public ARM64GenRegisterInfo {
+private:
+  const ARM64InstrInfo *TII;
+  const ARM64Subtarget *STI;
+
+public:
+  ARM64RegisterInfo(const ARM64InstrInfo *tii, const ARM64Subtarget *sti);
+
+  /// Code Generation virtual methods...
+  bool isReservedReg(const MachineFunction &MF, unsigned Reg) const;
+  const uint16_t *getCalleeSavedRegs(const MachineFunction *MF = 0) const;
+  const uint32_t *getCallPreservedMask(CallingConv::ID) const;
+
+  // Calls involved in thread-local variable lookup save more registers than
+  // normal calls, so they need a different mask to represent this.
+  const uint32_t *getTLSCallPreservedMask() const;
+
+  /// getThisReturnPreservedMask - Returns a call preserved mask specific to the
+  /// case that 'returned' is on an i64 first argument if the calling convention
+  /// is one that can (partially) model this attribute with a preserved mask
+  /// (i.e. it is a calling convention that uses the same register for the first
+  /// i64 argument and an i64 return value)
+  ///
+  /// Should return NULL in the case that the calling convention does not have
+  /// this property
+  const uint32_t *getThisReturnPreservedMask(CallingConv::ID) const;
+
+  BitVector getReservedRegs(const MachineFunction &MF) const;
+  const TargetRegisterClass *getPointerRegClass(const MachineFunction &MF,
+                                                unsigned Kind = 0) const;
+  const TargetRegisterClass *
+  getCrossCopyRegClass(const TargetRegisterClass *RC) const;
+
+  bool requiresRegisterScavenging(const MachineFunction &MF) const;
+  bool useFPForScavengingIndex(const MachineFunction &MF) const;
+  bool requiresFrameIndexScavenging(const MachineFunction &MF) const;
+
+  bool needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const;
+  bool isFrameOffsetLegal(const MachineInstr *MI, int64_t Offset) const;
+  void materializeFrameBaseRegister(MachineBasicBlock *MBB, unsigned BaseReg,
+                                    int FrameIdx, int64_t Offset) const;
+  void resolveFrameIndex(MachineBasicBlock::iterator I, unsigned BaseReg,
+                         int64_t Offset) const;
+  void eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj,
+                           unsigned FIOperandNum,
+                           RegScavenger *RS = NULL) const;
+
+  bool cannotEliminateFrame(const MachineFunction &MF) const;
+  bool requiresVirtualBaseRegisters(const MachineFunction &MF) const;
+  bool hasBasePointer(const MachineFunction &MF) const;
+  unsigned getBaseRegister() const;
+
+  // Debug information queries.
+  unsigned getFrameRegister(const MachineFunction &MF) const;
+
+  unsigned getRegPressureLimit(const TargetRegisterClass *RC,
+                               MachineFunction &MF) const;
+};
+
+} // end namespace llvm
+
+#endif // LLVM_TARGET_ARM64REGISTERINFO_H
--- a/lib/Target/ARM64/ARM64RegisterInfo.td
+++ b/lib/Target/ARM64/ARM64RegisterInfo.td
@ -0,0 +1,561 @@
+//===- ARM64RegisterInfo.td - Describe the ARM64 Regisers --*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+
+class ARM64Reg<bits<16> enc, string n, list<Register> subregs = [],
+               list<string> altNames = []>
+        : Register<n, altNames> {
+  let HWEncoding = enc;
+  let Namespace = "ARM64";
+  let SubRegs = subregs;
+}
+
+let Namespace = "ARM64" in {
+  def sub_32 : SubRegIndex<32>;
+
+  def bsub : SubRegIndex<8>;
+  def hsub : SubRegIndex<16>;
+  def ssub : SubRegIndex<32>;
+  def dsub : SubRegIndex<32>;
+  def qhisub : SubRegIndex<64>;
+  def qsub : SubRegIndex<64>;
+  // Note: Code depends on these having consecutive numbers
+  def dsub0 : SubRegIndex<64>;
+  def dsub1 : SubRegIndex<64>;
+  def dsub2 : SubRegIndex<64>;
+  def dsub3 : SubRegIndex<64>;
+  // Note: Code depends on these having consecutive numbers
+  def qsub0 : SubRegIndex<128>;
+  def qsub1 : SubRegIndex<128>;
+  def qsub2 : SubRegIndex<128>;
+  def qsub3 : SubRegIndex<128>;
+}
+
+let Namespace = "ARM64" in {
+  def vreg : RegAltNameIndex;
+  def vlist1 : RegAltNameIndex;
+}
+
+//===----------------------------------------------------------------------===//
+// Registers
+//===----------------------------------------------------------------------===//
+def W0    : ARM64Reg<0,   "w0" >, DwarfRegNum<[0]>;
+def W1    : ARM64Reg<1,   "w1" >, DwarfRegNum<[1]>;
+def W2    : ARM64Reg<2,   "w2" >, DwarfRegNum<[2]>;
+def W3    : ARM64Reg<3,   "w3" >, DwarfRegNum<[3]>;
+def W4    : ARM64Reg<4,   "w4" >, DwarfRegNum<[4]>;
+def W5    : ARM64Reg<5,   "w5" >, DwarfRegNum<[5]>;
+def W6    : ARM64Reg<6,   "w6" >, DwarfRegNum<[6]>;
+def W7    : ARM64Reg<7,   "w7" >, DwarfRegNum<[7]>;
+def W8    : ARM64Reg<8,   "w8" >, DwarfRegNum<[8]>;
+def W9    : ARM64Reg<9,   "w9" >, DwarfRegNum<[9]>;
+def W10   : ARM64Reg<10, "w10">, DwarfRegNum<[10]>;
+def W11   : ARM64Reg<11, "w11">, DwarfRegNum<[11]>;
+def W12   : ARM64Reg<12, "w12">, DwarfRegNum<[12]>;
+def W13   : ARM64Reg<13, "w13">, DwarfRegNum<[13]>;
+def W14   : ARM64Reg<14, "w14">, DwarfRegNum<[14]>;
+def W15   : ARM64Reg<15, "w15">, DwarfRegNum<[15]>;
+def W16   : ARM64Reg<16, "w16">, DwarfRegNum<[16]>;
+def W17   : ARM64Reg<17, "w17">, DwarfRegNum<[17]>;
+def W18   : ARM64Reg<18, "w18">, DwarfRegNum<[18]>;
+def W19   : ARM64Reg<19, "w19">, DwarfRegNum<[19]>;
+def W20   : ARM64Reg<20, "w20">, DwarfRegNum<[20]>;
+def W21   : ARM64Reg<21, "w21">, DwarfRegNum<[21]>;
+def W22   : ARM64Reg<22, "w22">, DwarfRegNum<[22]>;
+def W23   : ARM64Reg<23, "w23">, DwarfRegNum<[23]>;
+def W24   : ARM64Reg<24, "w24">, DwarfRegNum<[24]>;
+def W25   : ARM64Reg<25, "w25">, DwarfRegNum<[25]>;
+def W26   : ARM64Reg<26, "w26">, DwarfRegNum<[26]>;
+def W27   : ARM64Reg<27, "w27">, DwarfRegNum<[27]>;
+def W28   : ARM64Reg<28, "w28">, DwarfRegNum<[28]>;
+def W29   : ARM64Reg<29, "w29">, DwarfRegNum<[29]>;
+def W30   : ARM64Reg<30, "w30">, DwarfRegNum<[30]>;
+def WSP   : ARM64Reg<31, "wsp">, DwarfRegNum<[31]>;
+def WZR   : ARM64Reg<31, "wzr">, DwarfRegAlias<WSP>;
+
+let SubRegIndices = [sub_32] in {
+def X0    : ARM64Reg<0,   "x0",  [W0]>, DwarfRegAlias<W0>;
+def X1    : ARM64Reg<1,   "x1",  [W1]>, DwarfRegAlias<W1>;
+def X2    : ARM64Reg<2,   "x2",  [W2]>, DwarfRegAlias<W2>;
+def X3    : ARM64Reg<3,   "x3",  [W3]>, DwarfRegAlias<W3>;
+def X4    : ARM64Reg<4,   "x4",  [W4]>, DwarfRegAlias<W4>;
+def X5    : ARM64Reg<5,   "x5",  [W5]>, DwarfRegAlias<W5>;
+def X6    : ARM64Reg<6,   "x6",  [W6]>, DwarfRegAlias<W6>;
+def X7    : ARM64Reg<7,   "x7",  [W7]>, DwarfRegAlias<W7>;
+def X8    : ARM64Reg<8,   "x8",  [W8]>, DwarfRegAlias<W8>;
+def X9    : ARM64Reg<9,   "x9",  [W9]>, DwarfRegAlias<W9>;
+def X10   : ARM64Reg<10, "x10", [W10]>, DwarfRegAlias<W10>;
+def X11   : ARM64Reg<11, "x11", [W11]>, DwarfRegAlias<W11>;
+def X12   : ARM64Reg<12, "x12", [W12]>, DwarfRegAlias<W12>;
+def X13   : ARM64Reg<13, "x13", [W13]>, DwarfRegAlias<W13>;
+def X14   : ARM64Reg<14, "x14", [W14]>, DwarfRegAlias<W14>;
+def X15   : ARM64Reg<15, "x15", [W15]>, DwarfRegAlias<W15>;
+def X16   : ARM64Reg<16, "x16", [W16]>, DwarfRegAlias<W16>;
+def X17   : ARM64Reg<17, "x17", [W17]>, DwarfRegAlias<W17>;
+def X18   : ARM64Reg<18, "x18", [W18]>, DwarfRegAlias<W18>;
+def X19   : ARM64Reg<19, "x19", [W19]>, DwarfRegAlias<W19>;
+def X20   : ARM64Reg<20, "x20", [W20]>, DwarfRegAlias<W20>;
+def X21   : ARM64Reg<21, "x21", [W21]>, DwarfRegAlias<W21>;
+def X22   : ARM64Reg<22, "x22", [W22]>, DwarfRegAlias<W22>;
+def X23   : ARM64Reg<23, "x23", [W23]>, DwarfRegAlias<W23>;
+def X24   : ARM64Reg<24, "x24", [W24]>, DwarfRegAlias<W24>;
+def X25   : ARM64Reg<25, "x25", [W25]>, DwarfRegAlias<W25>;
+def X26   : ARM64Reg<26, "x26", [W26]>, DwarfRegAlias<W26>;
+def X27   : ARM64Reg<27, "x27", [W27]>, DwarfRegAlias<W27>;
+def X28   : ARM64Reg<28, "x28", [W28]>, DwarfRegAlias<W28>;
+def FP    : ARM64Reg<29, "fp",  [W29]>, DwarfRegAlias<W29>;
+def LR    : ARM64Reg<30, "lr",  [W30]>, DwarfRegAlias<W30>;
+def SP    : ARM64Reg<31, "sp",  [WSP]>, DwarfRegAlias<WSP>;
+def XZR   : ARM64Reg<31, "xzr", [WZR]>, DwarfRegAlias<WSP>;
+}
+
+// Condition code register.
+def CPSR  : ARM64Reg<0, "cpsr">;
+
+// GPR register classes with the intersections of GPR32/GPR32sp and
+// GPR64/GPR64sp for use by the coalescer.
+def GPR32common : RegisterClass<"ARM64", [i32], 32, (sequence "W%u", 0, 30)> {
+  let AltOrders = [(rotl GPR32common, 8)];
+  let AltOrderSelect = [{ return 1; }];
+}
+def GPR64common : RegisterClass<"ARM64", [i64], 64,
+                                (add (sequence "X%u", 0, 28), FP, LR)> {
+  let AltOrders = [(rotl GPR64common, 8)];
+  let AltOrderSelect = [{ return 1; }];
+}
+// GPR register classes which exclude SP/WSP.
+def GPR32 : RegisterClass<"ARM64", [i32], 32, (add GPR32common, WZR)> {
+  let AltOrders = [(rotl GPR32, 8)];
+  let AltOrderSelect = [{ return 1; }];
+}
+def GPR64 : RegisterClass<"ARM64", [i64], 64, (add GPR64common, XZR)> {
+  let AltOrders = [(rotl GPR64, 8)];
+  let AltOrderSelect = [{ return 1; }];
+}
+
+// GPR register classes which include SP/WSP.
+def GPR32sp : RegisterClass<"ARM64", [i32], 32, (add GPR32common, WSP)> {
+  let AltOrders = [(rotl GPR32sp, 8)];
+  let AltOrderSelect = [{ return 1; }];
+}
+def GPR64sp : RegisterClass<"ARM64", [i64], 64, (add GPR64common, SP)> {
+  let AltOrders = [(rotl GPR64sp, 8)];
+  let AltOrderSelect = [{ return 1; }];
+}
+
+// GPR register classes which include WZR/XZR AND SP/WSP. This is not a
+// constraint used by any instructions, it is used as a common super-class.
+def GPR32all : RegisterClass<"ARM64", [i32], 32, (add GPR32common, WZR, WSP)>;
+def GPR64all : RegisterClass<"ARM64", [i64], 64, (add GPR64common, XZR, SP)>;
+
+// For tail calls, we can't use callee-saved registers, as they are restored
+// to the saved value before the tail call, which would clobber a call address.
+// This is for indirect tail calls to store the address of the destination.
+def tcGPR64 : RegisterClass<"ARM64", [i64], 64, (sub GPR64common, X19, X20, X21,
+                                                     X22, X23, X24, X25, X26,
+                                                     X27, X28)>;
+
+// GPR register classes for post increment ammount of vector load/store that
+// has alternate printing when Rm=31 and prints a constant immediate value
+// equal to the total number of bytes transferred.
+def GPR64pi1  : RegisterOperand<GPR64, "printPostIncOperand1">;
+def GPR64pi2  : RegisterOperand<GPR64, "printPostIncOperand2">;
+def GPR64pi3  : RegisterOperand<GPR64, "printPostIncOperand3">;
+def GPR64pi4  : RegisterOperand<GPR64, "printPostIncOperand4">;
+def GPR64pi6  : RegisterOperand<GPR64, "printPostIncOperand6">;
+def GPR64pi8  : RegisterOperand<GPR64, "printPostIncOperand8">;
+def GPR64pi12 : RegisterOperand<GPR64, "printPostIncOperand12">;
+def GPR64pi16 : RegisterOperand<GPR64, "printPostIncOperand16">;
+def GPR64pi24 : RegisterOperand<GPR64, "printPostIncOperand24">;
+def GPR64pi32 : RegisterOperand<GPR64, "printPostIncOperand32">;
+def GPR64pi48 : RegisterOperand<GPR64, "printPostIncOperand48">;
+def GPR64pi64 : RegisterOperand<GPR64, "printPostIncOperand64">;
+
+// Condition code regclass.
+def CCR : RegisterClass<"ARM64", [i32], 32, (add CPSR)> {
+  let CopyCost = -1;  // Don't allow copying of status registers.
+
+  // CCR is not allocatable.
+  let isAllocatable = 0;
+}
+
+//===----------------------------------------------------------------------===//
+// Floating Point Scalar Registers
+//===----------------------------------------------------------------------===//
+
+def B0    : ARM64Reg<0,   "b0">, DwarfRegNum<[64]>;
+def B1    : ARM64Reg<1,   "b1">, DwarfRegNum<[65]>;
+def B2    : ARM64Reg<2,   "b2">, DwarfRegNum<[66]>;
+def B3    : ARM64Reg<3,   "b3">, DwarfRegNum<[67]>;
+def B4    : ARM64Reg<4,   "b4">, DwarfRegNum<[68]>;
+def B5    : ARM64Reg<5,   "b5">, DwarfRegNum<[69]>;
+def B6    : ARM64Reg<6,   "b6">, DwarfRegNum<[70]>;
+def B7    : ARM64Reg<7,   "b7">, DwarfRegNum<[71]>;
+def B8    : ARM64Reg<8,   "b8">, DwarfRegNum<[72]>;
+def B9    : ARM64Reg<9,   "b9">, DwarfRegNum<[73]>;
+def B10   : ARM64Reg<10, "b10">, DwarfRegNum<[74]>;
+def B11   : ARM64Reg<11, "b11">, DwarfRegNum<[75]>;
+def B12   : ARM64Reg<12, "b12">, DwarfRegNum<[76]>;
+def B13   : ARM64Reg<13, "b13">, DwarfRegNum<[77]>;
+def B14   : ARM64Reg<14, "b14">, DwarfRegNum<[78]>;
+def B15   : ARM64Reg<15, "b15">, DwarfRegNum<[79]>;
+def B16   : ARM64Reg<16, "b16">, DwarfRegNum<[80]>;
+def B17   : ARM64Reg<17, "b17">, DwarfRegNum<[81]>;
+def B18   : ARM64Reg<18, "b18">, DwarfRegNum<[82]>;
+def B19   : ARM64Reg<19, "b19">, DwarfRegNum<[83]>;
+def B20   : ARM64Reg<20, "b20">, DwarfRegNum<[84]>;
+def B21   : ARM64Reg<21, "b21">, DwarfRegNum<[85]>;
+def B22   : ARM64Reg<22, "b22">, DwarfRegNum<[86]>;
+def B23   : ARM64Reg<23, "b23">, DwarfRegNum<[87]>;
+def B24   : ARM64Reg<24, "b24">, DwarfRegNum<[88]>;
+def B25   : ARM64Reg<25, "b25">, DwarfRegNum<[89]>;
+def B26   : ARM64Reg<26, "b26">, DwarfRegNum<[90]>;
+def B27   : ARM64Reg<27, "b27">, DwarfRegNum<[91]>;
+def B28   : ARM64Reg<28, "b28">, DwarfRegNum<[92]>;
+def B29   : ARM64Reg<29, "b29">, DwarfRegNum<[93]>;
+def B30   : ARM64Reg<30, "b30">, DwarfRegNum<[94]>;
+def B31   : ARM64Reg<31, "b31">, DwarfRegNum<[95]>;
+
+let SubRegIndices = [bsub] in {
+def H0    : ARM64Reg<0,   "h0", [B0]>, DwarfRegAlias<B0>;
+def H1    : ARM64Reg<1,   "h1", [B1]>, DwarfRegAlias<B1>;
+def H2    : ARM64Reg<2,   "h2", [B2]>, DwarfRegAlias<B2>;
+def H3    : ARM64Reg<3,   "h3", [B3]>, DwarfRegAlias<B3>;
+def H4    : ARM64Reg<4,   "h4", [B4]>, DwarfRegAlias<B4>;
+def H5    : ARM64Reg<5,   "h5", [B5]>, DwarfRegAlias<B5>;
+def H6    : ARM64Reg<6,   "h6", [B6]>, DwarfRegAlias<B6>;
+def H7    : ARM64Reg<7,   "h7", [B7]>, DwarfRegAlias<B7>;
+def H8    : ARM64Reg<8,   "h8", [B8]>, DwarfRegAlias<B8>;
+def H9    : ARM64Reg<9,   "h9", [B9]>, DwarfRegAlias<B9>;
+def H10   : ARM64Reg<10, "h10", [B10]>, DwarfRegAlias<B10>;
+def H11   : ARM64Reg<11, "h11", [B11]>, DwarfRegAlias<B11>;
+def H12   : ARM64Reg<12, "h12", [B12]>, DwarfRegAlias<B12>;
+def H13   : ARM64Reg<13, "h13", [B13]>, DwarfRegAlias<B13>;
+def H14   : ARM64Reg<14, "h14", [B14]>, DwarfRegAlias<B14>;
+def H15   : ARM64Reg<15, "h15", [B15]>, DwarfRegAlias<B15>;
+def H16   : ARM64Reg<16, "h16", [B16]>, DwarfRegAlias<B16>;
+def H17   : ARM64Reg<17, "h17", [B17]>, DwarfRegAlias<B17>;
+def H18   : ARM64Reg<18, "h18", [B18]>, DwarfRegAlias<B18>;
+def H19   : ARM64Reg<19, "h19", [B19]>, DwarfRegAlias<B19>;
+def H20   : ARM64Reg<20, "h20", [B20]>, DwarfRegAlias<B20>;
+def H21   : ARM64Reg<21, "h21", [B21]>, DwarfRegAlias<B21>;
+def H22   : ARM64Reg<22, "h22", [B22]>, DwarfRegAlias<B22>;
+def H23   : ARM64Reg<23, "h23", [B23]>, DwarfRegAlias<B23>;
+def H24   : ARM64Reg<24, "h24", [B24]>, DwarfRegAlias<B24>;
+def H25   : ARM64Reg<25, "h25", [B25]>, DwarfRegAlias<B25>;
+def H26   : ARM64Reg<26, "h26", [B26]>, DwarfRegAlias<B26>;
+def H27   : ARM64Reg<27, "h27", [B27]>, DwarfRegAlias<B27>;
+def H28   : ARM64Reg<28, "h28", [B28]>, DwarfRegAlias<B28>;
+def H29   : ARM64Reg<29, "h29", [B29]>, DwarfRegAlias<B29>;
+def H30   : ARM64Reg<30, "h30", [B30]>, DwarfRegAlias<B30>;
+def H31   : ARM64Reg<31, "h31", [B31]>, DwarfRegAlias<B31>;
+}
+
+let SubRegIndices = [hsub] in {
+def S0    : ARM64Reg<0,   "s0", [H0]>, DwarfRegAlias<B0>;
+def S1    : ARM64Reg<1,   "s1", [H1]>, DwarfRegAlias<B1>;
+def S2    : ARM64Reg<2,   "s2", [H2]>, DwarfRegAlias<B2>;
+def S3    : ARM64Reg<3,   "s3", [H3]>, DwarfRegAlias<B3>;
+def S4    : ARM64Reg<4,   "s4", [H4]>, DwarfRegAlias<B4>;
+def S5    : ARM64Reg<5,   "s5", [H5]>, DwarfRegAlias<B5>;
+def S6    : ARM64Reg<6,   "s6", [H6]>, DwarfRegAlias<B6>;
+def S7    : ARM64Reg<7,   "s7", [H7]>, DwarfRegAlias<B7>;
+def S8    : ARM64Reg<8,   "s8", [H8]>, DwarfRegAlias<B8>;
+def S9    : ARM64Reg<9,   "s9", [H9]>, DwarfRegAlias<B9>;
+def S10   : ARM64Reg<10, "s10", [H10]>, DwarfRegAlias<B10>;
+def S11   : ARM64Reg<11, "s11", [H11]>, DwarfRegAlias<B11>;
+def S12   : ARM64Reg<12, "s12", [H12]>, DwarfRegAlias<B12>;
+def S13   : ARM64Reg<13, "s13", [H13]>, DwarfRegAlias<B13>;
+def S14   : ARM64Reg<14, "s14", [H14]>, DwarfRegAlias<B14>;
+def S15   : ARM64Reg<15, "s15", [H15]>, DwarfRegAlias<B15>;
+def S16   : ARM64Reg<16, "s16", [H16]>, DwarfRegAlias<B16>;
+def S17   : ARM64Reg<17, "s17", [H17]>, DwarfRegAlias<B17>;
+def S18   : ARM64Reg<18, "s18", [H18]>, DwarfRegAlias<B18>;
+def S19   : ARM64Reg<19, "s19", [H19]>, DwarfRegAlias<B19>;
+def S20   : ARM64Reg<20, "s20", [H20]>, DwarfRegAlias<B20>;
+def S21   : ARM64Reg<21, "s21", [H21]>, DwarfRegAlias<B21>;
+def S22   : ARM64Reg<22, "s22", [H22]>, DwarfRegAlias<B22>;
+def S23   : ARM64Reg<23, "s23", [H23]>, DwarfRegAlias<B23>;
+def S24   : ARM64Reg<24, "s24", [H24]>, DwarfRegAlias<B24>;
+def S25   : ARM64Reg<25, "s25", [H25]>, DwarfRegAlias<B25>;
+def S26   : ARM64Reg<26, "s26", [H26]>, DwarfRegAlias<B26>;
+def S27   : ARM64Reg<27, "s27", [H27]>, DwarfRegAlias<B27>;
+def S28   : ARM64Reg<28, "s28", [H28]>, DwarfRegAlias<B28>;
+def S29   : ARM64Reg<29, "s29", [H29]>, DwarfRegAlias<B29>;
+def S30   : ARM64Reg<30, "s30", [H30]>, DwarfRegAlias<B30>;
+def S31   : ARM64Reg<31, "s31", [H31]>, DwarfRegAlias<B31>;
+}
+
+let SubRegIndices = [ssub], RegAltNameIndices = [vreg, vlist1] in {
+def D0    : ARM64Reg<0,   "d0", [S0], ["v0", ""]>, DwarfRegAlias<B0>;
+def D1    : ARM64Reg<1,   "d1", [S1], ["v1", ""]>, DwarfRegAlias<B1>;
+def D2    : ARM64Reg<2,   "d2", [S2], ["v2", ""]>, DwarfRegAlias<B2>;
+def D3    : ARM64Reg<3,   "d3", [S3], ["v3", ""]>, DwarfRegAlias<B3>;
+def D4    : ARM64Reg<4,   "d4", [S4], ["v4", ""]>, DwarfRegAlias<B4>;
+def D5    : ARM64Reg<5,   "d5", [S5], ["v5", ""]>, DwarfRegAlias<B5>;
+def D6    : ARM64Reg<6,   "d6", [S6], ["v6", ""]>, DwarfRegAlias<B6>;
+def D7    : ARM64Reg<7,   "d7", [S7], ["v7", ""]>, DwarfRegAlias<B7>;
+def D8    : ARM64Reg<8,   "d8", [S8], ["v8", ""]>, DwarfRegAlias<B8>;
+def D9    : ARM64Reg<9,   "d9", [S9], ["v9", ""]>, DwarfRegAlias<B9>;
+def D10   : ARM64Reg<10, "d10", [S10], ["v10", ""]>, DwarfRegAlias<B10>;
+def D11   : ARM64Reg<11, "d11", [S11], ["v11", ""]>, DwarfRegAlias<B11>;
+def D12   : ARM64Reg<12, "d12", [S12], ["v12", ""]>, DwarfRegAlias<B12>;
+def D13   : ARM64Reg<13, "d13", [S13], ["v13", ""]>, DwarfRegAlias<B13>;
+def D14   : ARM64Reg<14, "d14", [S14], ["v14", ""]>, DwarfRegAlias<B14>;
+def D15   : ARM64Reg<15, "d15", [S15], ["v15", ""]>, DwarfRegAlias<B15>;
+def D16   : ARM64Reg<16, "d16", [S16], ["v16", ""]>, DwarfRegAlias<B16>;
+def D17   : ARM64Reg<17, "d17", [S17], ["v17", ""]>, DwarfRegAlias<B17>;
+def D18   : ARM64Reg<18, "d18", [S18], ["v18", ""]>, DwarfRegAlias<B18>;
+def D19   : ARM64Reg<19, "d19", [S19], ["v19", ""]>, DwarfRegAlias<B19>;
+def D20   : ARM64Reg<20, "d20", [S20], ["v20", ""]>, DwarfRegAlias<B20>;
+def D21   : ARM64Reg<21, "d21", [S21], ["v21", ""]>, DwarfRegAlias<B21>;
+def D22   : ARM64Reg<22, "d22", [S22], ["v22", ""]>, DwarfRegAlias<B22>;
+def D23   : ARM64Reg<23, "d23", [S23], ["v23", ""]>, DwarfRegAlias<B23>;
+def D24   : ARM64Reg<24, "d24", [S24], ["v24", ""]>, DwarfRegAlias<B24>;
+def D25   : ARM64Reg<25, "d25", [S25], ["v25", ""]>, DwarfRegAlias<B25>;
+def D26   : ARM64Reg<26, "d26", [S26], ["v26", ""]>, DwarfRegAlias<B26>;
+def D27   : ARM64Reg<27, "d27", [S27], ["v27", ""]>, DwarfRegAlias<B27>;
+def D28   : ARM64Reg<28, "d28", [S28], ["v28", ""]>, DwarfRegAlias<B28>;
+def D29   : ARM64Reg<29, "d29", [S29], ["v29", ""]>, DwarfRegAlias<B29>;
+def D30   : ARM64Reg<30, "d30", [S30], ["v30", ""]>, DwarfRegAlias<B30>;
+def D31   : ARM64Reg<31, "d31", [S31], ["v31", ""]>, DwarfRegAlias<B31>;
+}
+
+let SubRegIndices = [dsub], RegAltNameIndices = [vreg, vlist1] in {
+def Q0    : ARM64Reg<0,   "q0", [D0], ["v0", ""]>, DwarfRegAlias<B0>;
+def Q1    : ARM64Reg<1,   "q1", [D1], ["v1", ""]>, DwarfRegAlias<B1>;
+def Q2    : ARM64Reg<2,   "q2", [D2], ["v2", ""]>, DwarfRegAlias<B2>;
+def Q3    : ARM64Reg<3,   "q3", [D3], ["v3", ""]>, DwarfRegAlias<B3>;
+def Q4    : ARM64Reg<4,   "q4", [D4], ["v4", ""]>, DwarfRegAlias<B4>;
+def Q5    : ARM64Reg<5,   "q5", [D5], ["v5", ""]>, DwarfRegAlias<B5>;
+def Q6    : ARM64Reg<6,   "q6", [D6], ["v6", ""]>, DwarfRegAlias<B6>;
+def Q7    : ARM64Reg<7,   "q7", [D7], ["v7", ""]>, DwarfRegAlias<B7>;
+def Q8    : ARM64Reg<8,   "q8", [D8], ["v8", ""]>, DwarfRegAlias<B8>;
+def Q9    : ARM64Reg<9,   "q9", [D9], ["v9", ""]>, DwarfRegAlias<B9>;
+def Q10   : ARM64Reg<10, "q10", [D10], ["v10", ""]>, DwarfRegAlias<B10>;
+def Q11   : ARM64Reg<11, "q11", [D11], ["v11", ""]>, DwarfRegAlias<B11>;
+def Q12   : ARM64Reg<12, "q12", [D12], ["v12", ""]>, DwarfRegAlias<B12>;
+def Q13   : ARM64Reg<13, "q13", [D13], ["v13", ""]>, DwarfRegAlias<B13>;
+def Q14   : ARM64Reg<14, "q14", [D14], ["v14", ""]>, DwarfRegAlias<B14>;
+def Q15   : ARM64Reg<15, "q15", [D15], ["v15", ""]>, DwarfRegAlias<B15>;
+def Q16   : ARM64Reg<16, "q16", [D16], ["v16", ""]>, DwarfRegAlias<B16>;
+def Q17   : ARM64Reg<17, "q17", [D17], ["v17", ""]>, DwarfRegAlias<B17>;
+def Q18   : ARM64Reg<18, "q18", [D18], ["v18", ""]>, DwarfRegAlias<B18>;
+def Q19   : ARM64Reg<19, "q19", [D19], ["v19", ""]>, DwarfRegAlias<B19>;
+def Q20   : ARM64Reg<20, "q20", [D20], ["v20", ""]>, DwarfRegAlias<B20>;
+def Q21   : ARM64Reg<21, "q21", [D21], ["v21", ""]>, DwarfRegAlias<B21>;
+def Q22   : ARM64Reg<22, "q22", [D22], ["v22", ""]>, DwarfRegAlias<B22>;
+def Q23   : ARM64Reg<23, "q23", [D23], ["v23", ""]>, DwarfRegAlias<B23>;
+def Q24   : ARM64Reg<24, "q24", [D24], ["v24", ""]>, DwarfRegAlias<B24>;
+def Q25   : ARM64Reg<25, "q25", [D25], ["v25", ""]>, DwarfRegAlias<B25>;
+def Q26   : ARM64Reg<26, "q26", [D26], ["v26", ""]>, DwarfRegAlias<B26>;
+def Q27   : ARM64Reg<27, "q27", [D27], ["v27", ""]>, DwarfRegAlias<B27>;
+def Q28   : ARM64Reg<28, "q28", [D28], ["v28", ""]>, DwarfRegAlias<B28>;
+def Q29   : ARM64Reg<29, "q29", [D29], ["v29", ""]>, DwarfRegAlias<B29>;
+def Q30   : ARM64Reg<30, "q30", [D30], ["v30", ""]>, DwarfRegAlias<B30>;
+def Q31   : ARM64Reg<31, "q31", [D31], ["v31", ""]>, DwarfRegAlias<B31>;
+}
+
+def FPR8  : RegisterClass<"ARM64", [untyped], 8, (sequence "B%u", 0, 31)> {
+  let Size = 8;
+}
+def FPR16 : RegisterClass<"ARM64", [untyped], 16, (sequence "H%u", 0, 31)> {
+  let Size = 16;
+}
+def FPR32 : RegisterClass<"ARM64", [f32, i32], 32,(sequence "S%u", 0, 31)>;
+def FPR64 : RegisterClass<"ARM64", [f64, i64, v2f32, v1f64, v8i8, v4i16, v2i32,
+                                    v1i64],
+                                    64, (sequence "D%u", 0, 31)>;
+// We don't (yet) have an f128 legal type, so don't use that here. We
+// normalize 128-bit vectors to v2f64 for arg passing and such, so use
+// that here.
+def FPR128 : RegisterClass<"ARM64",
+                           [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64, f128],
+                           128, (sequence "Q%u", 0, 31)>;
+
+// The lower 16 vector registers.  Some instructions can only take registers
+// in this range.
+def FPR128_lo : RegisterClass<"ARM64",
+                              [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+                              128, (trunc FPR128, 16)>;
+
+// Pairs, triples, and quads of 64-bit vector registers.
+def DSeqPairs : RegisterTuples<[dsub0, dsub1], [(rotl FPR64, 0), (rotl FPR64, 1)]>;
+def DSeqTriples : RegisterTuples<[dsub0, dsub1, dsub2],
+                                 [(rotl FPR64, 0), (rotl FPR64, 1),
+                                  (rotl FPR64, 2)]>;
+def DSeqQuads : RegisterTuples<[dsub0, dsub1, dsub2, dsub3],
+                               [(rotl FPR64, 0), (rotl FPR64, 1),
+                                (rotl FPR64, 2), (rotl FPR64, 3)]>;
+def DD   : RegisterClass<"ARM64", [untyped], 64, (add DSeqPairs)> {
+  let Size = 128;
+}
+def DDD  : RegisterClass<"ARM64", [untyped], 64, (add DSeqTriples)> {
+  let Size = 196;
+}
+def DDDD : RegisterClass<"ARM64", [untyped], 64, (add DSeqQuads)> {
+  let Size = 256;
+}
+
+// Pairs, triples, and quads of 128-bit vector registers.
+def QSeqPairs : RegisterTuples<[qsub0, qsub1], [(rotl FPR128, 0), (rotl FPR128, 1)]>;
+def QSeqTriples : RegisterTuples<[qsub0, qsub1, qsub2],
+                                 [(rotl FPR128, 0), (rotl FPR128, 1),
+                                  (rotl FPR128, 2)]>;
+def QSeqQuads : RegisterTuples<[qsub0, qsub1, qsub2, qsub3],
+                               [(rotl FPR128, 0), (rotl FPR128, 1),
+                                (rotl FPR128, 2), (rotl FPR128, 3)]>;
+def QQ   : RegisterClass<"ARM64", [untyped], 128, (add QSeqPairs)> {
+  let Size = 256;
+}
+def QQQ  : RegisterClass<"ARM64", [untyped], 128, (add QSeqTriples)> {
+  let Size = 384;
+}
+def QQQQ : RegisterClass<"ARM64", [untyped], 128, (add QSeqQuads)> {
+  let Size = 512;
+}
+
+
+// Vector operand versions of the FP registers. Alternate name printing and
+// assmebler matching.
+def VectorRegAsmOperand : AsmOperandClass { let Name = "VectorReg"; }
+let ParserMatchClass = VectorRegAsmOperand in {
+def V64  : RegisterOperand<FPR64, "printVRegOperand">;
+def V128 : RegisterOperand<FPR128, "printVRegOperand">;
+def V128_lo : RegisterOperand<FPR128_lo, "printVRegOperand">;
+}
+
+class TypedVecListAsmOperand<int count, int regsize, int lanes, string kind>
+    : AsmOperandClass {
+  let Name = "TypedVectorList" # count # "_" # lanes # kind;
+
+  let PredicateMethod
+      = "isTypedVectorList<" # count # ", " # lanes # ", '" # kind # "'>";
+  let RenderMethod = "addVectorList" # regsize # "Operands<" # count # ">";
+}
+
+class TypedVecListRegOperand<RegisterClass Reg, int lanes, string kind>
+    : RegisterOperand<Reg, "printTypedVectorList<" # lanes # ", '"
+                                                   # kind # "'>">;
+
+multiclass VectorList<int count, RegisterClass Reg64, RegisterClass Reg128> {
+  // With implicit types (probably on instruction instead). E.g. { v0, v1 }
+  def _64AsmOperand : AsmOperandClass {
+    let Name = NAME # "64";
+    let PredicateMethod = "isImplicitlyTypedVectorList<" # count # ">";
+    let RenderMethod = "addVectorList64Operands<" # count # ">";
+  }
+
+  def "64" : RegisterOperand<Reg64, "printImplicitlyTypedVectorList"> {
+    let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_64AsmOperand");
+  }
+
+  def _128AsmOperand : AsmOperandClass {
+    let Name = NAME # "128";
+    let PredicateMethod = "isImplicitlyTypedVectorList<" # count # ">";
+    let RenderMethod = "addVectorList128Operands<" # count # ">";
+  }
+
+  def "128" : RegisterOperand<Reg128, "printImplicitlyTypedVectorList"> {
+    let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_128AsmOperand");
+  }
+
+  // 64-bit register lists with explicit type.
+
+  // { v0.8b, v1.8b }
+  def _8bAsmOperand : TypedVecListAsmOperand<count, 64, 8, "b">;
+  def "8b" : TypedVecListRegOperand<Reg64, 8, "b"> {
+    let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_8bAsmOperand");
+  }
+
+  // { v0.4h, v1.4h }
+  def _4hAsmOperand : TypedVecListAsmOperand<count, 64, 4, "h">;
+  def "4h" : TypedVecListRegOperand<Reg64, 4, "h"> {
+    let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_4hAsmOperand");
+  }
+
+  // { v0.2s, v1.2s }
+  def _2sAsmOperand : TypedVecListAsmOperand<count, 64, 2, "s">;
+  def "2s" : TypedVecListRegOperand<Reg64, 2, "s"> {
+    let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_2sAsmOperand");
+  }
+
+  // { v0.1d, v1.1d }
+  def _1dAsmOperand : TypedVecListAsmOperand<count, 64, 1, "d">;
+  def "1d" : TypedVecListRegOperand<Reg64, 1, "d"> {
+    let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_1dAsmOperand");
+  }
+
+  // 128-bit register lists with explicit type
+
+  // { v0.16b, v1.16b }
+  def _16bAsmOperand : TypedVecListAsmOperand<count, 128, 16, "b">;
+  def "16b" : TypedVecListRegOperand<Reg128, 16, "b"> {
+    let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_16bAsmOperand");
+  }
+
+  // { v0.8h, v1.8h }
+  def _8hAsmOperand : TypedVecListAsmOperand<count, 128, 8, "h">;
+  def "8h" : TypedVecListRegOperand<Reg128, 8, "h"> {
+    let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_8hAsmOperand");
+  }
+
+  // { v0.4s, v1.4s }
+  def _4sAsmOperand : TypedVecListAsmOperand<count, 128, 4, "s">;
+  def "4s" : TypedVecListRegOperand<Reg128, 4, "s"> {
+    let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_4sAsmOperand");
+  }
+
+  // { v0.2d, v1.2d }
+  def _2dAsmOperand : TypedVecListAsmOperand<count, 128, 2, "d">;
+  def "2d" : TypedVecListRegOperand<Reg128, 2, "d"> {
+    let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_2dAsmOperand");
+  }
+
+  // { v0.b, v1.b }
+  def _bAsmOperand : TypedVecListAsmOperand<count, 128, 0, "b">;
+  def "b" : TypedVecListRegOperand<Reg128, 0, "b"> {
+    let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_bAsmOperand");
+  }
+
+  // { v0.h, v1.h }
+  def _hAsmOperand : TypedVecListAsmOperand<count, 128, 0, "h">;
+  def "h" : TypedVecListRegOperand<Reg128, 0, "h"> {
+    let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_hAsmOperand");
+  }
+
+  // { v0.s, v1.s }
+  def _sAsmOperand : TypedVecListAsmOperand<count, 128, 0, "s">;
+  def "s" : TypedVecListRegOperand<Reg128, 0, "s"> {
+    let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_sAsmOperand");
+  }
+
+  // { v0.d, v1.d }
+  def _dAsmOperand : TypedVecListAsmOperand<count, 128, 0, "d">;
+  def "d" : TypedVecListRegOperand<Reg128, 0, "d"> {
+    let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_dAsmOperand");
+  }
+
+
+}
+
+defm VecListOne   : VectorList<1, FPR64, FPR128>;
+defm VecListTwo   : VectorList<2, DD,    QQ>;
+defm VecListThree : VectorList<3, DDD,   QQQ>;
+defm VecListFour  : VectorList<4, DDDD,  QQQQ>;
+
+
+// Register operand versions of the scalar FP registers.
+def FPR16Op : RegisterOperand<FPR16, "printOperand">;
+def FPR32Op : RegisterOperand<FPR32, "printOperand">;
+def FPR64Op : RegisterOperand<FPR64, "printOperand">;
+def FPR128Op : RegisterOperand<FPR128, "printOperand">;
--- a/lib/Target/ARM64/ARM64SchedCyclone.td
+++ b/lib/Target/ARM64/ARM64SchedCyclone.td
@ -0,0 +1,852 @@
+//=- ARMSchedCyclone.td - ARM64 Cyclone Scheduling Defs ------*- tablegen -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the machine model for ARM64 Cyclone to support
+// instruction scheduling and other instruction cost heuristics.
+//
+//===----------------------------------------------------------------------===//
+
+def CycloneModel : SchedMachineModel {
+  let IssueWidth = 6; // 6 micro-ops are dispatched per cycle.
+  let MicroOpBufferSize = 192; // Based on the reorder buffer.
+  let LoadLatency = 4; // Optimistic load latency.
+  let MispredictPenalty = 16; // 14-19 cycles are typical.
+}
+
+//===----------------------------------------------------------------------===//
+// Define each kind of processor resource and number available on Cyclone.
+
+// 4 integer pipes
+def CyUnitI : ProcResource<4> {
+  let BufferSize = 48;
+}
+
+// 2 branch units: I[0..1]
+def CyUnitB : ProcResource<2> {
+  let Super  = CyUnitI;
+  let BufferSize = 24;
+}
+
+// 1 indirect-branch unit: I[0]
+def CyUnitBR : ProcResource<1> {
+  let Super  = CyUnitB;
+}
+
+// 2 shifter pipes: I[2..3]
+// When an instruction consumes a CyUnitIS, it also consumes a CyUnitI
+def CyUnitIS : ProcResource<2> {
+  let Super = CyUnitI;
+  let BufferSize = 24;
+}
+
+// 1 mul pipe: I[0]
+def CyUnitIM : ProcResource<1> {
+  let Super = CyUnitBR;
+  let BufferSize = 32;
+}
+
+// 1 div pipe: I[1]
+def CyUnitID : ProcResource<1> {
+  let Super = CyUnitB;
+  let BufferSize = 16;
+}
+
+// 1 integer division unit. This is driven by the ID pipe, but only
+// consumes the pipe for one cycle at issue and another cycle at writeback.
+def CyUnitIntDiv : ProcResource<1>;
+
+// 2 ld/st pipes.
+def CyUnitLS : ProcResource<2> {
+  let BufferSize = 28;
+}
+
+// 3 fp/vector pipes.
+def CyUnitV : ProcResource<3> {
+  let BufferSize = 48;
+}
+// 2 fp/vector arithmetic and multiply pipes: V[0-1]
+def CyUnitVM : ProcResource<2> {
+  let Super = CyUnitV;
+  let BufferSize = 32;
+}
+// 1 fp/vector division/sqrt pipe: V[2]
+def CyUnitVD : ProcResource<1> {
+  let Super = CyUnitV;
+  let BufferSize = 16;
+}
+// 1 fp compare pipe: V[0]
+def CyUnitVC : ProcResource<1> {
+  let Super = CyUnitVM;
+  let BufferSize = 16;
+}
+
+// 2 fp division/square-root units.  These are driven by the VD pipe,
+// but only consume the pipe for one cycle at issue and a cycle at writeback.
+def CyUnitFloatDiv : ProcResource<2>;
+
+//===----------------------------------------------------------------------===//
+// Define scheduler read/write resources and latency on Cyclone.
+// This mirrors sections 7.7-7.9 of the Tuning Guide v1.0.1.
+
+let SchedModel = CycloneModel in {
+
+//---
+// 7.8.1. Moves
+//---
+
+// A single nop micro-op (uX).
+def WriteX : SchedWriteRes<[]> { let Latency = 0; }
+
+// Move zero is a register rename (to machine register zero).
+// The move is replaced by a single nop micro-op.
+// MOVZ Rd, #0
+// AND Rd, Rzr, #imm
+def WriteZPred : SchedPredicate<[{TII->isGPRZero(MI)}]>;
+def WriteImmZ  : SchedWriteVariant<[
+                   SchedVar<WriteZPred, [WriteX]>,
+                   SchedVar<NoSchedPred, [WriteImm]>]>;
+def : InstRW<[WriteImmZ], (instrs MOVZWi,MOVZXi,ANDWri,ANDXri)>;
+
+// Move GPR is a register rename and single nop micro-op.
+// ORR Xd, XZR, Xm
+// ADD Xd, Xn, #0
+def WriteIMovPred : SchedPredicate<[{TII->isGPRCopy(MI)}]>;
+def WriteVMovPred : SchedPredicate<[{TII->isFPRCopy(MI)}]>;
+def WriteMov      : SchedWriteVariant<[
+                      SchedVar<WriteIMovPred, [WriteX]>,
+                      SchedVar<WriteVMovPred, [WriteX]>,
+                      SchedVar<NoSchedPred,   [WriteI]>]>;
+def : InstRW<[WriteMov], (instrs COPY,ORRXrr,ADDXrr)>;
+
+// Move non-zero immediate is an integer ALU op.
+// MOVN,MOVZ,MOVK
+def : WriteRes<WriteImm, [CyUnitI]>;
+
+//---
+// 7.8.2-7.8.5. Arithmetic and Logical, Comparison, Conditional,
+//              Shifts and Bitfield Operations
+//---
+
+// ADR,ADRP
+// ADD(S)ri,SUB(S)ri,AND(S)ri,EORri,ORRri
+// ADD(S)rr,SUB(S)rr,AND(S)rr,BIC(S)rr,EONrr,EORrr,ORNrr,ORRrr
+// ADC(S),SBC(S)
+// Aliases: CMN, CMP, TST
+//
+// Conditional operations.
+// CCMNi,CCMPi,CCMNr,CCMPr,
+// CSEL,CSINC,CSINV,CSNEG
+//
+// Bit counting and reversal operations.
+// CLS,CLZ,RBIT,REV,REV16,REV32
+def : WriteRes<WriteI, [CyUnitI]>;
+
+// ADD with shifted register operand is a single micro-op that
+// consumes a shift pipeline for two cycles.
+// ADD(S)rs,SUB(S)rs,AND(S)rs,BIC(S)rs,EONrs,EORrs,ORNrs,ORRrs
+// EXAMPLE: ADDrs Xn, Xm LSL #imm
+def : WriteRes<WriteISReg, [CyUnitIS]> {
+  let Latency = 2;
+  let ResourceCycles = [2];
+}
+
+// ADD with extended register operand is the same as shifted reg operand.
+// ADD(S)re,SUB(S)re
+// EXAMPLE: ADDXre Xn, Xm, UXTB #1
+def : WriteRes<WriteIEReg, [CyUnitIS]> {
+  let Latency = 2;
+  let ResourceCycles = [2];
+}
+
+// Variable shift and bitfield operations.
+// ASRV,LSLV,LSRV,RORV,BFM,SBFM,UBFM
+def : WriteRes<WriteIS, [CyUnitIS]>;
+
+// EXTR Shifts a pair of registers and requires two micro-ops.
+// The second micro-op is delayed, as modeled by ReadExtrHi.
+// EXTR Xn, Xm, #imm
+def : WriteRes<WriteExtr, [CyUnitIS, CyUnitIS]> {
+  let Latency = 2;
+  let NumMicroOps = 2;
+}
+
+// EXTR's first register read is delayed by one cycle, effectively
+// shortening its writer's latency.
+// EXTR Xn, Xm, #imm
+def : ReadAdvance<ReadExtrHi, 1>;
+
+//---
+// 7.8.6. Multiplies
+//---
+
+// MUL/MNEG are aliases for MADD/MSUB.
+// MADDW,MSUBW,SMADDL,SMSUBL,UMADDL,UMSUBL
+def : WriteRes<WriteIM32, [CyUnitIM]> {
+  let Latency = 4;
+}
+// MADDX,MSUBX,SMULH,UMULH
+def : WriteRes<WriteIM64, [CyUnitIM]> {
+  let Latency = 5;
+}
+
+//---
+// 7.8.7. Divide
+//---
+
+// 32-bit divide takes 7-13 cycles. 10 cycles covers a 20-bit quotient.
+// The ID pipe is consumed for 2 cycles: issue and writeback.
+// SDIVW,UDIVW
+def : WriteRes<WriteID32, [CyUnitID, CyUnitIntDiv]> {
+  let Latency = 10;
+  let ResourceCycles = [2, 10];
+}
+// 64-bit divide takes 7-21 cycles. 13 cycles covers a 32-bit quotient.
+// The ID pipe is consumed for 2 cycles: issue and writeback.
+// SDIVX,UDIVX
+def : WriteRes<WriteID64, [CyUnitID, CyUnitIntDiv]> {
+  let Latency = 13;
+  let ResourceCycles = [2, 13];
+}
+
+//---
+// 7.8.8,7.8.10. Load/Store, single element
+//---
+
+// Integer loads take 4 cycles and use one LS unit for one cycle.
+def : WriteRes<WriteLD, [CyUnitLS]> {
+  let Latency = 4;
+}
+
+// Store-load forwarding is 4 cycles.
+//
+// Note: The store-exclusive sequence incorporates this
+// latency. However, general heuristics should not model the
+// dependence between a store and subsequent may-alias load because
+// hardware speculation works.
+def : WriteRes<WriteST, [CyUnitLS]> {
+  let Latency = 4;
+}
+
+// Load from base address plus an optionally scaled register offset.
+// Rt latency is latency WriteIS + WriteLD.
+// EXAMPLE: LDR Xn, Xm [, lsl 3]
+def CyWriteLDIdx : SchedWriteVariant<[
+  SchedVar<ScaledIdxPred, [WriteIS, WriteLD]>, // Load from scaled register.
+  SchedVar<NoSchedPred,   [WriteLD]>]>;        // Load from register offset.
+def : SchedAlias<WriteLDIdx, CyWriteLDIdx>;    // Map ARM64->Cyclone type.
+
+// EXAMPLE: STR Xn, Xm [, lsl 3]
+def CyWriteSTIdx : SchedWriteVariant<[
+  SchedVar<ScaledIdxPred, [WriteIS, WriteST]>, // Store to scaled register.
+  SchedVar<NoSchedPred,   [WriteST]>]>;        // Store to register offset.
+def : SchedAlias<WriteSTIdx, CyWriteSTIdx>;    // Map ARM64->Cyclone type.
+
+// Read the (unshifted) base register Xn in the second micro-op one cycle later.
+// EXAMPLE: LDR Xn, Xm [, lsl 3]
+def ReadBaseRS : SchedReadAdvance<1>;
+def CyReadAdrBase : SchedReadVariant<[
+  SchedVar<ScaledIdxPred, [ReadBaseRS]>, // Read base reg after shifting offset.
+  SchedVar<NoSchedPred,   [ReadDefault]>]>;   // Read base reg with no shift.
+def : SchedAlias<ReadAdrBase, CyReadAdrBase>; // Map ARM64->Cyclone type.
+
+//---
+// 7.8.9,7.8.11. Load/Store, paired
+//---
+
+// Address pre/post increment is a simple ALU op with one cycle latency.
+def : WriteRes<WriteAdr, [CyUnitI]>;
+
+// LDP high register write is fused with the load, but a nop micro-op remains.
+def : WriteRes<WriteLDHi, []> {
+  let Latency = 4;
+}
+
+// STP is a vector op and store, except for QQ, which is just two stores.
+def : SchedAlias<WriteSTP, WriteVSTShuffle>;
+def : InstRW<[WriteST, WriteST], (instrs STPQi)>;
+
+//---
+// 7.8.13. Branches
+//---
+
+// Branches take a single micro-op.
+// The misprediction penalty is defined as a SchedMachineModel property.
+def : WriteRes<WriteBr,    [CyUnitB]>  {let Latency = 0;}
+def : WriteRes<WriteBrReg, [CyUnitBR]> {let Latency = 0;}
+
+//---
+// 7.8.14. Never-issued Instructions, Barrier and Hint Operations
+//---
+
+// NOP,SEV,SEVL,WFE,WFI,YIELD
+def : WriteRes<WriteHint, []> {let Latency = 0;}
+// ISB
+def : InstRW<[WriteI], (instrs ISB)>;
+// SLREX,DMB,DSB
+def : WriteRes<WriteBarrier, [CyUnitLS]>;
+
+// System instructions get an invalid latency because the latency of
+// other operations across them is meaningless.
+def : WriteRes<WriteSys, []> {let Latency = -1;}
+
+//===----------------------------------------------------------------------===//
+// 7.9 Vector Unit Instructions
+
+// Simple vector operations take 2 cycles.
+def : WriteRes<WriteV, [CyUnitV]> {let Latency = 2;}
+
+// Define some longer latency vector op types for Cyclone.
+def CyWriteV3 : SchedWriteRes<[CyUnitV]> {let Latency = 3;}
+def CyWriteV4 : SchedWriteRes<[CyUnitV]> {let Latency = 4;}
+def CyWriteV5 : SchedWriteRes<[CyUnitV]> {let Latency = 5;}
+def CyWriteV6 : SchedWriteRes<[CyUnitV]> {let Latency = 6;}
+
+// Simple floating-point operations take 2 cycles.
+def : WriteRes<WriteF, [CyUnitV]> {let Latency = 2;}
+
+//---
+// 7.9.1 Vector Moves
+//---
+
+// TODO: Add Cyclone-specific zero-cycle zeros. LLVM currently
+// generates expensive int-float conversion instead:
+// FMOVDi Dd, #0.0
+// FMOVv2f64ns Vd.2d, #0.0
+
+// FMOVSi,FMOVDi
+def : WriteRes<WriteFImm, [CyUnitV]> {let Latency = 2;}
+
+// MOVI,MVNI are WriteV
+// FMOVv2f32ns,FMOVv2f64ns,FMOVv4f32ns are WriteV
+
+// Move FPR is a register rename and single nop micro-op.
+// ORR.16b Vd,Vn,Vn
+// COPY is handled above in the WriteMov Variant.
+def WriteVMov    : SchedWriteVariant<[
+                     SchedVar<WriteVMovPred, [WriteX]>,
+                     SchedVar<NoSchedPred,   [WriteV]>]>;
+def : InstRW<[WriteVMov], (instrs ORRv16i8)>;
+
+// FMOVSr,FMOVDr are WriteF.
+
+// MOV V,V is a WriteV.
+
+// CPY D,V[x] is a WriteV
+
+// INS V[x],V[y] is a WriteV.
+
+// FMOVWSr,FMOVXDr,FMOVXDHighr
+def : SchedAlias<WriteFCopy, WriteVLD>;
+
+// FMOVSWr,FMOVDXr
+def : InstRW<[WriteLD], (instrs FMOVSWr,FMOVDXr,FMOVDXHighr)>;
+
+// INS V[x],R
+def CyWriteCopyToFPR : WriteSequence<[WriteVLD, WriteV]>;
+def : InstRW<[CyWriteCopyToFPR], (instregex "INSv")>;
+
+// SMOV,UMOV R,V[x]
+def CyWriteCopyToGPR : WriteSequence<[WriteLD, WriteI]>;
+def : InstRW<[CyWriteCopyToGPR], (instregex "SMOVv","UMOVv")>;
+
+// DUP V,R
+def : InstRW<[CyWriteCopyToFPR], (instregex "DUPv")>;
+
+// DUP V,V[x] is a WriteV.
+
+//---
+// 7.9.2 Integer Arithmetic, Logical, and Comparisons
+//---
+
+// BIC,ORR V,#imm are WriteV
+
+def : InstRW<[CyWriteV3], (instregex "ABSv")>;
+
+// MVN,NEG,NOT are WriteV
+
+def : InstRW<[CyWriteV3], (instregex "SQABSv","SQNEGv")>;
+
+// ADDP is a WriteV.
+def CyWriteVADDLP : SchedWriteRes<[CyUnitV]> {let Latency = 2;}
+def : InstRW<[CyWriteVADDLP], (instregex "SADDLPv","UADDLPv")>;
+
+def : InstRW<[CyWriteV3],
+             (instregex "ADDVv","SMAXVv","UMAXVv","SMINVv","UMINVv")>;
+
+def : InstRW<[CyWriteV3], (instregex "SADDLV","UADDLV")>;
+
+// ADD,SUB are WriteV
+
+// Forward declare.
+def CyWriteVABD : SchedWriteRes<[CyUnitV]> {let Latency = 3;}
+
+// Add/Diff and accumulate uses the vector multiply unit.
+def CyWriteVAccum : SchedWriteRes<[CyUnitVM]> {let Latency = 3;}
+def CyReadVAccum  : SchedReadAdvance<1,
+                    [CyWriteVAccum, CyWriteVADDLP, CyWriteVABD]>;
+
+def : InstRW<[CyWriteVAccum, CyReadVAccum],
+             (instregex "SADALP","UADALP")>;
+
+def : InstRW<[CyWriteVAccum, CyReadVAccum],
+             (instregex "SABAv","UABAv","SABALv","UABALv")>;
+
+def : InstRW<[CyWriteV3], (instregex "SQADDv","SQSUBv","UQADDv","UQSUBv")>;
+
+def : InstRW<[CyWriteV3], (instregex "SUQADDv","USQADDv")>;
+
+def : InstRW<[CyWriteV4], (instregex "ADDHNv","RADDHNv", "RSUBHNv", "SUBHNv")>;
+
+// WriteV includes:
+// AND,BIC,CMTST,EOR,ORN,ORR
+// ADDP
+// SHADD,SHSUB,SRHADD,UHADD,UHSUB,URHADD
+// SADDL,SSUBL,UADDL,USUBL
+// SADDW,SSUBW,UADDW,USUBW
+
+def : InstRW<[CyWriteV3], (instregex "CMEQv","CMGEv","CMGTv",
+                                     "CMLEv","CMLTv",
+                                     "CMHIv","CMHSv")>;
+
+def : InstRW<[CyWriteV3], (instregex "SMAXv","SMINv","UMAXv","UMINv",
+                                     "SMAXPv","SMINPv","UMAXPv","UMINPv")>;
+
+def : InstRW<[CyWriteVABD], (instregex "SABDv","UABDv",
+                                       "SABDLv","UABDLv")>;
+
+//---
+// 7.9.3 Floating Point Arithmetic and Comparisons
+//---
+
+// FABS,FNEG are WriteF
+
+def : InstRW<[CyWriteV4], (instrs FADDPv2i32p)>;
+def : InstRW<[CyWriteV5], (instrs FADDPv2i64p)>;
+
+def : InstRW<[CyWriteV3], (instregex "FMAXPv2i","FMAXNMPv2i",
+                                     "FMINPv2i","FMINNMPv2i")>;
+
+def : InstRW<[CyWriteV4], (instregex "FMAXVv","FMAXNMVv","FMINVv","FMINNMVv")>;
+
+def : InstRW<[CyWriteV4], (instrs FADDSrr,FADDv2f32,FADDv4f32,
+                                  FSUBSrr,FSUBv2f32,FSUBv4f32,
+                                  FADDPv2f32,FADDPv4f32,
+                                  FABD32,FABDv2f32,FABDv4f32)>;
+def : InstRW<[CyWriteV5], (instrs FADDDrr,FADDv2f64,
+                                  FSUBDrr,FSUBv2f64,
+                                  FADDPv2f64,
+                                  FABD64,FABDv2f64)>;
+
+def : InstRW<[CyWriteV3], (instregex "FCMEQ","FCMGT","FCMLE","FCMLT")>;
+
+def : InstRW<[CyWriteV3], (instregex "FACGE","FACGT",
+                                     "FMAXS","FMAXD","FMAXv",
+                                     "FMINS","FMIND","FMINv",
+                                     "FMAXNMS","FMAXNMD","FMAXNMv",
+                                     "FMINNMS","FMINNMD","FMINNMv",
+                                     "FMAXPv2f","FMAXPv4f",
+                                     "FMINPv2f","FMINPv4f",
+                                     "FMAXNMPv2f","FMAXNMPv4f",
+                                     "FMINNMPv2f","FMINNMPv4f")>;
+
+// FCMP,FCMPE,FCCMP,FCCMPE
+def : WriteRes<WriteFCmp, [CyUnitVC]> {let Latency = 4;}
+
+// FCSEL is a WriteF.
+
+//---
+// 7.9.4 Shifts and Bitfield Operations
+//---
+
+// SHL is a WriteV
+
+def CyWriteVSHR : SchedWriteRes<[CyUnitV]> {let Latency = 2;}
+def : InstRW<[CyWriteVSHR], (instregex "SSHRv","USHRv")>;
+
+def CyWriteVSRSHR : SchedWriteRes<[CyUnitV]> {let Latency = 3;}
+def : InstRW<[CyWriteVSRSHR], (instregex "SRSHRv","URSHRv")>;
+
+// Shift and accumulate uses the vector multiply unit.
+def CyWriteVShiftAcc : SchedWriteRes<[CyUnitVM]> {let Latency = 3;}
+def CyReadVShiftAcc  : SchedReadAdvance<1,
+                        [CyWriteVShiftAcc, CyWriteVSHR, CyWriteVSRSHR]>;
+def : InstRW<[CyWriteVShiftAcc, CyReadVShiftAcc],
+             (instregex "SRSRAv","SSRAv","URSRAv","USRAv")>;
+
+// SSHL,USHL are WriteV.
+
+def : InstRW<[CyWriteV3], (instregex "SRSHLv","URSHLv")>;
+
+// SQSHL,SQSHLU,UQSHL are WriteV.
+
+def : InstRW<[CyWriteV3], (instregex "SQRSHLv","UQRSHLv")>;
+
+// WriteV includes:
+// SHLL,SSHLL,USHLL
+// SLI,SRI
+// BIF,BIT,BSL
+// EXT
+// CLS,CLZ,CNT,RBIT,REV16,REV32,REV64,XTN
+// XTN2
+
+def : InstRW<[CyWriteV4],
+             (instregex "RSHRNv","SHRNv",
+                        "SQRSHRNv","SQRSHRUNv","SQSHRNv","SQSHRUNv",
+                        "UQRSHRNv","UQSHRNv","SQXTNv","SQXTUNv","UQXTNv")>;
+
+//---
+// 7.9.5 Multiplication
+//---
+
+def CyWriteVMul : SchedWriteRes<[CyUnitVM]> { let Latency = 4;}
+def : InstRW<[CyWriteVMul], (instregex "MULv","SMULLv","UMULLv",
+                             "SQDMULLv","SQDMULHv","SQRDMULHv")>;
+
+// FMUL,FMULX,FNMUL default to WriteFMul.
+def : WriteRes<WriteFMul, [CyUnitVM]> { let Latency = 4;}
+
+def CyWriteV64Mul : SchedWriteRes<[CyUnitVM]> { let Latency = 5;}
+def : InstRW<[CyWriteV64Mul], (instrs FMULDrr,FMULv2f64,FMULv2i64_indexed,
+                               FNMULDrr,FMULX64,FMULXv2f64,FMULXv2i64_indexed)>;
+
+def CyReadVMulAcc : SchedReadAdvance<1, [CyWriteVMul, CyWriteV64Mul]>;
+def : InstRW<[CyWriteVMul, CyReadVMulAcc],
+             (instregex "MLA","MLS","SMLAL","SMLSL","UMLAL","UMLSL",
+              "SQDMLAL","SQDMLSL")>;
+
+def CyWriteSMul : SchedWriteRes<[CyUnitVM]> { let Latency = 8;}
+def CyWriteDMul : SchedWriteRes<[CyUnitVM]> { let Latency = 10;}
+def CyReadSMul : SchedReadAdvance<4, [CyWriteSMul]>;
+def CyReadDMul : SchedReadAdvance<5, [CyWriteDMul]>;
+
+def : InstRW<[CyWriteSMul, CyReadSMul],
+             (instrs FMADDSrrr,FMSUBSrrr,FNMADDSrrr,FNMSUBSrrr,
+              FMLAv2f32,FMLAv4f32,
+              FMLAv1i32_indexed,FMLAv1i64_indexed,FMLAv2i32_indexed)>;
+def : InstRW<[CyWriteDMul, CyReadDMul],
+             (instrs FMADDDrrr,FMSUBDrrr,FNMADDDrrr,FNMSUBDrrr,
+              FMLAv2f64,FMLAv2i64_indexed,
+              FMLSv2f64,FMLSv2i64_indexed)>;
+
+def CyWritePMUL : SchedWriteRes<[CyUnitVD]> { let Latency = 3; }
+def : InstRW<[CyWritePMUL], (instregex "PMULv", "PMULLv")>;
+
+//---
+// 7.9.6 Divide and Square Root
+//---
+
+// FDIV,FSQRT
+// TODO: Add 64-bit variant with 19 cycle latency.
+// TODO: Specialize FSQRT for longer latency.
+def : WriteRes<WriteFDiv, [CyUnitVD, CyUnitFloatDiv]> {
+  let Latency = 17;
+  let ResourceCycles = [2, 17];
+}
+
+def : InstRW<[CyWriteV4], (instregex "FRECPEv","FRECPXv","URECPEv","URSQRTEv")>;
+
+def WriteFRSQRTE : SchedWriteRes<[CyUnitVM]> { let Latency = 4; }
+def : InstRW<[WriteFRSQRTE], (instregex "FRSQRTEv")>;
+
+def WriteFRECPS : SchedWriteRes<[CyUnitVM]> { let Latency = 8; }
+def WriteFRSQRTS : SchedWriteRes<[CyUnitVM]> { let Latency = 10; }
+def : InstRW<[WriteFRECPS],  (instregex "FRECPSv")>;
+def : InstRW<[WriteFRSQRTS], (instregex "FRSQRTSv")>;
+
+//---
+// 7.9.7 Integer-FP Conversions
+//---
+
+// FCVT lengthen f16/s32
+def : InstRW<[WriteV], (instrs FCVTSHr,FCVTDHr,FCVTDSr)>;
+
+// FCVT,FCVTN,FCVTXN
+// SCVTF,UCVTF V,V
+// FRINT(AIMNPXZ) V,V
+def : WriteRes<WriteFCvt, [CyUnitV]> {let Latency = 4;}
+
+// SCVT/UCVT S/D, Rd = VLD5+V4: 9 cycles.
+def CyWriteCvtToFPR : WriteSequence<[WriteVLD, CyWriteV4]>;
+def : InstRW<[CyWriteCopyToFPR], (instregex "FCVT[AMNPZ][SU][SU][WX][SD]r")>;
+
+// FCVT Rd, S/D = V6+LD4: 10 cycles
+def CyWriteCvtToGPR : WriteSequence<[CyWriteV6, WriteLD]>;
+def : InstRW<[CyWriteCvtToGPR], (instregex "[SU]CVTF[SU][WX][SD]r")>;
+
+// FCVTL is a WriteV
+
+//---
+// 7.9.8-7.9.10 Cryptography, Data Transposition, Table Lookup
+//---
+
+def CyWriteCrypto2 : SchedWriteRes<[CyUnitVD]> {let Latency = 2;}
+def : InstRW<[CyWriteCrypto2], (instrs AESIMCrr, AESMCrr, SHA1Hrr,
+                                       AESDrr, AESErr, SHA1SU1rr, SHA256SU0rr,
+                                       SHA1SU0rrr)>;
+
+def CyWriteCrypto3 : SchedWriteRes<[CyUnitVD]> {let Latency = 3;}
+def : InstRW<[CyWriteCrypto3], (instrs SHA256SU1rrr)>;
+
+def CyWriteCrypto6 : SchedWriteRes<[CyUnitVD]> {let Latency = 6;}
+def : InstRW<[CyWriteCrypto6], (instrs SHA1Crrr, SHA1Mrrr, SHA1Prrr,
+                                       SHA256Hrrr,SHA256H2rrr)>;
+
+// TRN,UZP,ZUP are WriteV.
+
+// TBL,TBX are WriteV.
+
+//---
+// 7.9.11-7.9.14 Load/Store, single element and paired
+//---
+
+// Loading into the vector unit takes 5 cycles vs 4 for integer loads.
+def : WriteRes<WriteVLD, [CyUnitLS]> {
+  let Latency = 5;
+}
+
+// Store-load forwarding is 4 cycles.
+def : WriteRes<WriteVST, [CyUnitLS]> {
+  let Latency = 4;
+}
+
+// WriteVLDPair/VSTPair sequences are expanded by the target description.
+
+//---
+// 7.9.15 Load, element operations
+//---
+
+// Only the first WriteVLD and WriteAdr for writeback matches def operands.
+// Subsequent WriteVLDs consume resources. Since all loaded values have the
+// same latency, this is acceptable.
+
+// Vd is read 5 cycles after issuing the vector load.
+def : ReadAdvance<ReadVLD, 5>;
+
+def : InstRW<[WriteVLD],
+             (instregex "LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[WriteVLD, WriteAdr],
+             (instregex "LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST")>;
+
+// Register writes from the load's high half are fused micro-ops.
+def : InstRW<[WriteVLD],
+             (instregex "LD1Twov(8b|4h|2s|1d)$")>;
+def : InstRW<[WriteVLD, WriteAdr],
+             (instregex "LD1Twov(8b|4h|2s|1d)_POST")>;
+def : InstRW<[WriteVLD, WriteVLD],
+             (instregex "LD1Twov(16b|8h|4s|2d)$")>;
+def : InstRW<[WriteVLD, WriteAdr, WriteVLD],
+             (instregex "LD1Twov(16b|8h|4s|2d)_POST")>;
+
+def : InstRW<[WriteVLD, WriteVLD],
+             (instregex "LD1Threev(8b|4h|2s|1d)$")>;
+def : InstRW<[WriteVLD, WriteAdr, WriteVLD],
+             (instregex "LD1Threev(8b|4h|2s|1d)_POST")>;
+def : InstRW<[WriteVLD, WriteVLD, WriteVLD],
+             (instregex "LD1Threev(16b|8h|4s|2d)$")>;
+def : InstRW<[WriteVLD, WriteAdr, WriteVLD, WriteVLD],
+             (instregex "LD1Threev(16b|8h|4s|2d)_POST")>;
+
+def : InstRW<[WriteVLD, WriteVLD],
+             (instregex "LD1Fourv(8b|4h|2s|1d)$")>;
+def : InstRW<[WriteVLD, WriteAdr, WriteVLD],
+             (instregex "LD1Fourv(8b|4h|2s|1d)_POST")>;
+def : InstRW<[WriteVLD, WriteVLD, WriteVLD, WriteVLD],
+             (instregex "LD1Fourv(16b|8h|4s|2d)$")>;
+def : InstRW<[WriteVLD, WriteAdr, WriteVLD, WriteVLD, WriteVLD],
+             (instregex "LD1Fourv(16b|8h|4s|2d)_POST")>;
+
+def : InstRW<[WriteVLDShuffle, ReadVLD],
+             (instregex "LD1i(8|16|32)$")>;
+def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr],
+             (instregex "LD1i(8|16|32)_POST")>;
+
+def : InstRW<[WriteVLDShuffle, ReadVLD],          (instrs LD1i64)>;
+def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr],(instrs LD1i64_POST)>;
+
+def : InstRW<[WriteVLDShuffle],
+             (instregex "LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[WriteVLDShuffle, WriteAdr],
+             (instregex "LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+def : InstRW<[WriteVLDShuffle, WriteV],
+             (instregex "LD2Twov(8b|4h|2s)$")>;
+def : InstRW<[WriteVLDShuffle, WriteAdr, WriteV],
+             (instregex "LD2Twov(8b|4h|2s)_POST$")>;
+def : InstRW<[WriteVLDShuffle, WriteVLDShuffle],
+             (instregex "LD2Twov(16b|8h|4s|2d)$")>;
+def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVLDShuffle],
+             (instregex "LD2Twov(16b|8h|4s|2d)_POST")>;
+
+def : InstRW<[WriteVLDShuffle, ReadVLD, WriteV],
+             (instregex "LD2i(8|16|32)$")>;
+def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr, WriteV],
+             (instregex "LD2i(8|16|32)_POST")>;
+def : InstRW<[WriteVLDShuffle, ReadVLD, WriteV],
+             (instregex "LD2i64$")>;
+def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr, WriteV],
+             (instregex "LD2i64_POST")>;
+
+def : InstRW<[WriteVLDShuffle, WriteV],
+             (instregex "LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[WriteVLDShuffle, WriteAdr, WriteV],
+             (instregex "LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST")>;
+
+def : InstRW<[WriteVLDShuffle, WriteVLDShuffle, WriteV],
+             (instregex "LD3Threev(8b|4h|2s)$")>;
+def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVLDShuffle, WriteV],
+             (instregex "LD3Threev(8b|4h|2s)_POST")>;
+def : InstRW<[WriteVLDShuffle, WriteVLDShuffle, WriteVLDShuffle],
+             (instregex "LD3Threev(16b|8h|4s|2d)$")>;
+def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVLDShuffle, WriteVLDShuffle],
+             (instregex "LD3Threev(16b|8h|4s|2d)_POST")>;
+
+def : InstRW<[WriteVLDShuffle, ReadVLD, WriteV, WriteV],
+             (instregex "LD3i(8|16|32)$")>;
+def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr, WriteV, WriteV],
+             (instregex "LD3i(8|16|32)_POST")>;
+
+def : InstRW<[WriteVLDShuffle, ReadVLD, WriteVLDShuffle, WriteV],
+             (instregex "LD3i64$")>;
+def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr, WriteVLDShuffle, WriteV],
+             (instregex "LD3i64_POST")>;
+
+def : InstRW<[WriteVLDShuffle, WriteV, WriteV],
+             (instregex "LD3Rv(8b|4h|2s|16b|8h|4s)$")>;
+def : InstRW<[WriteVLDShuffle, WriteAdr, WriteV, WriteV],
+             (instregex "LD3Rv(8b|4h|2s|16b|8h|4s)_POST")>;
+
+def : InstRW<[WriteVLDShuffle, WriteVLDShuffle, WriteV],
+             (instrs LD3Rv1d,LD3Rv2d)>;
+def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVLDShuffle, WriteV],
+             (instrs LD3Rv2d_POST,LD3Rv2d_POST)>;
+
+def : InstRW<[WriteVLDShuffle, WriteVLDShuffle, WriteV, WriteV],
+             (instregex "LD4Fourv(8b|4h|2s)$")>;
+def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVLDShuffle, WriteV, WriteV],
+             (instregex "LD4Fourv(8b|4h|2s)_POST")>;
+def : InstRW<[WriteVLDPairShuffle, WriteVLDPairShuffle,
+              WriteVLDPairShuffle, WriteVLDPairShuffle],
+             (instregex "LD4Fourv(16b|8h|4s|2d)$")>;
+def : InstRW<[WriteVLDPairShuffle, WriteAdr, WriteVLDPairShuffle,
+              WriteVLDPairShuffle, WriteVLDPairShuffle],
+             (instregex "LD4Fourv(16b|8h|4s|2d)_POST")>;
+
+def : InstRW<[WriteVLDShuffle, ReadVLD, WriteV, WriteV, WriteV],
+             (instregex "LD4i(8|16|32)$")>;
+def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr, WriteV, WriteV, WriteV],
+             (instregex "LD4i(8|16|32)_POST")>;
+
+
+def : InstRW<[WriteVLDShuffle, ReadVLD, WriteVLDShuffle, WriteV, WriteV],
+             (instrs LD4i64)>;
+def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr, WriteVLDShuffle, WriteV],
+             (instrs LD4i64_POST)>;
+
+def : InstRW<[WriteVLDShuffle, WriteV, WriteV, WriteV],
+             (instregex "LD4Rv(8b|4h|2s|16b|8h|4s)$")>;
+def : InstRW<[WriteVLDShuffle, WriteAdr, WriteV, WriteV, WriteV],
+             (instregex "LD4Rv(8b|4h|2s|16b|8h|4s)_POST")>;
+
+def : InstRW<[WriteVLDShuffle, WriteVLDShuffle, WriteV, WriteV],
+             (instrs LD4Rv1d,LD4Rv2d)>;
+def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVLDShuffle, WriteV, WriteV],
+             (instrs LD4Rv1d_POST,LD4Rv2d_POST)>;
+
+//---
+// 7.9.16 Store, element operations
+//---
+
+// Only the WriteAdr for writeback matches a def operands.
+// Subsequent WriteVLDs only consume resources.
+
+def : InstRW<[WriteVST],
+             (instregex "ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[WriteAdr, WriteVST],
+             (instregex "ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST")>;
+
+def : InstRW<[WriteVSTShuffle],
+             (instregex "ST1Twov(8b|4h|2s|1d)$")>;
+def : InstRW<[WriteAdr, WriteVSTShuffle],
+             (instregex "ST1Twov(8b|4h|2s|1d)_POST")>;
+def : InstRW<[WriteVST, WriteVST],
+             (instregex "ST1Twov(16b|8h|4s|2d)$")>;
+def : InstRW<[WriteAdr, WriteVST, WriteVST],
+             (instregex "ST1Twov(16b|8h|4s|2d)_POST")>;
+
+def : InstRW<[WriteVSTShuffle, WriteVST],
+             (instregex "ST1Threev(8b|4h|2s|1d)$")>;
+def : InstRW<[WriteAdr, WriteVSTShuffle, WriteVST],
+             (instregex "ST1Threev(8b|4h|2s|1d)_POST")>;
+def : InstRW<[WriteVST, WriteVST, WriteVST],
+             (instregex "ST1Threev(16b|8h|4s|2d)$")>;
+def : InstRW<[WriteAdr, WriteVST, WriteVST, WriteVST],
+             (instregex "ST1Threev(16b|8h|4s|2d)_POST")>;
+
+def : InstRW<[WriteVSTShuffle, WriteVSTShuffle],
+             (instregex "ST1Fourv(8b|4h|2s|1d)$")>;
+def : InstRW<[WriteAdr, WriteVSTShuffle, WriteVSTShuffle],
+             (instregex "ST1Fourv(8b|4h|2s|1d)_POST")>;
+def : InstRW<[WriteVST, WriteVST, WriteVST, WriteVST],
+             (instregex "ST1Fourv(16b|8h|4s|2d)$")>;
+def : InstRW<[WriteAdr, WriteVST, WriteVST, WriteVST, WriteVST],
+             (instregex "ST1Fourv(16b|8h|4s|2d)_POST")>;
+
+def : InstRW<[WriteVSTShuffle],           (instregex "ST1i(8|16|32)$")>;
+def : InstRW<[WriteAdr, WriteVSTShuffle], (instregex "ST1i(8|16|32)_POST")>;
+
+def : InstRW<[WriteVSTShuffle],           (instrs ST1i64)>;
+def : InstRW<[WriteAdr, WriteVSTShuffle], (instrs ST1i64_POST)>;
+
+def : InstRW<[WriteVSTShuffle],
+             (instregex "ST2Twov(8b|4h|2s)$")>;
+def : InstRW<[WriteAdr, WriteVSTShuffle],
+             (instregex "ST2Twov(8b|4h|2s)_POST")>;
+def : InstRW<[WriteVSTShuffle, WriteVSTShuffle],
+             (instregex "ST2Twov(16b|8h|4s|2d)$")>;
+def : InstRW<[WriteAdr, WriteVSTShuffle, WriteVSTShuffle],
+             (instregex "ST2Twov(16b|8h|4s|2d)_POST")>;
+
+def : InstRW<[WriteVSTShuffle],           (instregex "ST2i(8|16|32)$")>;
+def : InstRW<[WriteAdr, WriteVSTShuffle], (instregex "ST2i(8|16|32)_POST")>;
+def : InstRW<[WriteVSTShuffle],           (instrs ST2i64)>;
+def : InstRW<[WriteAdr, WriteVSTShuffle], (instrs ST2i64_POST)>;
+
+def : InstRW<[WriteVSTShuffle, WriteVSTShuffle],
+             (instregex "ST3Threev(8b|4h|2s)$")>;
+def : InstRW<[WriteAdr, WriteVSTShuffle, WriteVSTShuffle],
+             (instregex "ST3Threev(8b|4h|2s)_POST")>;
+def : InstRW<[WriteVSTShuffle, WriteVSTShuffle, WriteVSTShuffle],
+             (instregex "ST3Threev(16b|8h|4s|2d)$")>;
+def : InstRW<[WriteAdr, WriteVSTShuffle, WriteVSTShuffle, WriteVSTShuffle],
+             (instregex "ST3Threev(16b|8h|4s|2d)_POST")>;
+
+def : InstRW<[WriteVSTShuffle],           (instregex "ST3i(8|16|32)$")>;
+def : InstRW<[WriteAdr, WriteVSTShuffle], (instregex "ST3i(8|16|32)_POST")>;
+
+def :InstRW<[WriteVSTShuffle, WriteVSTShuffle],           (instrs ST3i64)>;
+def :InstRW<[WriteAdr, WriteVSTShuffle, WriteVSTShuffle], (instrs ST3i64_POST)>;
+
+def : InstRW<[WriteVSTPairShuffle, WriteVSTPairShuffle],
+            (instregex "ST4Fourv(8b|4h|2s|1d)$")>;
+def : InstRW<[WriteAdr, WriteVSTPairShuffle, WriteVSTPairShuffle],
+            (instregex "ST4Fourv(8b|4h|2s|1d)_POST")>;
+def : InstRW<[WriteVSTPairShuffle, WriteVSTPairShuffle,
+              WriteVSTPairShuffle, WriteVSTPairShuffle],
+             (instregex "ST4Fourv(16b|8h|4s|2d)$")>;
+def : InstRW<[WriteAdr, WriteVSTPairShuffle, WriteVSTPairShuffle,
+              WriteVSTPairShuffle, WriteVSTPairShuffle],
+             (instregex "ST4Fourv(16b|8h|4s|2d)_POST")>;
+
+def : InstRW<[WriteVSTPairShuffle],           (instregex "ST4i(8|16|32)$")>;
+def : InstRW<[WriteAdr, WriteVSTPairShuffle], (instregex "ST4i(8|16|32)_POST")>;
+
+def : InstRW<[WriteVSTShuffle, WriteVSTShuffle],          (instrs ST4i64)>;
+def : InstRW<[WriteAdr, WriteVSTShuffle, WriteVSTShuffle],(instrs ST4i64_POST)>;
+
+} // SchedModel = CycloneModel
--- a/lib/Target/ARM64/ARM64Schedule.td
+++ b/lib/Target/ARM64/ARM64Schedule.td
@ -0,0 +1,92 @@
+//===-- ARMSchedule.td - ARM Scheduling Definitions --------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+// Define TII for use in SchedVariant Predicates.
+// const MachineInstr *MI and const TargetSchedModel *SchedModel
+// are defined by default.
+def : PredicateProlog<[{
+  const ARM64InstrInfo *TII =
+    static_cast<const ARM64InstrInfo*>(SchedModel->getInstrInfo());
+  (void)TII;
+}]>;
+
+// ARM64 Scheduler Definitions
+
+def WriteImm       : SchedWrite; // MOVN, MOVZ
+// TODO: Provide variants for MOV32/64imm Pseudos that dynamically
+// select the correct sequence of WriteImms.
+
+def WriteI         : SchedWrite; // ALU
+def WriteISReg     : SchedWrite; // ALU of Shifted-Reg
+def WriteIEReg     : SchedWrite; // ALU of Extended-Reg
+def WriteExtr      : SchedWrite; // EXTR shifts a reg pair
+def ReadExtrHi     : SchedRead;  // Read the high reg of the EXTR pair
+def WriteIS        : SchedWrite; // Shift/Scale
+def WriteID32      : SchedWrite; // 32-bit Divide
+def WriteID64      : SchedWrite; // 64-bit Divide
+def WriteIM32      : SchedWrite; // 32-bit Multiply
+def WriteIM64      : SchedWrite; // 64-bit Multiply
+def WriteBr        : SchedWrite; // Branch
+def WriteBrReg     : SchedWrite; // Indirect Branch
+
+def WriteLD        : SchedWrite; // Load from base addr plus immediate offset
+def WriteST        : SchedWrite; // Store to base addr plus immediate offset
+def WriteSTP       : SchedWrite; // Store a register pair.
+def WriteAdr       : SchedWrite; // Address pre/post increment.
+
+def WriteLDIdx : SchedWrite; // Load from a register index (maybe scaled).
+def WriteSTIdx : SchedWrite; // Store to a register index (maybe scaled).
+def ReadAdrBase : SchedRead; // Read the base resister of a reg-offset LD/ST.
+
+// ScaledIdxPred is true if a WriteLDIdx operand will be
+// scaled. Subtargets can use this to dynamically select resources and
+// latency for WriteLDIdx and ReadAdrBase.
+def ScaledIdxPred : SchedPredicate<[{TII->isScaledAddr(MI)}]>;
+
+// Serialized two-level address load.
+// EXAMPLE: LOADGot
+def WriteLDAdr : WriteSequence<[WriteAdr, WriteLD]>;
+
+// Serialized two-level address lookup.
+// EXAMPLE: MOVaddr...
+def WriteAdrAdr : WriteSequence<[WriteAdr, WriteAdr]>;
+
+// The second register of a load-pair.
+// LDP,LDPSW,LDNP,LDXP,LDAXP
+def WriteLDHi : SchedWrite;
+
+// Store-exclusive is a store followed by a dependent load.
+def WriteSTX : WriteSequence<[WriteST, WriteLD]>;
+
+def WriteSys     : SchedWrite; // Long, variable latency system ops.
+def WriteBarrier : SchedWrite; // Memory barrier.
+def WriteHint    : SchedWrite; // Hint instruction.
+
+def WriteF       : SchedWrite; // General floating-point ops.
+def WriteFCmp    : SchedWrite; // Floating-point compare.
+def WriteFCvt    : SchedWrite; // Float conversion.
+def WriteFCopy   : SchedWrite; // Float-int register copy.
+def WriteFImm    : SchedWrite; // Floating-point immediate.
+def WriteFMul    : SchedWrite; // Floating-point multiply.
+def WriteFDiv    : SchedWrite; // Floating-point division.
+
+def WriteV   : SchedWrite; // Vector ops.
+def WriteVLD : SchedWrite; // Vector loads.
+def WriteVST : SchedWrite; // Vector stores.
+
+// Read the unwritten lanes of the VLD's destination registers.
+def ReadVLD : SchedRead;
+
+// Sequential vector load and shuffle.
+def WriteVLDShuffle     : WriteSequence<[WriteVLD, WriteV]>;
+def WriteVLDPairShuffle : WriteSequence<[WriteVLD, WriteV, WriteV]>;
+
+// Store a shuffled vector.
+def WriteVSTShuffle : WriteSequence<[WriteV, WriteVST]>;
+def WriteVSTPairShuffle : WriteSequence<[WriteV, WriteV, WriteVST]>;
--- a/lib/Target/ARM64/ARM64SelectionDAGInfo.cpp
+++ b/lib/Target/ARM64/ARM64SelectionDAGInfo.cpp
@ -0,0 +1,57 @@
+//===-- ARM64SelectionDAGInfo.cpp - ARM64 SelectionDAG Info ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the ARM64SelectionDAGInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "arm64-selectiondag-info"
+#include "ARM64TargetMachine.h"
+using namespace llvm;
+
+ARM64SelectionDAGInfo::ARM64SelectionDAGInfo(const TargetMachine &TM)
+    : TargetSelectionDAGInfo(TM),
+      Subtarget(&TM.getSubtarget<ARM64Subtarget>()) {}
+
+ARM64SelectionDAGInfo::~ARM64SelectionDAGInfo() {}
+
+SDValue ARM64SelectionDAGInfo::EmitTargetCodeForMemset(
+    SelectionDAG &DAG, SDLoc dl, SDValue Chain, SDValue Dst, SDValue Src,
+    SDValue Size, unsigned Align, bool isVolatile,
+    MachinePointerInfo DstPtrInfo) const {
+  // Check to see if there is a specialized entry-point for memory zeroing.
+  ConstantSDNode *V = dyn_cast<ConstantSDNode>(Src);
+  ConstantSDNode *SizeValue = dyn_cast<ConstantSDNode>(Size);
+  const char *bzeroEntry =
+      (V && V->isNullValue()) ? Subtarget->getBZeroEntry() : 0;
+  // For small size (< 256), it is not beneficial to use bzero
+  // instead of memset.
+  if (bzeroEntry && (!SizeValue || SizeValue->getZExtValue() > 256)) {
+    const ARM64TargetLowering &TLI = *static_cast<const ARM64TargetLowering *>(
+                                          DAG.getTarget().getTargetLowering());
+
+    EVT IntPtr = TLI.getPointerTy();
+    Type *IntPtrTy = getDataLayout()->getIntPtrType(*DAG.getContext());
+    TargetLowering::ArgListTy Args;
+    TargetLowering::ArgListEntry Entry;
+    Entry.Node = Dst;
+    Entry.Ty = IntPtrTy;
+    Args.push_back(Entry);
+    Entry.Node = Size;
+    Args.push_back(Entry);
+    TargetLowering::CallLoweringInfo CLI(
+        Chain, Type::getVoidTy(*DAG.getContext()), false, false, false, false,
+        0, CallingConv::C, /*isTailCall=*/false,
+        /*doesNotRet=*/false, /*isReturnValueUsed=*/false,
+        DAG.getExternalSymbol(bzeroEntry, IntPtr), Args, DAG, dl);
+    std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
+    return CallResult.second;
+  }
+  return SDValue();
+}
--- a/lib/Target/ARM64/ARM64SelectionDAGInfo.h
+++ b/lib/Target/ARM64/ARM64SelectionDAGInfo.h
@ -0,0 +1,38 @@
+//===-- ARM64SelectionDAGInfo.h - ARM64 SelectionDAG Info -------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the ARM64 subclass for TargetSelectionDAGInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef ARM64SELECTIONDAGINFO_H
+#define ARM64SELECTIONDAGINFO_H
+
+#include "llvm/Target/TargetSelectionDAGInfo.h"
+
+namespace llvm {
+
+class ARM64SelectionDAGInfo : public TargetSelectionDAGInfo {
+  /// Subtarget - Keep a pointer to the ARMSubtarget around so that we can
+  /// make the right decision when generating code for different targets.
+  const ARM64Subtarget *Subtarget;
+
+public:
+  explicit ARM64SelectionDAGInfo(const TargetMachine &TM);
+  ~ARM64SelectionDAGInfo();
+
+  virtual SDValue EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl,
+                                          SDValue Chain, SDValue Dst,
+                                          SDValue Src, SDValue Size,
+                                          unsigned Align, bool isVolatile,
+                                          MachinePointerInfo DstPtrInfo) const;
+};
+}
+
+#endif
--- a/lib/Target/ARM64/ARM64StorePairSuppress.cpp
+++ b/lib/Target/ARM64/ARM64StorePairSuppress.cpp
@ -0,0 +1,169 @@
+//===---- ARM64StorePairSuppress.cpp --- Suppress store pair formation ----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass identifies floating point stores that should not be combined into
+// store pairs. Later we may do the same for floating point loads.
+// ===---------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "arm64-stp-suppress"
+#include "ARM64InstrInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineTraceMetrics.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetSchedule.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+namespace {
+class ARM64StorePairSuppress : public MachineFunctionPass {
+  const ARM64InstrInfo *TII;
+  const TargetRegisterInfo *TRI;
+  const MachineRegisterInfo *MRI;
+  MachineFunction *MF;
+  TargetSchedModel SchedModel;
+  MachineTraceMetrics *Traces;
+  MachineTraceMetrics::Ensemble *MinInstr;
+
+public:
+  static char ID;
+  ARM64StorePairSuppress() : MachineFunctionPass(ID) {}
+
+  virtual const char *getPassName() const {
+    return "ARM64 Store Pair Suppression";
+  }
+
+  bool runOnMachineFunction(MachineFunction &F);
+
+private:
+  bool shouldAddSTPToBlock(const MachineBasicBlock *BB);
+
+  bool isNarrowFPStore(const MachineInstr *MI);
+
+  virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+    AU.setPreservesCFG();
+    AU.addRequired<MachineTraceMetrics>();
+    AU.addPreserved<MachineTraceMetrics>();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+};
+char ARM64StorePairSuppress::ID = 0;
+} // anonymous
+
+FunctionPass *llvm::createARM64StorePairSuppressPass() {
+  return new ARM64StorePairSuppress();
+}
+
+/// Return true if an STP can be added to this block without increasing the
+/// critical resource height. STP is good to form in Ld/St limited blocks and
+/// bad to form in float-point limited blocks. This is true independent of the
+/// critical path. If the critical path is longer than the resource height, the
+/// extra vector ops can limit physreg renaming. Otherwise, it could simply
+/// oversaturate the vector units.
+bool ARM64StorePairSuppress::shouldAddSTPToBlock(const MachineBasicBlock *BB) {
+  if (!MinInstr)
+    MinInstr = Traces->getEnsemble(MachineTraceMetrics::TS_MinInstrCount);
+
+  MachineTraceMetrics::Trace BBTrace = MinInstr->getTrace(BB);
+  unsigned ResLength = BBTrace.getResourceLength();
+
+  // Get the machine model's scheduling class for STPQi.
+  // Bypass TargetSchedule's SchedClass resolution since we only have an opcode.
+  unsigned SCIdx = TII->get(ARM64::STPDi).getSchedClass();
+  const MCSchedClassDesc *SCDesc =
+      SchedModel.getMCSchedModel()->getSchedClassDesc(SCIdx);
+
+  // If a subtarget does not define resources for STPQi, bail here.
+  if (SCDesc->isValid() && !SCDesc->isVariant()) {
+    unsigned ResLenWithSTP = BBTrace.getResourceLength(
+        ArrayRef<const MachineBasicBlock *>(), SCDesc);
+    if (ResLenWithSTP > ResLength) {
+      DEBUG(dbgs() << "  Suppress STP in BB: " << BB->getNumber()
+                   << " resources " << ResLength << " -> " << ResLenWithSTP
+                   << "\n");
+      return false;
+    }
+  }
+  return true;
+}
+
+/// Return true if this is a floating-point store smaller than the V reg. On
+/// cyclone, these require a vector shuffle before storing a pair.
+/// Ideally we would call getMatchingPairOpcode() and have the machine model
+/// tell us if it's profitable with no cpu knowledge here.
+///
+/// FIXME: We plan to develop a decent Target abstraction for simple loads and
+/// stores. Until then use a nasty switch similar to ARM64LoadStoreOptimizer.
+bool ARM64StorePairSuppress::isNarrowFPStore(const MachineInstr *MI) {
+  switch (MI->getOpcode()) {
+  default:
+    return false;
+  case ARM64::STRSui:
+  case ARM64::STRDui:
+  case ARM64::STURSi:
+  case ARM64::STURDi:
+    return true;
+  }
+}
+
+bool ARM64StorePairSuppress::runOnMachineFunction(MachineFunction &mf) {
+  MF = &mf;
+  TII = static_cast<const ARM64InstrInfo *>(MF->getTarget().getInstrInfo());
+  TRI = MF->getTarget().getRegisterInfo();
+  MRI = &MF->getRegInfo();
+  const TargetSubtargetInfo &ST =
+      MF->getTarget().getSubtarget<TargetSubtargetInfo>();
+  SchedModel.init(*ST.getSchedModel(), &ST, TII);
+
+  Traces = &getAnalysis<MachineTraceMetrics>();
+  MinInstr = 0;
+
+  DEBUG(dbgs() << "*** " << getPassName() << ": " << MF->getName() << '\n');
+
+  if (!SchedModel.hasInstrSchedModel()) {
+    DEBUG(dbgs() << "  Skipping pass: no machine model present.\n");
+    return false;
+  }
+
+  // Check for a sequence of stores to the same base address. We don't need to
+  // precisely determine whether a store pair can be formed. But we do want to
+  // filter out most situations where we can't form store pairs to avoid
+  // computing trace metrics in those cases.
+  for (MachineFunction::iterator BI = MF->begin(), BE = MF->end(); BI != BE;
+       ++BI) {
+    bool SuppressSTP = false;
+    unsigned PrevBaseReg = 0;
+    for (MachineBasicBlock::iterator I = BI->begin(), E = BI->end(); I != E;
+         ++I) {
+      if (!isNarrowFPStore(I))
+        continue;
+      unsigned BaseReg;
+      unsigned Offset;
+      if (TII->getLdStBaseRegImmOfs(I, BaseReg, Offset, TRI)) {
+        if (PrevBaseReg == BaseReg) {
+          // If this block can take STPs, skip ahead to the next block.
+          if (!SuppressSTP && shouldAddSTPToBlock(I->getParent()))
+            break;
+          // Otherwise, continue unpairing the stores in this block.
+          DEBUG(dbgs() << "Unpairing store " << *I << "\n");
+          SuppressSTP = true;
+          TII->suppressLdStPair(I);
+        }
+        PrevBaseReg = BaseReg;
+      } else
+        PrevBaseReg = 0;
+    }
+  }
+  // This pass just sets some internal MachineMemOperand flags. It can't really
+  // invalidate anything.
+  return false;
+}
--- a/lib/Target/ARM64/ARM64Subtarget.cpp
+++ b/lib/Target/ARM64/ARM64Subtarget.cpp
@ -0,0 +1,83 @@
+//===-- ARM64Subtarget.cpp - ARM64 Subtarget Information --------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the ARM64 specific subclass of TargetSubtarget.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARM64InstrInfo.h"
+#include "ARM64Subtarget.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/MachineScheduler.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/Support/TargetRegistry.h"
+
+#define GET_SUBTARGETINFO_CTOR
+#define GET_SUBTARGETINFO_TARGET_DESC
+#include "ARM64GenSubtargetInfo.inc"
+
+using namespace llvm;
+
+ARM64Subtarget::ARM64Subtarget(const std::string &TT, const std::string &CPU,
+                               const std::string &FS)
+    : ARM64GenSubtargetInfo(TT, CPU, FS), HasZeroCycleRegMove(false),
+      HasZeroCycleZeroing(false), CPUString(CPU), TargetTriple(TT) {
+  // Determine default and user-specified characteristics
+
+  if (CPUString.empty())
+    // We default to Cyclone for now.
+    CPUString = "cyclone";
+
+  ParseSubtargetFeatures(CPUString, FS);
+}
+
+/// ClassifyGlobalReference - Find the target operand flags that describe
+/// how a global value should be referenced for the current subtarget.
+unsigned char
+ARM64Subtarget::ClassifyGlobalReference(const GlobalValue *GV,
+                                        const TargetMachine &TM) const {
+
+  // Determine whether this is a reference to a definition or a declaration.
+  // Materializable GVs (in JIT lazy compilation mode) do not require an extra
+  // load from stub.
+  bool isDecl = GV->hasAvailableExternallyLinkage();
+  if (GV->isDeclaration() && !GV->isMaterializable())
+    isDecl = true;
+
+  // If symbol visibility is hidden, the extra load is not needed if
+  // the symbol is definitely defined in the current translation unit.
+  if (TM.getRelocationModel() != Reloc::Static && GV->hasDefaultVisibility() &&
+      (isDecl || GV->isWeakForLinker()))
+    return ARM64II::MO_GOT;
+
+  if (TM.getCodeModel() == CodeModel::Large && isTargetMachO())
+    return ARM64II::MO_GOT;
+
+  // FIXME: this will fail on static ELF for weak symbols.
+  return ARM64II::MO_NO_FLAG;
+}
+
+/// This function returns the name of a function which has an interface
+/// like the non-standard bzero function, if such a function exists on
+/// the current subtarget and it is considered prefereable over
+/// memset with zero passed as the second argument. Otherwise it
+/// returns null.
+const char *ARM64Subtarget::getBZeroEntry() const {
+  // At the moment, always prefer bzero.
+  return "bzero";
+}
+
+void ARM64Subtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
+                                         MachineInstr *begin, MachineInstr *end,
+                                         unsigned NumRegionInstrs) const {
+  // LNT run (at least on Cyclone) showed reasonably significant gains for
+  // bi-directional scheduling. 253.perlbmk.
+  Policy.OnlyTopDown = false;
+  Policy.OnlyBottomUp = false;
+}
--- a/lib/Target/ARM64/ARM64Subtarget.h
+++ b/lib/Target/ARM64/ARM64Subtarget.h
@ -0,0 +1,87 @@
+//=====---- ARM64Subtarget.h - Define Subtarget for the ARM64 -*- C++ -*--====//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the ARM64 specific subclass of TargetSubtarget.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef ARM64SUBTARGET_H
+#define ARM64SUBTARGET_H
+
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include "ARM64RegisterInfo.h"
+#include <string>
+
+#define GET_SUBTARGETINFO_HEADER
+#include "ARM64GenSubtargetInfo.inc"
+
+namespace llvm {
+class GlobalValue;
+class StringRef;
+
+class ARM64Subtarget : public ARM64GenSubtargetInfo {
+protected:
+  // HasZeroCycleRegMove - Has zero-cycle register mov instructions.
+  bool HasZeroCycleRegMove;
+
+  // HasZeroCycleZeroing - Has zero-cycle zeroing instructions.
+  bool HasZeroCycleZeroing;
+
+  /// CPUString - String name of used CPU.
+  std::string CPUString;
+
+  /// TargetTriple - What processor and OS we're targeting.
+  Triple TargetTriple;
+
+public:
+  /// This constructor initializes the data members to match that
+  /// of the specified triple.
+  ARM64Subtarget(const std::string &TT, const std::string &CPU,
+                 const std::string &FS);
+
+  virtual bool enableMachineScheduler() const { return true; }
+
+  bool hasZeroCycleRegMove() const { return HasZeroCycleRegMove; }
+
+  bool hasZeroCycleZeroing() const { return HasZeroCycleZeroing; }
+
+  bool isTargetDarwin() const { return TargetTriple.isOSDarwin(); }
+
+  bool isTargetELF() const { return TargetTriple.isOSBinFormatELF(); }
+
+  bool isTargetMachO() const { return TargetTriple.isOSBinFormatMachO(); }
+
+  bool isCyclone() const { return CPUString == "cyclone"; }
+
+  /// getMaxInlineSizeThreshold - Returns the maximum memset / memcpy size
+  /// that still makes it profitable to inline the call.
+  unsigned getMaxInlineSizeThreshold() const { return 64; }
+
+  /// ParseSubtargetFeatures - Parses features string setting specified
+  /// subtarget options.  Definition of function is auto generated by tblgen.
+  void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
+
+  /// ClassifyGlobalReference - Find the target operand flags that describe
+  /// how a global value should be referenced for the current subtarget.
+  unsigned char ClassifyGlobalReference(const GlobalValue *GV,
+                                        const TargetMachine &TM) const;
+
+  /// This function returns the name of a function which has an interface
+  /// like the non-standard bzero function, if such a function exists on
+  /// the current subtarget and it is considered prefereable over
+  /// memset with zero passed as the second argument. Otherwise it
+  /// returns null.
+  const char *getBZeroEntry() const;
+
+  void overrideSchedPolicy(MachineSchedPolicy &Policy, MachineInstr *begin,
+                           MachineInstr *end, unsigned NumRegionInstrs) const;
+};
+} // End llvm namespace
+
+#endif // ARM64SUBTARGET_H
--- a/lib/Target/ARM64/ARM64TargetMachine.cpp
+++ b/lib/Target/ARM64/ARM64TargetMachine.cpp
@ -0,0 +1,157 @@
+//===-- ARM64TargetMachine.cpp - Define TargetMachine for ARM64 -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARM64.h"
+#include "ARM64TargetMachine.h"
+#include "llvm/PassManager.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Transforms/Scalar.h"
+using namespace llvm;
+
+static cl::opt<bool> EnableCCMP("arm64-ccmp",
+                                cl::desc("Enable the CCMP formation pass"),
+                                cl::init(true));
+
+static cl::opt<bool> EnableStPairSuppress("arm64-stp-suppress", cl::Hidden,
+                                          cl::desc("Suppress STP for ARM64"),
+                                          cl::init(true));
+
+static cl::opt<bool>
+EnablePromoteConstant("arm64-promote-const", cl::Hidden,
+                      cl::desc("Enable the promote constant pass"),
+                      cl::init(true));
+
+static cl::opt<bool>
+EnableCollectLOH("arm64-collect-loh", cl::Hidden,
+                 cl::desc("Enable the pass that emits the linker"
+                          " optimization hints (LOH)"),
+                 cl::init(true));
+
+extern "C" void LLVMInitializeARM64Target() {
+  // Register the target.
+  RegisterTargetMachine<ARM64TargetMachine> X(TheARM64Target);
+}
+
+/// TargetMachine ctor - Create an ARM64 architecture model.
+///
+ARM64TargetMachine::ARM64TargetMachine(const Target &T, StringRef TT,
+                                       StringRef CPU, StringRef FS,
+                                       const TargetOptions &Options,
+                                       Reloc::Model RM, CodeModel::Model CM,
+                                       CodeGenOpt::Level OL)
+    : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL),
+      Subtarget(TT, CPU, FS),
+      DL(Subtarget.isTargetMachO() ? "e-m:o-i64:64-i128:128-n32:64-S128"
+                                   : "e-m:e-i64:64-i128:128-n32:64-S128"),
+      InstrInfo(Subtarget), TLInfo(*this), FrameLowering(*this, Subtarget),
+      TSInfo(*this) {
+  initAsmInfo();
+}
+
+namespace {
+/// ARM64 Code Generator Pass Configuration Options.
+class ARM64PassConfig : public TargetPassConfig {
+public:
+  ARM64PassConfig(ARM64TargetMachine *TM, PassManagerBase &PM)
+      : TargetPassConfig(TM, PM) {}
+
+  ARM64TargetMachine &getARM64TargetMachine() const {
+    return getTM<ARM64TargetMachine>();
+  }
+
+  virtual bool addPreISel();
+  virtual bool addInstSelector();
+  virtual bool addILPOpts();
+  virtual bool addPreRegAlloc();
+  virtual bool addPostRegAlloc();
+  virtual bool addPreSched2();
+  virtual bool addPreEmitPass();
+};
+} // namespace
+
+void ARM64TargetMachine::addAnalysisPasses(PassManagerBase &PM) {
+  // Add first the target-independent BasicTTI pass, then our ARM64 pass. This
+  // allows the ARM64 pass to delegate to the target independent layer when
+  // appropriate.
+  PM.add(createBasicTargetTransformInfoPass(this));
+  PM.add(createARM64TargetTransformInfoPass(this));
+}
+
+TargetPassConfig *ARM64TargetMachine::createPassConfig(PassManagerBase &PM) {
+  return new ARM64PassConfig(this, PM);
+}
+
+// Pass Pipeline Configuration
+bool ARM64PassConfig::addPreISel() {
+  // Run promote constant before global merge, so that the promoted constants
+  // get a chance to be merged
+  if (TM->getOptLevel() != CodeGenOpt::None && EnablePromoteConstant)
+    addPass(createARM64PromoteConstantPass());
+  if (TM->getOptLevel() != CodeGenOpt::None)
+    addPass(createGlobalMergePass(TM));
+  if (TM->getOptLevel() != CodeGenOpt::None)
+    addPass(createARM64AddressTypePromotionPass());
+  return false;
+}
+
+bool ARM64PassConfig::addInstSelector() {
+  addPass(createARM64ISelDag(getARM64TargetMachine(), getOptLevel()));
+
+  // For ELF, cleanup any local-dynamic TLS accesses (i.e. combine as many
+  // references to _TLS_MODULE_BASE_ as possible.
+  if (TM->getSubtarget<ARM64Subtarget>().isTargetELF() &&
+      getOptLevel() != CodeGenOpt::None)
+    addPass(createARM64CleanupLocalDynamicTLSPass());
+
+  return false;
+}
+
+bool ARM64PassConfig::addILPOpts() {
+  if (EnableCCMP)
+    addPass(createARM64ConditionalCompares());
+  addPass(&EarlyIfConverterID);
+  if (EnableStPairSuppress)
+    addPass(createARM64StorePairSuppressPass());
+  return true;
+}
+
+bool ARM64PassConfig::addPreRegAlloc() {
+  // Use AdvSIMD scalar instructions whenever profitable.
+  addPass(createARM64AdvSIMDScalar());
+  return true;
+}
+
+bool ARM64PassConfig::addPostRegAlloc() {
+  // Change dead register definitions to refer to the zero register.
+  addPass(createARM64DeadRegisterDefinitions());
+  return true;
+}
+
+bool ARM64PassConfig::addPreSched2() {
+  // Expand some pseudo instructions to allow proper scheduling.
+  addPass(createARM64ExpandPseudoPass());
+  // Use load/store pair instructions when possible.
+  addPass(createARM64LoadStoreOptimizationPass());
+  return true;
+}
+
+bool ARM64PassConfig::addPreEmitPass() {
+  // Relax conditional branch instructions if they're otherwise out of
+  // range of their destination.
+  addPass(createARM64BranchRelaxation());
+  if (TM->getOptLevel() != CodeGenOpt::None && EnableCollectLOH)
+    addPass(createARM64CollectLOHPass());
+  return true;
+}
--- a/lib/Target/ARM64/ARM64TargetMachine.h
+++ b/lib/Target/ARM64/ARM64TargetMachine.h
@ -0,0 +1,69 @@
+//===-- ARM64TargetMachine.h - Define TargetMachine for ARM64 ---*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the ARM64 specific subclass of TargetMachine.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef ARM64TARGETMACHINE_H
+#define ARM64TARGETMACHINE_H
+
+#include "ARM64InstrInfo.h"
+#include "ARM64ISelLowering.h"
+#include "ARM64Subtarget.h"
+#include "ARM64FrameLowering.h"
+#include "ARM64SelectionDAGInfo.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/MC/MCStreamer.h"
+
+namespace llvm {
+
+class ARM64TargetMachine : public LLVMTargetMachine {
+protected:
+  ARM64Subtarget Subtarget;
+
+private:
+  const DataLayout DL;
+  ARM64InstrInfo InstrInfo;
+  ARM64TargetLowering TLInfo;
+  ARM64FrameLowering FrameLowering;
+  ARM64SelectionDAGInfo TSInfo;
+
+public:
+  ARM64TargetMachine(const Target &T, StringRef TT, StringRef CPU, StringRef FS,
+                     const TargetOptions &Options, Reloc::Model RM,
+                     CodeModel::Model CM, CodeGenOpt::Level OL);
+
+  virtual const ARM64Subtarget *getSubtargetImpl() const { return &Subtarget; }
+  virtual const ARM64TargetLowering *getTargetLowering() const {
+    return &TLInfo;
+  }
+  virtual const DataLayout *getDataLayout() const { return &DL; }
+  virtual const ARM64FrameLowering *getFrameLowering() const {
+    return &FrameLowering;
+  }
+  virtual const ARM64InstrInfo *getInstrInfo() const { return &InstrInfo; }
+  virtual const ARM64RegisterInfo *getRegisterInfo() const {
+    return &InstrInfo.getRegisterInfo();
+  }
+  virtual const ARM64SelectionDAGInfo *getSelectionDAGInfo() const {
+    return &TSInfo;
+  }
+
+  // Pass Pipeline Configuration
+  virtual TargetPassConfig *createPassConfig(PassManagerBase &PM);
+
+  /// \brief Register ARM64 analysis passes with a pass manager.
+  virtual void addAnalysisPasses(PassManagerBase &PM);
+};
+
+} // end namespace llvm
+
+#endif
--- a/lib/Target/ARM64/ARM64TargetObjectFile.cpp
+++ b/lib/Target/ARM64/ARM64TargetObjectFile.cpp
@ -0,0 +1,52 @@
+//===-- ARM64TargetObjectFile.cpp - ARM64 Object Info ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARM64TargetObjectFile.h"
+#include "ARM64TargetMachine.h"
+#include "llvm/IR/Mangler.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/Support/Dwarf.h"
+using namespace llvm;
+using namespace dwarf;
+
+void ARM64_ELFTargetObjectFile::Initialize(MCContext &Ctx,
+                                           const TargetMachine &TM) {
+  TargetLoweringObjectFileELF::Initialize(Ctx, TM);
+  InitializeELF(TM.Options.UseInitArray);
+}
+
+const MCExpr *ARM64_MachoTargetObjectFile::getTTypeGlobalReference(
+    const GlobalValue *GV, unsigned Encoding, Mangler &Mang,
+    const TargetMachine &TM, MachineModuleInfo *MMI,
+    MCStreamer &Streamer) const {
+  // On Darwin, we can reference dwarf symbols with foo@GOT-., which
+  // is an indirect pc-relative reference. The default implementation
+  // won't reference using the GOT, so we need this target-specific
+  // version.
+  if (Encoding & (DW_EH_PE_indirect | DW_EH_PE_pcrel)) {
+    const MCSymbol *Sym = TM.getSymbol(GV, Mang);
+    const MCExpr *Res =
+        MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_GOT, getContext());
+    MCSymbol *PCSym = getContext().CreateTempSymbol();
+    Streamer.EmitLabel(PCSym);
+    const MCExpr *PC = MCSymbolRefExpr::Create(PCSym, getContext());
+    return MCBinaryExpr::CreateSub(Res, PC, getContext());
+  }
+
+  return TargetLoweringObjectFileMachO::getTTypeGlobalReference(
+      GV, Encoding, Mang, TM, MMI, Streamer);
+}
+
+MCSymbol *ARM64_MachoTargetObjectFile::getCFIPersonalitySymbol(
+    const GlobalValue *GV, Mangler &Mang, const TargetMachine &TM,
+    MachineModuleInfo *MMI) const {
+  return TM.getSymbol(GV, Mang);
+}
--- a/lib/Target/ARM64/ARM64TargetObjectFile.h
+++ b/lib/Target/ARM64/ARM64TargetObjectFile.h
@ -0,0 +1,40 @@
+//===-- ARM64TargetObjectFile.h - ARM64 Object Info -*- C++ -------------*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TARGET_ARM64_TARGETOBJECTFILE_H
+#define LLVM_TARGET_ARM64_TARGETOBJECTFILE_H
+
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+
+namespace llvm {
+class ARM64TargetMachine;
+
+/// This implementation is used for AArch64 ELF targets (Linux in particular).
+class ARM64_ELFTargetObjectFile : public TargetLoweringObjectFileELF {
+  virtual void Initialize(MCContext &Ctx, const TargetMachine &TM);
+};
+
+/// ARM64_MachoTargetObjectFile - This TLOF implementation is used for Darwin.
+class ARM64_MachoTargetObjectFile : public TargetLoweringObjectFileMachO {
+public:
+  const MCExpr *getTTypeGlobalReference(const GlobalValue *GV,
+                                        unsigned Encoding, Mangler &Mang,
+                                        const TargetMachine &TM,
+                                        MachineModuleInfo *MMI,
+                                        MCStreamer &Streamer) const override;
+
+  MCSymbol *getCFIPersonalitySymbol(const GlobalValue *GV, Mangler &Mang,
+                                    const TargetMachine &TM,
+                                    MachineModuleInfo *MMI) const override;
+};
+
+} // end namespace llvm
+
+#endif
--- a/lib/Target/ARM64/ARM64TargetTransformInfo.cpp
+++ b/lib/Target/ARM64/ARM64TargetTransformInfo.cpp
@ -0,0 +1,326 @@
+//===-- ARM64TargetTransformInfo.cpp - ARM64 specific TTI pass ------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file implements a TargetTransformInfo analysis pass specific to the
+/// ARM64 target machine. It uses the target's detailed information to provide
+/// more precise answers to certain TTI queries, while letting the target
+/// independent and default TTI implementations handle the rest.
+///
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "arm64tti"
+#include "ARM64.h"
+#include "ARM64TargetMachine.h"
+#include "MCTargetDesc/ARM64AddressingModes.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Target/CostTable.h"
+#include "llvm/Target/TargetLowering.h"
+using namespace llvm;
+
+// Declare the pass initialization routine locally as target-specific passes
+// don't havve a target-wide initialization entry point, and so we rely on the
+// pass constructor initialization.
+namespace llvm {
+void initializeARM64TTIPass(PassRegistry &);
+}
+
+namespace {
+
+class ARM64TTI final : public ImmutablePass, public TargetTransformInfo {
+  const ARM64TargetMachine *TM;
+  const ARM64Subtarget *ST;
+  const ARM64TargetLowering *TLI;
+
+  /// Estimate the overhead of scalarizing an instruction. Insert and Extract
+  /// are set if the result needs to be inserted and/or extracted from vectors.
+  unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) const;
+
+public:
+  ARM64TTI() : ImmutablePass(ID), TM(0), ST(0), TLI(0) {
+    llvm_unreachable("This pass cannot be directly constructed");
+  }
+
+  ARM64TTI(const ARM64TargetMachine *TM)
+      : ImmutablePass(ID), TM(TM), ST(TM->getSubtargetImpl()),
+        TLI(TM->getTargetLowering()) {
+    initializeARM64TTIPass(*PassRegistry::getPassRegistry());
+  }
+
+  void initializePass() override { pushTTIStack(this); }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    TargetTransformInfo::getAnalysisUsage(AU);
+  }
+
+  /// Pass identification.
+  static char ID;
+
+  /// Provide necessary pointer adjustments for the two base classes.
+  void *getAdjustedAnalysisPointer(const void *ID) override {
+    if (ID == &TargetTransformInfo::ID)
+      return (TargetTransformInfo *)this;
+    return this;
+  }
+
+  /// \name Scalar TTI Implementations
+  /// @{
+
+  unsigned getIntImmCost(const APInt &Imm, Type *Ty) const override;
+  PopcntSupportKind getPopcntSupport(unsigned TyWidth) const override;
+
+  /// @}
+
+  /// \name Vector TTI Implementations
+  /// @{
+
+  unsigned getNumberOfRegisters(bool Vector) const override {
+    if (Vector)
+      return 32;
+
+    return 31;
+  }
+
+  unsigned getRegisterBitWidth(bool Vector) const override {
+    if (Vector)
+      return 128;
+
+    return 64;
+  }
+
+  unsigned getMaximumUnrollFactor() const override { return 2; }
+
+  unsigned getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) const
+      override;
+
+  unsigned getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) const
+      override;
+
+  unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty,
+                                  OperandValueKind Opd1Info = OK_AnyValue,
+                                  OperandValueKind Opd2Info = OK_AnyValue) const
+      override;
+
+  unsigned getAddressComputationCost(Type *Ty, bool IsComplex) const override;
+
+  unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy) const
+      override;
+
+  unsigned getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
+                           unsigned AddressSpace) const override;
+  /// @}
+};
+
+} // end anonymous namespace
+
+INITIALIZE_AG_PASS(ARM64TTI, TargetTransformInfo, "arm64tti",
+                   "ARM64 Target Transform Info", true, true, false)
+char ARM64TTI::ID = 0;
+
+ImmutablePass *
+llvm::createARM64TargetTransformInfoPass(const ARM64TargetMachine *TM) {
+  return new ARM64TTI(TM);
+}
+
+unsigned ARM64TTI::getIntImmCost(const APInt &Imm, Type *Ty) const {
+  assert(Ty->isIntegerTy());
+
+  unsigned BitSize = Ty->getPrimitiveSizeInBits();
+  if (BitSize == 0)
+    return ~0U;
+
+  int64_t Val = Imm.getSExtValue();
+  if (Val == 0 || ARM64_AM::isLogicalImmediate(Val, BitSize))
+    return 1;
+
+  if ((int64_t)Val < 0)
+    Val = ~Val;
+  if (BitSize == 32)
+    Val &= (1LL << 32) - 1;
+
+  unsigned LZ = countLeadingZeros((uint64_t)Val);
+  unsigned Shift = (63 - LZ) / 16;
+  // MOVZ is free so return true for one or fewer MOVK.
+  return (Shift == 0) ? 1 : Shift;
+}
+
+ARM64TTI::PopcntSupportKind ARM64TTI::getPopcntSupport(unsigned TyWidth) const {
+  assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
+  if (TyWidth == 32 || TyWidth == 64)
+    return PSK_FastHardware;
+  // TODO: ARM64TargetLowering::LowerCTPOP() supports 128bit popcount.
+  return PSK_Software;
+}
+
+unsigned ARM64TTI::getCastInstrCost(unsigned Opcode, Type *Dst,
+                                    Type *Src) const {
+  int ISD = TLI->InstructionOpcodeToISD(Opcode);
+  assert(ISD && "Invalid opcode");
+
+  EVT SrcTy = TLI->getValueType(Src);
+  EVT DstTy = TLI->getValueType(Dst);
+
+  if (!SrcTy.isSimple() || !DstTy.isSimple())
+    return TargetTransformInfo::getCastInstrCost(Opcode, Dst, Src);
+
+  static const TypeConversionCostTblEntry<MVT> ConversionTbl[] = {
+    // LowerVectorINT_TO_FP:
+    { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 },
+    { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i8, 1 },
+    { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i16, 1 },
+    { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 1 },
+    { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 },
+    { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 },
+    { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i8, 1 },
+    { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i16, 1 },
+    { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 1 },
+    { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 },
+    // LowerVectorFP_TO_INT
+    { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 1 },
+    { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, 1 },
+    { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1 },
+    { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 1 },
+    { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f64, 1 },
+    { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 1 },
+    { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 4 },
+    { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, 4 },
+  };
+
+  int Idx = ConvertCostTableLookup<MVT>(
+      ConversionTbl, array_lengthof(ConversionTbl), ISD, DstTy.getSimpleVT(),
+      SrcTy.getSimpleVT());
+  if (Idx != -1)
+    return ConversionTbl[Idx].Cost;
+
+  return TargetTransformInfo::getCastInstrCost(Opcode, Dst, Src);
+}
+
+unsigned ARM64TTI::getVectorInstrCost(unsigned Opcode, Type *Val,
+                                      unsigned Index) const {
+  assert(Val->isVectorTy() && "This must be a vector type");
+
+  if (Index != -1U) {
+    // Legalize the type.
+    std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Val);
+
+    // This type is legalized to a scalar type.
+    if (!LT.second.isVector())
+      return 0;
+
+    // The type may be split. Normalize the index to the new type.
+    unsigned Width = LT.second.getVectorNumElements();
+    Index = Index % Width;
+
+    // The element at index zero is already inside the vector.
+    if (Index == 0)
+      return 0;
+  }
+
+  // All other insert/extracts cost this much.
+  return 2;
+}
+
+unsigned ARM64TTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
+                                          OperandValueKind Opd1Info,
+                                          OperandValueKind Opd2Info) const {
+  // Legalize the type.
+  std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Ty);
+
+  int ISD = TLI->InstructionOpcodeToISD(Opcode);
+
+  switch (ISD) {
+  default:
+    return TargetTransformInfo::getArithmeticInstrCost(Opcode, Ty, Opd1Info,
+                                                       Opd2Info);
+  case ISD::ADD:
+  case ISD::MUL:
+  case ISD::XOR:
+  case ISD::OR:
+  case ISD::AND:
+    // These nodes are marked as 'custom' for combining purposes only.
+    // We know that they are legal. See LowerAdd in ISelLowering.
+    return 1 * LT.first;
+  }
+}
+
+unsigned ARM64TTI::getAddressComputationCost(Type *Ty, bool IsComplex) const {
+  // Address computations in vectorized code with non-consecutive addresses will
+  // likely result in more instructions compared to scalar code where the
+  // computation can more often be merged into the index mode. The resulting
+  // extra micro-ops can significantly decrease throughput.
+  unsigned NumVectorInstToHideOverhead = 10;
+
+  if (Ty->isVectorTy() && IsComplex)
+    return NumVectorInstToHideOverhead;
+
+  // In many cases the address computation is not merged into the instruction
+  // addressing mode.
+  return 1;
+}
+
+unsigned ARM64TTI::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
+                                      Type *CondTy) const {
+
+  int ISD = TLI->InstructionOpcodeToISD(Opcode);
+  // We don't lower vector selects well that are wider than the register width.
+  if (ValTy->isVectorTy() && ISD == ISD::SELECT) {
+    // We would need this many instructions to hide the scalarization happening.
+    unsigned AmortizationCost = 20;
+    static const TypeConversionCostTblEntry<MVT::SimpleValueType>
+    VectorSelectTbl[] = {
+      { ISD::SELECT, MVT::v16i1, MVT::v16i16, 16 * AmortizationCost },
+      { ISD::SELECT, MVT::v8i1, MVT::v8i32, 8 * AmortizationCost },
+      { ISD::SELECT, MVT::v16i1, MVT::v16i32, 16 * AmortizationCost },
+      { ISD::SELECT, MVT::v4i1, MVT::v4i64, 4 * AmortizationCost },
+      { ISD::SELECT, MVT::v8i1, MVT::v8i64, 8 * AmortizationCost },
+      { ISD::SELECT, MVT::v16i1, MVT::v16i64, 16 * AmortizationCost }
+    };
+
+    EVT SelCondTy = TLI->getValueType(CondTy);
+    EVT SelValTy = TLI->getValueType(ValTy);
+    if (SelCondTy.isSimple() && SelValTy.isSimple()) {
+      int Idx =
+          ConvertCostTableLookup(VectorSelectTbl, ISD, SelCondTy.getSimpleVT(),
+                                 SelValTy.getSimpleVT());
+      if (Idx != -1)
+        return VectorSelectTbl[Idx].Cost;
+    }
+  }
+  return TargetTransformInfo::getCmpSelInstrCost(Opcode, ValTy, CondTy);
+}
+
+unsigned ARM64TTI::getMemoryOpCost(unsigned Opcode, Type *Src,
+                                   unsigned Alignment,
+                                   unsigned AddressSpace) const {
+  std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Src);
+
+  if (Opcode == Instruction::Store && Src->isVectorTy() && Alignment != 16 &&
+      Src->getVectorElementType()->isIntegerTy(64)) {
+    // Unaligned stores are extremely inefficient. We don't split
+    // unaligned v2i64 stores because the negative impact that has shown in
+    // practice on inlined memcpy code.
+    // We make v2i64 stores expensive so that we will only vectorize if there
+    // are 6 other instructions getting vectorized.
+    unsigned AmortizationCost = 6;
+
+    return LT.first * 2 * AmortizationCost;
+  }
+
+  if (Src->isVectorTy() && Src->getVectorElementType()->isIntegerTy(8) &&
+      Src->getVectorNumElements() < 8) {
+    // We scalarize the loads/stores because there is not v.4b register and we
+    // have to promote the elements to v.4h.
+    unsigned NumVecElts = Src->getVectorNumElements();
+    unsigned NumVectorizableInstsToAmortize = NumVecElts * 2;
+    // We generate 2 instructions per vector element.
+    return NumVectorizableInstsToAmortize * NumVecElts * 2;
+  }
+
+  return LT.first;
+}
--- a/lib/Target/ARM64/AsmParser/ARM64AsmParser.cpp
+++ b/lib/Target/ARM64/AsmParser/ARM64AsmParser.cpp
--- a/lib/Target/ARM64/AsmParser/CMakeLists.txt
+++ b/lib/Target/ARM64/AsmParser/CMakeLists.txt
@ -0,0 +1,6 @@
+include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
+
+add_llvm_library(LLVMARM64AsmParser
+  ARM64AsmParser.cpp
+  )
+
--- a/lib/Target/ARM64/AsmParser/LLVMBuild.txt
+++ b/lib/Target/ARM64/AsmParser/LLVMBuild.txt
@ -0,0 +1,24 @@
+;===- ./lib/Target/ARM64/AsmParser/LLVMBuild.txt ---------------*- Conf -*--===;
+;
+;                     The LLVM Compiler Infrastructure
+;
+; This file is distributed under the University of Illinois Open Source
+; License. See LICENSE.TXT for details.
+;
+;===------------------------------------------------------------------------===;
+;
+; This is an LLVMBuild description file for the components in this subdirectory.
+;
+; For more information on the LLVMBuild system, please see:
+;
+;   http://llvm.org/docs/LLVMBuild.html
+;
+;===------------------------------------------------------------------------===;
+
+[component_0]
+type = Library
+name = ARM64AsmParser
+parent = ARM64
+required_libraries = ARM64Desc ARM64Info MC MCParser Support
+add_to_library_groups = ARM64
+
--- a/lib/Target/ARM64/AsmParser/Makefile
+++ b/lib/Target/ARM64/AsmParser/Makefile
@ -0,0 +1,15 @@
+##===- lib/Target/ARM/AsmParser/Makefile -------------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+LEVEL = ../../../..
+LIBRARYNAME = LLVMARM64AsmParser
+
+# Hack: we need to include 'main' ARM target directory to grab private headers
+CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
+
+include $(LEVEL)/Makefile.common
--- a/lib/Target/ARM64/CMakeLists.txt
+++ b/lib/Target/ARM64/CMakeLists.txt
@ -0,0 +1,50 @@
+set(LLVM_TARGET_DEFINITIONS ARM64.td)
+
+tablegen(LLVM ARM64GenRegisterInfo.inc -gen-register-info)
+tablegen(LLVM ARM64GenInstrInfo.inc -gen-instr-info)
+tablegen(LLVM ARM64GenMCCodeEmitter.inc -gen-emitter -mc-emitter)
+tablegen(LLVM ARM64GenMCPseudoLowering.inc -gen-pseudo-lowering)
+tablegen(LLVM ARM64GenAsmWriter.inc -gen-asm-writer)
+tablegen(LLVM ARM64GenAsmWriter1.inc -gen-asm-writer -asmwriternum=1)
+tablegen(LLVM ARM64GenAsmMatcher.inc -gen-asm-matcher)
+tablegen(LLVM ARM64GenDAGISel.inc -gen-dag-isel)
+tablegen(LLVM ARM64GenFastISel.inc -gen-fast-isel)
+tablegen(LLVM ARM64GenCallingConv.inc -gen-callingconv)
+tablegen(LLVM ARM64GenSubtargetInfo.inc -gen-subtarget)
+tablegen(LLVM ARM64GenDisassemblerTables.inc -gen-disassembler)
+add_public_tablegen_target(ARM64CommonTableGen)
+
+add_llvm_target(ARM64CodeGen
+  ARM64AddressTypePromotion.cpp
+  ARM64AdvSIMDScalarPass.cpp
+  ARM64AsmPrinter.cpp
+  ARM64BranchRelaxation.cpp
+  ARM64CleanupLocalDynamicTLSPass.cpp
+  ARM64CollectLOH.cpp
+  ARM64ConditionalCompares.cpp
+  ARM64DeadRegisterDefinitionsPass.cpp
+  ARM64ExpandPseudoInsts.cpp
+  ARM64FastISel.cpp
+  ARM64FrameLowering.cpp
+  ARM64ISelDAGToDAG.cpp
+  ARM64ISelLowering.cpp
+  ARM64InstrInfo.cpp
+  ARM64LoadStoreOptimizer.cpp
+  ARM64MCInstLower.cpp
+  ARM64PromoteConstant.cpp
+  ARM64RegisterInfo.cpp
+  ARM64SelectionDAGInfo.cpp
+  ARM64StorePairSuppress.cpp
+  ARM64Subtarget.cpp
+  ARM64TargetMachine.cpp
+  ARM64TargetObjectFile.cpp
+  ARM64TargetTransformInfo.cpp
+)
+
+add_dependencies(LLVMARM64CodeGen intrinsics_gen)
+
+add_subdirectory(TargetInfo)
+add_subdirectory(AsmParser)
+add_subdirectory(Disassembler)
+add_subdirectory(InstPrinter)
+add_subdirectory(MCTargetDesc)
--- a/lib/Target/ARM64/Disassembler/ARM64Disassembler.cpp
+++ b/lib/Target/ARM64/Disassembler/ARM64Disassembler.cpp
--- a/lib/Target/ARM64/Disassembler/ARM64Disassembler.h
+++ b/lib/Target/ARM64/Disassembler/ARM64Disassembler.h
@ -0,0 +1,54 @@
+//===- ARM64Disassembler.h - Disassembler for ARM64 -------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef ARM64DISASSEMBLER_H
+#define ARM64DISASSEMBLER_H
+
+#include "llvm/MC/MCDisassembler.h"
+
+namespace llvm {
+
+class MCInst;
+class MemoryObject;
+class raw_ostream;
+
+class ARM64Disassembler : public MCDisassembler {
+public:
+  ARM64Disassembler(const MCSubtargetInfo &STI) : MCDisassembler(STI) {}
+
+  ~ARM64Disassembler() {}
+
+  /// getInstruction - See MCDisassembler.
+  MCDisassembler::DecodeStatus getInstruction(MCInst &instr, uint64_t &size,
+                                              const MemoryObject &region,
+                                              uint64_t address,
+                                              raw_ostream &vStream,
+                                              raw_ostream &cStream) const;
+
+  /// tryAddingSymbolicOperand - tryAddingSymbolicOperand trys to add a symbolic
+  /// operand in place of the immediate Value in the MCInst.  The immediate
+  /// Value has not had any PC adjustment made by the caller. If the instruction
+  /// adds the PC to the immediate Value then InstsAddsAddressToValue is true,
+  /// else false.  If the getOpInfo() function was set as part of the
+  /// setupForSymbolicDisassembly() call then that function is called to get any
+  /// symbolic information at the Address for this instrution.  If that returns
+  /// non-zero then the symbolic information it returns is used to create an
+  /// MCExpr and that is added as an operand to the MCInst.  This function
+  /// returns true if it adds an operand to the MCInst and false otherwise.
+  bool tryAddingSymbolicOperand(uint64_t Address, int Value,
+                                bool InstsAddsAddressToValue, uint64_t InstSize,
+                                MCInst &MI, uint32_t insn = 0) const;
+};
+
+} // namespace llvm
+
+#endif
--- a/lib/Target/ARM64/Disassembler/CMakeLists.txt
+++ b/lib/Target/ARM64/Disassembler/CMakeLists.txt
@ -0,0 +1,13 @@
+include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
+
+add_llvm_library(LLVMARM64Disassembler
+  ARM64Disassembler.cpp
+  )
+# workaround for hanging compilation on MSVC8, 9 and 10
+#if( MSVC_VERSION EQUAL 1400 OR MSVC_VERSION EQUAL 1500 OR MSVC_VERSION EQUAL 1600 )
+#set_property(
+#  SOURCE ARMDisassembler.cpp
+#  PROPERTY COMPILE_FLAGS "/Od"
+#  )
+#endif()
+add_dependencies(LLVMARM64Disassembler ARM64CommonTableGen)
--- a/lib/Target/ARM64/Disassembler/LLVMBuild.txt
+++ b/lib/Target/ARM64/Disassembler/LLVMBuild.txt
@ -0,0 +1,24 @@
+;===- ./lib/Target/ARM64/Disassembler/LLVMBuild.txt ------------*- Conf -*--===;
+;
+;                     The LLVM Compiler Infrastructure
+;
+; This file is distributed under the University of Illinois Open Source
+; License. See LICENSE.TXT for details.
+;
+;===------------------------------------------------------------------------===;
+;
+; This is an LLVMBuild description file for the components in this subdirectory.
+;
+; For more information on the LLVMBuild system, please see:
+;
+;   http://llvm.org/docs/LLVMBuild.html
+;
+;===------------------------------------------------------------------------===;
+
+[component_0]
+type = Library
+name = ARM64Disassembler
+parent = ARM64
+required_libraries = ARM64Desc ARM64Info MC Support
+add_to_library_groups = ARM64
+
--- a/lib/Target/ARM64/Disassembler/Makefile
+++ b/lib/Target/ARM64/Disassembler/Makefile
@ -0,0 +1,16 @@
+##===- lib/Target/ARM64/Disassembler/Makefile --------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+
+LEVEL = ../../../..
+LIBRARYNAME = LLVMARM64Disassembler
+
+# Hack: we need to include 'main' arm target directory to grab private headers
+CPPFLAGS = -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
+
+include $(LEVEL)/Makefile.common
--- a/lib/Target/ARM64/InstPrinter/ARM64InstPrinter.cpp
+++ b/lib/Target/ARM64/InstPrinter/ARM64InstPrinter.cpp
--- a/lib/Target/ARM64/InstPrinter/ARM64InstPrinter.h
+++ b/lib/Target/ARM64/InstPrinter/ARM64InstPrinter.h
@ -0,0 +1,157 @@
+//===-- ARM64InstPrinter.h - Convert ARM64 MCInst to assembly syntax ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class prints an ARM64 MCInst to a .s file.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef ARM64INSTPRINTER_H
+#define ARM64INSTPRINTER_H
+
+#include "MCTargetDesc/ARM64MCTargetDesc.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/MC/MCInstPrinter.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+
+namespace llvm {
+
+class MCOperand;
+
+class ARM64InstPrinter : public MCInstPrinter {
+public:
+  ARM64InstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII,
+                   const MCRegisterInfo &MRI, const MCSubtargetInfo &STI);
+
+  virtual void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot);
+  virtual void printRegName(raw_ostream &OS, unsigned RegNo) const;
+
+  // Autogenerated by tblgen.
+  virtual void printInstruction(const MCInst *MI, raw_ostream &O);
+  virtual bool printAliasInstr(const MCInst *MI, raw_ostream &O);
+  virtual StringRef getRegName(unsigned RegNo) const {
+    return getRegisterName(RegNo);
+  }
+  static const char *getRegisterName(unsigned RegNo,
+                                     unsigned AltIdx = ARM64::NoRegAltName);
+
+protected:
+  bool printSysAlias(const MCInst *MI, raw_ostream &O);
+  // Operand printers
+  void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printPostIncOperand(const MCInst *MI, unsigned OpNo, unsigned Imm,
+                           raw_ostream &O);
+  void printPostIncOperand1(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printPostIncOperand2(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printPostIncOperand3(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printPostIncOperand4(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printPostIncOperand6(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printPostIncOperand8(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printPostIncOperand12(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printPostIncOperand16(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printPostIncOperand24(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printPostIncOperand32(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printPostIncOperand48(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printPostIncOperand64(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printVRegOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printSysCROperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printAddSubImm(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printLogicalImm32(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printLogicalImm64(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printShifter(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printShiftedRegister(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printExtendedRegister(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printExtend(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printCondCode(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printDotCondCode(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printAlignedBranchTarget(const MCInst *MI, unsigned OpNum,
+                                raw_ostream &O);
+  void printAMIndexed(const MCInst *MI, unsigned OpNum, unsigned Scale,
+                      raw_ostream &O);
+  void printAMIndexed128(const MCInst *MI, unsigned OpNum, raw_ostream &O) {
+    printAMIndexed(MI, OpNum, 16, O);
+  }
+
+  void printAMIndexed64(const MCInst *MI, unsigned OpNum, raw_ostream &O) {
+    printAMIndexed(MI, OpNum, 8, O);
+  }
+
+  void printAMIndexed32(const MCInst *MI, unsigned OpNum, raw_ostream &O) {
+    printAMIndexed(MI, OpNum, 4, O);
+  }
+
+  void printAMIndexed16(const MCInst *MI, unsigned OpNum, raw_ostream &O) {
+    printAMIndexed(MI, OpNum, 2, O);
+  }
+
+  void printAMIndexed8(const MCInst *MI, unsigned OpNum, raw_ostream &O) {
+    printAMIndexed(MI, OpNum, 1, O);
+  }
+  void printAMUnscaled(const MCInst *MI, unsigned OpNum, raw_ostream &O) {
+    printAMIndexed(MI, OpNum, 1, O);
+  }
+  void printAMNoIndex(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printImmScale4(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printImmScale8(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printImmScale16(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printPrefetchOp(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printMemoryPostIndexed(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printMemoryPostIndexed32(const MCInst *MI, unsigned OpNum,
+                                raw_ostream &O);
+  void printMemoryPostIndexed64(const MCInst *MI, unsigned OpNum,
+                                raw_ostream &O);
+  void printMemoryPostIndexed128(const MCInst *MI, unsigned OpNum,
+                                 raw_ostream &O);
+  void printMemoryRegOffset(const MCInst *MI, unsigned OpNum, raw_ostream &O,
+                            int LegalShiftAmt);
+  void printMemoryRegOffset8(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printMemoryRegOffset16(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printMemoryRegOffset32(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printMemoryRegOffset64(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printMemoryRegOffset128(const MCInst *MI, unsigned OpNum,
+                               raw_ostream &O);
+
+  void printFPImmOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+
+  void printVectorList(const MCInst *MI, unsigned OpNum, raw_ostream &O,
+                       StringRef LayoutSuffix);
+
+  /// Print a list of vector registers where the type suffix is implicit
+  /// (i.e. attached to the instruction rather than the registers).
+  void printImplicitlyTypedVectorList(const MCInst *MI, unsigned OpNum,
+                                      raw_ostream &O);
+
+  template <unsigned NumLanes, char LaneKind>
+  void printTypedVectorList(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+
+  void printVectorIndex(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printAdrpLabel(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printBarrierOption(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printSystemRegister(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printSystemCPSRField(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printSIMDType10Operand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+};
+
+class ARM64AppleInstPrinter : public ARM64InstPrinter {
+public:
+  ARM64AppleInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII,
+                        const MCRegisterInfo &MRI, const MCSubtargetInfo &STI);
+
+  virtual void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot);
+
+  virtual void printInstruction(const MCInst *MI, raw_ostream &O);
+  virtual bool printAliasInstr(const MCInst *MI, raw_ostream &O);
+  virtual StringRef getRegName(unsigned RegNo) const {
+    return getRegisterName(RegNo);
+  }
+  static const char *getRegisterName(unsigned RegNo,
+                                     unsigned AltIdx = ARM64::NoRegAltName);
+};
+}
+
+#endif
--- a/lib/Target/ARM64/InstPrinter/CMakeLists.txt
+++ b/lib/Target/ARM64/InstPrinter/CMakeLists.txt
@ -0,0 +1,7 @@
+include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
+
+add_llvm_library(LLVMARM64AsmPrinter
+  ARM64InstPrinter.cpp
+  )
+
+add_dependencies(LLVMARM64AsmPrinter ARM64CommonTableGen)
--- a/lib/Target/ARM64/InstPrinter/LLVMBuild.txt
+++ b/lib/Target/ARM64/InstPrinter/LLVMBuild.txt
@ -0,0 +1,24 @@
+;===- ./lib/Target/ARM64/InstPrinter/LLVMBuild.txt -------------*- Conf -*--===;
+;
+;                     The LLVM Compiler Infrastructure
+;
+; This file is distributed under the University of Illinois Open Source
+; License. See LICENSE.TXT for details.
+;
+;===------------------------------------------------------------------------===;
+;
+; This is an LLVMBuild description file for the components in this subdirectory.
+;
+; For more information on the LLVMBuild system, please see:
+;
+;   http://llvm.org/docs/LLVMBuild.html
+;
+;===------------------------------------------------------------------------===;
+
+[component_0]
+type = Library
+name = ARM64AsmPrinter
+parent = ARM64
+required_libraries = MC Support
+add_to_library_groups = ARM64
+
--- a/lib/Target/ARM64/InstPrinter/Makefile
+++ b/lib/Target/ARM64/InstPrinter/Makefile
@ -0,0 +1,15 @@
+##===- lib/Target/ARM64/AsmPrinter/Makefile ----------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+LEVEL = ../../../..
+LIBRARYNAME = LLVMARM64AsmPrinter
+
+# Hack: we need to include 'main' arm target directory to grab private headers
+CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
+
+include $(LEVEL)/Makefile.common
--- a/lib/Target/ARM64/LLVMBuild.txt
+++ b/lib/Target/ARM64/LLVMBuild.txt
@ -0,0 +1,36 @@
+;===- ./lib/Target/ARM64/LLVMBuild.txt -------------------------*- Conf -*--===;
+;
+;                     The LLVM Compiler Infrastructure
+;
+; This file is distributed under the University of Illinois Open Source
+; License. See LICENSE.TXT for details.
+;
+;===------------------------------------------------------------------------===;
+;
+; This is an LLVMBuild description file for the components in this subdirectory.
+;
+; For more information on the LLVMBuild system, please see:
+;
+;   http://llvm.org/docs/LLVMBuild.html
+;
+;===------------------------------------------------------------------------===;
+
+[common]
+subdirectories = AsmParser Disassembler InstPrinter MCTargetDesc TargetInfo
+
+[component_0]
+type = TargetGroup
+name = ARM64
+parent = Target
+has_asmparser = 1
+has_asmprinter = 1
+has_disassembler = 1
+has_jit = 1
+
+[component_1]
+type = Library
+name = ARM64CodeGen
+parent = ARM64
+required_libraries = ARM64AsmPrinter ARM64Desc ARM64Info Analysis AsmPrinter CodeGen Core MC SelectionDAG Support Target
+add_to_library_groups = ARM64
+
--- a/lib/Target/ARM64/MCTargetDesc/ARM64AddressingModes.h
+++ b/lib/Target/ARM64/MCTargetDesc/ARM64AddressingModes.h
@ -0,0 +1,759 @@
+//===- ARM64AddressingModes.h - ARM64 Addressing Modes ----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the ARM64 addressing mode implementation stuff.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TARGET_ARM64_ARM64ADDRESSINGMODES_H
+#define LLVM_TARGET_ARM64_ARM64ADDRESSINGMODES_H
+
+#include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include <cassert>
+
+namespace llvm {
+
+/// ARM64_AM - ARM64 Addressing Mode Stuff
+namespace ARM64_AM {
+
+//===----------------------------------------------------------------------===//
+// Shifts
+//
+
+enum ShiftType {
+  InvalidShift = -1,
+  LSL = 0,
+  LSR = 1,
+  ASR = 2,
+  ROR = 3,
+  MSL = 4
+};
+
+/// getShiftName - Get the string encoding for the shift type.
+static inline const char *getShiftName(ARM64_AM::ShiftType ST) {
+  switch (ST) {
+  default: assert(false && "unhandled shift type!");
+  case ARM64_AM::LSL: return "lsl";
+  case ARM64_AM::LSR: return "lsr";
+  case ARM64_AM::ASR: return "asr";
+  case ARM64_AM::ROR: return "ror";
+  case ARM64_AM::MSL: return "msl";
+  }
+  return 0;
+}
+
+/// getShiftType - Extract the shift type.
+static inline ARM64_AM::ShiftType getShiftType(unsigned Imm) {
+  return ARM64_AM::ShiftType((Imm >> 6) & 0x7);
+}
+
+/// getShiftValue - Extract the shift value.
+static inline unsigned getShiftValue(unsigned Imm) {
+  return Imm & 0x3f;
+}
+
+/// getShifterImm - Encode the shift type and amount:
+///   imm:     6-bit shift amount
+///   shifter: 000 ==> lsl
+///            001 ==> lsr
+///            010 ==> asr
+///            011 ==> ror
+///            100 ==> msl
+///   {8-6}  = shifter
+///   {5-0}  = imm
+static inline unsigned getShifterImm(ARM64_AM::ShiftType ST, unsigned Imm) {
+  assert((Imm & 0x3f) == Imm && "Illegal shifted immedate value!");
+  return (unsigned(ST) << 6) | (Imm & 0x3f);
+}
+
+//===----------------------------------------------------------------------===//
+// Extends
+//
+
+enum ExtendType {
+  InvalidExtend = -1,
+  UXTB = 0,
+  UXTH = 1,
+  UXTW = 2,
+  UXTX = 3,
+  SXTB = 4,
+  SXTH = 5,
+  SXTW = 6,
+  SXTX = 7
+};
+
+/// getExtendName - Get the string encoding for the extend type.
+static inline const char *getExtendName(ARM64_AM::ExtendType ET) {
+  switch (ET) {
+  default: assert(false && "unhandled extend type!");
+  case ARM64_AM::UXTB: return "uxtb";
+  case ARM64_AM::UXTH: return "uxth";
+  case ARM64_AM::UXTW: return "uxtw";
+  case ARM64_AM::UXTX: return "uxtx";
+  case ARM64_AM::SXTB: return "sxtb";
+  case ARM64_AM::SXTH: return "sxth";
+  case ARM64_AM::SXTW: return "sxtw";
+  case ARM64_AM::SXTX: return "sxtx";
+  }
+  return 0;
+}
+
+/// getArithShiftValue - get the arithmetic shift value.
+static inline unsigned getArithShiftValue(unsigned Imm) {
+  return Imm & 0x7;
+}
+
+/// getExtendType - Extract the extend type for operands of arithmetic ops.
+static inline ARM64_AM::ExtendType getArithExtendType(unsigned Imm) {
+  return ARM64_AM::ExtendType((Imm >> 3) & 0x7);
+}
+
+/// getArithExtendImm - Encode the extend type and shift amount for an
+///                     arithmetic instruction:
+///   imm:     3-bit extend amount
+///   shifter: 000 ==> uxtb
+///            001 ==> uxth
+///            010 ==> uxtw
+///            011 ==> uxtx
+///            100 ==> sxtb
+///            101 ==> sxth
+///            110 ==> sxtw
+///            111 ==> sxtx
+///   {5-3}  = shifter
+///   {2-0}  = imm3
+static inline unsigned getArithExtendImm(ARM64_AM::ExtendType ET,
+                                         unsigned Imm) {
+  assert((Imm & 0x7) == Imm && "Illegal shifted immedate value!");
+  return (unsigned(ET) << 3) | (Imm & 0x7);
+}
+
+/// getMemDoShift - Extract the "do shift" flag value for load/store
+/// instructions.
+static inline bool getMemDoShift(unsigned Imm) {
+  return (Imm & 0x1) != 0;
+}
+
+/// getExtendType - Extract the extend type for the offset operand of
+/// loads/stores.
+static inline ARM64_AM::ExtendType getMemExtendType(unsigned Imm) {
+  return ARM64_AM::ExtendType((Imm >> 1) & 0x7);
+}
+
+/// getExtendImm - Encode the extend type and amount for a load/store inst:
+///   imm:     3-bit extend amount
+///   shifter: 000 ==> uxtb
+///            001 ==> uxth
+///            010 ==> uxtw
+///            011 ==> uxtx
+///            100 ==> sxtb
+///            101 ==> sxth
+///            110 ==> sxtw
+///            111 ==> sxtx
+///   {3-1}  = shifter
+///   {0}  = imm3
+static inline unsigned getMemExtendImm(ARM64_AM::ExtendType ET, bool Imm) {
+  assert((Imm & 0x7) == Imm && "Illegal shifted immedate value!");
+  return (unsigned(ET) << 1) | (Imm & 0x7);
+}
+
+//===----------------------------------------------------------------------===//
+// Prefetch
+//
+
+/// Pre-fetch operator names.
+/// The enum values match the encoding values:
+///   prfop<4:3> 00=preload data, 10=prepare for store
+///   prfop<2:1> 00=target L1 cache, 01=target L2 cache, 10=target L3 cache,
+///   prfop<0> 0=non-streaming (temporal), 1=streaming (non-temporal)
+enum PrefetchOp {
+  InvalidPrefetchOp = -1,
+  PLDL1KEEP = 0x00,
+  PLDL1STRM = 0x01,
+  PLDL2KEEP = 0x02,
+  PLDL2STRM = 0x03,
+  PLDL3KEEP = 0x04,
+  PLDL3STRM = 0x05,
+  PSTL1KEEP = 0x10,
+  PSTL1STRM = 0x11,
+  PSTL2KEEP = 0x12,
+  PSTL2STRM = 0x13,
+  PSTL3KEEP = 0x14,
+  PSTL3STRM = 0x15
+};
+
+/// isNamedPrefetchOp - Check if the prefetch-op 5-bit value has a name.
+static inline bool isNamedPrefetchOp(unsigned prfop) {
+  switch (prfop) {
+  default: return false;
+  case ARM64_AM::PLDL1KEEP: case ARM64_AM::PLDL1STRM: case ARM64_AM::PLDL2KEEP:
+  case ARM64_AM::PLDL2STRM: case ARM64_AM::PLDL3KEEP: case ARM64_AM::PLDL3STRM:
+  case ARM64_AM::PSTL1KEEP: case ARM64_AM::PSTL1STRM: case ARM64_AM::PSTL2KEEP:
+  case ARM64_AM::PSTL2STRM: case ARM64_AM::PSTL3KEEP: case ARM64_AM::PSTL3STRM:
+    return true;
+  }
+}
+
+
+/// getPrefetchOpName - Get the string encoding for the prefetch operator.
+static inline const char *getPrefetchOpName(ARM64_AM::PrefetchOp prfop) {
+  switch (prfop) {
+  default: assert(false && "unhandled prefetch-op type!");
+  case ARM64_AM::PLDL1KEEP: return "pldl1keep";
+  case ARM64_AM::PLDL1STRM: return "pldl1strm";
+  case ARM64_AM::PLDL2KEEP: return "pldl2keep";
+  case ARM64_AM::PLDL2STRM: return "pldl2strm";
+  case ARM64_AM::PLDL3KEEP: return "pldl3keep";
+  case ARM64_AM::PLDL3STRM: return "pldl3strm";
+  case ARM64_AM::PSTL1KEEP: return "pstl1keep";
+  case ARM64_AM::PSTL1STRM: return "pstl1strm";
+  case ARM64_AM::PSTL2KEEP: return "pstl2keep";
+  case ARM64_AM::PSTL2STRM: return "pstl2strm";
+  case ARM64_AM::PSTL3KEEP: return "pstl3keep";
+  case ARM64_AM::PSTL3STRM: return "pstl3strm";
+  }
+  return 0;
+}
+
+static inline uint64_t ror(uint64_t elt, unsigned size) {
+  return ((elt & 1) << (size-1)) | (elt >> 1);
+}
+
+/// processLogicalImmediate - Determine if an immediate value can be encoded
+/// as the immediate operand of a logical instruction for the given register
+/// size.  If so, return true with "encoding" set to the encoded value in
+/// the form N:immr:imms.
+static inline bool processLogicalImmediate(uint64_t imm, unsigned regSize,
+                                           uint64_t &encoding) {
+  if (imm == 0ULL || imm == ~0ULL ||
+      (regSize != 64 && (imm >> regSize != 0 || imm == ~0U)))
+    return false;
+
+  unsigned size = 2;
+  uint64_t eltVal = imm;
+
+  // First, determine the element size.
+  while (size < regSize) {
+    unsigned numElts = regSize / size;
+    unsigned mask = (1ULL << size) - 1;
+    uint64_t lowestEltVal = imm & mask;
+
+    bool allMatched = true;
+    for (unsigned i = 1; i < numElts; ++i) {
+     uint64_t currEltVal = (imm >> (i*size)) & mask;
+      if (currEltVal != lowestEltVal) {
+        allMatched = false;
+        break;
+      }
+    }
+
+    if (allMatched) {
+      eltVal = lowestEltVal;
+      break;
+    }
+
+    size *= 2;
+  }
+
+  // Second, determine the rotation to make the element be: 0^m 1^n.
+  for (unsigned i = 0; i < size; ++i) {
+    eltVal = ror(eltVal, size);
+    uint32_t clz = countLeadingZeros(eltVal) - (64 - size);
+    uint32_t cto = CountTrailingOnes_64(eltVal);
+
+    if (clz + cto == size) {
+      // Encode in immr the number of RORs it would take to get *from* this
+      // element value to our target value, where i+1 is the number of RORs
+      // to go the opposite direction.
+      unsigned immr = size - (i + 1);
+
+      // If size has a 1 in the n'th bit, create a value that has zeroes in
+      // bits [0, n] and ones above that.
+      uint64_t nimms = ~(size-1) << 1;
+
+      // Or the CTO value into the low bits, which must be below the Nth bit
+      // bit mentioned above.
+      nimms |= (cto-1);
+
+      // Extract the seventh bit and toggle it to create the N field.
+      unsigned N = ((nimms >> 6) & 1) ^ 1;
+
+      encoding = (N << 12) | (immr << 6) | (nimms & 0x3f);
+      return true;
+    }
+  }
+
+  return false;
+}
+
+/// isLogicalImmediate - Return true if the immediate is valid for a logical
+/// immediate instruction of the given register size. Return false otherwise.
+static inline bool isLogicalImmediate(uint64_t imm, unsigned regSize) {
+  uint64_t encoding;
+  return processLogicalImmediate(imm, regSize, encoding);
+}
+
+/// encodeLogicalImmediate - Return the encoded immediate value for a logical
+/// immediate instruction of the given register size.
+static inline uint64_t encodeLogicalImmediate(uint64_t imm, unsigned regSize) {
+  uint64_t encoding = 0;
+  bool res = processLogicalImmediate(imm, regSize, encoding);
+  assert(res && "invalid logical immediate");
+  (void)res;
+  return encoding;
+}
+
+/// decodeLogicalImmediate - Decode a logical immediate value in the form
+/// "N:immr:imms" (where the immr and imms fields are each 6 bits) into the
+/// integer value it represents with regSize bits.
+static inline uint64_t decodeLogicalImmediate(uint64_t val, unsigned regSize) {
+  // Extract the N, imms, and immr fields.
+  unsigned N = (val >> 12) & 1;
+  unsigned immr = (val >> 6) & 0x3f;
+  unsigned imms = val & 0x3f;
+
+  assert((regSize == 64 || N == 0) && "undefined logical immediate encoding");
+  int len = 31 - countLeadingZeros((N << 6) | (~imms & 0x3f));
+  assert(len >= 0 && "undefined logical immediate encoding");
+  unsigned size = (1 << len);
+  unsigned R = immr & (size - 1);
+  unsigned S = imms & (size - 1);
+  assert(S != size - 1 && "undefined logical immediate encoding");
+  uint64_t pattern = (1ULL << (S + 1)) - 1;
+  for (unsigned i = 0; i < R; ++i)
+    pattern = ror(pattern, size);
+
+  // Replicate the pattern to fill the regSize.
+  while (size != regSize) {
+    pattern |= (pattern << size);
+    size *= 2;
+  }
+  return pattern;
+}
+
+/// isValidDecodeLogicalImmediate - Check to see if the logical immediate value
+/// in the form "N:immr:imms" (where the immr and imms fields are each 6 bits)
+/// is a valid encoding for an integer value with regSize bits.
+static inline bool isValidDecodeLogicalImmediate(uint64_t val,
+                                                 unsigned regSize) {
+  // Extract the N and imms fields needed for checking.
+  unsigned N = (val >> 12) & 1;
+  unsigned imms = val & 0x3f;
+
+  if (regSize == 32 && N != 0) // undefined logical immediate encoding
+    return false;
+  int len = 31 - countLeadingZeros((N << 6) | (~imms & 0x3f));
+  if (len < 0) // undefined logical immediate encoding
+    return false;
+  unsigned size = (1 << len);
+  unsigned S = imms & (size - 1);
+  if (S == size - 1) // undefined logical immediate encoding
+    return false;
+
+  return true;
+}
+
+//===----------------------------------------------------------------------===//
+// Floating-point Immediates
+//
+static inline float getFPImmFloat(unsigned Imm) {
+  // We expect an 8-bit binary encoding of a floating-point number here.
+  union {
+    uint32_t I;
+    float F;
+  } FPUnion;
+
+  uint8_t Sign = (Imm >> 7) & 0x1;
+  uint8_t Exp = (Imm >> 4) & 0x7;
+  uint8_t Mantissa = Imm & 0xf;
+
+  //   8-bit FP    iEEEE Float Encoding
+  //   abcd efgh   aBbbbbbc defgh000 00000000 00000000
+  //
+  // where B = NOT(b);
+
+  FPUnion.I = 0;
+  FPUnion.I |= Sign << 31;
+  FPUnion.I |= ((Exp & 0x4) != 0 ? 0 : 1) << 30;
+  FPUnion.I |= ((Exp & 0x4) != 0 ? 0x1f : 0) << 25;
+  FPUnion.I |= (Exp & 0x3) << 23;
+  FPUnion.I |= Mantissa << 19;
+  return FPUnion.F;
+}
+
+/// getFP32Imm - Return an 8-bit floating-point version of the 32-bit
+/// floating-point value. If the value cannot be represented as an 8-bit
+/// floating-point value, then return -1.
+static inline int getFP32Imm(const APInt &Imm) {
+  uint32_t Sign = Imm.lshr(31).getZExtValue() & 1;
+  int32_t Exp = (Imm.lshr(23).getSExtValue() & 0xff) - 127;  // -126 to 127
+  int64_t Mantissa = Imm.getZExtValue() & 0x7fffff;  // 23 bits
+
+  // We can handle 4 bits of mantissa.
+  // mantissa = (16+UInt(e:f:g:h))/16.
+  if (Mantissa & 0x7ffff)
+    return -1;
+  Mantissa >>= 19;
+  if ((Mantissa & 0xf) != Mantissa)
+    return -1;
+
+  // We can handle 3 bits of exponent: exp == UInt(NOT(b):c:d)-3
+  if (Exp < -3 || Exp > 4)
+    return -1;
+  Exp = ((Exp+3) & 0x7) ^ 4;
+
+  return ((int)Sign << 7) | (Exp << 4) | Mantissa;
+}
+
+static inline int getFP32Imm(const APFloat &FPImm) {
+  return getFP32Imm(FPImm.bitcastToAPInt());
+}
+
+/// getFP64Imm - Return an 8-bit floating-point version of the 64-bit
+/// floating-point value. If the value cannot be represented as an 8-bit
+/// floating-point value, then return -1.
+static inline int getFP64Imm(const APInt &Imm) {
+  uint64_t Sign = Imm.lshr(63).getZExtValue() & 1;
+  int64_t Exp = (Imm.lshr(52).getSExtValue() & 0x7ff) - 1023;   // -1022 to 1023
+  uint64_t Mantissa = Imm.getZExtValue() & 0xfffffffffffffULL;
+
+  // We can handle 4 bits of mantissa.
+  // mantissa = (16+UInt(e:f:g:h))/16.
+  if (Mantissa & 0xffffffffffffULL)
+    return -1;
+  Mantissa >>= 48;
+  if ((Mantissa & 0xf) != Mantissa)
+    return -1;
+
+  // We can handle 3 bits of exponent: exp == UInt(NOT(b):c:d)-3
+  if (Exp < -3 || Exp > 4)
+    return -1;
+  Exp = ((Exp+3) & 0x7) ^ 4;
+
+  return ((int)Sign << 7) | (Exp << 4) | Mantissa;
+}
+
+static inline int getFP64Imm(const APFloat &FPImm) {
+  return getFP64Imm(FPImm.bitcastToAPInt());
+}
+
+//===--------------------------------------------------------------------===//
+// AdvSIMD Modified Immediates
+//===--------------------------------------------------------------------===//
+
+// 0x00 0x00 0x00 abcdefgh 0x00 0x00 0x00 abcdefgh
+static inline bool isAdvSIMDModImmType1(uint64_t Imm) {
+  return ((Imm >> 32) == (Imm & 0xffffffffULL)) &&
+         ((Imm & 0xffffff00ffffff00ULL) == 0);
+}
+
+static inline uint8_t encodeAdvSIMDModImmType1(uint64_t Imm) {
+  return (Imm & 0xffULL);
+}
+
+static inline uint64_t decodeAdvSIMDModImmType1(uint8_t Imm) {
+  uint64_t EncVal = Imm;
+  return (EncVal << 32) | EncVal;
+}
+
+// 0x00 0x00 abcdefgh 0x00 0x00 0x00 abcdefgh 0x00
+static inline bool isAdvSIMDModImmType2(uint64_t Imm) {
+  return ((Imm >> 32) == (Imm & 0xffffffffULL)) &&
+         ((Imm & 0xffff00ffffff00ffULL) == 0);
+}
+
+static inline uint8_t encodeAdvSIMDModImmType2(uint64_t Imm) {
+  return (Imm & 0xff00ULL) >> 8;
+}
+
+static inline uint64_t decodeAdvSIMDModImmType2(uint8_t Imm) {
+  uint64_t EncVal = Imm;
+  return (EncVal << 40) | (EncVal << 8);
+}
+
+// 0x00 abcdefgh 0x00 0x00 0x00 abcdefgh 0x00 0x00
+static inline bool isAdvSIMDModImmType3(uint64_t Imm) {
+  return ((Imm >> 32) == (Imm & 0xffffffffULL)) &&
+         ((Imm & 0xff00ffffff00ffffULL) == 0);
+}
+
+static inline uint8_t encodeAdvSIMDModImmType3(uint64_t Imm) {
+  return (Imm & 0xff0000ULL) >> 16;
+}
+
+static inline uint64_t decodeAdvSIMDModImmType3(uint8_t Imm) {
+  uint64_t EncVal = Imm;
+  return (EncVal << 48) | (EncVal << 16);
+}
+
+// abcdefgh 0x00 0x00 0x00 abcdefgh 0x00 0x00 0x00
+static inline bool isAdvSIMDModImmType4(uint64_t Imm) {
+  return ((Imm >> 32) == (Imm & 0xffffffffULL)) &&
+         ((Imm & 0x00ffffff00ffffffULL) == 0);
+}
+
+static inline uint8_t encodeAdvSIMDModImmType4(uint64_t Imm) {
+  return (Imm & 0xff000000ULL) >> 24;
+}
+
+static inline uint64_t decodeAdvSIMDModImmType4(uint8_t Imm) {
+  uint64_t EncVal = Imm;
+  return (EncVal << 56) | (EncVal << 24);
+}
+
+// 0x00 abcdefgh 0x00 abcdefgh 0x00 abcdefgh 0x00 abcdefgh
+static inline bool isAdvSIMDModImmType5(uint64_t Imm) {
+  return ((Imm >> 32) == (Imm & 0xffffffffULL)) &&
+         (((Imm & 0x00ff0000ULL) >> 16) == (Imm & 0x000000ffULL)) &&
+         ((Imm & 0xff00ff00ff00ff00ULL) == 0);
+}
+
+static inline uint8_t encodeAdvSIMDModImmType5(uint64_t Imm) {
+  return (Imm & 0xffULL);
+}
+
+static inline uint64_t decodeAdvSIMDModImmType5(uint8_t Imm) {
+  uint64_t EncVal = Imm;
+  return (EncVal << 48) | (EncVal << 32) | (EncVal << 16) | EncVal;
+}
+
+// abcdefgh 0x00 abcdefgh 0x00 abcdefgh 0x00 abcdefgh 0x00
+static inline bool isAdvSIMDModImmType6(uint64_t Imm) {
+  return ((Imm >> 32) == (Imm & 0xffffffffULL)) &&
+         (((Imm & 0xff000000ULL) >> 16) == (Imm & 0x0000ff00ULL)) &&
+         ((Imm & 0x00ff00ff00ff00ffULL) == 0);
+}
+
+static inline uint8_t encodeAdvSIMDModImmType6(uint64_t Imm) {
+  return (Imm & 0xff00ULL) >> 8;
+}
+
+static inline uint64_t decodeAdvSIMDModImmType6(uint8_t Imm) {
+  uint64_t EncVal = Imm;
+  return (EncVal << 56) | (EncVal << 40) | (EncVal << 24) | (EncVal << 8);
+}
+
+// 0x00 0x00 abcdefgh 0xFF 0x00 0x00 abcdefgh 0xFF
+static inline bool isAdvSIMDModImmType7(uint64_t Imm) {
+  return ((Imm >> 32) == (Imm & 0xffffffffULL)) &&
+         ((Imm & 0xffff00ffffff00ffULL) == 0x000000ff000000ffULL);
+}
+
+static inline uint8_t encodeAdvSIMDModImmType7(uint64_t Imm) {
+  return (Imm & 0xff00ULL) >> 8;
+}
+
+static inline uint64_t decodeAdvSIMDModImmType7(uint8_t Imm) {
+  uint64_t EncVal = Imm;
+  return (EncVal << 40) | (EncVal << 8) | 0x000000ff000000ffULL;
+}
+
+// 0x00 abcdefgh 0xFF 0xFF 0x00 abcdefgh 0xFF 0xFF
+static inline bool isAdvSIMDModImmType8(uint64_t Imm) {
+  return ((Imm >> 32) == (Imm & 0xffffffffULL)) &&
+         ((Imm & 0xff00ffffff00ffffULL) == 0x0000ffff0000ffffULL);
+}
+
+static inline uint64_t decodeAdvSIMDModImmType8(uint8_t Imm) {
+  uint64_t EncVal = Imm;
+  return (EncVal << 48) | (EncVal << 16) | 0x0000ffff0000ffffULL;
+}
+
+static inline uint8_t encodeAdvSIMDModImmType8(uint64_t Imm) {
+  return (Imm & 0x00ff0000ULL) >> 16;
+}
+
+// abcdefgh abcdefgh abcdefgh abcdefgh abcdefgh abcdefgh abcdefgh abcdefgh
+static inline bool isAdvSIMDModImmType9(uint64_t Imm) {
+  return ((Imm >> 32) == (Imm & 0xffffffffULL)) &&
+         ((Imm >> 48) == (Imm & 0x0000ffffULL)) &&
+         ((Imm >> 56) == (Imm & 0x000000ffULL));
+}
+
+static inline uint8_t encodeAdvSIMDModImmType9(uint64_t Imm) {
+  return (Imm & 0xffULL);
+}
+
+static inline uint64_t decodeAdvSIMDModImmType9(uint8_t Imm) {
+  uint64_t EncVal = Imm;
+  EncVal |= (EncVal << 8);
+  EncVal |= (EncVal << 16);
+  EncVal |= (EncVal << 32);
+  return EncVal;
+}
+
+// aaaaaaaa bbbbbbbb cccccccc dddddddd eeeeeeee ffffffff gggggggg hhhhhhhh
+// cmode: 1110, op: 1
+static inline bool isAdvSIMDModImmType10(uint64_t Imm) {
+  uint64_t ByteA = Imm & 0xff00000000000000ULL;
+  uint64_t ByteB = Imm & 0x00ff000000000000ULL;
+  uint64_t ByteC = Imm & 0x0000ff0000000000ULL;
+  uint64_t ByteD = Imm & 0x000000ff00000000ULL;
+  uint64_t ByteE = Imm & 0x00000000ff000000ULL;
+  uint64_t ByteF = Imm & 0x0000000000ff0000ULL;
+  uint64_t ByteG = Imm & 0x000000000000ff00ULL;
+  uint64_t ByteH = Imm & 0x00000000000000ffULL;
+
+  return (ByteA == 0ULL || ByteA == 0xff00000000000000ULL) &&
+         (ByteB == 0ULL || ByteB == 0x00ff000000000000ULL) &&
+         (ByteC == 0ULL || ByteC == 0x0000ff0000000000ULL) &&
+         (ByteD == 0ULL || ByteD == 0x000000ff00000000ULL) &&
+         (ByteE == 0ULL || ByteE == 0x00000000ff000000ULL) &&
+         (ByteF == 0ULL || ByteF == 0x0000000000ff0000ULL) &&
+         (ByteG == 0ULL || ByteG == 0x000000000000ff00ULL) &&
+         (ByteH == 0ULL || ByteH == 0x00000000000000ffULL);
+}
+
+static inline uint8_t encodeAdvSIMDModImmType10(uint64_t Imm) {
+  bool BitA = Imm & 0xff00000000000000ULL;
+  bool BitB = Imm & 0x00ff000000000000ULL;
+  bool BitC = Imm & 0x0000ff0000000000ULL;
+  bool BitD = Imm & 0x000000ff00000000ULL;
+  bool BitE = Imm & 0x00000000ff000000ULL;
+  bool BitF = Imm & 0x0000000000ff0000ULL;
+  bool BitG = Imm & 0x000000000000ff00ULL;
+  bool BitH = Imm & 0x00000000000000ffULL;
+
+  unsigned EncVal = BitA;
+  EncVal <<= 1;
+  EncVal |= BitB;
+  EncVal <<= 1;
+  EncVal |= BitC;
+  EncVal <<= 1;
+  EncVal |= BitD;
+  EncVal <<= 1;
+  EncVal |= BitE;
+  EncVal <<= 1;
+  EncVal |= BitF;
+  EncVal <<= 1;
+  EncVal |= BitG;
+  EncVal <<= 1;
+  EncVal |= BitH;
+  return EncVal;
+}
+
+static inline uint64_t decodeAdvSIMDModImmType10(uint8_t Imm) {
+  uint64_t EncVal = 0;
+  if (Imm & 0x80) EncVal |= 0xff00000000000000ULL;
+  if (Imm & 0x40) EncVal |= 0x00ff000000000000ULL;
+  if (Imm & 0x20) EncVal |= 0x0000ff0000000000ULL;
+  if (Imm & 0x10) EncVal |= 0x000000ff00000000ULL;
+  if (Imm & 0x08) EncVal |= 0x00000000ff000000ULL;
+  if (Imm & 0x04) EncVal |= 0x0000000000ff0000ULL;
+  if (Imm & 0x02) EncVal |= 0x000000000000ff00ULL;
+  if (Imm & 0x01) EncVal |= 0x00000000000000ffULL;
+  return EncVal;
+}
+
+// aBbbbbbc defgh000 0x00 0x00 aBbbbbbc defgh000 0x00 0x00
+static inline bool isAdvSIMDModImmType11(uint64_t Imm) {
+  uint64_t BString = (Imm & 0x7E000000ULL) >> 25;
+  return ((Imm >> 32) == (Imm & 0xffffffffULL)) &&
+         (BString == 0x1f || BString == 0x20) &&
+         ((Imm & 0x0007ffff0007ffffULL) == 0);
+}
+
+static inline uint8_t encodeAdvSIMDModImmType11(uint64_t Imm) {
+  bool BitA = (Imm & 0x80000000ULL);
+  bool BitB = (Imm & 0x20000000ULL);
+  bool BitC = (Imm & 0x01000000ULL);
+  bool BitD = (Imm & 0x00800000ULL);
+  bool BitE = (Imm & 0x00400000ULL);
+  bool BitF = (Imm & 0x00200000ULL);
+  bool BitG = (Imm & 0x00100000ULL);
+  bool BitH = (Imm & 0x00080000ULL);
+
+  unsigned EncVal = BitA;
+  EncVal <<= 1;
+  EncVal |= BitB;
+  EncVal <<= 1;
+  EncVal |= BitC;
+  EncVal <<= 1;
+  EncVal |= BitD;
+  EncVal <<= 1;
+  EncVal |= BitE;
+  EncVal <<= 1;
+  EncVal |= BitF;
+  EncVal <<= 1;
+  EncVal |= BitG;
+  EncVal <<= 1;
+  EncVal |= BitH;
+  return EncVal;
+}
+
+static inline uint64_t decodeAdvSIMDModImmType11(uint8_t Imm) {
+  uint64_t EncVal = 0;
+  if (Imm & 0x80) EncVal |= 0x80000000ULL;
+  if (Imm & 0x40) EncVal |= 0x3e000000ULL;
+  else            EncVal |= 0x40000000ULL;
+  if (Imm & 0x20) EncVal |= 0x01000000ULL;
+  if (Imm & 0x10) EncVal |= 0x00800000ULL;
+  if (Imm & 0x08) EncVal |= 0x00400000ULL;
+  if (Imm & 0x04) EncVal |= 0x00200000ULL;
+  if (Imm & 0x02) EncVal |= 0x00100000ULL;
+  if (Imm & 0x01) EncVal |= 0x00080000ULL;
+  return (EncVal << 32) | EncVal;
+}
+
+// aBbbbbbb bbcdefgh 0x00 0x00 0x00 0x00 0x00 0x00
+static inline bool isAdvSIMDModImmType12(uint64_t Imm) {
+  uint64_t BString = (Imm & 0x7fc0000000000000ULL) >> 54;
+  return ((BString == 0xff || BString == 0x100) &&
+         ((Imm & 0x0000ffffffffffffULL) == 0));
+}
+
+static inline uint8_t encodeAdvSIMDModImmType12(uint64_t Imm) {
+  bool BitA = (Imm & 0x8000000000000000ULL);
+  bool BitB = (Imm & 0x0040000000000000ULL);
+  bool BitC = (Imm & 0x0020000000000000ULL);
+  bool BitD = (Imm & 0x0010000000000000ULL);
+  bool BitE = (Imm & 0x0008000000000000ULL);
+  bool BitF = (Imm & 0x0004000000000000ULL);
+  bool BitG = (Imm & 0x0002000000000000ULL);
+  bool BitH = (Imm & 0x0001000000000000ULL);
+
+  unsigned EncVal = BitA;
+  EncVal <<= 1;
+  EncVal |= BitB;
+  EncVal <<= 1;
+  EncVal |= BitC;
+  EncVal <<= 1;
+  EncVal |= BitD;
+  EncVal <<= 1;
+  EncVal |= BitE;
+  EncVal <<= 1;
+  EncVal |= BitF;
+  EncVal <<= 1;
+  EncVal |= BitG;
+  EncVal <<= 1;
+  EncVal |= BitH;
+  return EncVal;
+}
+
+static inline uint64_t decodeAdvSIMDModImmType12(uint8_t Imm) {
+  uint64_t EncVal = 0;
+  if (Imm & 0x80) EncVal |= 0x8000000000000000ULL;
+  if (Imm & 0x40) EncVal |= 0x3fc0000000000000ULL;
+  else            EncVal |= 0x4000000000000000ULL;
+  if (Imm & 0x20) EncVal |= 0x0020000000000000ULL;
+  if (Imm & 0x10) EncVal |= 0x0010000000000000ULL;
+  if (Imm & 0x08) EncVal |= 0x0008000000000000ULL;
+  if (Imm & 0x04) EncVal |= 0x0004000000000000ULL;
+  if (Imm & 0x02) EncVal |= 0x0002000000000000ULL;
+  if (Imm & 0x01) EncVal |= 0x0001000000000000ULL;
+  return (EncVal << 32) | EncVal;
+}
+
+} // end namespace ARM64_AM
+
+} // end namespace llvm
+
+#endif
--- a/lib/Target/ARM64/MCTargetDesc/ARM64AsmBackend.cpp
+++ b/lib/Target/ARM64/MCTargetDesc/ARM64AsmBackend.cpp
@ -0,0 +1,533 @@
+//===-- ARM64AsmBackend.cpp - ARM64 Assembler Backend ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARM64.h"
+#include "ARM64RegisterInfo.h"
+#include "MCTargetDesc/ARM64FixupKinds.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCDirectives.h"
+#include "llvm/MC/MCFixupKindInfo.h"
+#include "llvm/MC/MCObjectWriter.h"
+#include "llvm/MC/MCSectionMachO.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MachO.h"
+using namespace llvm;
+
+namespace {
+
+class ARM64AsmBackend : public MCAsmBackend {
+  static const unsigned PCRelFlagVal =
+      MCFixupKindInfo::FKF_IsAlignedDownTo32Bits | MCFixupKindInfo::FKF_IsPCRel;
+
+public:
+  ARM64AsmBackend(const Target &T) : MCAsmBackend() {}
+
+  unsigned getNumFixupKinds() const { return ARM64::NumTargetFixupKinds; }
+
+  const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const {
+    const static MCFixupKindInfo Infos[ARM64::NumTargetFixupKinds] = {
+      // This table *must* be in the order that the fixup_* kinds are defined in
+      // ARM64FixupKinds.h.
+      //
+      // Name                           Offset (bits) Size (bits)     Flags
+      { "fixup_arm64_pcrel_adr_imm21", 0, 32, PCRelFlagVal },
+      { "fixup_arm64_pcrel_adrp_imm21", 0, 32, PCRelFlagVal },
+      { "fixup_arm64_add_imm12", 10, 12, 0 },
+      { "fixup_arm64_ldst_imm12_scale1", 10, 12, 0 },
+      { "fixup_arm64_ldst_imm12_scale2", 10, 12, 0 },
+      { "fixup_arm64_ldst_imm12_scale4", 10, 12, 0 },
+      { "fixup_arm64_ldst_imm12_scale8", 10, 12, 0 },
+      { "fixup_arm64_ldst_imm12_scale16", 10, 12, 0 },
+      { "fixup_arm64_movw", 5, 16, 0 },
+      { "fixup_arm64_pcrel_branch14", 5, 14, PCRelFlagVal },
+      { "fixup_arm64_pcrel_imm19", 5, 19, PCRelFlagVal },
+      { "fixup_arm64_pcrel_branch26", 0, 26, PCRelFlagVal },
+      { "fixup_arm64_pcrel_call26", 0, 26, PCRelFlagVal },
+      { "fixup_arm64_tlsdesc_call", 0, 0, 0 }
+    };
+
+    if (Kind < FirstTargetFixupKind)
+      return MCAsmBackend::getFixupKindInfo(Kind);
+
+    assert(unsigned(Kind - FirstTargetFixupKind) < getNumFixupKinds() &&
+           "Invalid kind!");
+    return Infos[Kind - FirstTargetFixupKind];
+  }
+
+  void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
+                  uint64_t Value, bool IsPCRel) const;
+
+  bool mayNeedRelaxation(const MCInst &Inst) const;
+  bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value,
+                            const MCRelaxableFragment *DF,
+                            const MCAsmLayout &Layout) const;
+  void relaxInstruction(const MCInst &Inst, MCInst &Res) const;
+  bool writeNopData(uint64_t Count, MCObjectWriter *OW) const;
+
+  void HandleAssemblerFlag(MCAssemblerFlag Flag) {}
+
+  unsigned getPointerSize() const { return 8; }
+};
+
+} // end anonymous namespace
+
+/// \brief The number of bytes the fixup may change.
+static unsigned getFixupKindNumBytes(unsigned Kind) {
+  switch (Kind) {
+  default:
+    assert(0 && "Unknown fixup kind!");
+
+  case ARM64::fixup_arm64_tlsdesc_call:
+    return 0;
+
+  case FK_Data_1:
+    return 1;
+
+  case FK_Data_2:
+  case ARM64::fixup_arm64_movw:
+    return 2;
+
+  case ARM64::fixup_arm64_pcrel_branch14:
+  case ARM64::fixup_arm64_add_imm12:
+  case ARM64::fixup_arm64_ldst_imm12_scale1:
+  case ARM64::fixup_arm64_ldst_imm12_scale2:
+  case ARM64::fixup_arm64_ldst_imm12_scale4:
+  case ARM64::fixup_arm64_ldst_imm12_scale8:
+  case ARM64::fixup_arm64_ldst_imm12_scale16:
+  case ARM64::fixup_arm64_pcrel_imm19:
+    return 3;
+
+  case ARM64::fixup_arm64_pcrel_adr_imm21:
+  case ARM64::fixup_arm64_pcrel_adrp_imm21:
+  case ARM64::fixup_arm64_pcrel_branch26:
+  case ARM64::fixup_arm64_pcrel_call26:
+  case FK_Data_4:
+    return 4;
+
+  case FK_Data_8:
+    return 8;
+  }
+}
+
+static unsigned AdrImmBits(unsigned Value) {
+  unsigned lo2 = Value & 0x3;
+  unsigned hi19 = (Value & 0x1ffffc) >> 2;
+  return (hi19 << 5) | (lo2 << 29);
+}
+
+static uint64_t adjustFixupValue(unsigned Kind, uint64_t Value) {
+  int64_t SignedValue = static_cast<int64_t>(Value);
+  switch (Kind) {
+  default:
+    assert(false && "Unknown fixup kind!");
+  case ARM64::fixup_arm64_pcrel_adr_imm21:
+    if (SignedValue > 2097151 || SignedValue < -2097152)
+      report_fatal_error("fixup value out of range");
+    return AdrImmBits(Value & 0x1fffffULL);
+  case ARM64::fixup_arm64_pcrel_adrp_imm21:
+    return AdrImmBits((Value & 0x1fffff000ULL) >> 12);
+  case ARM64::fixup_arm64_pcrel_imm19:
+    // Signed 21-bit immediate
+    if (SignedValue > 2097151 || SignedValue < -2097152)
+      report_fatal_error("fixup value out of range");
+    // Low two bits are not encoded.
+    return (Value >> 2) & 0x7ffff;
+  case ARM64::fixup_arm64_add_imm12:
+  case ARM64::fixup_arm64_ldst_imm12_scale1:
+    // Unsigned 12-bit immediate
+    if (Value >= 0x1000)
+      report_fatal_error("invalid imm12 fixup value");
+    return Value;
+  case ARM64::fixup_arm64_ldst_imm12_scale2:
+    // Unsigned 12-bit immediate which gets multiplied by 2
+    if (Value & 1 || Value >= 0x2000)
+      report_fatal_error("invalid imm12 fixup value");
+    return Value >> 1;
+  case ARM64::fixup_arm64_ldst_imm12_scale4:
+    // Unsigned 12-bit immediate which gets multiplied by 4
+    if (Value & 3 || Value >= 0x4000)
+      report_fatal_error("invalid imm12 fixup value");
+    return Value >> 2;
+  case ARM64::fixup_arm64_ldst_imm12_scale8:
+    // Unsigned 12-bit immediate which gets multiplied by 8
+    if (Value & 7 || Value >= 0x8000)
+      report_fatal_error("invalid imm12 fixup value");
+    return Value >> 3;
+  case ARM64::fixup_arm64_ldst_imm12_scale16:
+    // Unsigned 12-bit immediate which gets multiplied by 16
+    if (Value & 15 || Value >= 0x10000)
+      report_fatal_error("invalid imm12 fixup value");
+    return Value >> 4;
+  case ARM64::fixup_arm64_movw:
+    report_fatal_error("no resolvable MOVZ/MOVK fixups supported yet");
+    return Value;
+  case ARM64::fixup_arm64_pcrel_branch14:
+    // Signed 16-bit immediate
+    if (SignedValue > 32767 || SignedValue < -32768)
+      report_fatal_error("fixup value out of range");
+    // Low two bits are not encoded (4-byte alignment assumed).
+    if (Value & 0x3)
+      report_fatal_error("fixup not sufficiently aligned");
+    return (Value >> 2) & 0x3fff;
+  case ARM64::fixup_arm64_pcrel_branch26:
+  case ARM64::fixup_arm64_pcrel_call26:
+    // Signed 28-bit immediate
+    if (SignedValue > 134217727 || SignedValue < -134217728)
+      report_fatal_error("fixup value out of range");
+    // Low two bits are not encoded (4-byte alignment assumed).
+    if (Value & 0x3)
+      report_fatal_error("fixup not sufficiently aligned");
+    return (Value >> 2) & 0x3ffffff;
+  case FK_Data_1:
+  case FK_Data_2:
+  case FK_Data_4:
+  case FK_Data_8:
+    return Value;
+  }
+}
+
+void ARM64AsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
+                                 unsigned DataSize, uint64_t Value,
+                                 bool IsPCRel) const {
+  unsigned NumBytes = getFixupKindNumBytes(Fixup.getKind());
+  if (!Value)
+    return; // Doesn't change encoding.
+  MCFixupKindInfo Info = getFixupKindInfo(Fixup.getKind());
+  // Apply any target-specific value adjustments.
+  Value = adjustFixupValue(Fixup.getKind(), Value);
+
+  // Shift the value into position.
+  Value <<= Info.TargetOffset;
+
+  unsigned Offset = Fixup.getOffset();
+  assert(Offset + NumBytes <= DataSize && "Invalid fixup offset!");
+
+  // For each byte of the fragment that the fixup touches, mask in the
+  // bits from the fixup value.
+  for (unsigned i = 0; i != NumBytes; ++i)
+    Data[Offset + i] |= uint8_t((Value >> (i * 8)) & 0xff);
+}
+
+bool ARM64AsmBackend::mayNeedRelaxation(const MCInst &Inst) const {
+  return false;
+}
+
+bool ARM64AsmBackend::fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value,
+                                           const MCRelaxableFragment *DF,
+                                           const MCAsmLayout &Layout) const {
+  // FIXME:  This isn't correct for ARM64. Just moving the "generic" logic
+  // into the targets for now.
+  //
+  // Relax if the value is too big for a (signed) i8.
+  return int64_t(Value) != int64_t(int8_t(Value));
+}
+
+void ARM64AsmBackend::relaxInstruction(const MCInst &Inst, MCInst &Res) const {
+  assert(false && "ARM64AsmBackend::relaxInstruction() unimplemented");
+}
+
+bool ARM64AsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const {
+  // If the count is not 4-byte aligned, we must be writing data into the text
+  // section (otherwise we have unaligned instructions, and thus have far
+  // bigger problems), so just write zeros instead.
+  if ((Count & 3) != 0) {
+    for (uint64_t i = 0, e = (Count & 3); i != e; ++i)
+      OW->Write8(0);
+  }
+
+  // We are properly aligned, so write NOPs as requested.
+  Count /= 4;
+  for (uint64_t i = 0; i != Count; ++i)
+    OW->Write32(0xd503201f);
+  return true;
+}
+
+namespace {
+
+namespace CU {
+
+/// \brief Compact unwind encoding values.
+enum CompactUnwindEncodings {
+  /// \brief A "frameless" leaf function, where no non-volatile registers are
+  /// saved. The return remains in LR throughout the function.
+  UNWIND_ARM64_MODE_FRAMELESS = 0x02000000,
+
+  /// \brief No compact unwind encoding available. Instead the low 23-bits of
+  /// the compact unwind encoding is the offset of the DWARF FDE in the
+  /// __eh_frame section. This mode is never used in object files. It is only
+  /// generated by the linker in final linked images, which have only DWARF info
+  /// for a function.
+  UNWIND_ARM64_MODE_DWARF = 0x03000000,
+
+  /// \brief This is a standard arm64 prologue where FP/LR are immediately
+  /// pushed on the stack, then SP is copied to FP. If there are any
+  /// non-volatile register saved, they are copied into the stack fame in pairs
+  /// in a contiguous ranger right below the saved FP/LR pair. Any subset of the
+  /// five X pairs and four D pairs can be saved, but the memory layout must be
+  /// in register number order.
+  UNWIND_ARM64_MODE_FRAME = 0x04000000,
+
+  /// \brief Frame register pair encodings.
+  UNWIND_ARM64_FRAME_X19_X20_PAIR = 0x00000001,
+  UNWIND_ARM64_FRAME_X21_X22_PAIR = 0x00000002,
+  UNWIND_ARM64_FRAME_X23_X24_PAIR = 0x00000004,
+  UNWIND_ARM64_FRAME_X25_X26_PAIR = 0x00000008,
+  UNWIND_ARM64_FRAME_X27_X28_PAIR = 0x00000010,
+  UNWIND_ARM64_FRAME_D8_D9_PAIR = 0x00000100,
+  UNWIND_ARM64_FRAME_D10_D11_PAIR = 0x00000200,
+  UNWIND_ARM64_FRAME_D12_D13_PAIR = 0x00000400,
+  UNWIND_ARM64_FRAME_D14_D15_PAIR = 0x00000800
+};
+
+} // end CU namespace
+
+// FIXME: This should be in a separate file.
+class DarwinARM64AsmBackend : public ARM64AsmBackend {
+  const MCRegisterInfo &MRI;
+
+  /// \brief Encode compact unwind stack adjustment for frameless functions.
+  /// See UNWIND_ARM64_FRAMELESS_STACK_SIZE_MASK in compact_unwind_encoding.h.
+  /// The stack size always needs to be 16 byte aligned.
+  uint32_t encodeStackAdjustment(uint32_t StackSize) const {
+    return (StackSize / 16) << 12;
+  }
+
+public:
+  DarwinARM64AsmBackend(const Target &T, const MCRegisterInfo &MRI)
+      : ARM64AsmBackend(T), MRI(MRI) {}
+
+  MCObjectWriter *createObjectWriter(raw_ostream &OS) const {
+    return createARM64MachObjectWriter(OS, MachO::CPU_TYPE_ARM64,
+                                       MachO::CPU_SUBTYPE_ARM64_ALL);
+  }
+
+  virtual bool doesSectionRequireSymbols(const MCSection &Section) const {
+    // Any section for which the linker breaks things into atoms needs to
+    // preserve symbols, including assembler local symbols, to identify
+    // those atoms. These sections are:
+    // Sections of type:
+    //
+    //    S_CSTRING_LITERALS  (e.g. __cstring)
+    //    S_LITERAL_POINTERS  (e.g.  objc selector pointers)
+    //    S_16BYTE_LITERALS, S_8BYTE_LITERALS, S_4BYTE_LITERALS
+    //
+    // Sections named:
+    //
+    //    __TEXT,__eh_frame
+    //    __TEXT,__ustring
+    //    __DATA,__cfstring
+    //    __DATA,__objc_classrefs
+    //    __DATA,__objc_catlist
+    //
+    // FIXME: It would be better if the compiler used actual linker local
+    // symbols for each of these sections rather than preserving what
+    // are ostensibly assembler local symbols.
+    const MCSectionMachO &SMO = static_cast<const MCSectionMachO &>(Section);
+    return (SMO.getType() == MachO::S_CSTRING_LITERALS ||
+            SMO.getType() == MachO::S_4BYTE_LITERALS ||
+            SMO.getType() == MachO::S_8BYTE_LITERALS ||
+            SMO.getType() == MachO::S_16BYTE_LITERALS ||
+            SMO.getType() == MachO::S_LITERAL_POINTERS ||
+            (SMO.getSegmentName() == "__TEXT" &&
+             (SMO.getSectionName() == "__eh_frame" ||
+              SMO.getSectionName() == "__ustring")) ||
+            (SMO.getSegmentName() == "__DATA" &&
+             (SMO.getSectionName() == "__cfstring" ||
+              SMO.getSectionName() == "__objc_classrefs" ||
+              SMO.getSectionName() == "__objc_catlist")));
+  }
+
+  /// \brief Generate the compact unwind encoding from the CFI directives.
+  virtual uint32_t
+  generateCompactUnwindEncoding(ArrayRef<MCCFIInstruction> Instrs) const
+      override {
+    if (Instrs.empty())
+      return CU::UNWIND_ARM64_MODE_FRAMELESS;
+
+    bool HasFP = false;
+    unsigned StackSize = 0;
+
+    uint32_t CompactUnwindEncoding = 0;
+    for (size_t i = 0, e = Instrs.size(); i != e; ++i) {
+      const MCCFIInstruction &Inst = Instrs[i];
+
+      switch (Inst.getOperation()) {
+      default:
+        // Cannot handle this directive:  bail out.
+        return CU::UNWIND_ARM64_MODE_DWARF;
+      case MCCFIInstruction::OpDefCfa: {
+        // Defines a frame pointer.
+        assert(getXRegFromWReg(MRI.getLLVMRegNum(Inst.getRegister(), true)) ==
+                   ARM64::FP &&
+               "Invalid frame pointer!");
+        assert(i + 2 < e && "Insufficient CFI instructions to define a frame!");
+
+        const MCCFIInstruction &LRPush = Instrs[++i];
+        assert(LRPush.getOperation() == MCCFIInstruction::OpOffset &&
+               "Link register not pushed!");
+        const MCCFIInstruction &FPPush = Instrs[++i];
+        assert(FPPush.getOperation() == MCCFIInstruction::OpOffset &&
+               "Frame pointer not pushed!");
+
+        unsigned LRReg = MRI.getLLVMRegNum(LRPush.getRegister(), true);
+        unsigned FPReg = MRI.getLLVMRegNum(FPPush.getRegister(), true);
+
+        LRReg = getXRegFromWReg(LRReg);
+        FPReg = getXRegFromWReg(FPReg);
+
+        assert(LRReg == ARM64::LR && FPReg == ARM64::FP &&
+               "Pushing invalid registers for frame!");
+
+        // Indicate that the function has a frame.
+        CompactUnwindEncoding |= CU::UNWIND_ARM64_MODE_FRAME;
+        HasFP = true;
+        break;
+      }
+      case MCCFIInstruction::OpDefCfaOffset: {
+        assert(StackSize == 0 && "We already have the CFA offset!");
+        StackSize = std::abs(Inst.getOffset());
+        break;
+      }
+      case MCCFIInstruction::OpOffset: {
+        // Registers are saved in pairs. We expect there to be two consecutive
+        // `.cfi_offset' instructions with the appropriate registers specified.
+        unsigned Reg1 = MRI.getLLVMRegNum(Inst.getRegister(), true);
+        if (i + 1 == e)
+          return CU::UNWIND_ARM64_MODE_DWARF;
+
+        const MCCFIInstruction &Inst2 = Instrs[++i];
+        if (Inst2.getOperation() != MCCFIInstruction::OpOffset)
+          return CU::UNWIND_ARM64_MODE_DWARF;
+        unsigned Reg2 = MRI.getLLVMRegNum(Inst2.getRegister(), true);
+
+        // N.B. The encodings must be in register number order, and the X
+        // registers before the D registers.
+
+        // X19/X20 pair = 0x00000001,
+        // X21/X22 pair = 0x00000002,
+        // X23/X24 pair = 0x00000004,
+        // X25/X26 pair = 0x00000008,
+        // X27/X28 pair = 0x00000010
+        Reg1 = getXRegFromWReg(Reg1);
+        Reg2 = getXRegFromWReg(Reg2);
+
+        if (Reg1 == ARM64::X19 && Reg2 == ARM64::X20 &&
+            (CompactUnwindEncoding & 0xF1E) == 0)
+          CompactUnwindEncoding |= CU::UNWIND_ARM64_FRAME_X19_X20_PAIR;
+        else if (Reg1 == ARM64::X21 && Reg2 == ARM64::X22 &&
+                 (CompactUnwindEncoding & 0xF1C) == 0)
+          CompactUnwindEncoding |= CU::UNWIND_ARM64_FRAME_X21_X22_PAIR;
+        else if (Reg1 == ARM64::X23 && Reg2 == ARM64::X24 &&
+                 (CompactUnwindEncoding & 0xF18) == 0)
+          CompactUnwindEncoding |= CU::UNWIND_ARM64_FRAME_X23_X24_PAIR;
+        else if (Reg1 == ARM64::X25 && Reg2 == ARM64::X26 &&
+                 (CompactUnwindEncoding & 0xF10) == 0)
+          CompactUnwindEncoding |= CU::UNWIND_ARM64_FRAME_X25_X26_PAIR;
+        else if (Reg1 == ARM64::X27 && Reg2 == ARM64::X28 &&
+                 (CompactUnwindEncoding & 0xF00) == 0)
+          CompactUnwindEncoding |= CU::UNWIND_ARM64_FRAME_X27_X28_PAIR;
+        else {
+          Reg1 = getDRegFromBReg(Reg1);
+          Reg2 = getDRegFromBReg(Reg2);
+
+          // D8/D9 pair   = 0x00000100,
+          // D10/D11 pair = 0x00000200,
+          // D12/D13 pair = 0x00000400,
+          // D14/D15 pair = 0x00000800
+          if (Reg1 == ARM64::D8 && Reg2 == ARM64::D9 &&
+              (CompactUnwindEncoding & 0xE00) == 0)
+            CompactUnwindEncoding |= CU::UNWIND_ARM64_FRAME_D8_D9_PAIR;
+          else if (Reg1 == ARM64::D10 && Reg2 == ARM64::D11 &&
+                   (CompactUnwindEncoding & 0xC00) == 0)
+            CompactUnwindEncoding |= CU::UNWIND_ARM64_FRAME_D10_D11_PAIR;
+          else if (Reg1 == ARM64::D12 && Reg2 == ARM64::D13 &&
+                   (CompactUnwindEncoding & 0x800) == 0)
+            CompactUnwindEncoding |= CU::UNWIND_ARM64_FRAME_D12_D13_PAIR;
+          else if (Reg1 == ARM64::D14 && Reg2 == ARM64::D15)
+            CompactUnwindEncoding |= CU::UNWIND_ARM64_FRAME_D14_D15_PAIR;
+          else
+            // A pair was pushed which we cannot handle.
+            return CU::UNWIND_ARM64_MODE_DWARF;
+        }
+
+        break;
+      }
+      }
+    }
+
+    if (!HasFP) {
+      // With compact unwind info we can only represent stack adjustments of up
+      // to 65520 bytes.
+      if (StackSize > 65520)
+        return CU::UNWIND_ARM64_MODE_DWARF;
+
+      CompactUnwindEncoding |= CU::UNWIND_ARM64_MODE_FRAMELESS;
+      CompactUnwindEncoding |= encodeStackAdjustment(StackSize);
+    }
+
+    return CompactUnwindEncoding;
+  }
+};
+
+} // end anonymous namespace
+
+namespace {
+
+class ELFARM64AsmBackend : public ARM64AsmBackend {
+public:
+  uint8_t OSABI;
+
+  ELFARM64AsmBackend(const Target &T, uint8_t OSABI)
+      : ARM64AsmBackend(T), OSABI(OSABI) {}
+
+  MCObjectWriter *createObjectWriter(raw_ostream &OS) const {
+    return createARM64ELFObjectWriter(OS, OSABI);
+  }
+
+  void processFixupValue(const MCAssembler &Asm, const MCAsmLayout &Layout,
+                         const MCFixup &Fixup, const MCFragment *DF,
+                         const MCValue &Target, uint64_t &Value,
+                         bool &IsResolved) override;
+};
+
+void ELFARM64AsmBackend::processFixupValue(const MCAssembler &Asm,
+                                           const MCAsmLayout &Layout,
+                                           const MCFixup &Fixup,
+                                           const MCFragment *DF,
+                                           const MCValue &Target,
+                                           uint64_t &Value, bool &IsResolved) {
+  // The ADRP instruction adds some multiple of 0x1000 to the current PC &
+  // ~0xfff. This means that the required offset to reach a symbol can vary by
+  // up to one step depending on where the ADRP is in memory. For example:
+  //
+  //     ADRP x0, there
+  //  there:
+  //
+  // If the ADRP occurs at address 0xffc then "there" will be at 0x1000 and
+  // we'll need that as an offset. At any other address "there" will be in the
+  // same page as the ADRP and the instruction should encode 0x0. Assuming the
+  // section isn't 0x1000-aligned, we therefore need to delegate this decision
+  // to the linker -- a relocation!
+  if ((uint32_t)Fixup.getKind() == ARM64::fixup_arm64_pcrel_adrp_imm21)
+    IsResolved = false;
+}
+}
+
+MCAsmBackend *llvm::createARM64AsmBackend(const Target &T,
+                                          const MCRegisterInfo &MRI,
+                                          StringRef TT, StringRef CPU) {
+  Triple TheTriple(TT);
+
+  if (TheTriple.isOSDarwin())
+    return new DarwinARM64AsmBackend(T, MRI);
+
+  assert(TheTriple.isOSBinFormatELF() && "Expect either MachO or ELF target");
+  return new ELFARM64AsmBackend(T, TheTriple.getOS());
+}
--- a/lib/Target/ARM64/MCTargetDesc/ARM64BaseInfo.h
+++ b/lib/Target/ARM64/MCTargetDesc/ARM64BaseInfo.h
@ -0,0 +1,998 @@
+//===-- ARM64BaseInfo.h - Top level definitions for ARM64 -------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains small standalone helper functions and enum definitions for
+// the ARM64 target useful for the compiler back-end and the MC libraries.
+// As such, it deliberately does not include references to LLVM core
+// code gen types, passes, etc..
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef ARM64BASEINFO_H
+#define ARM64BASEINFO_H
+
+#include "ARM64MCTargetDesc.h"
+#include "llvm/Support/ErrorHandling.h"
+
+namespace llvm {
+
+inline static unsigned getWRegFromXReg(unsigned Reg) {
+  switch (Reg) {
+  case ARM64::X0: return ARM64::W0;
+  case ARM64::X1: return ARM64::W1;
+  case ARM64::X2: return ARM64::W2;
+  case ARM64::X3: return ARM64::W3;
+  case ARM64::X4: return ARM64::W4;
+  case ARM64::X5: return ARM64::W5;
+  case ARM64::X6: return ARM64::W6;
+  case ARM64::X7: return ARM64::W7;
+  case ARM64::X8: return ARM64::W8;
+  case ARM64::X9: return ARM64::W9;
+  case ARM64::X10: return ARM64::W10;
+  case ARM64::X11: return ARM64::W11;
+  case ARM64::X12: return ARM64::W12;
+  case ARM64::X13: return ARM64::W13;
+  case ARM64::X14: return ARM64::W14;
+  case ARM64::X15: return ARM64::W15;
+  case ARM64::X16: return ARM64::W16;
+  case ARM64::X17: return ARM64::W17;
+  case ARM64::X18: return ARM64::W18;
+  case ARM64::X19: return ARM64::W19;
+  case ARM64::X20: return ARM64::W20;
+  case ARM64::X21: return ARM64::W21;
+  case ARM64::X22: return ARM64::W22;
+  case ARM64::X23: return ARM64::W23;
+  case ARM64::X24: return ARM64::W24;
+  case ARM64::X25: return ARM64::W25;
+  case ARM64::X26: return ARM64::W26;
+  case ARM64::X27: return ARM64::W27;
+  case ARM64::X28: return ARM64::W28;
+  case ARM64::FP: return ARM64::W29;
+  case ARM64::LR: return ARM64::W30;
+  case ARM64::SP: return ARM64::WSP;
+  case ARM64::XZR: return ARM64::WZR;
+  }
+  // For anything else, return it unchanged.
+  return Reg;
+}
+
+inline static unsigned getXRegFromWReg(unsigned Reg) {
+  switch (Reg) {
+  case ARM64::W0: return ARM64::X0;
+  case ARM64::W1: return ARM64::X1;
+  case ARM64::W2: return ARM64::X2;
+  case ARM64::W3: return ARM64::X3;
+  case ARM64::W4: return ARM64::X4;
+  case ARM64::W5: return ARM64::X5;
+  case ARM64::W6: return ARM64::X6;
+  case ARM64::W7: return ARM64::X7;
+  case ARM64::W8: return ARM64::X8;
+  case ARM64::W9: return ARM64::X9;
+  case ARM64::W10: return ARM64::X10;
+  case ARM64::W11: return ARM64::X11;
+  case ARM64::W12: return ARM64::X12;
+  case ARM64::W13: return ARM64::X13;
+  case ARM64::W14: return ARM64::X14;
+  case ARM64::W15: return ARM64::X15;
+  case ARM64::W16: return ARM64::X16;
+  case ARM64::W17: return ARM64::X17;
+  case ARM64::W18: return ARM64::X18;
+  case ARM64::W19: return ARM64::X19;
+  case ARM64::W20: return ARM64::X20;
+  case ARM64::W21: return ARM64::X21;
+  case ARM64::W22: return ARM64::X22;
+  case ARM64::W23: return ARM64::X23;
+  case ARM64::W24: return ARM64::X24;
+  case ARM64::W25: return ARM64::X25;
+  case ARM64::W26: return ARM64::X26;
+  case ARM64::W27: return ARM64::X27;
+  case ARM64::W28: return ARM64::X28;
+  case ARM64::W29: return ARM64::FP;
+  case ARM64::W30: return ARM64::LR;
+  case ARM64::WSP: return ARM64::SP;
+  case ARM64::WZR: return ARM64::XZR;
+  }
+  // For anything else, return it unchanged.
+  return Reg;
+}
+
+static inline unsigned getBRegFromDReg(unsigned Reg) {
+  switch (Reg) {
+  case ARM64::D0:  return ARM64::B0;
+  case ARM64::D1:  return ARM64::B1;
+  case ARM64::D2:  return ARM64::B2;
+  case ARM64::D3:  return ARM64::B3;
+  case ARM64::D4:  return ARM64::B4;
+  case ARM64::D5:  return ARM64::B5;
+  case ARM64::D6:  return ARM64::B6;
+  case ARM64::D7:  return ARM64::B7;
+  case ARM64::D8:  return ARM64::B8;
+  case ARM64::D9:  return ARM64::B9;
+  case ARM64::D10: return ARM64::B10;
+  case ARM64::D11: return ARM64::B11;
+  case ARM64::D12: return ARM64::B12;
+  case ARM64::D13: return ARM64::B13;
+  case ARM64::D14: return ARM64::B14;
+  case ARM64::D15: return ARM64::B15;
+  case ARM64::D16: return ARM64::B16;
+  case ARM64::D17: return ARM64::B17;
+  case ARM64::D18: return ARM64::B18;
+  case ARM64::D19: return ARM64::B19;
+  case ARM64::D20: return ARM64::B20;
+  case ARM64::D21: return ARM64::B21;
+  case ARM64::D22: return ARM64::B22;
+  case ARM64::D23: return ARM64::B23;
+  case ARM64::D24: return ARM64::B24;
+  case ARM64::D25: return ARM64::B25;
+  case ARM64::D26: return ARM64::B26;
+  case ARM64::D27: return ARM64::B27;
+  case ARM64::D28: return ARM64::B28;
+  case ARM64::D29: return ARM64::B29;
+  case ARM64::D30: return ARM64::B30;
+  case ARM64::D31: return ARM64::B31;
+  }
+  // For anything else, return it unchanged.
+  return Reg;
+}
+
+
+static inline unsigned getDRegFromBReg(unsigned Reg) {
+  switch (Reg) {
+  case ARM64::B0:  return ARM64::D0;
+  case ARM64::B1:  return ARM64::D1;
+  case ARM64::B2:  return ARM64::D2;
+  case ARM64::B3:  return ARM64::D3;
+  case ARM64::B4:  return ARM64::D4;
+  case ARM64::B5:  return ARM64::D5;
+  case ARM64::B6:  return ARM64::D6;
+  case ARM64::B7:  return ARM64::D7;
+  case ARM64::B8:  return ARM64::D8;
+  case ARM64::B9:  return ARM64::D9;
+  case ARM64::B10: return ARM64::D10;
+  case ARM64::B11: return ARM64::D11;
+  case ARM64::B12: return ARM64::D12;
+  case ARM64::B13: return ARM64::D13;
+  case ARM64::B14: return ARM64::D14;
+  case ARM64::B15: return ARM64::D15;
+  case ARM64::B16: return ARM64::D16;
+  case ARM64::B17: return ARM64::D17;
+  case ARM64::B18: return ARM64::D18;
+  case ARM64::B19: return ARM64::D19;
+  case ARM64::B20: return ARM64::D20;
+  case ARM64::B21: return ARM64::D21;
+  case ARM64::B22: return ARM64::D22;
+  case ARM64::B23: return ARM64::D23;
+  case ARM64::B24: return ARM64::D24;
+  case ARM64::B25: return ARM64::D25;
+  case ARM64::B26: return ARM64::D26;
+  case ARM64::B27: return ARM64::D27;
+  case ARM64::B28: return ARM64::D28;
+  case ARM64::B29: return ARM64::D29;
+  case ARM64::B30: return ARM64::D30;
+  case ARM64::B31: return ARM64::D31;
+  }
+  // For anything else, return it unchanged.
+  return Reg;
+}
+
+namespace ARM64CC {
+
+// The CondCodes constants map directly to the 4-bit encoding of the condition
+// field for predicated instructions.
+enum CondCode {  // Meaning (integer)          Meaning (floating-point)
+  EQ = 0x0,      // Equal                      Equal
+  NE = 0x1,      // Not equal                  Not equal, or unordered
+  CS = 0x2,      // Carry set                  >, ==, or unordered
+  CC = 0x3,      // Carry clear                Less than
+  MI = 0x4,      // Minus, negative            Less than
+  PL = 0x5,      // Plus, positive or zero     >, ==, or unordered
+  VS = 0x6,      // Overflow                   Unordered
+  VC = 0x7,      // No overflow                Not unordered
+  HI = 0x8,      // Unsigned higher            Greater than, or unordered
+  LS = 0x9,      // Unsigned lower or same     Less than or equal
+  GE = 0xa,      // Greater than or equal      Greater than or equal
+  LT = 0xb,      // Less than                  Less than, or unordered
+  GT = 0xc,      // Greater than               Greater than
+  LE = 0xd,      // Less than or equal         <, ==, or unordered
+  AL = 0xe       // Always (unconditional)     Always (unconditional)
+};
+
+inline static const char *getCondCodeName(CondCode Code) {
+  // cond<0> is ignored when cond<3:1> = 111, where 1110 is 0xe (aka AL).
+  if ((Code & AL) == AL)
+    Code = AL;
+  switch (Code) {
+  case EQ:  return "eq";
+  case NE:  return "ne";
+  case CS:  return "cs";
+  case CC:  return "cc";
+  case MI:  return "mi";
+  case PL:  return "pl";
+  case VS:  return "vs";
+  case VC:  return "vc";
+  case HI:  return "hi";
+  case LS:  return "ls";
+  case GE:  return "ge";
+  case LT:  return "lt";
+  case GT:  return "gt";
+  case LE:  return "le";
+  case AL:  return "al";
+  }
+  llvm_unreachable("Unknown condition code");
+}
+
+inline static CondCode getInvertedCondCode(CondCode Code) {
+  switch (Code) {
+  default: llvm_unreachable("Unknown condition code");
+  case EQ:  return NE;
+  case NE:  return EQ;
+  case CS:  return CC;
+  case CC:  return CS;
+  case MI:  return PL;
+  case PL:  return MI;
+  case VS:  return VC;
+  case VC:  return VS;
+  case HI:  return LS;
+  case LS:  return HI;
+  case GE:  return LT;
+  case LT:  return GE;
+  case GT:  return LE;
+  case LE:  return GT;
+  }
+}
+
+/// Given a condition code, return NZCV flags that would satisfy that condition.
+/// The flag bits are in the format expected by the ccmp instructions.
+/// Note that many different flag settings can satisfy a given condition code,
+/// this function just returns one of them.
+inline static unsigned getNZCVToSatisfyCondCode(CondCode Code) {
+  // NZCV flags encoded as expected by ccmp instructions, ARMv8 ISA 5.5.7.
+  enum { N = 8, Z = 4, C = 2, V = 1 };
+  switch (Code) {
+  default: llvm_unreachable("Unknown condition code");
+  case EQ: return Z; // Z == 1
+  case NE: return 0; // Z == 0
+  case CS: return C; // C == 1
+  case CC: return 0; // C == 0
+  case MI: return N; // N == 1
+  case PL: return 0; // N == 0
+  case VS: return V; // V == 1
+  case VC: return 0; // V == 0
+  case HI: return C; // C == 1 && Z == 0
+  case LS: return 0; // C == 0 || Z == 1
+  case GE: return 0; // N == V
+  case LT: return N; // N != V
+  case GT: return 0; // Z == 0 && N == V
+  case LE: return Z; // Z == 1 || N != V
+  }
+}
+} // end namespace ARM64CC
+
+namespace ARM64SYS {
+enum BarrierOption {
+  InvalidBarrier = 0xff,
+  OSHLD = 0x1,
+  OSHST = 0x2,
+  OSH =   0x3,
+  NSHLD = 0x5,
+  NSHST = 0x6,
+  NSH =   0x7,
+  ISHLD = 0x9,
+  ISHST = 0xa,
+  ISH =   0xb,
+  LD =    0xd,
+  ST =    0xe,
+  SY =    0xf
+};
+
+inline static const char *getBarrierOptName(BarrierOption Opt) {
+  switch (Opt) {
+  default: return NULL;
+  case 0x1: return "oshld";
+  case 0x2: return "oshst";
+  case 0x3: return "osh";
+  case 0x5: return "nshld";
+  case 0x6: return "nshst";
+  case 0x7: return "nsh";
+  case 0x9: return "ishld";
+  case 0xa: return "ishst";
+  case 0xb: return "ish";
+  case 0xd: return "ld";
+  case 0xe: return "st";
+  case 0xf: return "sy";
+  }
+}
+
+#define A64_SYSREG_ENC(op0,CRn,op2,CRm,op1) ((op0) << 14 | (op1) << 11 | \
+                                             (CRn) << 7  | (CRm) << 3 | (op2))
+enum SystemRegister {
+  InvalidSystemReg = 0,
+  // Table in section 3.10.3
+  SPSR_EL1  = 0xc200,
+  SPSR_svc  = SPSR_EL1,
+  ELR_EL1   = 0xc201,
+  SP_EL0    = 0xc208,
+  SPSel     = 0xc210,
+  CurrentEL = 0xc212,
+  DAIF      = 0xda11,
+  NZCV      = 0xda10,
+  FPCR      = 0xda20,
+  FPSR      = 0xda21,
+  DSPSR     = 0xda28,
+  DLR       = 0xda29,
+  SPSR_EL2  = 0xe200,
+  SPSR_hyp  = SPSR_EL2,
+  ELR_EL2   = 0xe201,
+  SP_EL1    = 0xe208,
+  SPSR_irq  = 0xe218,
+  SPSR_abt  = 0xe219,
+  SPSR_und  = 0xe21a,
+  SPSR_fiq  = 0xe21b,
+  SPSR_EL3  = 0xf200,
+  ELR_EL3   = 0xf201,
+  SP_EL2    = 0xf208,
+
+
+  // Table in section 3.10.8
+  MIDR_EL1 = 0xc000,
+  CTR_EL0 = 0xd801,
+  MPIDR_EL1 = 0xc005,
+  ECOIDR_EL1 = 0xc006,
+  DCZID_EL0 = 0xd807,
+  MVFR0_EL1 = 0xc018,
+  MVFR1_EL1 = 0xc019,
+  ID_AA64PFR0_EL1 = 0xc020,
+  ID_AA64PFR1_EL1 = 0xc021,
+  ID_AA64DFR0_EL1 = 0xc028,
+  ID_AA64DFR1_EL1 = 0xc029,
+  ID_AA64ISAR0_EL1 = 0xc030,
+  ID_AA64ISAR1_EL1 = 0xc031,
+  ID_AA64MMFR0_EL1 = 0xc038,
+  ID_AA64MMFR1_EL1 = 0xc039,
+  CCSIDR_EL1 = 0xc800,
+  CLIDR_EL1 = 0xc801,
+  AIDR_EL1 = 0xc807,
+  CSSELR_EL1 = 0xd000,
+  VPIDR_EL2 = 0xe000,
+  VMPIDR_EL2 = 0xe005,
+  SCTLR_EL1 = 0xc080,
+  SCTLR_EL2 = 0xe080,
+  SCTLR_EL3 = 0xf080,
+  ACTLR_EL1 = 0xc081,
+  ACTLR_EL2 = 0xe081,
+  ACTLR_EL3 = 0xf081,
+  CPACR_EL1 = 0xc082,
+  CPTR_EL2 = 0xe08a,
+  CPTR_EL3 = 0xf08a,
+  SCR_EL3 = 0xf088,
+  HCR_EL2 = 0xe088,
+  MDCR_EL2 = 0xe089,
+  MDCR_EL3 = 0xf099,
+  HSTR_EL2 = 0xe08b,
+  HACR_EL2 = 0xe08f,
+  TTBR0_EL1 = 0xc100,
+  TTBR1_EL1 = 0xc101,
+  TTBR0_EL2 = 0xe100,
+  TTBR0_EL3 = 0xf100,
+  VTTBR_EL2 = 0xe108,
+  TCR_EL1 = 0xc102,
+  TCR_EL2 = 0xe102,
+  TCR_EL3 = 0xf102,
+  VTCR_EL2 = 0xe10a,
+  ADFSR_EL1 = 0xc288,
+  AIFSR_EL1 = 0xc289,
+  ADFSR_EL2 = 0xe288,
+  AIFSR_EL2 = 0xe289,
+  ADFSR_EL3 = 0xf288,
+  AIFSR_EL3 = 0xf289,
+  ESR_EL1 = 0xc290,
+  ESR_EL2 = 0xe290,
+  ESR_EL3 = 0xf290,
+  FAR_EL1 = 0xc300,
+  FAR_EL2 = 0xe300,
+  FAR_EL3 = 0xf300,
+  HPFAR_EL2 = 0xe304,
+  PAR_EL1 = 0xc3a0,
+  MAIR_EL1 = 0xc510,
+  MAIR_EL2 = 0xe510,
+  MAIR_EL3 = 0xf510,
+  AMAIR_EL1 = 0xc518,
+  AMAIR_EL2 = 0xe518,
+  AMAIR_EL3 = 0xf518,
+  VBAR_EL1 = 0xc600,
+  VBAR_EL2 = 0xe600,
+  VBAR_EL3 = 0xf600,
+  RVBAR_EL1 = 0xc601,
+  RVBAR_EL2 = 0xe601,
+  RVBAR_EL3 = 0xf601,
+  ISR_EL1 = 0xc608,
+  CONTEXTIDR_EL1 = 0xc681,
+  TPIDR_EL0 = 0xde82,
+  TPIDRRO_EL0 = 0xde83,
+  TPIDR_EL1 = 0xc684,
+  TPIDR_EL2 = 0xe682,
+  TPIDR_EL3 = 0xf682,
+  TEECR32_EL1 = 0x9000,
+  CNTFRQ_EL0 = 0xdf00,
+  CNTPCT_EL0 = 0xdf01,
+  CNTVCT_EL0 = 0xdf02,
+  CNTVOFF_EL2 = 0xe703,
+  CNTKCTL_EL1 = 0xc708,
+  CNTHCTL_EL2 = 0xe708,
+  CNTP_TVAL_EL0 = 0xdf10,
+  CNTP_CTL_EL0 = 0xdf11,
+  CNTP_CVAL_EL0 = 0xdf12,
+  CNTV_TVAL_EL0 = 0xdf18,
+  CNTV_CTL_EL0 = 0xdf19,
+  CNTV_CVAL_EL0 = 0xdf1a,
+  CNTHP_TVAL_EL2 = 0xe710,
+  CNTHP_CTL_EL2 = 0xe711,
+  CNTHP_CVAL_EL2 = 0xe712,
+  CNTPS_TVAL_EL1 = 0xff10,
+  CNTPS_CTL_EL1 = 0xff11,
+  CNTPS_CVAL_EL1= 0xff12,
+
+  PMEVCNTR0_EL0  = 0xdf40,
+  PMEVCNTR1_EL0  = 0xdf41,
+  PMEVCNTR2_EL0  = 0xdf42,
+  PMEVCNTR3_EL0  = 0xdf43,
+  PMEVCNTR4_EL0  = 0xdf44,
+  PMEVCNTR5_EL0  = 0xdf45,
+  PMEVCNTR6_EL0  = 0xdf46,
+  PMEVCNTR7_EL0  = 0xdf47,
+  PMEVCNTR8_EL0  = 0xdf48,
+  PMEVCNTR9_EL0  = 0xdf49,
+  PMEVCNTR10_EL0 = 0xdf4a,
+  PMEVCNTR11_EL0 = 0xdf4b,
+  PMEVCNTR12_EL0 = 0xdf4c,
+  PMEVCNTR13_EL0 = 0xdf4d,
+  PMEVCNTR14_EL0 = 0xdf4e,
+  PMEVCNTR15_EL0 = 0xdf4f,
+  PMEVCNTR16_EL0 = 0xdf50,
+  PMEVCNTR17_EL0 = 0xdf51,
+  PMEVCNTR18_EL0 = 0xdf52,
+  PMEVCNTR19_EL0 = 0xdf53,
+  PMEVCNTR20_EL0 = 0xdf54,
+  PMEVCNTR21_EL0 = 0xdf55,
+  PMEVCNTR22_EL0 = 0xdf56,
+  PMEVCNTR23_EL0 = 0xdf57,
+  PMEVCNTR24_EL0 = 0xdf58,
+  PMEVCNTR25_EL0 = 0xdf59,
+  PMEVCNTR26_EL0 = 0xdf5a,
+  PMEVCNTR27_EL0 = 0xdf5b,
+  PMEVCNTR28_EL0 = 0xdf5c,
+  PMEVCNTR29_EL0 = 0xdf5d,
+  PMEVCNTR30_EL0 = 0xdf5e,
+
+  PMEVTYPER0_EL0  = 0xdf60,
+  PMEVTYPER1_EL0  = 0xdf61,
+  PMEVTYPER2_EL0  = 0xdf62,
+  PMEVTYPER3_EL0  = 0xdf63,
+  PMEVTYPER4_EL0  = 0xdf64,
+  PMEVTYPER5_EL0  = 0xdf65,
+  PMEVTYPER6_EL0  = 0xdf66,
+  PMEVTYPER7_EL0  = 0xdf67,
+  PMEVTYPER8_EL0  = 0xdf68,
+  PMEVTYPER9_EL0  = 0xdf69,
+  PMEVTYPER10_EL0 = 0xdf6a,
+  PMEVTYPER11_EL0 = 0xdf6b,
+  PMEVTYPER12_EL0 = 0xdf6c,
+  PMEVTYPER13_EL0 = 0xdf6d,
+  PMEVTYPER14_EL0 = 0xdf6e,
+  PMEVTYPER15_EL0 = 0xdf6f,
+  PMEVTYPER16_EL0 = 0xdf70,
+  PMEVTYPER17_EL0 = 0xdf71,
+  PMEVTYPER18_EL0 = 0xdf72,
+  PMEVTYPER19_EL0 = 0xdf73,
+  PMEVTYPER20_EL0 = 0xdf74,
+  PMEVTYPER21_EL0 = 0xdf75,
+  PMEVTYPER22_EL0 = 0xdf76,
+  PMEVTYPER23_EL0 = 0xdf77,
+  PMEVTYPER24_EL0 = 0xdf78,
+  PMEVTYPER25_EL0 = 0xdf79,
+  PMEVTYPER26_EL0 = 0xdf7a,
+  PMEVTYPER27_EL0 = 0xdf7b,
+  PMEVTYPER28_EL0 = 0xdf7c,
+  PMEVTYPER29_EL0 = 0xdf7d,
+  PMEVTYPER30_EL0 = 0xdf7e,
+
+  PMCCFILTR_EL0  = 0xdf7f,
+
+  RMR_EL3 = 0xf602,
+  RMR_EL2 = 0xd602,
+  RMR_EL1 = 0xce02,
+
+  // Debug Architecture 5.3, Table 17.
+  MDCCSR_EL0   = A64_SYSREG_ENC(2, 0, 0, 1, 3),
+  MDCCINT_EL1  = A64_SYSREG_ENC(2, 0, 0, 2, 0),
+  DBGDTR_EL0   = A64_SYSREG_ENC(2, 0, 0, 4, 3),
+  DBGDTRRX_EL0 = A64_SYSREG_ENC(2, 0, 0, 5, 3),
+  DBGDTRTX_EL0 = DBGDTRRX_EL0,
+  DBGVCR32_EL2 = A64_SYSREG_ENC(2, 0, 0, 7, 4),
+  OSDTRRX_EL1  = A64_SYSREG_ENC(2, 0, 2, 0, 0),
+  MDSCR_EL1    = A64_SYSREG_ENC(2, 0, 2, 2, 0),
+  OSDTRTX_EL1  = A64_SYSREG_ENC(2, 0, 2, 3, 0),
+  OSECCR_EL11  = A64_SYSREG_ENC(2, 0, 2, 6, 0),
+
+  DBGBVR0_EL1  = A64_SYSREG_ENC(2, 0, 4, 0, 0),
+  DBGBVR1_EL1  = A64_SYSREG_ENC(2, 0, 4, 1, 0),
+  DBGBVR2_EL1  = A64_SYSREG_ENC(2, 0, 4, 2, 0),
+  DBGBVR3_EL1  = A64_SYSREG_ENC(2, 0, 4, 3, 0),
+  DBGBVR4_EL1  = A64_SYSREG_ENC(2, 0, 4, 4, 0),
+  DBGBVR5_EL1  = A64_SYSREG_ENC(2, 0, 4, 5, 0),
+  DBGBVR6_EL1  = A64_SYSREG_ENC(2, 0, 4, 6, 0),
+  DBGBVR7_EL1  = A64_SYSREG_ENC(2, 0, 4, 7, 0),
+  DBGBVR8_EL1  = A64_SYSREG_ENC(2, 0, 4, 8, 0),
+  DBGBVR9_EL1  = A64_SYSREG_ENC(2, 0, 4, 9, 0),
+  DBGBVR10_EL1 = A64_SYSREG_ENC(2, 0, 4, 10, 0),
+  DBGBVR11_EL1 = A64_SYSREG_ENC(2, 0, 4, 11, 0),
+  DBGBVR12_EL1 = A64_SYSREG_ENC(2, 0, 4, 12, 0),
+  DBGBVR13_EL1 = A64_SYSREG_ENC(2, 0, 4, 13, 0),
+  DBGBVR14_EL1 = A64_SYSREG_ENC(2, 0, 4, 14, 0),
+  DBGBVR15_EL1 = A64_SYSREG_ENC(2, 0, 4, 15, 0),
+
+  DBGBCR0_EL1  = A64_SYSREG_ENC(2, 0, 5, 0, 0),
+  DBGBCR1_EL1  = A64_SYSREG_ENC(2, 0, 5, 1, 0),
+  DBGBCR2_EL1  = A64_SYSREG_ENC(2, 0, 5, 2, 0),
+  DBGBCR3_EL1  = A64_SYSREG_ENC(2, 0, 5, 3, 0),
+  DBGBCR4_EL1  = A64_SYSREG_ENC(2, 0, 5, 4, 0),
+  DBGBCR5_EL1  = A64_SYSREG_ENC(2, 0, 5, 5, 0),
+  DBGBCR6_EL1  = A64_SYSREG_ENC(2, 0, 5, 6, 0),
+  DBGBCR7_EL1  = A64_SYSREG_ENC(2, 0, 5, 7, 0),
+  DBGBCR8_EL1  = A64_SYSREG_ENC(2, 0, 5, 8, 0),
+  DBGBCR9_EL1  = A64_SYSREG_ENC(2, 0, 5, 9, 0),
+  DBGBCR10_EL1 = A64_SYSREG_ENC(2, 0, 5, 10, 0),
+  DBGBCR11_EL1 = A64_SYSREG_ENC(2, 0, 5, 11, 0),
+  DBGBCR12_EL1 = A64_SYSREG_ENC(2, 0, 5, 12, 0),
+  DBGBCR13_EL1 = A64_SYSREG_ENC(2, 0, 5, 13, 0),
+  DBGBCR14_EL1 = A64_SYSREG_ENC(2, 0, 5, 14, 0),
+  DBGBCR15_EL1 = A64_SYSREG_ENC(2, 0, 5, 15, 0),
+
+  DBGWVR0_EL1  = A64_SYSREG_ENC(2, 0, 6, 0, 0),
+  DBGWVR1_EL1  = A64_SYSREG_ENC(2, 0, 6, 1, 0),
+  DBGWVR2_EL1  = A64_SYSREG_ENC(2, 0, 6, 2, 0),
+  DBGWVR3_EL1  = A64_SYSREG_ENC(2, 0, 6, 3, 0),
+  DBGWVR4_EL1  = A64_SYSREG_ENC(2, 0, 6, 4, 0),
+  DBGWVR5_EL1  = A64_SYSREG_ENC(2, 0, 6, 5, 0),
+  DBGWVR6_EL1  = A64_SYSREG_ENC(2, 0, 6, 6, 0),
+  DBGWVR7_EL1  = A64_SYSREG_ENC(2, 0, 6, 7, 0),
+  DBGWVR8_EL1  = A64_SYSREG_ENC(2, 0, 6, 8, 0),
+  DBGWVR9_EL1  = A64_SYSREG_ENC(2, 0, 6, 9, 0),
+  DBGWVR10_EL1 = A64_SYSREG_ENC(2, 0, 6, 10, 0),
+  DBGWVR11_EL1 = A64_SYSREG_ENC(2, 0, 6, 11, 0),
+  DBGWVR12_EL1 = A64_SYSREG_ENC(2, 0, 6, 12, 0),
+  DBGWVR13_EL1 = A64_SYSREG_ENC(2, 0, 6, 13, 0),
+  DBGWVR14_EL1 = A64_SYSREG_ENC(2, 0, 6, 14, 0),
+  DBGWVR15_EL1 = A64_SYSREG_ENC(2, 0, 6, 15, 0),
+
+  DBGWCR0_EL1  = A64_SYSREG_ENC(2, 0, 7, 0, 0),
+  DBGWCR1_EL1  = A64_SYSREG_ENC(2, 0, 7, 1, 0),
+  DBGWCR2_EL1  = A64_SYSREG_ENC(2, 0, 7, 2, 0),
+  DBGWCR3_EL1  = A64_SYSREG_ENC(2, 0, 7, 3, 0),
+  DBGWCR4_EL1  = A64_SYSREG_ENC(2, 0, 7, 4, 0),
+  DBGWCR5_EL1  = A64_SYSREG_ENC(2, 0, 7, 5, 0),
+  DBGWCR6_EL1  = A64_SYSREG_ENC(2, 0, 7, 6, 0),
+  DBGWCR7_EL1  = A64_SYSREG_ENC(2, 0, 7, 7, 0),
+  DBGWCR8_EL1  = A64_SYSREG_ENC(2, 0, 7, 8, 0),
+  DBGWCR9_EL1  = A64_SYSREG_ENC(2, 0, 7, 9, 0),
+  DBGWCR10_EL1 = A64_SYSREG_ENC(2, 0, 7, 10, 0),
+  DBGWCR11_EL1 = A64_SYSREG_ENC(2, 0, 7, 11, 0),
+  DBGWCR12_EL1 = A64_SYSREG_ENC(2, 0, 7, 12, 0),
+  DBGWCR13_EL1 = A64_SYSREG_ENC(2, 0, 7, 13, 0),
+  DBGWCR14_EL1 = A64_SYSREG_ENC(2, 0, 7, 14, 0),
+  DBGWCR15_EL1 = A64_SYSREG_ENC(2, 0, 7, 15, 0),
+
+  MDRAR_EL1    = A64_SYSREG_ENC(2, 1, 0, 0, 0),
+  OSLAR_EL1    = A64_SYSREG_ENC(2, 1, 4, 0, 0),
+  OSLSR_EL1    = A64_SYSREG_ENC(2, 1, 4, 1, 0),
+  OSDLR_EL1    = A64_SYSREG_ENC(2, 1, 4, 3, 0),
+  DBGPRCR_EL1  = A64_SYSREG_ENC(2, 1, 4, 4, 0),
+
+  DBGCLAIMSET_EL1   = A64_SYSREG_ENC(2, 7, 6, 8, 0),
+  DBGCLAIMCLR_EL1   = A64_SYSREG_ENC(2, 7, 6, 9, 0),
+  DBGAUTHSTATUS_EL1 = A64_SYSREG_ENC(2, 7, 6, 14, 0),
+
+  DBGDEVID2    = A64_SYSREG_ENC(2, 7, 7, 0, 0),
+  DBGDEVID1    = A64_SYSREG_ENC(2, 7, 7, 1, 0),
+  DBGDEVID0    = A64_SYSREG_ENC(2, 7, 7, 2, 0),
+
+  // The following registers are defined to allow access from AArch64 to
+  // registers which are only used in the AArch32 architecture.
+  DACR32_EL2 = 0xe180,
+  IFSR32_EL2 = 0xe281,
+  TEEHBR32_EL1 = 0x9080,
+  SDER32_EL3 = 0xf089,
+  FPEXC32_EL2 = 0xe298,
+
+  // Cyclone specific system registers
+  CPM_IOACC_CTL_EL3 = 0xff90,
+
+  // Architectural system registers
+  ID_PFR0_EL1 = 0xc008,
+  ID_PFR1_EL1 = 0xc009,
+  ID_DFR0_EL1 = 0xc00a,
+  ID_AFR0_EL1 = 0xc00b,
+  ID_ISAR0_EL1 = 0xc010,
+  ID_ISAR1_EL1 = 0xc011,
+  ID_ISAR2_EL1 = 0xc012,
+  ID_ISAR3_EL1 = 0xc013,
+  ID_ISAR4_EL1 = 0xc014,
+  ID_ISAR5_EL1 = 0xc015,
+  AFSR1_EL1 = 0xc289, // note same as old AIFSR_EL1
+  AFSR0_EL1 = 0xc288, // note same as old ADFSR_EL1
+  REVIDR_EL1 = 0xc006 // note same as old ECOIDR_EL1
+
+};
+#undef A64_SYSREG_ENC
+
+static inline const char *getSystemRegisterName(SystemRegister Reg) {
+  switch(Reg) {
+  default: return NULL; // Caller is responsible for handling invalid value.
+  case SPSR_EL1: return "SPSR_EL1";
+  case ELR_EL1: return "ELR_EL1";
+  case SP_EL0: return "SP_EL0";
+  case SPSel: return "SPSel";
+  case DAIF: return "DAIF";
+  case CurrentEL: return "CurrentEL";
+  case NZCV: return "NZCV";
+  case FPCR: return "FPCR";
+  case FPSR: return "FPSR";
+  case DSPSR: return "DSPSR";
+  case DLR: return "DLR";
+  case SPSR_EL2: return "SPSR_EL2";
+  case ELR_EL2: return "ELR_EL2";
+  case SP_EL1: return "SP_EL1";
+  case SPSR_irq: return "SPSR_irq";
+  case SPSR_abt: return "SPSR_abt";
+  case SPSR_und: return "SPSR_und";
+  case SPSR_fiq: return "SPSR_fiq";
+  case SPSR_EL3: return "SPSR_EL3";
+  case ELR_EL3: return "ELR_EL3";
+  case SP_EL2: return "SP_EL2";
+  case MIDR_EL1: return "MIDR_EL1";
+  case CTR_EL0: return "CTR_EL0";
+  case MPIDR_EL1: return "MPIDR_EL1";
+  case DCZID_EL0: return "DCZID_EL0";
+  case MVFR0_EL1: return "MVFR0_EL1";
+  case MVFR1_EL1: return "MVFR1_EL1";
+  case ID_AA64PFR0_EL1: return "ID_AA64PFR0_EL1";
+  case ID_AA64PFR1_EL1: return "ID_AA64PFR1_EL1";
+  case ID_AA64DFR0_EL1: return "ID_AA64DFR0_EL1";
+  case ID_AA64DFR1_EL1: return "ID_AA64DFR1_EL1";
+  case ID_AA64ISAR0_EL1: return "ID_AA64ISAR0_EL1";
+  case ID_AA64ISAR1_EL1: return "ID_AA64ISAR1_EL1";
+  case ID_AA64MMFR0_EL1: return "ID_AA64MMFR0_EL1";
+  case ID_AA64MMFR1_EL1: return "ID_AA64MMFR1_EL1";
+  case CCSIDR_EL1: return "CCSIDR_EL1";
+  case CLIDR_EL1: return "CLIDR_EL1";
+  case AIDR_EL1: return "AIDR_EL1";
+  case CSSELR_EL1: return "CSSELR_EL1";
+  case VPIDR_EL2: return "VPIDR_EL2";
+  case VMPIDR_EL2: return "VMPIDR_EL2";
+  case SCTLR_EL1: return "SCTLR_EL1";
+  case SCTLR_EL2: return "SCTLR_EL2";
+  case SCTLR_EL3: return "SCTLR_EL3";
+  case ACTLR_EL1: return "ACTLR_EL1";
+  case ACTLR_EL2: return "ACTLR_EL2";
+  case ACTLR_EL3: return "ACTLR_EL3";
+  case CPACR_EL1: return "CPACR_EL1";
+  case CPTR_EL2: return "CPTR_EL2";
+  case CPTR_EL3: return "CPTR_EL3";
+  case SCR_EL3: return "SCR_EL3";
+  case HCR_EL2: return "HCR_EL2";
+  case MDCR_EL2: return "MDCR_EL2";
+  case MDCR_EL3: return "MDCR_EL3";
+  case HSTR_EL2: return "HSTR_EL2";
+  case HACR_EL2: return "HACR_EL2";
+  case TTBR0_EL1: return "TTBR0_EL1";
+  case TTBR1_EL1: return "TTBR1_EL1";
+  case TTBR0_EL2: return "TTBR0_EL2";
+  case TTBR0_EL3: return "TTBR0_EL3";
+  case VTTBR_EL2: return "VTTBR_EL2";
+  case TCR_EL1: return "TCR_EL1";
+  case TCR_EL2: return "TCR_EL2";
+  case TCR_EL3: return "TCR_EL3";
+  case VTCR_EL2: return "VTCR_EL2";
+  case ADFSR_EL2: return "ADFSR_EL2";
+  case AIFSR_EL2: return "AIFSR_EL2";
+  case ADFSR_EL3: return "ADFSR_EL3";
+  case AIFSR_EL3: return "AIFSR_EL3";
+  case ESR_EL1: return "ESR_EL1";
+  case ESR_EL2: return "ESR_EL2";
+  case ESR_EL3: return "ESR_EL3";
+  case FAR_EL1: return "FAR_EL1";
+  case FAR_EL2: return "FAR_EL2";
+  case FAR_EL3: return "FAR_EL3";
+  case HPFAR_EL2: return "HPFAR_EL2";
+  case PAR_EL1: return "PAR_EL1";
+  case MAIR_EL1: return "MAIR_EL1";
+  case MAIR_EL2: return "MAIR_EL2";
+  case MAIR_EL3: return "MAIR_EL3";
+  case AMAIR_EL1: return "AMAIR_EL1";
+  case AMAIR_EL2: return "AMAIR_EL2";
+  case AMAIR_EL3: return "AMAIR_EL3";
+  case VBAR_EL1: return "VBAR_EL1";
+  case VBAR_EL2: return "VBAR_EL2";
+  case VBAR_EL3: return "VBAR_EL3";
+  case RVBAR_EL1: return "RVBAR_EL1";
+  case RVBAR_EL2: return "RVBAR_EL2";
+  case RVBAR_EL3: return "RVBAR_EL3";
+  case ISR_EL1: return "ISR_EL1";
+  case CONTEXTIDR_EL1: return "CONTEXTIDR_EL1";
+  case TPIDR_EL0: return "TPIDR_EL0";
+  case TPIDRRO_EL0: return "TPIDRRO_EL0";
+  case TPIDR_EL1: return "TPIDR_EL1";
+  case TPIDR_EL2: return "TPIDR_EL2";
+  case TPIDR_EL3: return "TPIDR_EL3";
+  case TEECR32_EL1: return "TEECR32_EL1";
+  case CNTFRQ_EL0: return "CNTFRQ_EL0";
+  case CNTPCT_EL0: return "CNTPCT_EL0";
+  case CNTVCT_EL0: return "CNTVCT_EL0";
+  case CNTVOFF_EL2: return "CNTVOFF_EL2";
+  case CNTKCTL_EL1: return "CNTKCTL_EL1";
+  case CNTHCTL_EL2: return "CNTHCTL_EL2";
+  case CNTP_TVAL_EL0: return "CNTP_TVAL_EL0";
+  case CNTP_CTL_EL0: return "CNTP_CTL_EL0";
+  case CNTP_CVAL_EL0: return "CNTP_CVAL_EL0";
+  case CNTV_TVAL_EL0: return "CNTV_TVAL_EL0";
+  case CNTV_CTL_EL0: return "CNTV_CTL_EL0";
+  case CNTV_CVAL_EL0: return "CNTV_CVAL_EL0";
+  case CNTHP_TVAL_EL2: return "CNTHP_TVAL_EL2";
+  case CNTHP_CTL_EL2: return "CNTHP_CTL_EL2";
+  case CNTHP_CVAL_EL2: return "CNTHP_CVAL_EL2";
+  case CNTPS_TVAL_EL1: return "CNTPS_TVAL_EL1";
+  case CNTPS_CTL_EL1: return "CNTPS_CTL_EL1";
+  case CNTPS_CVAL_EL1: return "CNTPS_CVAL_EL1";
+  case DACR32_EL2: return "DACR32_EL2";
+  case IFSR32_EL2: return "IFSR32_EL2";
+  case TEEHBR32_EL1: return "TEEHBR32_EL1";
+  case SDER32_EL3: return "SDER32_EL3";
+  case FPEXC32_EL2: return "FPEXC32_EL2";
+  case PMEVCNTR0_EL0: return "PMEVCNTR0_EL0";
+  case PMEVCNTR1_EL0: return "PMEVCNTR1_EL0";
+  case PMEVCNTR2_EL0: return "PMEVCNTR2_EL0";
+  case PMEVCNTR3_EL0: return "PMEVCNTR3_EL0";
+  case PMEVCNTR4_EL0: return "PMEVCNTR4_EL0";
+  case PMEVCNTR5_EL0: return "PMEVCNTR5_EL0";
+  case PMEVCNTR6_EL0: return "PMEVCNTR6_EL0";
+  case PMEVCNTR7_EL0: return "PMEVCNTR7_EL0";
+  case PMEVCNTR8_EL0: return "PMEVCNTR8_EL0";
+  case PMEVCNTR9_EL0: return "PMEVCNTR9_EL0";
+  case PMEVCNTR10_EL0: return "PMEVCNTR10_EL0";
+  case PMEVCNTR11_EL0: return "PMEVCNTR11_EL0";
+  case PMEVCNTR12_EL0: return "PMEVCNTR12_EL0";
+  case PMEVCNTR13_EL0: return "PMEVCNTR13_EL0";
+  case PMEVCNTR14_EL0: return "PMEVCNTR14_EL0";
+  case PMEVCNTR15_EL0: return "PMEVCNTR15_EL0";
+  case PMEVCNTR16_EL0: return "PMEVCNTR16_EL0";
+  case PMEVCNTR17_EL0: return "PMEVCNTR17_EL0";
+  case PMEVCNTR18_EL0: return "PMEVCNTR18_EL0";
+  case PMEVCNTR19_EL0: return "PMEVCNTR19_EL0";
+  case PMEVCNTR20_EL0: return "PMEVCNTR20_EL0";
+  case PMEVCNTR21_EL0: return "PMEVCNTR21_EL0";
+  case PMEVCNTR22_EL0: return "PMEVCNTR22_EL0";
+  case PMEVCNTR23_EL0: return "PMEVCNTR23_EL0";
+  case PMEVCNTR24_EL0: return "PMEVCNTR24_EL0";
+  case PMEVCNTR25_EL0: return "PMEVCNTR25_EL0";
+  case PMEVCNTR26_EL0: return "PMEVCNTR26_EL0";
+  case PMEVCNTR27_EL0: return "PMEVCNTR27_EL0";
+  case PMEVCNTR28_EL0: return "PMEVCNTR28_EL0";
+  case PMEVCNTR29_EL0: return "PMEVCNTR29_EL0";
+  case PMEVCNTR30_EL0: return "PMEVCNTR30_EL0";
+  case PMEVTYPER0_EL0: return "PMEVTYPER0_EL0";
+  case PMEVTYPER1_EL0: return "PMEVTYPER1_EL0";
+  case PMEVTYPER2_EL0: return "PMEVTYPER2_EL0";
+  case PMEVTYPER3_EL0: return "PMEVTYPER3_EL0";
+  case PMEVTYPER4_EL0: return "PMEVTYPER4_EL0";
+  case PMEVTYPER5_EL0: return "PMEVTYPER5_EL0";
+  case PMEVTYPER6_EL0: return "PMEVTYPER6_EL0";
+  case PMEVTYPER7_EL0: return "PMEVTYPER7_EL0";
+  case PMEVTYPER8_EL0: return "PMEVTYPER8_EL0";
+  case PMEVTYPER9_EL0: return "PMEVTYPER9_EL0";
+  case PMEVTYPER10_EL0: return "PMEVTYPER10_EL0";
+  case PMEVTYPER11_EL0: return "PMEVTYPER11_EL0";
+  case PMEVTYPER12_EL0: return "PMEVTYPER12_EL0";
+  case PMEVTYPER13_EL0: return "PMEVTYPER13_EL0";
+  case PMEVTYPER14_EL0: return "PMEVTYPER14_EL0";
+  case PMEVTYPER15_EL0: return "PMEVTYPER15_EL0";
+  case PMEVTYPER16_EL0: return "PMEVTYPER16_EL0";
+  case PMEVTYPER17_EL0: return "PMEVTYPER17_EL0";
+  case PMEVTYPER18_EL0: return "PMEVTYPER18_EL0";
+  case PMEVTYPER19_EL0: return "PMEVTYPER19_EL0";
+  case PMEVTYPER20_EL0: return "PMEVTYPER20_EL0";
+  case PMEVTYPER21_EL0: return "PMEVTYPER21_EL0";
+  case PMEVTYPER22_EL0: return "PMEVTYPER22_EL0";
+  case PMEVTYPER23_EL0: return "PMEVTYPER23_EL0";
+  case PMEVTYPER24_EL0: return "PMEVTYPER24_EL0";
+  case PMEVTYPER25_EL0: return "PMEVTYPER25_EL0";
+  case PMEVTYPER26_EL0: return "PMEVTYPER26_EL0";
+  case PMEVTYPER27_EL0: return "PMEVTYPER27_EL0";
+  case PMEVTYPER28_EL0: return "PMEVTYPER28_EL0";
+  case PMEVTYPER29_EL0: return "PMEVTYPER29_EL0";
+  case PMEVTYPER30_EL0: return "PMEVTYPER30_EL0";
+  case PMCCFILTR_EL0: return "PMCCFILTR_EL0";
+  case RMR_EL3: return "RMR_EL3";
+  case RMR_EL2: return "RMR_EL2";
+  case RMR_EL1: return "RMR_EL1";
+  case CPM_IOACC_CTL_EL3: return "CPM_IOACC_CTL_EL3";
+  case MDCCSR_EL0: return "MDCCSR_EL0";
+  case MDCCINT_EL1: return "MDCCINT_EL1";
+  case DBGDTR_EL0: return "DBGDTR_EL0";
+  case DBGDTRRX_EL0: return "DBGDTRRX_EL0";
+  case DBGVCR32_EL2: return "DBGVCR32_EL2";
+  case OSDTRRX_EL1: return "OSDTRRX_EL1";
+  case MDSCR_EL1: return "MDSCR_EL1";
+  case OSDTRTX_EL1: return "OSDTRTX_EL1";
+  case OSECCR_EL11: return "OSECCR_EL11";
+  case DBGBVR0_EL1: return "DBGBVR0_EL1";
+  case DBGBVR1_EL1: return "DBGBVR1_EL1";
+  case DBGBVR2_EL1: return "DBGBVR2_EL1";
+  case DBGBVR3_EL1: return "DBGBVR3_EL1";
+  case DBGBVR4_EL1: return "DBGBVR4_EL1";
+  case DBGBVR5_EL1: return "DBGBVR5_EL1";
+  case DBGBVR6_EL1: return "DBGBVR6_EL1";
+  case DBGBVR7_EL1: return "DBGBVR7_EL1";
+  case DBGBVR8_EL1: return "DBGBVR8_EL1";
+  case DBGBVR9_EL1: return "DBGBVR9_EL1";
+  case DBGBVR10_EL1: return "DBGBVR10_EL1";
+  case DBGBVR11_EL1: return "DBGBVR11_EL1";
+  case DBGBVR12_EL1: return "DBGBVR12_EL1";
+  case DBGBVR13_EL1: return "DBGBVR13_EL1";
+  case DBGBVR14_EL1: return "DBGBVR14_EL1";
+  case DBGBVR15_EL1: return "DBGBVR15_EL1";
+  case DBGBCR0_EL1: return "DBGBCR0_EL1";
+  case DBGBCR1_EL1: return "DBGBCR1_EL1";
+  case DBGBCR2_EL1: return "DBGBCR2_EL1";
+  case DBGBCR3_EL1: return "DBGBCR3_EL1";
+  case DBGBCR4_EL1: return "DBGBCR4_EL1";
+  case DBGBCR5_EL1: return "DBGBCR5_EL1";
+  case DBGBCR6_EL1: return "DBGBCR6_EL1";
+  case DBGBCR7_EL1: return "DBGBCR7_EL1";
+  case DBGBCR8_EL1: return "DBGBCR8_EL1";
+  case DBGBCR9_EL1: return "DBGBCR9_EL1";
+  case DBGBCR10_EL1: return "DBGBCR10_EL1";
+  case DBGBCR11_EL1: return "DBGBCR11_EL1";
+  case DBGBCR12_EL1: return "DBGBCR12_EL1";
+  case DBGBCR13_EL1: return "DBGBCR13_EL1";
+  case DBGBCR14_EL1: return "DBGBCR14_EL1";
+  case DBGBCR15_EL1: return "DBGBCR15_EL1";
+  case DBGWVR0_EL1: return "DBGWVR0_EL1";
+  case DBGWVR1_EL1: return "DBGWVR1_EL1";
+  case DBGWVR2_EL1: return "DBGWVR2_EL1";
+  case DBGWVR3_EL1: return "DBGWVR3_EL1";
+  case DBGWVR4_EL1: return "DBGWVR4_EL1";
+  case DBGWVR5_EL1: return "DBGWVR5_EL1";
+  case DBGWVR6_EL1: return "DBGWVR6_EL1";
+  case DBGWVR7_EL1: return "DBGWVR7_EL1";
+  case DBGWVR8_EL1: return "DBGWVR8_EL1";
+  case DBGWVR9_EL1: return "DBGWVR9_EL1";
+  case DBGWVR10_EL1: return "DBGWVR10_EL1";
+  case DBGWVR11_EL1: return "DBGWVR11_EL1";
+  case DBGWVR12_EL1: return "DBGWVR12_EL1";
+  case DBGWVR13_EL1: return "DBGWVR13_EL1";
+  case DBGWVR14_EL1: return "DBGWVR14_EL1";
+  case DBGWVR15_EL1: return "DBGWVR15_EL1";
+  case DBGWCR0_EL1: return "DBGWCR0_EL1";
+  case DBGWCR1_EL1: return "DBGWCR1_EL1";
+  case DBGWCR2_EL1: return "DBGWCR2_EL1";
+  case DBGWCR3_EL1: return "DBGWCR3_EL1";
+  case DBGWCR4_EL1: return "DBGWCR4_EL1";
+  case DBGWCR5_EL1: return "DBGWCR5_EL1";
+  case DBGWCR6_EL1: return "DBGWCR6_EL1";
+  case DBGWCR7_EL1: return "DBGWCR7_EL1";
+  case DBGWCR8_EL1: return "DBGWCR8_EL1";
+  case DBGWCR9_EL1: return "DBGWCR9_EL1";
+  case DBGWCR10_EL1: return "DBGWCR10_EL1";
+  case DBGWCR11_EL1: return "DBGWCR11_EL1";
+  case DBGWCR12_EL1: return "DBGWCR12_EL1";
+  case DBGWCR13_EL1: return "DBGWCR13_EL1";
+  case DBGWCR14_EL1: return "DBGWCR14_EL1";
+  case DBGWCR15_EL1: return "DBGWCR15_EL1";
+  case MDRAR_EL1: return "MDRAR_EL1";
+  case OSLAR_EL1: return "OSLAR_EL1";
+  case OSLSR_EL1: return "OSLSR_EL1";
+  case OSDLR_EL1: return "OSDLR_EL1";
+  case DBGPRCR_EL1: return "DBGPRCR_EL1";
+  case DBGCLAIMSET_EL1: return "DBGCLAIMSET_EL1";
+  case DBGCLAIMCLR_EL1: return "DBGCLAIMCLR_EL1";
+  case DBGAUTHSTATUS_EL1: return "DBGAUTHSTATUS_EL1";
+  case DBGDEVID2: return "DBGDEVID2";
+  case DBGDEVID1: return "DBGDEVID1";
+  case DBGDEVID0: return "DBGDEVID0";
+  case ID_PFR0_EL1: return "ID_PFR0_EL1";
+  case ID_PFR1_EL1: return "ID_PFR1_EL1";
+  case ID_DFR0_EL1: return "ID_DFR0_EL1";
+  case ID_AFR0_EL1: return "ID_AFR0_EL1";
+  case ID_ISAR0_EL1: return "ID_ISAR0_EL1";
+  case ID_ISAR1_EL1: return "ID_ISAR1_EL1";
+  case ID_ISAR2_EL1: return "ID_ISAR2_EL1";
+  case ID_ISAR3_EL1: return "ID_ISAR3_EL1";
+  case ID_ISAR4_EL1: return "ID_ISAR4_EL1";
+  case ID_ISAR5_EL1: return "ID_ISAR5_EL1";
+  case AFSR1_EL1: return "AFSR1_EL1";
+  case AFSR0_EL1: return "AFSR0_EL1";
+  case REVIDR_EL1: return "REVIDR_EL1";
+  }
+}
+
+enum CPSRField {
+  InvalidCPSRField = 0xff,
+  cpsr_SPSel = 0x5,
+  cpsr_DAIFSet = 0x1e,
+  cpsr_DAIFClr = 0x1f
+};
+
+static inline const char *getCPSRFieldName(CPSRField Val) {
+  switch(Val) {
+  default: assert(0 && "Invalid system register value!");
+  case cpsr_SPSel: return "SPSel";
+  case cpsr_DAIFSet: return "DAIFSet";
+  case cpsr_DAIFClr: return "DAIFClr";
+  }
+}
+
+} // end namespace ARM64SYS
+
+namespace ARM64II {
+  /// Target Operand Flag enum.
+  enum TOF {
+    //===------------------------------------------------------------------===//
+    // ARM64 Specific MachineOperand flags.
+
+    MO_NO_FLAG,
+
+    MO_FRAGMENT = 0x7,
+
+    /// MO_PAGE - A symbol operand with this flag represents the pc-relative
+    /// offset of the 4K page containing the symbol.  This is used with the
+    /// ADRP instruction.
+    MO_PAGE = 1,
+
+    /// MO_PAGEOFF - A symbol operand with this flag represents the offset of
+    /// that symbol within a 4K page.  This offset is added to the page address
+    /// to produce the complete address.
+    MO_PAGEOFF = 2,
+
+    /// MO_G3 - A symbol operand with this flag (granule 3) represents the high
+    /// 16-bits of a 64-bit address, used in a MOVZ or MOVK instruction
+    MO_G3 = 3,
+
+    /// MO_G2 - A symbol operand with this flag (granule 2) represents the bits
+    /// 32-47 of a 64-bit address, used in a MOVZ or MOVK instruction
+    MO_G2 = 4,
+
+    /// MO_G1 - A symbol operand with this flag (granule 1) represents the bits
+    /// 16-31 of a 64-bit address, used in a MOVZ or MOVK instruction
+    MO_G1 = 5,
+
+    /// MO_G0 - A symbol operand with this flag (granule 0) represents the bits
+    /// 0-15 of a 64-bit address, used in a MOVZ or MOVK instruction
+    MO_G0 = 6,
+
+    /// MO_GOT - This flag indicates that a symbol operand represents the
+    /// address of the GOT entry for the symbol, rather than the address of
+    /// the symbol itself.
+    MO_GOT = 8,
+
+    /// MO_NC - Indicates whether the linker is expected to check the symbol
+    /// reference for overflow. For example in an ADRP/ADD pair of relocations
+    /// the ADRP usually does check, but not the ADD.
+    MO_NC = 0x10,
+
+    /// MO_TLS - Indicates that the operand being accessed is some kind of
+    /// thread-local symbol. On Darwin, only one type of thread-local access
+    /// exists (pre linker-relaxation), but on ELF the TLSModel used for the
+    /// referee will affect interpretation.
+    MO_TLS = 0x20
+  };
+} // end namespace ARM64II
+
+} // end namespace llvm
+
+#endif
--- a/lib/Target/ARM64/MCTargetDesc/ARM64ELFObjectWriter.cpp
+++ b/lib/Target/ARM64/MCTargetDesc/ARM64ELFObjectWriter.cpp
@ -0,0 +1,237 @@
+//===-- ARM64ELFObjectWriter.cpp - ARM64 ELF Writer -----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file handles ELF-specific object emission, converting LLVM's internal
+// fixups into the appropriate relocations.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/ARM64FixupKinds.h"
+#include "MCTargetDesc/ARM64MCExpr.h"
+#include "MCTargetDesc/ARM64MCTargetDesc.h"
+#include "llvm/MC/MCELFObjectWriter.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/Support/ErrorHandling.h"
+
+using namespace llvm;
+
+namespace {
+class ARM64ELFObjectWriter : public MCELFObjectTargetWriter {
+public:
+  ARM64ELFObjectWriter(uint8_t OSABI);
+
+  virtual ~ARM64ELFObjectWriter();
+
+protected:
+  unsigned GetRelocType(const MCValue &Target, const MCFixup &Fixup,
+                        bool IsPCRel) const override;
+
+private:
+};
+}
+
+ARM64ELFObjectWriter::ARM64ELFObjectWriter(uint8_t OSABI)
+    : MCELFObjectTargetWriter(/*Is64Bit*/ true, OSABI, ELF::EM_AARCH64,
+                              /*HasRelocationAddend*/ true) {}
+
+ARM64ELFObjectWriter::~ARM64ELFObjectWriter() {}
+
+unsigned ARM64ELFObjectWriter::GetRelocType(const MCValue &Target,
+                                            const MCFixup &Fixup,
+                                            bool IsPCRel) const {
+  ARM64MCExpr::VariantKind RefKind =
+      static_cast<ARM64MCExpr::VariantKind>(Target.getRefKind());
+  ARM64MCExpr::VariantKind SymLoc = ARM64MCExpr::getSymbolLoc(RefKind);
+  bool IsNC = ARM64MCExpr::isNotChecked(RefKind);
+
+  assert((!Target.getSymA() ||
+          Target.getSymA()->getKind() == MCSymbolRefExpr::VK_None) &&
+         "Should only be expression-level modifiers here");
+
+  assert((!Target.getSymB() ||
+          Target.getSymB()->getKind() == MCSymbolRefExpr::VK_None) &&
+         "Should only be expression-level modifiers here");
+
+  if (IsPCRel) {
+    switch ((unsigned)Fixup.getKind()) {
+    case FK_Data_2:
+      return ELF::R_AARCH64_PREL16;
+    case FK_Data_4:
+      return ELF::R_AARCH64_PREL32;
+    case FK_Data_8:
+      return ELF::R_AARCH64_PREL64;
+    case ARM64::fixup_arm64_pcrel_adr_imm21:
+      llvm_unreachable("No ELF relocations supported for ADR at the moment");
+    case ARM64::fixup_arm64_pcrel_adrp_imm21:
+      if (SymLoc == ARM64MCExpr::VK_ABS && !IsNC)
+        return ELF::R_AARCH64_ADR_PREL_PG_HI21;
+      if (SymLoc == ARM64MCExpr::VK_GOT && !IsNC)
+        return ELF::R_AARCH64_ADR_GOT_PAGE;
+      if (SymLoc == ARM64MCExpr::VK_GOTTPREL && !IsNC)
+        return ELF::R_AARCH64_TLSIE_ADR_GOTTPREL_PAGE21;
+      if (SymLoc == ARM64MCExpr::VK_TLSDESC && !IsNC)
+        return ELF::R_AARCH64_TLSDESC_ADR_PAGE;
+      llvm_unreachable("invalid symbol kind for ADRP relocation");
+    case ARM64::fixup_arm64_pcrel_branch26:
+      return ELF::R_AARCH64_JUMP26;
+    case ARM64::fixup_arm64_pcrel_call26:
+      return ELF::R_AARCH64_CALL26;
+    case ARM64::fixup_arm64_pcrel_imm19:
+      return ELF::R_AARCH64_TLSIE_LD_GOTTPREL_PREL19;
+    default:
+      llvm_unreachable("Unsupported pc-relative fixup kind");
+    }
+  } else {
+    switch ((unsigned)Fixup.getKind()) {
+    case FK_Data_2:
+      return ELF::R_AARCH64_ABS16;
+    case FK_Data_4:
+      return ELF::R_AARCH64_ABS32;
+    case FK_Data_8:
+      return ELF::R_AARCH64_ABS64;
+    case ARM64::fixup_arm64_add_imm12:
+      if (SymLoc == ARM64MCExpr::VK_DTPREL && IsNC)
+        return ELF::R_AARCH64_TLSLD_ADD_DTPREL_LO12_NC;
+      if (SymLoc == ARM64MCExpr::VK_DTPREL && !IsNC)
+        return ELF::R_AARCH64_TLSLD_ADD_DTPREL_LO12;
+      if (SymLoc == ARM64MCExpr::VK_TPREL && IsNC)
+        return ELF::R_AARCH64_TLSLE_ADD_TPREL_LO12_NC;
+      if (SymLoc == ARM64MCExpr::VK_TPREL && !IsNC)
+        return ELF::R_AARCH64_TLSLE_ADD_TPREL_LO12;
+      if (SymLoc == ARM64MCExpr::VK_TLSDESC && IsNC)
+        return ELF::R_AARCH64_TLSDESC_ADD_LO12_NC;
+      if (SymLoc == ARM64MCExpr::VK_ABS && IsNC)
+        return ELF::R_AARCH64_ADD_ABS_LO12_NC;
+
+      report_fatal_error("invalid fixup for add (uimm12) instruction");
+      return 0;
+    case ARM64::fixup_arm64_ldst_imm12_scale1:
+      if (SymLoc == ARM64MCExpr::VK_ABS && IsNC)
+        return ELF::R_AARCH64_LDST8_ABS_LO12_NC;
+      if (SymLoc == ARM64MCExpr::VK_DTPREL && !IsNC)
+        return ELF::R_AARCH64_TLSLD_LDST8_DTPREL_LO12;
+      if (SymLoc == ARM64MCExpr::VK_DTPREL && IsNC)
+        return ELF::R_AARCH64_TLSLD_LDST8_DTPREL_LO12_NC;
+      if (SymLoc == ARM64MCExpr::VK_TPREL && !IsNC)
+        return ELF::R_AARCH64_TLSLE_LDST8_TPREL_LO12;
+      if (SymLoc == ARM64MCExpr::VK_TPREL && IsNC)
+        return ELF::R_AARCH64_TLSLE_LDST8_TPREL_LO12_NC;
+
+      report_fatal_error("invalid fixup for 8-bit load/store instruction");
+      return 0;
+    case ARM64::fixup_arm64_ldst_imm12_scale2:
+      if (SymLoc == ARM64MCExpr::VK_ABS && IsNC)
+        return ELF::R_AARCH64_LDST16_ABS_LO12_NC;
+      if (SymLoc == ARM64MCExpr::VK_DTPREL && !IsNC)
+        return ELF::R_AARCH64_TLSLD_LDST16_DTPREL_LO12;
+      if (SymLoc == ARM64MCExpr::VK_DTPREL && IsNC)
+        return ELF::R_AARCH64_TLSLD_LDST16_DTPREL_LO12_NC;
+      if (SymLoc == ARM64MCExpr::VK_TPREL && !IsNC)
+        return ELF::R_AARCH64_TLSLE_LDST16_TPREL_LO12;
+      if (SymLoc == ARM64MCExpr::VK_TPREL && IsNC)
+        return ELF::R_AARCH64_TLSLE_LDST16_TPREL_LO12_NC;
+
+      report_fatal_error("invalid fixup for 16-bit load/store instruction");
+      return 0;
+    case ARM64::fixup_arm64_ldst_imm12_scale4:
+      if (SymLoc == ARM64MCExpr::VK_ABS && IsNC)
+        return ELF::R_AARCH64_LDST32_ABS_LO12_NC;
+      if (SymLoc == ARM64MCExpr::VK_DTPREL && !IsNC)
+        return ELF::R_AARCH64_TLSLD_LDST32_DTPREL_LO12;
+      if (SymLoc == ARM64MCExpr::VK_DTPREL && IsNC)
+        return ELF::R_AARCH64_TLSLD_LDST32_DTPREL_LO12_NC;
+      if (SymLoc == ARM64MCExpr::VK_TPREL && !IsNC)
+        return ELF::R_AARCH64_TLSLE_LDST32_TPREL_LO12;
+      if (SymLoc == ARM64MCExpr::VK_TPREL && IsNC)
+        return ELF::R_AARCH64_TLSLE_LDST32_TPREL_LO12_NC;
+
+      report_fatal_error("invalid fixup for 32-bit load/store instruction");
+      return 0;
+    case ARM64::fixup_arm64_ldst_imm12_scale8:
+      if (SymLoc == ARM64MCExpr::VK_ABS && IsNC)
+        return ELF::R_AARCH64_LDST64_ABS_LO12_NC;
+      if (SymLoc == ARM64MCExpr::VK_GOT && IsNC)
+        return ELF::R_AARCH64_LD64_GOT_LO12_NC;
+      if (SymLoc == ARM64MCExpr::VK_DTPREL && !IsNC)
+        return ELF::R_AARCH64_TLSLD_LDST64_DTPREL_LO12;
+      if (SymLoc == ARM64MCExpr::VK_DTPREL && IsNC)
+        return ELF::R_AARCH64_TLSLD_LDST64_DTPREL_LO12_NC;
+      if (SymLoc == ARM64MCExpr::VK_TPREL && !IsNC)
+        return ELF::R_AARCH64_TLSLE_LDST64_TPREL_LO12;
+      if (SymLoc == ARM64MCExpr::VK_TPREL && IsNC)
+        return ELF::R_AARCH64_TLSLE_LDST64_TPREL_LO12_NC;
+      if (SymLoc == ARM64MCExpr::VK_GOTTPREL && IsNC)
+        return ELF::R_AARCH64_TLSIE_LD64_GOTTPREL_LO12_NC;
+      if (SymLoc == ARM64MCExpr::VK_TLSDESC && IsNC)
+        return ELF::R_AARCH64_TLSDESC_LD64_LO12_NC;
+
+      report_fatal_error("invalid fixup for 64-bit load/store instruction");
+      return 0;
+    case ARM64::fixup_arm64_ldst_imm12_scale16:
+      if (SymLoc == ARM64MCExpr::VK_ABS && IsNC)
+        return ELF::R_AARCH64_LDST128_ABS_LO12_NC;
+
+      report_fatal_error("invalid fixup for 128-bit load/store instruction");
+      return 0;
+    case ARM64::fixup_arm64_movw:
+      if (RefKind == ARM64MCExpr::VK_ABS_G3)
+        return ELF::R_AARCH64_MOVW_UABS_G3;
+      if (RefKind == ARM64MCExpr::VK_ABS_G2)
+        return ELF::R_AARCH64_MOVW_UABS_G2;
+      if (RefKind == ARM64MCExpr::VK_ABS_G2_NC)
+        return ELF::R_AARCH64_MOVW_UABS_G2_NC;
+      if (RefKind == ARM64MCExpr::VK_ABS_G1)
+        return ELF::R_AARCH64_MOVW_UABS_G1;
+      if (RefKind == ARM64MCExpr::VK_ABS_G1_NC)
+        return ELF::R_AARCH64_MOVW_UABS_G1_NC;
+      if (RefKind == ARM64MCExpr::VK_ABS_G0)
+        return ELF::R_AARCH64_MOVW_UABS_G0;
+      if (RefKind == ARM64MCExpr::VK_ABS_G0_NC)
+        return ELF::R_AARCH64_MOVW_UABS_G0_NC;
+      if (RefKind == ARM64MCExpr::VK_DTPREL_G2)
+        return ELF::R_AARCH64_TLSLD_MOVW_DTPREL_G2;
+      if (RefKind == ARM64MCExpr::VK_DTPREL_G1)
+        return ELF::R_AARCH64_TLSLD_MOVW_DTPREL_G1;
+      if (RefKind == ARM64MCExpr::VK_DTPREL_G1_NC)
+        return ELF::R_AARCH64_TLSLD_MOVW_DTPREL_G1_NC;
+      if (RefKind == ARM64MCExpr::VK_DTPREL_G0)
+        return ELF::R_AARCH64_TLSLD_MOVW_DTPREL_G0;
+      if (RefKind == ARM64MCExpr::VK_DTPREL_G0_NC)
+        return ELF::R_AARCH64_TLSLD_MOVW_DTPREL_G0_NC;
+      if (RefKind == ARM64MCExpr::VK_TPREL_G2)
+        return ELF::R_AARCH64_TLSLE_MOVW_TPREL_G2;
+      if (RefKind == ARM64MCExpr::VK_TPREL_G1)
+        return ELF::R_AARCH64_TLSLE_MOVW_TPREL_G1;
+      if (RefKind == ARM64MCExpr::VK_TPREL_G1_NC)
+        return ELF::R_AARCH64_TLSLE_MOVW_TPREL_G1_NC;
+      if (RefKind == ARM64MCExpr::VK_TPREL_G0)
+        return ELF::R_AARCH64_TLSLE_MOVW_TPREL_G0;
+      if (RefKind == ARM64MCExpr::VK_TPREL_G0_NC)
+        return ELF::R_AARCH64_TLSLE_MOVW_TPREL_G0_NC;
+      if (RefKind == ARM64MCExpr::VK_GOTTPREL_G1)
+        return ELF::R_AARCH64_TLSIE_MOVW_GOTTPREL_G1;
+      if (RefKind == ARM64MCExpr::VK_GOTTPREL_G0_NC)
+        return ELF::R_AARCH64_TLSIE_MOVW_GOTTPREL_G0_NC;
+      report_fatal_error("invalid fixup for movz/movk instruction");
+      return 0;
+    case ARM64::fixup_arm64_tlsdesc_call:
+      return ELF::R_AARCH64_TLSDESC_CALL;
+    default:
+      llvm_unreachable("Unknown ELF relocation type");
+    }
+  }
+
+  llvm_unreachable("Unimplemented fixup -> relocation");
+}
+
+MCObjectWriter *llvm::createARM64ELFObjectWriter(raw_ostream &OS,
+                                                 uint8_t OSABI) {
+  MCELFObjectTargetWriter *MOTW = new ARM64ELFObjectWriter(OSABI);
+  return createELFObjectWriter(MOTW, OS, /*IsLittleEndian=*/true);
+}
--- a/lib/Target/ARM64/MCTargetDesc/ARM64ELFStreamer.cpp
+++ b/lib/Target/ARM64/MCTargetDesc/ARM64ELFStreamer.cpp
@ -0,0 +1,158 @@
+//===- lib/MC/ARM64ELFStreamer.cpp - ELF Object Output for ARM64 ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file assembles .s files and emits AArch64 ELF .o object files. Different
+// from generic ELF streamer in emitting mapping symbols ($x and $d) to delimit
+// regions of data and code.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/MC/MCELFStreamer.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCELF.h"
+#include "llvm/MC/MCELFStreamer.h"
+#include "llvm/MC/MCELFSymbolFlags.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCObjectStreamer.h"
+#include "llvm/MC/MCSection.h"
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ELF.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+namespace {
+
+/// Extend the generic ELFStreamer class so that it can emit mapping symbols at
+/// the appropriate points in the object files. These symbols are defined in the
+/// AArch64 ELF ABI:
+///    infocenter.arm.com/help/topic/com.arm.doc.ihi0056a/IHI0056A_aaelf64.pdf
+///
+/// In brief: $x or $d should be emitted at the start of each contiguous region
+/// of A64 code or data in a section. In practice, this emission does not rely
+/// on explicit assembler directives but on inherent properties of the
+/// directives doing the emission (e.g. ".byte" is data, "add x0, x0, x0" an
+/// instruction).
+///
+/// As a result this system is orthogonal to the DataRegion infrastructure used
+/// by MachO. Beware!
+class ARM64ELFStreamer : public MCELFStreamer {
+public:
+  ARM64ELFStreamer(MCContext &Context, MCAsmBackend &TAB, raw_ostream &OS,
+                   MCCodeEmitter *Emitter)
+      : MCELFStreamer(Context, TAB, OS, Emitter), MappingSymbolCounter(0),
+        LastEMS(EMS_None) {}
+
+  ~ARM64ELFStreamer() {}
+
+  virtual void ChangeSection(const MCSection *Section,
+                             const MCExpr *Subsection) {
+    // We have to keep track of the mapping symbol state of any sections we
+    // use. Each one should start off as EMS_None, which is provided as the
+    // default constructor by DenseMap::lookup.
+    LastMappingSymbols[getPreviousSection().first] = LastEMS;
+    LastEMS = LastMappingSymbols.lookup(Section);
+
+    MCELFStreamer::ChangeSection(Section, Subsection);
+  }
+
+  /// This function is the one used to emit instruction data into the ELF
+  /// streamer. We override it to add the appropriate mapping symbol if
+  /// necessary.
+  virtual void EmitInstruction(const MCInst &Inst, const MCSubtargetInfo &STI) {
+    EmitA64MappingSymbol();
+    MCELFStreamer::EmitInstruction(Inst, STI);
+  }
+
+  /// This is one of the functions used to emit data into an ELF section, so the
+  /// ARM64 streamer overrides it to add the appropriate mapping symbol ($d)
+  /// if necessary.
+  virtual void EmitBytes(StringRef Data) {
+    EmitDataMappingSymbol();
+    MCELFStreamer::EmitBytes(Data);
+  }
+
+  /// This is one of the functions used to emit data into an ELF section, so the
+  /// ARM64 streamer overrides it to add the appropriate mapping symbol ($d)
+  /// if necessary.
+  virtual void EmitValueImpl(const MCExpr *Value, unsigned Size) {
+    EmitDataMappingSymbol();
+    MCELFStreamer::EmitValueImpl(Value, Size);
+  }
+
+private:
+  enum ElfMappingSymbol {
+    EMS_None,
+    EMS_A64,
+    EMS_Data
+  };
+
+  void EmitDataMappingSymbol() {
+    if (LastEMS == EMS_Data)
+      return;
+    EmitMappingSymbol("$d");
+    LastEMS = EMS_Data;
+  }
+
+  void EmitA64MappingSymbol() {
+    if (LastEMS == EMS_A64)
+      return;
+    EmitMappingSymbol("$x");
+    LastEMS = EMS_A64;
+  }
+
+  void EmitMappingSymbol(StringRef Name) {
+    MCSymbol *Start = getContext().CreateTempSymbol();
+    EmitLabel(Start);
+
+    MCSymbol *Symbol = getContext().GetOrCreateSymbol(
+        Name + "." + Twine(MappingSymbolCounter++));
+
+    MCSymbolData &SD = getAssembler().getOrCreateSymbolData(*Symbol);
+    MCELF::SetType(SD, ELF::STT_NOTYPE);
+    MCELF::SetBinding(SD, ELF::STB_LOCAL);
+    SD.setExternal(false);
+    Symbol->setSection(*getCurrentSection().first);
+
+    const MCExpr *Value = MCSymbolRefExpr::Create(Start, getContext());
+    Symbol->setVariableValue(Value);
+  }
+
+  int64_t MappingSymbolCounter;
+
+  DenseMap<const MCSection *, ElfMappingSymbol> LastMappingSymbols;
+  ElfMappingSymbol LastEMS;
+
+  /// @}
+};
+}
+
+namespace llvm {
+MCELFStreamer *createARM64ELFStreamer(MCContext &Context, MCAsmBackend &TAB,
+                                      raw_ostream &OS, MCCodeEmitter *Emitter,
+                                      bool RelaxAll, bool NoExecStack) {
+  ARM64ELFStreamer *S = new ARM64ELFStreamer(Context, TAB, OS, Emitter);
+  if (RelaxAll)
+    S->getAssembler().setRelaxAll(true);
+  if (NoExecStack)
+    S->getAssembler().setNoExecStack(true);
+  return S;
+}
+}
--- a/lib/Target/ARM64/MCTargetDesc/ARM64ELFStreamer.h
+++ b/lib/Target/ARM64/MCTargetDesc/ARM64ELFStreamer.h
@ -0,0 +1,26 @@
+//===-- ARM64ELFStreamer.h - ELF Streamer for ARM64 -------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements ELF streamer information for the ARM64 backend.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_AARCH64_ELF_STREAMER_H
+#define LLVM_AARCH64_ELF_STREAMER_H
+
+#include "llvm/MC/MCELFStreamer.h"
+
+namespace llvm {
+
+MCELFStreamer *createARM64ELFStreamer(MCContext &Context, MCAsmBackend &TAB,
+                                      raw_ostream &OS, MCCodeEmitter *Emitter,
+                                      bool RelaxAll, bool NoExecStack);
+}
+
+#endif // ARM64_ELF_STREAMER_H
--- a/lib/Target/ARM64/MCTargetDesc/ARM64FixupKinds.h
+++ b/lib/Target/ARM64/MCTargetDesc/ARM64FixupKinds.h
@ -0,0 +1,72 @@
+//===-- ARM64FixupKinds.h - ARM64 Specific Fixup Entries --------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_ARM64FIXUPKINDS_H
+#define LLVM_ARM64FIXUPKINDS_H
+
+#include "llvm/MC/MCFixup.h"
+
+namespace llvm {
+namespace ARM64 {
+
+enum Fixups {
+  // fixup_arm64_pcrel_adr_imm21 - A 21-bit pc-relative immediate inserted into
+  // an ADR instruction.
+  fixup_arm64_pcrel_adr_imm21 = FirstTargetFixupKind,
+
+  // fixup_arm64_pcrel_adrp_imm21 - A 21-bit pc-relative immediate inserted into
+  // an ADRP instruction.
+  fixup_arm64_pcrel_adrp_imm21,
+
+  // fixup_arm64_imm12 - 12-bit fixup for add/sub instructions.
+  //     No alignment adjustment. All value bits are encoded.
+  fixup_arm64_add_imm12,
+
+  // fixup_arm64_ldst_imm12_* - unsigned 12-bit fixups for load and
+  // store instructions.
+  fixup_arm64_ldst_imm12_scale1,
+  fixup_arm64_ldst_imm12_scale2,
+  fixup_arm64_ldst_imm12_scale4,
+  fixup_arm64_ldst_imm12_scale8,
+  fixup_arm64_ldst_imm12_scale16,
+
+  // FIXME: comment
+  fixup_arm64_movw,
+
+  // fixup_arm64_pcrel_imm14 - The high 14 bits of a 21-bit pc-relative
+  // immediate.
+  fixup_arm64_pcrel_branch14,
+
+  // fixup_arm64_pcrel_imm19 - The high 19 bits of a 21-bit pc-relative
+  // immediate. Same encoding as fixup_arm64_pcrel_adrhi, except this
+  // is not used as part of a lo/hi pair and thus generates relocations
+  // directly when necessary.
+  fixup_arm64_pcrel_imm19,
+
+  // fixup_arm64_pcrel_branch26 - The high 26 bits of a 28-bit pc-relative
+  // immediate.
+  fixup_arm64_pcrel_branch26,
+
+  // fixup_arm64_pcrel_call26 - The high 26 bits of a 28-bit pc-relative
+  // immediate. Distinguished from branch26 only on ELF.
+  fixup_arm64_pcrel_call26,
+
+  // fixup_arm64_tlsdesc_call - zero-space placeholder for the ELF
+  // R_AARCH64_TLSDESC_CALL relocation.
+  fixup_arm64_tlsdesc_call,
+
+  // Marker
+  LastTargetFixupKind,
+  NumTargetFixupKinds = LastTargetFixupKind - FirstTargetFixupKind
+};
+
+} // end namespace ARM64
+} // end namespace llvm
+
+#endif
--- a/lib/Target/ARM64/MCTargetDesc/ARM64MCAsmInfo.cpp
+++ b/lib/Target/ARM64/MCTargetDesc/ARM64MCAsmInfo.cpp
@ -0,0 +1,92 @@
+//===-- ARM64MCAsmInfo.cpp - ARM64 asm properties -----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declarations of the ARM64MCAsmInfo properties.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARM64MCAsmInfo.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/Support/CommandLine.h"
+using namespace llvm;
+
+enum AsmWriterVariantTy {
+  Default = -1,
+  Generic = 0,
+  Apple = 1
+};
+
+static cl::opt<AsmWriterVariantTy> AsmWriterVariant(
+    "arm64-neon-syntax", cl::init(Default),
+    cl::desc("Choose style of NEON code to emit from ARM64 backend:"),
+    cl::values(clEnumValN(Generic, "generic", "Emit generic NEON assembly"),
+               clEnumValN(Apple, "apple", "Emit Apple-style NEON assembly"),
+               clEnumValEnd));
+
+ARM64MCAsmInfoDarwin::ARM64MCAsmInfoDarwin() {
+  // We prefer NEON instructions to be printed in the short form.
+  AssemblerDialect = AsmWriterVariant == Default ? 1 : AsmWriterVariant;
+
+  PrivateGlobalPrefix = "L";
+  SeparatorString = "%%";
+  CommentString = ";";
+  PointerSize = CalleeSaveStackSlotSize = 8;
+
+  AlignmentIsInBytes = false;
+  UsesELFSectionDirectiveForBSS = true;
+  SupportsDebugInformation = true;
+  UseDataRegionDirectives = true;
+
+  ExceptionsType = ExceptionHandling::DwarfCFI;
+}
+
+const MCExpr *ARM64MCAsmInfoDarwin::getExprForPersonalitySymbol(
+    const MCSymbol *Sym, unsigned Encoding, MCStreamer &Streamer) const {
+  // On Darwin, we can reference dwarf symbols with foo@GOT-., which
+  // is an indirect pc-relative reference. The default implementation
+  // won't reference using the GOT, so we need this target-specific
+  // version.
+  MCContext &Context = Streamer.getContext();
+  const MCExpr *Res =
+      MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_GOT, Context);
+  MCSymbol *PCSym = Context.CreateTempSymbol();
+  Streamer.EmitLabel(PCSym);
+  const MCExpr *PC = MCSymbolRefExpr::Create(PCSym, Context);
+  return MCBinaryExpr::CreateSub(Res, PC, Context);
+}
+
+ARM64MCAsmInfoELF::ARM64MCAsmInfoELF() {
+  // We prefer NEON instructions to be printed in the short form.
+  AssemblerDialect = AsmWriterVariant == Default ? 0 : AsmWriterVariant;
+
+  PointerSize = 8;
+
+  // ".comm align is in bytes but .align is pow-2."
+  AlignmentIsInBytes = false;
+
+  CommentString = "//";
+  PrivateGlobalPrefix = ".L";
+  Code32Directive = ".code\t32";
+
+  Data16bitsDirective = "\t.hword\t";
+  Data32bitsDirective = "\t.word\t";
+  Data64bitsDirective = "\t.xword\t";
+
+  UseDataRegionDirectives = false;
+
+  WeakRefDirective = "\t.weak\t";
+
+  HasLEB128 = true;
+  SupportsDebugInformation = true;
+
+  // Exceptions handling
+  ExceptionsType = ExceptionHandling::DwarfCFI;
+}
--- a/lib/Target/ARM64/MCTargetDesc/ARM64MCAsmInfo.h
+++ b/lib/Target/ARM64/MCTargetDesc/ARM64MCAsmInfo.h
@ -0,0 +1,36 @@
+//=====-- ARM64MCAsmInfo.h - ARM64 asm properties -----------*- C++ -*--====//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declaration of the ARM64MCAsmInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef ARM64TARGETASMINFO_H
+#define ARM64TARGETASMINFO_H
+
+#include "llvm/MC/MCAsmInfoDarwin.h"
+
+namespace llvm {
+class Target;
+class StringRef;
+class MCStreamer;
+struct ARM64MCAsmInfoDarwin : public MCAsmInfoDarwin {
+  explicit ARM64MCAsmInfoDarwin();
+  virtual const MCExpr *getExprForPersonalitySymbol(const MCSymbol *Sym,
+                                                    unsigned Encoding,
+                                                    MCStreamer &Streamer) const;
+};
+
+struct ARM64MCAsmInfoELF : public MCAsmInfo {
+  explicit ARM64MCAsmInfoELF();
+};
+
+} // namespace llvm
+
+#endif
--- a/lib/Target/ARM64/MCTargetDesc/ARM64MCCodeEmitter.cpp
+++ b/lib/Target/ARM64/MCTargetDesc/ARM64MCCodeEmitter.cpp
@ -0,0 +1,563 @@
+//===-- ARM64/ARM64MCCodeEmitter.cpp - Convert ARM64 code to machine code -===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the ARM64MCCodeEmitter class.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "mccodeemitter"
+#include "MCTargetDesc/ARM64AddressingModes.h"
+#include "MCTargetDesc/ARM64BaseInfo.h"
+#include "MCTargetDesc/ARM64FixupKinds.h"
+#include "MCTargetDesc/ARM64MCExpr.h"
+#include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+STATISTIC(MCNumEmitted, "Number of MC instructions emitted.");
+STATISTIC(MCNumFixups, "Number of MC fixups created.");
+
+namespace {
+
+class ARM64MCCodeEmitter : public MCCodeEmitter {
+  MCContext &Ctx;
+
+  ARM64MCCodeEmitter(const ARM64MCCodeEmitter &); // DO NOT IMPLEMENT
+  void operator=(const ARM64MCCodeEmitter &);     // DO NOT IMPLEMENT
+public:
+  ARM64MCCodeEmitter(const MCInstrInfo &mcii, const MCSubtargetInfo &sti,
+                     MCContext &ctx)
+      : Ctx(ctx) {}
+
+  ~ARM64MCCodeEmitter() {}
+
+  // getBinaryCodeForInstr - TableGen'erated function for getting the
+  // binary encoding for an instruction.
+  uint64_t getBinaryCodeForInstr(const MCInst &MI,
+                                 SmallVectorImpl<MCFixup> &Fixups,
+                                 const MCSubtargetInfo &STI) const;
+
+  /// getMachineOpValue - Return binary encoding of operand. If the machine
+  /// operand requires relocation, record the relocation and return zero.
+  unsigned getMachineOpValue(const MCInst &MI, const MCOperand &MO,
+                             SmallVectorImpl<MCFixup> &Fixups,
+                             const MCSubtargetInfo &STI) const;
+
+  /// getAMIndexed8OpValue - Return encoding info for base register
+  /// and 12-bit unsigned immediate attached to a load, store or prfm
+  /// instruction. If operand requires a relocation, record it and
+  /// return zero in that part of the encoding.
+  template <uint32_t FixupKind>
+  uint32_t getAMIndexed8OpValue(const MCInst &MI, unsigned OpIdx,
+                                SmallVectorImpl<MCFixup> &Fixups,
+                                const MCSubtargetInfo &STI) const;
+
+  /// getAdrLabelOpValue - Return encoding info for 21-bit immediate ADR label
+  /// target.
+  uint32_t getAdrLabelOpValue(const MCInst &MI, unsigned OpIdx,
+                              SmallVectorImpl<MCFixup> &Fixups,
+                              const MCSubtargetInfo &STI) const;
+
+  /// getAddSubImmOpValue - Return encoding for the 12-bit immediate value and
+  /// the 2-bit shift field.
+  uint32_t getAddSubImmOpValue(const MCInst &MI, unsigned OpIdx,
+                               SmallVectorImpl<MCFixup> &Fixups,
+                               const MCSubtargetInfo &STI) const;
+
+  /// getCondBranchTargetOpValue - Return the encoded value for a conditional
+  /// branch target.
+  uint32_t getCondBranchTargetOpValue(const MCInst &MI, unsigned OpIdx,
+                                      SmallVectorImpl<MCFixup> &Fixups,
+                                      const MCSubtargetInfo &STI) const;
+
+  /// getTestBranchTargetOpValue - Return the encoded value for a test-bit-and-
+  /// branch target.
+  uint32_t getTestBranchTargetOpValue(const MCInst &MI, unsigned OpIdx,
+                                      SmallVectorImpl<MCFixup> &Fixups,
+                                      const MCSubtargetInfo &STI) const;
+
+  /// getBranchTargetOpValue - Return the encoded value for an unconditional
+  /// branch target.
+  uint32_t getBranchTargetOpValue(const MCInst &MI, unsigned OpIdx,
+                                  SmallVectorImpl<MCFixup> &Fixups,
+                                  const MCSubtargetInfo &STI) const;
+
+  /// getMoveWideImmOpValue - Return the encoded value for the immediate operand
+  /// of a MOVZ or MOVK instruction.
+  uint32_t getMoveWideImmOpValue(const MCInst &MI, unsigned OpIdx,
+                                 SmallVectorImpl<MCFixup> &Fixups,
+                                 const MCSubtargetInfo &STI) const;
+
+  /// getVecShifterOpValue - Return the encoded value for the vector shifter.
+  uint32_t getVecShifterOpValue(const MCInst &MI, unsigned OpIdx,
+                                SmallVectorImpl<MCFixup> &Fixups,
+                                const MCSubtargetInfo &STI) const;
+
+  /// getMoveVecShifterOpValue - Return the encoded value for the vector move
+  /// shifter (MSL).
+  uint32_t getMoveVecShifterOpValue(const MCInst &MI, unsigned OpIdx,
+                                    SmallVectorImpl<MCFixup> &Fixups,
+                                    const MCSubtargetInfo &STI) const;
+
+  /// getFixedPointScaleOpValue - Return the encoded value for the
+  // FP-to-fixed-point scale factor.
+  uint32_t getFixedPointScaleOpValue(const MCInst &MI, unsigned OpIdx,
+                                     SmallVectorImpl<MCFixup> &Fixups,
+                                     const MCSubtargetInfo &STI) const;
+
+  uint32_t getVecShiftR64OpValue(const MCInst &MI, unsigned OpIdx,
+                                 SmallVectorImpl<MCFixup> &Fixups,
+                                 const MCSubtargetInfo &STI) const;
+  uint32_t getVecShiftR32OpValue(const MCInst &MI, unsigned OpIdx,
+                                 SmallVectorImpl<MCFixup> &Fixups,
+                                 const MCSubtargetInfo &STI) const;
+  uint32_t getVecShiftR16OpValue(const MCInst &MI, unsigned OpIdx,
+                                 SmallVectorImpl<MCFixup> &Fixups,
+                                 const MCSubtargetInfo &STI) const;
+  uint32_t getVecShiftR8OpValue(const MCInst &MI, unsigned OpIdx,
+                                SmallVectorImpl<MCFixup> &Fixups,
+                                const MCSubtargetInfo &STI) const;
+  uint32_t getVecShiftL64OpValue(const MCInst &MI, unsigned OpIdx,
+                                 SmallVectorImpl<MCFixup> &Fixups,
+                                 const MCSubtargetInfo &STI) const;
+  uint32_t getVecShiftL32OpValue(const MCInst &MI, unsigned OpIdx,
+                                 SmallVectorImpl<MCFixup> &Fixups,
+                                 const MCSubtargetInfo &STI) const;
+  uint32_t getVecShiftL16OpValue(const MCInst &MI, unsigned OpIdx,
+                                 SmallVectorImpl<MCFixup> &Fixups,
+                                 const MCSubtargetInfo &STI) const;
+  uint32_t getVecShiftL8OpValue(const MCInst &MI, unsigned OpIdx,
+                                SmallVectorImpl<MCFixup> &Fixups,
+                                const MCSubtargetInfo &STI) const;
+
+  /// getSIMDShift64OpValue - Return the encoded value for the
+  // shift-by-immediate AdvSIMD instructions.
+  uint32_t getSIMDShift64OpValue(const MCInst &MI, unsigned OpIdx,
+                                 SmallVectorImpl<MCFixup> &Fixups,
+                                 const MCSubtargetInfo &STI) const;
+
+  uint32_t getSIMDShift64_32OpValue(const MCInst &MI, unsigned OpIdx,
+                                    SmallVectorImpl<MCFixup> &Fixups,
+                                    const MCSubtargetInfo &STI) const;
+
+  uint32_t getSIMDShift32OpValue(const MCInst &MI, unsigned OpIdx,
+                                 SmallVectorImpl<MCFixup> &Fixups,
+                                 const MCSubtargetInfo &STI) const;
+
+  uint32_t getSIMDShift16OpValue(const MCInst &MI, unsigned OpIdx,
+                                 SmallVectorImpl<MCFixup> &Fixups,
+                                 const MCSubtargetInfo &STI) const;
+
+  unsigned fixMOVZ(const MCInst &MI, unsigned EncodedValue,
+                   const MCSubtargetInfo &STI) const;
+
+  void EmitByte(unsigned char C, raw_ostream &OS) const { OS << (char)C; }
+
+  void EmitConstant(uint64_t Val, unsigned Size, raw_ostream &OS) const {
+    // Output the constant in little endian byte order.
+    for (unsigned i = 0; i != Size; ++i) {
+      EmitByte(Val & 255, OS);
+      Val >>= 8;
+    }
+  }
+
+  void EncodeInstruction(const MCInst &MI, raw_ostream &OS,
+                         SmallVectorImpl<MCFixup> &Fixups,
+                         const MCSubtargetInfo &STI) const;
+};
+
+} // end anonymous namespace
+
+MCCodeEmitter *llvm::createARM64MCCodeEmitter(const MCInstrInfo &MCII,
+                                              const MCRegisterInfo &MRI,
+                                              const MCSubtargetInfo &STI,
+                                              MCContext &Ctx) {
+  return new ARM64MCCodeEmitter(MCII, STI, Ctx);
+}
+
+/// getMachineOpValue - Return binary encoding of operand. If the machine
+/// operand requires relocation, record the relocation and return zero.
+unsigned
+ARM64MCCodeEmitter::getMachineOpValue(const MCInst &MI, const MCOperand &MO,
+                                      SmallVectorImpl<MCFixup> &Fixups,
+                                      const MCSubtargetInfo &STI) const {
+  if (MO.isReg())
+    return Ctx.getRegisterInfo()->getEncodingValue(MO.getReg());
+  else {
+    assert(MO.isImm() && "did not expect relocated expression");
+    return static_cast<unsigned>(MO.getImm());
+  }
+
+  assert(0 && "Unable to encode MCOperand!");
+  return 0;
+}
+
+template <uint32_t FixupKind>
+uint32_t
+ARM64MCCodeEmitter::getAMIndexed8OpValue(const MCInst &MI, unsigned OpIdx,
+                                         SmallVectorImpl<MCFixup> &Fixups,
+                                         const MCSubtargetInfo &STI) const {
+  unsigned BaseReg = MI.getOperand(OpIdx).getReg();
+  BaseReg = Ctx.getRegisterInfo()->getEncodingValue(BaseReg);
+
+  const MCOperand &MO = MI.getOperand(OpIdx + 1);
+  uint32_t ImmVal = 0;
+
+  if (MO.isImm())
+    ImmVal = static_cast<uint32_t>(MO.getImm());
+  else {
+    assert(MO.isExpr() && "unable to encode load/store imm operand");
+    MCFixupKind Kind = MCFixupKind(FixupKind);
+    Fixups.push_back(MCFixup::Create(0, MO.getExpr(), Kind, MI.getLoc()));
+    ++MCNumFixups;
+  }
+
+  return BaseReg | (ImmVal << 5);
+}
+
+/// getAdrLabelOpValue - Return encoding info for 21-bit immediate ADR label
+/// target.
+uint32_t
+ARM64MCCodeEmitter::getAdrLabelOpValue(const MCInst &MI, unsigned OpIdx,
+                                       SmallVectorImpl<MCFixup> &Fixups,
+                                       const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+
+  // If the destination is an immediate, we have nothing to do.
+  if (MO.isImm())
+    return MO.getImm();
+  assert(MO.isExpr() && "Unexpected ADR target type!");
+  const MCExpr *Expr = MO.getExpr();
+
+  MCFixupKind Kind = MI.getOpcode() == ARM64::ADR
+                         ? MCFixupKind(ARM64::fixup_arm64_pcrel_adr_imm21)
+                         : MCFixupKind(ARM64::fixup_arm64_pcrel_adrp_imm21);
+  Fixups.push_back(MCFixup::Create(0, Expr, Kind, MI.getLoc()));
+
+  MCNumFixups += 1;
+
+  // All of the information is in the fixup.
+  return 0;
+}
+
+/// getAddSubImmOpValue - Return encoding for the 12-bit immediate value and
+/// the 2-bit shift field.  The shift field is stored in bits 13-14 of the
+/// return value.
+uint32_t
+ARM64MCCodeEmitter::getAddSubImmOpValue(const MCInst &MI, unsigned OpIdx,
+                                        SmallVectorImpl<MCFixup> &Fixups,
+                                        const MCSubtargetInfo &STI) const {
+  // Suboperands are [imm, shifter].
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  const MCOperand &MO1 = MI.getOperand(OpIdx + 1);
+  assert(ARM64_AM::getShiftType(MO1.getImm()) == ARM64_AM::LSL &&
+         "unexpected shift type for add/sub immediate");
+  unsigned ShiftVal = ARM64_AM::getShiftValue(MO1.getImm());
+  assert((ShiftVal == 0 || ShiftVal == 12) &&
+         "unexpected shift value for add/sub immediate");
+  if (MO.isImm())
+    return MO.getImm() | (ShiftVal == 0 ? 0 : (1 << 12));
+  assert(MO.isExpr() && "Unable to encode MCOperand!");
+  const MCExpr *Expr = MO.getExpr();
+  assert(ShiftVal == 0 && "shift not allowed on add/sub immediate with fixup");
+
+  // Encode the 12 bits of the fixup.
+  MCFixupKind Kind = MCFixupKind(ARM64::fixup_arm64_add_imm12);
+  Fixups.push_back(MCFixup::Create(0, Expr, Kind, MI.getLoc()));
+
+  ++MCNumFixups;
+
+  return 0;
+}
+
+/// getCondBranchTargetOpValue - Return the encoded value for a conditional
+/// branch target.
+uint32_t ARM64MCCodeEmitter::getCondBranchTargetOpValue(
+    const MCInst &MI, unsigned OpIdx, SmallVectorImpl<MCFixup> &Fixups,
+    const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+
+  // If the destination is an immediate, we have nothing to do.
+  if (MO.isImm())
+    return MO.getImm();
+  assert(MO.isExpr() && "Unexpected target type!");
+
+  MCFixupKind Kind = MCFixupKind(ARM64::fixup_arm64_pcrel_imm19);
+  Fixups.push_back(MCFixup::Create(0, MO.getExpr(), Kind, MI.getLoc()));
+
+  ++MCNumFixups;
+
+  // All of the information is in the fixup.
+  return 0;
+}
+
+uint32_t
+ARM64MCCodeEmitter::getMoveWideImmOpValue(const MCInst &MI, unsigned OpIdx,
+                                          SmallVectorImpl<MCFixup> &Fixups,
+                                          const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+
+  if (MO.isImm())
+    return MO.getImm();
+  assert(MO.isExpr() && "Unexpected movz/movk immediate");
+
+  Fixups.push_back(MCFixup::Create(
+      0, MO.getExpr(), MCFixupKind(ARM64::fixup_arm64_movw), MI.getLoc()));
+
+  ++MCNumFixups;
+
+  return 0;
+}
+
+/// getTestBranchTargetOpValue - Return the encoded value for a test-bit-and-
+/// branch target.
+uint32_t ARM64MCCodeEmitter::getTestBranchTargetOpValue(
+    const MCInst &MI, unsigned OpIdx, SmallVectorImpl<MCFixup> &Fixups,
+    const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+
+  // If the destination is an immediate, we have nothing to do.
+  if (MO.isImm())
+    return MO.getImm();
+  assert(MO.isExpr() && "Unexpected ADR target type!");
+
+  MCFixupKind Kind = MCFixupKind(ARM64::fixup_arm64_pcrel_branch14);
+  Fixups.push_back(MCFixup::Create(0, MO.getExpr(), Kind, MI.getLoc()));
+
+  ++MCNumFixups;
+
+  // All of the information is in the fixup.
+  return 0;
+}
+
+/// getBranchTargetOpValue - Return the encoded value for an unconditional
+/// branch target.
+uint32_t
+ARM64MCCodeEmitter::getBranchTargetOpValue(const MCInst &MI, unsigned OpIdx,
+                                           SmallVectorImpl<MCFixup> &Fixups,
+                                           const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+
+  // If the destination is an immediate, we have nothing to do.
+  if (MO.isImm())
+    return MO.getImm();
+  assert(MO.isExpr() && "Unexpected ADR target type!");
+
+  MCFixupKind Kind = MI.getOpcode() == ARM64::BL
+                         ? MCFixupKind(ARM64::fixup_arm64_pcrel_call26)
+                         : MCFixupKind(ARM64::fixup_arm64_pcrel_branch26);
+  Fixups.push_back(MCFixup::Create(0, MO.getExpr(), Kind, MI.getLoc()));
+
+  ++MCNumFixups;
+
+  // All of the information is in the fixup.
+  return 0;
+}
+
+/// getVecShifterOpValue - Return the encoded value for the vector shifter:
+///
+///   00 -> 0
+///   01 -> 8
+///   10 -> 16
+///   11 -> 24
+uint32_t
+ARM64MCCodeEmitter::getVecShifterOpValue(const MCInst &MI, unsigned OpIdx,
+                                         SmallVectorImpl<MCFixup> &Fixups,
+                                         const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  assert(MO.isImm() && "Expected an immediate value for the shift amount!");
+
+  switch (MO.getImm()) {
+  default:
+    break;
+  case 0:
+    return 0;
+  case 8:
+    return 1;
+  case 16:
+    return 2;
+  case 24:
+    return 3;
+  }
+
+  assert(false && "Invalid value for vector shift amount!");
+  return 0;
+}
+
+uint32_t
+ARM64MCCodeEmitter::getSIMDShift64OpValue(const MCInst &MI, unsigned OpIdx,
+                                          SmallVectorImpl<MCFixup> &Fixups,
+                                          const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  assert(MO.isImm() && "Expected an immediate value for the shift amount!");
+  return 64 - (MO.getImm());
+}
+
+uint32_t
+ARM64MCCodeEmitter::getSIMDShift64_32OpValue(const MCInst &MI, unsigned OpIdx,
+                                             SmallVectorImpl<MCFixup> &Fixups,
+                                             const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  assert(MO.isImm() && "Expected an immediate value for the shift amount!");
+  return 64 - (MO.getImm() | 32);
+}
+
+uint32_t
+ARM64MCCodeEmitter::getSIMDShift32OpValue(const MCInst &MI, unsigned OpIdx,
+                                          SmallVectorImpl<MCFixup> &Fixups,
+                                          const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  assert(MO.isImm() && "Expected an immediate value for the shift amount!");
+  return 32 - (MO.getImm() | 16);
+}
+
+uint32_t
+ARM64MCCodeEmitter::getSIMDShift16OpValue(const MCInst &MI, unsigned OpIdx,
+                                          SmallVectorImpl<MCFixup> &Fixups,
+                                          const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  assert(MO.isImm() && "Expected an immediate value for the shift amount!");
+  return 16 - (MO.getImm() | 8);
+}
+
+/// getFixedPointScaleOpValue - Return the encoded value for the
+// FP-to-fixed-point scale factor.
+uint32_t ARM64MCCodeEmitter::getFixedPointScaleOpValue(
+    const MCInst &MI, unsigned OpIdx, SmallVectorImpl<MCFixup> &Fixups,
+    const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  assert(MO.isImm() && "Expected an immediate value for the scale amount!");
+  return 64 - MO.getImm();
+}
+
+uint32_t
+ARM64MCCodeEmitter::getVecShiftR64OpValue(const MCInst &MI, unsigned OpIdx,
+                                          SmallVectorImpl<MCFixup> &Fixups,
+                                          const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  assert(MO.isImm() && "Expected an immediate value for the scale amount!");
+  return 64 - MO.getImm();
+}
+
+uint32_t
+ARM64MCCodeEmitter::getVecShiftR32OpValue(const MCInst &MI, unsigned OpIdx,
+                                          SmallVectorImpl<MCFixup> &Fixups,
+                                          const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  assert(MO.isImm() && "Expected an immediate value for the scale amount!");
+  return 32 - MO.getImm();
+}
+
+uint32_t
+ARM64MCCodeEmitter::getVecShiftR16OpValue(const MCInst &MI, unsigned OpIdx,
+                                          SmallVectorImpl<MCFixup> &Fixups,
+                                          const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  assert(MO.isImm() && "Expected an immediate value for the scale amount!");
+  return 16 - MO.getImm();
+}
+
+uint32_t
+ARM64MCCodeEmitter::getVecShiftR8OpValue(const MCInst &MI, unsigned OpIdx,
+                                         SmallVectorImpl<MCFixup> &Fixups,
+                                         const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  assert(MO.isImm() && "Expected an immediate value for the scale amount!");
+  return 8 - MO.getImm();
+}
+
+uint32_t
+ARM64MCCodeEmitter::getVecShiftL64OpValue(const MCInst &MI, unsigned OpIdx,
+                                          SmallVectorImpl<MCFixup> &Fixups,
+                                          const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  assert(MO.isImm() && "Expected an immediate value for the scale amount!");
+  return MO.getImm() - 64;
+}
+
+uint32_t
+ARM64MCCodeEmitter::getVecShiftL32OpValue(const MCInst &MI, unsigned OpIdx,
+                                          SmallVectorImpl<MCFixup> &Fixups,
+                                          const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  assert(MO.isImm() && "Expected an immediate value for the scale amount!");
+  return MO.getImm() - 32;
+}
+
+uint32_t
+ARM64MCCodeEmitter::getVecShiftL16OpValue(const MCInst &MI, unsigned OpIdx,
+                                          SmallVectorImpl<MCFixup> &Fixups,
+                                          const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  assert(MO.isImm() && "Expected an immediate value for the scale amount!");
+  return MO.getImm() - 16;
+}
+
+uint32_t
+ARM64MCCodeEmitter::getVecShiftL8OpValue(const MCInst &MI, unsigned OpIdx,
+                                         SmallVectorImpl<MCFixup> &Fixups,
+                                         const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  assert(MO.isImm() && "Expected an immediate value for the scale amount!");
+  return MO.getImm() - 8;
+}
+
+/// getMoveVecShifterOpValue - Return the encoded value for the vector move
+/// shifter (MSL).
+uint32_t
+ARM64MCCodeEmitter::getMoveVecShifterOpValue(const MCInst &MI, unsigned OpIdx,
+                                             SmallVectorImpl<MCFixup> &Fixups,
+                                             const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  assert(MO.isImm() &&
+         "Expected an immediate value for the move shift amount!");
+  unsigned ShiftVal = ARM64_AM::getShiftValue(MO.getImm());
+  assert((ShiftVal == 8 || ShiftVal == 16) && "Invalid shift amount!");
+  return ShiftVal == 8 ? 0 : 1;
+}
+
+unsigned ARM64MCCodeEmitter::fixMOVZ(const MCInst &MI, unsigned EncodedValue,
+                                     const MCSubtargetInfo &STI) const {
+  // If one of the signed fixup kinds is applied to a MOVZ instruction, the
+  // eventual result could be either a MOVZ or a MOVN. It's the MCCodeEmitter's
+  // job to ensure that any bits possibly affected by this are 0. This means we
+  // must zero out bit 30 (essentially emitting a MOVN).
+  MCOperand UImm16MO = MI.getOperand(1);
+
+  // Nothing to do if there's no fixup.
+  if (UImm16MO.isImm())
+    return EncodedValue;
+
+  return EncodedValue & ~(1u << 30);
+}
+
+void ARM64MCCodeEmitter::EncodeInstruction(const MCInst &MI, raw_ostream &OS,
+                                           SmallVectorImpl<MCFixup> &Fixups,
+                                           const MCSubtargetInfo &STI) const {
+  if (MI.getOpcode() == ARM64::TLSDESCCALL) {
+    // This is a directive which applies an R_AARCH64_TLSDESC_CALL to the
+    // following (BLR) instruction. It doesn't emit any code itself so it
+    // doesn't go through the normal TableGenerated channels.
+    MCFixupKind Fixup = MCFixupKind(ARM64::fixup_arm64_tlsdesc_call);
+    Fixups.push_back(MCFixup::Create(0, MI.getOperand(0).getExpr(), Fixup));
+    return;
+  }
+
+  uint64_t Binary = getBinaryCodeForInstr(MI, Fixups, STI);
+  EmitConstant(Binary, 4, OS);
+  ++MCNumEmitted; // Keep track of the # of mi's emitted.
+}
+
+#include "ARM64GenMCCodeEmitter.inc"
--- a/lib/Target/ARM64/MCTargetDesc/ARM64MCExpr.cpp
+++ b/lib/Target/ARM64/MCTargetDesc/ARM64MCExpr.cpp
@ -0,0 +1,168 @@
+//===-- ARM64MCExpr.cpp - ARM64 specific MC expression classes --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the implementation of the assembly expression modifiers
+// accepted by the AArch64 architecture (e.g. ":lo12:", ":gottprel_g1:", ...).
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "aarch64symbolrefexpr"
+#include "ARM64MCExpr.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCELF.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/Object/ELF.h"
+#include "llvm/Support/ErrorHandling.h"
+
+using namespace llvm;
+
+const ARM64MCExpr *ARM64MCExpr::Create(const MCExpr *Expr, VariantKind Kind,
+                                       MCContext &Ctx) {
+  return new (Ctx) ARM64MCExpr(Expr, Kind);
+}
+
+StringRef ARM64MCExpr::getVariantKindName() const {
+  switch (static_cast<uint32_t>(getKind())) {
+  case VK_CALL:                return "";
+  case VK_LO12:                return ":lo12:";
+  case VK_ABS_G3:              return ":abs_g3:";
+  case VK_ABS_G2:              return ":abs_g2:";
+  case VK_ABS_G2_NC:           return ":abs_g2_nc:";
+  case VK_ABS_G1:              return ":abs_g1:";
+  case VK_ABS_G1_NC:           return ":abs_g1_nc:";
+  case VK_ABS_G0:              return ":abs_g0:";
+  case VK_ABS_G0_NC:           return ":abs_g0_nc:";
+  case VK_DTPREL_G2:           return ":dtprel_g2:";
+  case VK_DTPREL_G1:           return ":dtprel_g1:";
+  case VK_DTPREL_G1_NC:        return ":dtprel_g1_nc:";
+  case VK_DTPREL_G0:           return ":dtprel_g0:";
+  case VK_DTPREL_G0_NC:        return ":dtprel_g0_nc:";
+  case VK_DTPREL_LO12:         return ":dtprel_lo12:";
+  case VK_DTPREL_LO12_NC:      return ":dtprel_lo12_nc:";
+  case VK_TPREL_G2:            return ":tprel_g2:";
+  case VK_TPREL_G1:            return ":tprel_g1:";
+  case VK_TPREL_G1_NC:         return ":tprel_g1_nc:";
+  case VK_TPREL_G0:            return ":tprel_g0:";
+  case VK_TPREL_G0_NC:         return ":tprel_g0_nc:";
+  case VK_TPREL_LO12:          return ":tprel_lo12:";
+  case VK_TPREL_LO12_NC:       return ":tprel_lo12_nc:";
+  case VK_TLSDESC_LO12:        return ":tlsdesc_lo12:";
+  case VK_ABS_PAGE:            return "";
+  case VK_GOT_PAGE:            return ":got:";
+  case VK_GOT_LO12:            return ":got_lo12:";
+  case VK_GOTTPREL_PAGE:       return ":gottprel:";
+  case VK_GOTTPREL_LO12_NC:    return ":gottprel_lo12:";
+  case VK_GOTTPREL_G1:         return ":gottprel_g1:";
+  case VK_GOTTPREL_G0_NC:      return ":gottprel_g0_nc:";
+  case VK_TLSDESC:             return "";
+  case VK_TLSDESC_PAGE:        return ":tlsdesc:";
+  default:
+    llvm_unreachable("Invalid ELF symbol kind");
+  }
+}
+
+void ARM64MCExpr::PrintImpl(raw_ostream &OS) const {
+  if (getKind() != VK_NONE)
+    OS << getVariantKindName();
+  OS << *Expr;
+}
+
+// FIXME: This basically copies MCObjectStreamer::AddValueSymbols. Perhaps
+// that method should be made public?
+// FIXME: really do above: now that two backends are using it.
+static void AddValueSymbolsImpl(const MCExpr *Value, MCAssembler *Asm) {
+  switch (Value->getKind()) {
+  case MCExpr::Target:
+    llvm_unreachable("Can't handle nested target expr!");
+    break;
+
+  case MCExpr::Constant:
+    break;
+
+  case MCExpr::Binary: {
+    const MCBinaryExpr *BE = cast<MCBinaryExpr>(Value);
+    AddValueSymbolsImpl(BE->getLHS(), Asm);
+    AddValueSymbolsImpl(BE->getRHS(), Asm);
+    break;
+  }
+
+  case MCExpr::SymbolRef:
+    Asm->getOrCreateSymbolData(cast<MCSymbolRefExpr>(Value)->getSymbol());
+    break;
+
+  case MCExpr::Unary:
+    AddValueSymbolsImpl(cast<MCUnaryExpr>(Value)->getSubExpr(), Asm);
+    break;
+  }
+}
+
+void ARM64MCExpr::AddValueSymbols(MCAssembler *Asm) const {
+  AddValueSymbolsImpl(getSubExpr(), Asm);
+}
+
+const MCSection *ARM64MCExpr::FindAssociatedSection() const {
+  llvm_unreachable("FIXME: what goes here?");
+}
+
+bool ARM64MCExpr::EvaluateAsRelocatableImpl(MCValue &Res,
+                                            const MCAsmLayout *Layout) const {
+  if (!getSubExpr()->EvaluateAsRelocatable(Res, Layout))
+    return false;
+
+  Res =
+      MCValue::get(Res.getSymA(), Res.getSymB(), Res.getConstant(), getKind());
+
+  return true;
+}
+
+static void fixELFSymbolsInTLSFixupsImpl(const MCExpr *Expr, MCAssembler &Asm) {
+  switch (Expr->getKind()) {
+  case MCExpr::Target:
+    llvm_unreachable("Can't handle nested target expression");
+    break;
+  case MCExpr::Constant:
+    break;
+
+  case MCExpr::Binary: {
+    const MCBinaryExpr *BE = cast<MCBinaryExpr>(Expr);
+    fixELFSymbolsInTLSFixupsImpl(BE->getLHS(), Asm);
+    fixELFSymbolsInTLSFixupsImpl(BE->getRHS(), Asm);
+    break;
+  }
+
+  case MCExpr::SymbolRef: {
+    // We're known to be under a TLS fixup, so any symbol should be
+    // modified. There should be only one.
+    const MCSymbolRefExpr &SymRef = *cast<MCSymbolRefExpr>(Expr);
+    MCSymbolData &SD = Asm.getOrCreateSymbolData(SymRef.getSymbol());
+    MCELF::SetType(SD, ELF::STT_TLS);
+    break;
+  }
+
+  case MCExpr::Unary:
+    fixELFSymbolsInTLSFixupsImpl(cast<MCUnaryExpr>(Expr)->getSubExpr(), Asm);
+    break;
+  }
+}
+
+void ARM64MCExpr::fixELFSymbolsInTLSFixups(MCAssembler &Asm) const {
+  switch (getSymbolLoc(Kind)) {
+  default:
+    return;
+  case VK_DTPREL:
+  case VK_GOTTPREL:
+  case VK_TPREL:
+  case VK_TLSDESC:
+    break;
+  }
+
+  fixELFSymbolsInTLSFixupsImpl(getSubExpr(), Asm);
+}
--- a/lib/Target/ARM64/MCTargetDesc/ARM64MCExpr.h
+++ b/lib/Target/ARM64/MCTargetDesc/ARM64MCExpr.h
@ -0,0 +1,162 @@
+//=---- ARM64MCExpr.h - ARM64 specific MC expression classes ------*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes ARM64-specific MCExprs, used for modifiers like
+// ":lo12:" or ":gottprel_g1:".
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_ARM64MCEXPR_H
+#define LLVM_ARM64MCEXPR_H
+
+#include "llvm/MC/MCExpr.h"
+#include "llvm/Support/ErrorHandling.h"
+
+namespace llvm {
+
+class ARM64MCExpr : public MCTargetExpr {
+public:
+  enum VariantKind {
+    VK_NONE     = 0x000,
+
+    // Symbol locations specifying (roughly speaking) what calculation should be
+    // performed to construct the final address for the relocated
+    // symbol. E.g. direct, via the GOT, ...
+    VK_ABS      = 0x001,
+    VK_SABS     = 0x002,
+    VK_GOT      = 0x003,
+    VK_DTPREL   = 0x004,
+    VK_GOTTPREL = 0x005,
+    VK_TPREL    = 0x006,
+    VK_TLSDESC  = 0x007,
+    VK_SymLocBits = 0x00f,
+
+    // Variants specifying which part of the final address calculation is
+    // used. E.g. the low 12 bits for an ADD/LDR, the middle 16 bits for a
+    // MOVZ/MOVK.
+    VK_PAGE     = 0x010,
+    VK_PAGEOFF  = 0x020,
+    VK_G0       = 0x030,
+    VK_G1       = 0x040,
+    VK_G2       = 0x050,
+    VK_G3       = 0x060,
+    VK_AddressFragBits = 0x0f0,
+
+    // Whether the final relocation is a checked one (where a linker should
+    // perform a range-check on the final address) or not. Note that this field
+    // is unfortunately sometimes omitted from the assembly syntax. E.g. :lo12:
+    // on its own is a non-checked relocation. We side with ELF on being
+    // explicit about this!
+    VK_NC       = 0x100,
+
+    // Convenience definitions for referring to specific textual representations
+    // of relocation specifiers. Note that this means the "_NC" is sometimes
+    // omitted in line with assembly syntax here (VK_LO12 rather than VK_LO12_NC
+    // since a user would write ":lo12:").
+    VK_CALL              = VK_ABS,
+    VK_ABS_PAGE          = VK_ABS      | VK_PAGE,
+    VK_ABS_G3            = VK_ABS      | VK_G3,
+    VK_ABS_G2            = VK_ABS      | VK_G2,
+    VK_ABS_G2_NC         = VK_ABS      | VK_G2      | VK_NC,
+    VK_ABS_G1            = VK_ABS      | VK_G1,
+    VK_ABS_G1_NC         = VK_ABS      | VK_G1      | VK_NC,
+    VK_ABS_G0            = VK_ABS      | VK_G0,
+    VK_ABS_G0_NC         = VK_ABS      | VK_G0      | VK_NC,
+    VK_LO12              = VK_ABS      | VK_PAGEOFF | VK_NC,
+    VK_GOT_LO12          = VK_GOT      | VK_PAGEOFF | VK_NC,
+    VK_GOT_PAGE          = VK_GOT      | VK_PAGE,
+    VK_DTPREL_G2         = VK_DTPREL   | VK_G2,
+    VK_DTPREL_G1         = VK_DTPREL   | VK_G1,
+    VK_DTPREL_G1_NC      = VK_DTPREL   | VK_G1      | VK_NC,
+    VK_DTPREL_G0         = VK_DTPREL   | VK_G0,
+    VK_DTPREL_G0_NC      = VK_DTPREL   | VK_G0      | VK_NC,
+    VK_DTPREL_LO12       = VK_DTPREL   | VK_PAGEOFF,
+    VK_DTPREL_LO12_NC    = VK_DTPREL   | VK_PAGEOFF | VK_NC,
+    VK_GOTTPREL_PAGE     = VK_GOTTPREL | VK_PAGE,
+    VK_GOTTPREL_LO12_NC  = VK_GOTTPREL | VK_PAGEOFF | VK_NC,
+    VK_GOTTPREL_G1       = VK_GOTTPREL | VK_G1,
+    VK_GOTTPREL_G0_NC    = VK_GOTTPREL | VK_G0      | VK_NC,
+    VK_TPREL_G2          = VK_TPREL    | VK_G2,
+    VK_TPREL_G1          = VK_TPREL    | VK_G1,
+    VK_TPREL_G1_NC       = VK_TPREL    | VK_G1      | VK_NC,
+    VK_TPREL_G0          = VK_TPREL    | VK_G0,
+    VK_TPREL_G0_NC       = VK_TPREL    | VK_G0      | VK_NC,
+    VK_TPREL_LO12        = VK_TPREL    | VK_PAGEOFF,
+    VK_TPREL_LO12_NC     = VK_TPREL    | VK_PAGEOFF | VK_NC,
+    VK_TLSDESC_LO12      = VK_TLSDESC  | VK_PAGEOFF | VK_NC,
+    VK_TLSDESC_PAGE      = VK_TLSDESC  | VK_PAGE,
+
+    VK_INVALID  = 0xfff
+  };
+
+private:
+  const MCExpr *Expr;
+  const VariantKind Kind;
+
+  explicit ARM64MCExpr(const MCExpr *Expr, VariantKind Kind)
+    : Expr(Expr), Kind(Kind) {}
+
+public:
+  /// @name Construction
+  /// @{
+
+  static const ARM64MCExpr *Create(const MCExpr *Expr, VariantKind Kind,
+                                   MCContext &Ctx);
+
+  /// @}
+  /// @name Accessors
+  /// @{
+
+  /// Get the kind of this expression.
+  VariantKind getKind() const { return static_cast<VariantKind>(Kind); }
+
+  /// Get the expression this modifier applies to.
+  const MCExpr *getSubExpr() const { return Expr; }
+
+  /// @}
+  /// @name VariantKind information extractors.
+  /// @{
+
+  static VariantKind getSymbolLoc(VariantKind Kind) {
+    return static_cast<VariantKind>(Kind & VK_SymLocBits);
+  }
+
+  static VariantKind getAddressFrag(VariantKind Kind) {
+    return static_cast<VariantKind>(Kind & VK_AddressFragBits);
+  }
+
+  static bool isNotChecked(VariantKind Kind) { return Kind & VK_NC; }
+
+  /// @}
+
+  /// Convert the variant kind into an ELF-appropriate modifier
+  /// (e.g. ":got:", ":lo12:").
+  StringRef getVariantKindName() const;
+
+  void PrintImpl(raw_ostream &OS) const;
+
+  void AddValueSymbols(MCAssembler *) const;
+
+  const MCSection *FindAssociatedSection() const;
+
+  bool EvaluateAsRelocatableImpl(MCValue &Res,
+                                 const MCAsmLayout *Layout) const;
+
+  void fixELFSymbolsInTLSFixups(MCAssembler &Asm) const;
+
+  static bool classof(const MCExpr *E) {
+    return E->getKind() == MCExpr::Target;
+  }
+
+  static bool classof(const ARM64MCExpr *) { return true; }
+
+};
+} // end namespace llvm
+
+#endif
--- a/lib/Target/ARM64/MCTargetDesc/ARM64MCTargetDesc.cpp
+++ b/lib/Target/ARM64/MCTargetDesc/ARM64MCTargetDesc.cpp
@ -0,0 +1,167 @@
+//===-- ARM64MCTargetDesc.cpp - ARM64 Target Descriptions -------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides ARM64 specific target descriptions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARM64MCTargetDesc.h"
+#include "ARM64ELFStreamer.h"
+#include "ARM64MCAsmInfo.h"
+#include "InstPrinter/ARM64InstPrinter.h"
+#include "llvm/MC/MCCodeGenInfo.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetRegistry.h"
+
+#define GET_INSTRINFO_MC_DESC
+#include "ARM64GenInstrInfo.inc"
+
+#define GET_SUBTARGETINFO_MC_DESC
+#include "ARM64GenSubtargetInfo.inc"
+
+#define GET_REGINFO_MC_DESC
+#include "ARM64GenRegisterInfo.inc"
+
+using namespace llvm;
+
+static MCInstrInfo *createARM64MCInstrInfo() {
+  MCInstrInfo *X = new MCInstrInfo();
+  InitARM64MCInstrInfo(X);
+  return X;
+}
+
+static MCSubtargetInfo *createARM64MCSubtargetInfo(StringRef TT, StringRef CPU,
+                                                   StringRef FS) {
+  MCSubtargetInfo *X = new MCSubtargetInfo();
+  InitARM64MCSubtargetInfo(X, TT, CPU, FS);
+  return X;
+}
+
+static MCRegisterInfo *createARM64MCRegisterInfo(StringRef Triple) {
+  MCRegisterInfo *X = new MCRegisterInfo();
+  InitARM64MCRegisterInfo(X, ARM64::LR);
+  return X;
+}
+
+static MCAsmInfo *createARM64MCAsmInfo(const MCRegisterInfo &MRI,
+                                       StringRef TT) {
+  Triple TheTriple(TT);
+
+  MCAsmInfo *MAI;
+  if (TheTriple.isOSDarwin())
+    MAI = new ARM64MCAsmInfoDarwin();
+  else {
+    assert(TheTriple.isOSBinFormatELF() && "Only expect Darwin or ELF");
+    MAI = new ARM64MCAsmInfoELF();
+  }
+
+  // Initial state of the frame pointer is SP.
+  unsigned Reg = MRI.getDwarfRegNum(ARM64::SP, true);
+  MCCFIInstruction Inst = MCCFIInstruction::createDefCfa(0, Reg, 0);
+  MAI->addInitialFrameState(Inst);
+
+  return MAI;
+}
+
+MCCodeGenInfo *createARM64MCCodeGenInfo(StringRef TT, Reloc::Model RM,
+                                        CodeModel::Model CM,
+                                        CodeGenOpt::Level OL) {
+  Triple TheTriple(TT);
+  assert((TheTriple.isOSBinFormatELF() || TheTriple.isOSBinFormatMachO()) &&
+         "Only expect Darwin and ELF targets");
+
+  if (CM == CodeModel::Default)
+    CM = CodeModel::Small;
+  // The default MCJIT memory managers make no guarantees about where they can
+  // find an executable page; JITed code needs to be able to refer to globals
+  // no matter how far away they are.
+  else if (CM == CodeModel::JITDefault)
+    CM = CodeModel::Large;
+  else if (CM != CodeModel::Small && CM != CodeModel::Large)
+    report_fatal_error("Only small and large code models are allowed on ARM64");
+
+  // ARM64 Darwin is always PIC.
+  if (TheTriple.isOSDarwin())
+    RM = Reloc::PIC_;
+  // On ELF platforms the default static relocation model has a smart enough
+  // linker to cope with referencing external symbols defined in a shared
+  // library. Hence DynamicNoPIC doesn't need to be promoted to PIC.
+  else if (RM == Reloc::Default || RM == Reloc::DynamicNoPIC)
+    RM = Reloc::Static;
+
+  MCCodeGenInfo *X = new MCCodeGenInfo();
+  X->InitMCCodeGenInfo(RM, CM, OL);
+  return X;
+}
+
+static MCInstPrinter *createARM64MCInstPrinter(const Target &T,
+                                               unsigned SyntaxVariant,
+                                               const MCAsmInfo &MAI,
+                                               const MCInstrInfo &MII,
+                                               const MCRegisterInfo &MRI,
+                                               const MCSubtargetInfo &STI) {
+  if (SyntaxVariant == 0)
+    return new ARM64InstPrinter(MAI, MII, MRI, STI);
+  if (SyntaxVariant == 1)
+    return new ARM64AppleInstPrinter(MAI, MII, MRI, STI);
+
+  return 0;
+}
+
+static MCStreamer *createMCStreamer(const Target &T, StringRef TT,
+                                    MCContext &Ctx, MCAsmBackend &TAB,
+                                    raw_ostream &OS, MCCodeEmitter *Emitter,
+                                    const MCSubtargetInfo &STI, bool RelaxAll,
+                                    bool NoExecStack) {
+  Triple TheTriple(TT);
+
+  if (TheTriple.isOSDarwin())
+    return createMachOStreamer(Ctx, TAB, OS, Emitter, RelaxAll,
+                               /*LabelSections*/ true);
+
+  return createARM64ELFStreamer(Ctx, TAB, OS, Emitter, RelaxAll, NoExecStack);
+}
+
+// Force static initialization.
+extern "C" void LLVMInitializeARM64TargetMC() {
+  // Register the MC asm info.
+  RegisterMCAsmInfoFn X(TheARM64Target, createARM64MCAsmInfo);
+
+  // Register the MC codegen info.
+  TargetRegistry::RegisterMCCodeGenInfo(TheARM64Target,
+                                        createARM64MCCodeGenInfo);
+
+  // Register the MC instruction info.
+  TargetRegistry::RegisterMCInstrInfo(TheARM64Target, createARM64MCInstrInfo);
+
+  // Register the MC register info.
+  TargetRegistry::RegisterMCRegInfo(TheARM64Target, createARM64MCRegisterInfo);
+
+  // Register the MC subtarget info.
+  TargetRegistry::RegisterMCSubtargetInfo(TheARM64Target,
+                                          createARM64MCSubtargetInfo);
+
+  // Register the asm backend.
+  TargetRegistry::RegisterMCAsmBackend(TheARM64Target, createARM64AsmBackend);
+
+  // Register the MC Code Emitter
+  TargetRegistry::RegisterMCCodeEmitter(TheARM64Target,
+                                        createARM64MCCodeEmitter);
+
+  // Register the object streamer.
+  TargetRegistry::RegisterMCObjectStreamer(TheARM64Target, createMCStreamer);
+
+  // Register the MCInstPrinter.
+  TargetRegistry::RegisterMCInstPrinter(TheARM64Target,
+                                        createARM64MCInstPrinter);
+}
--- a/lib/Target/ARM64/MCTargetDesc/ARM64MCTargetDesc.h
+++ b/lib/Target/ARM64/MCTargetDesc/ARM64MCTargetDesc.h
@ -0,0 +1,62 @@
+//===-- ARM64MCTargetDesc.h - ARM64 Target Descriptions ---------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides ARM64 specific target descriptions.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef ARM64MCTARGETDESC_H
+#define ARM64MCTARGETDESC_H
+
+#include "llvm/Support/DataTypes.h"
+#include <string>
+
+namespace llvm {
+class MCAsmBackend;
+class MCCodeEmitter;
+class MCContext;
+class MCInstrInfo;
+class MCRegisterInfo;
+class MCObjectWriter;
+class MCSubtargetInfo;
+class StringRef;
+class Target;
+class raw_ostream;
+
+extern Target TheARM64Target;
+
+MCCodeEmitter *createARM64MCCodeEmitter(const MCInstrInfo &MCII,
+                                        const MCRegisterInfo &MRI,
+                                        const MCSubtargetInfo &STI,
+                                        MCContext &Ctx);
+MCAsmBackend *createARM64AsmBackend(const Target &T, const MCRegisterInfo &MRI,
+                                    StringRef TT, StringRef CPU);
+
+MCObjectWriter *createARM64ELFObjectWriter(raw_ostream &OS, uint8_t OSABI);
+
+MCObjectWriter *createARM64MachObjectWriter(raw_ostream &OS, uint32_t CPUType,
+                                            uint32_t CPUSubtype);
+
+} // End llvm namespace
+
+// Defines symbolic names for ARM64 registers.  This defines a mapping from
+// register name to register number.
+//
+#define GET_REGINFO_ENUM
+#include "ARM64GenRegisterInfo.inc"
+
+// Defines symbolic names for the ARM64 instructions.
+//
+#define GET_INSTRINFO_ENUM
+#include "ARM64GenInstrInfo.inc"
+
+#define GET_SUBTARGETINFO_ENUM
+#include "ARM64GenSubtargetInfo.inc"
+
+#endif
--- a/lib/Target/ARM64/MCTargetDesc/ARM64MachObjectWriter.cpp
+++ b/lib/Target/ARM64/MCTargetDesc/ARM64MachObjectWriter.cpp
@ -0,0 +1,396 @@
+//===-- ARMMachObjectWriter.cpp - ARM Mach Object Writer ------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/ARM64FixupKinds.h"
+#include "MCTargetDesc/ARM64MCTargetDesc.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCAsmLayout.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCFixup.h"
+#include "llvm/MC/MCMachObjectWriter.h"
+#include "llvm/MC/MCSectionMachO.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MachO.h"
+using namespace llvm;
+
+namespace {
+class ARM64MachObjectWriter : public MCMachObjectTargetWriter {
+  bool getARM64FixupKindMachOInfo(const MCFixup &Fixup, unsigned &RelocType,
+                                  const MCSymbolRefExpr *Sym,
+                                  unsigned &Log2Size, const MCAssembler &Asm);
+
+public:
+  ARM64MachObjectWriter(uint32_t CPUType, uint32_t CPUSubtype)
+      : MCMachObjectTargetWriter(true /* is64Bit */, CPUType, CPUSubtype,
+                                 /*UseAggressiveSymbolFolding=*/true) {}
+
+  void RecordRelocation(MachObjectWriter *Writer, const MCAssembler &Asm,
+                        const MCAsmLayout &Layout, const MCFragment *Fragment,
+                        const MCFixup &Fixup, MCValue Target,
+                        uint64_t &FixedValue);
+};
+}
+
+bool ARM64MachObjectWriter::getARM64FixupKindMachOInfo(
+    const MCFixup &Fixup, unsigned &RelocType, const MCSymbolRefExpr *Sym,
+    unsigned &Log2Size, const MCAssembler &Asm) {
+  RelocType = unsigned(MachO::ARM64_RELOC_UNSIGNED);
+  Log2Size = ~0U;
+
+  switch ((unsigned)Fixup.getKind()) {
+  default:
+    return false;
+
+  case FK_Data_1:
+    Log2Size = llvm::Log2_32(1);
+    return true;
+  case FK_Data_2:
+    Log2Size = llvm::Log2_32(2);
+    return true;
+  case FK_Data_4:
+    Log2Size = llvm::Log2_32(4);
+    if (Sym->getKind() == MCSymbolRefExpr::VK_GOT)
+      RelocType = unsigned(MachO::ARM64_RELOC_POINTER_TO_GOT);
+    return true;
+  case FK_Data_8:
+    Log2Size = llvm::Log2_32(8);
+    if (Sym->getKind() == MCSymbolRefExpr::VK_GOT)
+      RelocType = unsigned(MachO::ARM64_RELOC_POINTER_TO_GOT);
+    return true;
+  case ARM64::fixup_arm64_add_imm12:
+  case ARM64::fixup_arm64_ldst_imm12_scale1:
+  case ARM64::fixup_arm64_ldst_imm12_scale2:
+  case ARM64::fixup_arm64_ldst_imm12_scale4:
+  case ARM64::fixup_arm64_ldst_imm12_scale8:
+  case ARM64::fixup_arm64_ldst_imm12_scale16:
+    Log2Size = llvm::Log2_32(4);
+    switch (Sym->getKind()) {
+    default:
+      assert(0 && "Unexpected symbol reference variant kind!");
+    case MCSymbolRefExpr::VK_PAGEOFF:
+      RelocType = unsigned(MachO::ARM64_RELOC_PAGEOFF12);
+      return true;
+    case MCSymbolRefExpr::VK_GOTPAGEOFF:
+      RelocType = unsigned(MachO::ARM64_RELOC_GOT_LOAD_PAGEOFF12);
+      return true;
+    case MCSymbolRefExpr::VK_TLVPPAGEOFF:
+      RelocType = unsigned(MachO::ARM64_RELOC_TLVP_LOAD_PAGEOFF12);
+      return true;
+    }
+  case ARM64::fixup_arm64_pcrel_adrp_imm21:
+    Log2Size = llvm::Log2_32(4);
+    // This encompasses the relocation for the whole 21-bit value.
+    switch (Sym->getKind()) {
+    default:
+      Asm.getContext().FatalError(Fixup.getLoc(),
+                                  "ADR/ADRP relocations must be GOT relative");
+    case MCSymbolRefExpr::VK_PAGE:
+      RelocType = unsigned(MachO::ARM64_RELOC_PAGE21);
+      return true;
+    case MCSymbolRefExpr::VK_GOTPAGE:
+      RelocType = unsigned(MachO::ARM64_RELOC_GOT_LOAD_PAGE21);
+      return true;
+    case MCSymbolRefExpr::VK_TLVPPAGE:
+      RelocType = unsigned(MachO::ARM64_RELOC_TLVP_LOAD_PAGE21);
+      return true;
+    }
+    return true;
+  case ARM64::fixup_arm64_pcrel_branch26:
+  case ARM64::fixup_arm64_pcrel_call26:
+    Log2Size = llvm::Log2_32(4);
+    RelocType = unsigned(MachO::ARM64_RELOC_BRANCH26);
+    return true;
+  }
+}
+
+void ARM64MachObjectWriter::RecordRelocation(
+    MachObjectWriter *Writer, const MCAssembler &Asm, const MCAsmLayout &Layout,
+    const MCFragment *Fragment, const MCFixup &Fixup, MCValue Target,
+    uint64_t &FixedValue) {
+  unsigned IsPCRel = Writer->isFixupKindPCRel(Asm, Fixup.getKind());
+
+  // See <reloc.h>.
+  uint32_t FixupOffset = Layout.getFragmentOffset(Fragment);
+  unsigned Log2Size = 0;
+  int64_t Value = 0;
+  unsigned Index = 0;
+  unsigned IsExtern = 0;
+  unsigned Type = 0;
+  unsigned Kind = Fixup.getKind();
+
+  FixupOffset += Fixup.getOffset();
+
+  // ARM64 pcrel relocation addends do not include the section offset.
+  if (IsPCRel)
+    FixedValue += FixupOffset;
+
+  // ADRP fixups use relocations for the whole symbol value and only
+  // put the addend in the instruction itself. Clear out any value the
+  // generic code figured out from the sybmol definition.
+  if (Kind == ARM64::fixup_arm64_pcrel_adrp_imm21 ||
+      Kind == ARM64::fixup_arm64_pcrel_imm19)
+    FixedValue = 0;
+
+  // imm19 relocations are for conditional branches, which require
+  // assembler local symbols. If we got here, that's not what we have,
+  // so complain loudly.
+  if (Kind == ARM64::fixup_arm64_pcrel_imm19) {
+    Asm.getContext().FatalError(Fixup.getLoc(),
+                                "conditional branch requires assembler-local"
+                                " label. '" +
+                                    Target.getSymA()->getSymbol().getName() +
+                                    "' is external.");
+    return;
+  }
+
+  // 14-bit branch relocations should only target internal labels, and so
+  // should never get here.
+  if (Kind == ARM64::fixup_arm64_pcrel_branch14) {
+    Asm.getContext().FatalError(Fixup.getLoc(),
+                                "Invalid relocation on conditional branch!");
+    return;
+  }
+
+  if (!getARM64FixupKindMachOInfo(Fixup, Type, Target.getSymA(), Log2Size,
+                                  Asm)) {
+    Asm.getContext().FatalError(Fixup.getLoc(), "unknown ARM64 fixup kind!");
+    return;
+  }
+
+  Value = Target.getConstant();
+
+  if (Target.isAbsolute()) { // constant
+    // FIXME: Should this always be extern?
+    // SymbolNum of 0 indicates the absolute section.
+    Type = MachO::ARM64_RELOC_UNSIGNED;
+    Index = 0;
+
+    if (IsPCRel) {
+      IsExtern = 1;
+      Asm.getContext().FatalError(Fixup.getLoc(),
+                                  "PC relative absolute relocation!");
+
+      // FIXME: x86_64 sets the type to a branch reloc here. Should we do
+      // something similar?
+    }
+  } else if (Target.getSymB()) { // A - B + constant
+    const MCSymbol *A = &Target.getSymA()->getSymbol();
+    MCSymbolData &A_SD = Asm.getSymbolData(*A);
+    const MCSymbolData *A_Base = Asm.getAtom(&A_SD);
+
+    const MCSymbol *B = &Target.getSymB()->getSymbol();
+    MCSymbolData &B_SD = Asm.getSymbolData(*B);
+    const MCSymbolData *B_Base = Asm.getAtom(&B_SD);
+
+    // Check for "_foo@got - .", which comes through here as:
+    // Ltmp0:
+    //    ... _foo@got - Ltmp0
+    if (Target.getSymA()->getKind() == MCSymbolRefExpr::VK_GOT &&
+        Target.getSymB()->getKind() == MCSymbolRefExpr::VK_None &&
+        Layout.getSymbolOffset(&B_SD) ==
+            Layout.getFragmentOffset(Fragment) + Fixup.getOffset()) {
+      // SymB is the PC, so use a PC-rel pointer-to-GOT relocation.
+      Index = A_Base->getIndex();
+      IsExtern = 1;
+      Type = MachO::ARM64_RELOC_POINTER_TO_GOT;
+      IsPCRel = 1;
+      MachO::any_relocation_info MRE;
+      MRE.r_word0 = FixupOffset;
+      MRE.r_word1 = ((Index << 0) | (IsPCRel << 24) | (Log2Size << 25) |
+                     (IsExtern << 27) | (Type << 28));
+      Writer->addRelocation(Fragment->getParent(), MRE);
+      return;
+    } else if (Target.getSymA()->getKind() != MCSymbolRefExpr::VK_None ||
+               Target.getSymB()->getKind() != MCSymbolRefExpr::VK_None)
+      // Otherwise, neither symbol can be modified.
+      Asm.getContext().FatalError(Fixup.getLoc(),
+                                  "unsupported relocation of modified symbol");
+
+    // We don't support PCrel relocations of differences.
+    if (IsPCRel)
+      Asm.getContext().FatalError(Fixup.getLoc(),
+                                  "unsupported pc-relative relocation of "
+                                  "difference");
+
+    // ARM64 always uses external relocations. If there is no symbol to use as
+    // a base address (a local symbol with no preceeding non-local symbol),
+    // error out.
+    //
+    // FIXME: We should probably just synthesize an external symbol and use
+    // that.
+    if (!A_Base)
+      Asm.getContext().FatalError(
+          Fixup.getLoc(),
+          "unsupported relocation of local symbol '" + A->getName() +
+              "'. Must have non-local symbol earlier in section.");
+    if (!B_Base)
+      Asm.getContext().FatalError(
+          Fixup.getLoc(),
+          "unsupported relocation of local symbol '" + B->getName() +
+              "'. Must have non-local symbol earlier in section.");
+
+    if (A_Base == B_Base && A_Base)
+      Asm.getContext().FatalError(Fixup.getLoc(),
+                                  "unsupported relocation with identical base");
+
+    Value += (A_SD.getFragment() == NULL ? 0 : Writer->getSymbolAddress(
+                                                   &A_SD, Layout)) -
+             (A_Base == NULL || A_Base->getFragment() == NULL
+                  ? 0
+                  : Writer->getSymbolAddress(A_Base, Layout));
+    Value -= (B_SD.getFragment() == NULL ? 0 : Writer->getSymbolAddress(
+                                                   &B_SD, Layout)) -
+             (B_Base == NULL || B_Base->getFragment() == NULL
+                  ? 0
+                  : Writer->getSymbolAddress(B_Base, Layout));
+
+    Index = A_Base->getIndex();
+    IsExtern = 1;
+    Type = MachO::ARM64_RELOC_UNSIGNED;
+
+    MachO::any_relocation_info MRE;
+    MRE.r_word0 = FixupOffset;
+    MRE.r_word1 = ((Index << 0) | (IsPCRel << 24) | (Log2Size << 25) |
+                   (IsExtern << 27) | (Type << 28));
+    Writer->addRelocation(Fragment->getParent(), MRE);
+
+    Index = B_Base->getIndex();
+    IsExtern = 1;
+    Type = MachO::ARM64_RELOC_SUBTRACTOR;
+  } else { // A + constant
+    const MCSymbol *Symbol = &Target.getSymA()->getSymbol();
+    MCSymbolData &SD = Asm.getSymbolData(*Symbol);
+    const MCSymbolData *Base = Asm.getAtom(&SD);
+    const MCSectionMachO &Section = static_cast<const MCSectionMachO &>(
+        Fragment->getParent()->getSection());
+
+    // If the symbol is a variable and we weren't able to get a Base for it
+    // (i.e., it's not in the symbol table associated with a section) resolve
+    // the relocation based its expansion instead.
+    if (Symbol->isVariable() && !Base) {
+      // If the evaluation is an absolute value, just use that directly
+      // to keep things easy.
+      int64_t Res;
+      if (SD.getSymbol().getVariableValue()->EvaluateAsAbsolute(
+              Res, Layout, Writer->getSectionAddressMap())) {
+        FixedValue = Res;
+        return;
+      }
+
+      // FIXME: Will the Target we already have ever have any data in it
+      // we need to preserve and merge with the new Target? How about
+      // the FixedValue?
+      if (!Symbol->getVariableValue()->EvaluateAsRelocatable(Target, &Layout))
+        Asm.getContext().FatalError(Fixup.getLoc(),
+                                    "unable to resolve variable '" +
+                                        Symbol->getName() + "'");
+      return RecordRelocation(Writer, Asm, Layout, Fragment, Fixup, Target,
+                              FixedValue);
+    }
+
+    // Relocations inside debug sections always use local relocations when
+    // possible. This seems to be done because the debugger doesn't fully
+    // understand relocation entries and expects to find values that
+    // have already been fixed up.
+    if (Symbol->isInSection()) {
+      if (Section.hasAttribute(MachO::S_ATTR_DEBUG))
+        Base = 0;
+    }
+
+    // ARM64 uses external relocations as much as possible. For debug sections,
+    // and for pointer-sized relocations (.quad), we allow section relocations.
+    // It's code sections that run into trouble.
+    if (Base) {
+      Index = Base->getIndex();
+      IsExtern = 1;
+
+      // Add the local offset, if needed.
+      if (Base != &SD)
+        Value += Layout.getSymbolOffset(&SD) - Layout.getSymbolOffset(Base);
+    } else if (Symbol->isInSection()) {
+      // Pointer-sized relocations can use a local relocation. Otherwise,
+      // we have to be in a debug info section.
+      if (!Section.hasAttribute(MachO::S_ATTR_DEBUG) && Log2Size != 3)
+        Asm.getContext().FatalError(
+            Fixup.getLoc(),
+            "unsupported relocation of local symbol '" + Symbol->getName() +
+                "'. Must have non-local symbol earlier in section.");
+      // Adjust the relocation to be section-relative.
+      // The index is the section ordinal (1-based).
+      const MCSectionData &SymSD =
+          Asm.getSectionData(SD.getSymbol().getSection());
+      Index = SymSD.getOrdinal() + 1;
+      IsExtern = 0;
+      Value += Writer->getSymbolAddress(&SD, Layout);
+
+      if (IsPCRel)
+        Value -= Writer->getFragmentAddress(Fragment, Layout) +
+                 Fixup.getOffset() + (1 << Log2Size);
+    } else {
+      // Resolve constant variables.
+      if (SD.getSymbol().isVariable()) {
+        int64_t Res;
+        if (SD.getSymbol().getVariableValue()->EvaluateAsAbsolute(
+                Res, Layout, Writer->getSectionAddressMap())) {
+          FixedValue = Res;
+          return;
+        }
+      }
+      Asm.getContext().FatalError(Fixup.getLoc(),
+                                  "unsupported relocation of variable '" +
+                                      Symbol->getName() + "'");
+    }
+  }
+
+  // If the relocation kind is Branch26, Page21, or Pageoff12, any addend
+  // is represented via an Addend relocation, not encoded directly into
+  // the instruction.
+  if ((Type == MachO::ARM64_RELOC_BRANCH26 ||
+       Type == MachO::ARM64_RELOC_PAGE21 ||
+       Type == MachO::ARM64_RELOC_PAGEOFF12) &&
+      Value) {
+    assert((Value & 0xff000000) == 0 && "Added relocation out of range!");
+
+    MachO::any_relocation_info MRE;
+    MRE.r_word0 = FixupOffset;
+    MRE.r_word1 = ((Index << 0) | (IsPCRel << 24) | (Log2Size << 25) |
+                   (IsExtern << 27) | (Type << 28));
+    Writer->addRelocation(Fragment->getParent(), MRE);
+
+    // Now set up the Addend relocation.
+    Type = MachO::ARM64_RELOC_ADDEND;
+    Index = Value;
+    IsPCRel = 0;
+    Log2Size = 2;
+    IsExtern = 0;
+
+    // Put zero into the instruction itself. The addend is in the relocation.
+    Value = 0;
+  }
+
+  // If there's any addend left to handle, encode it in the instruction.
+  FixedValue = Value;
+
+  // struct relocation_info (8 bytes)
+  MachO::any_relocation_info MRE;
+  MRE.r_word0 = FixupOffset;
+  MRE.r_word1 = ((Index << 0) | (IsPCRel << 24) | (Log2Size << 25) |
+                 (IsExtern << 27) | (Type << 28));
+  Writer->addRelocation(Fragment->getParent(), MRE);
+}
+
+MCObjectWriter *llvm::createARM64MachObjectWriter(raw_ostream &OS,
+                                                  uint32_t CPUType,
+                                                  uint32_t CPUSubtype) {
+  return createMachObjectWriter(new ARM64MachObjectWriter(CPUType, CPUSubtype),
+                                OS, /*IsLittleEndian=*/true);
+}
--- a/lib/Target/ARM64/MCTargetDesc/CMakeLists.txt
+++ b/lib/Target/ARM64/MCTargetDesc/CMakeLists.txt
@ -0,0 +1,14 @@
+add_llvm_library(LLVMARM64Desc
+  ARM64AsmBackend.cpp
+  ARM64ELFObjectWriter.cpp
+  ARM64ELFStreamer.cpp
+  ARM64MCAsmInfo.cpp
+  ARM64MCCodeEmitter.cpp
+  ARM64MCExpr.cpp
+  ARM64MCTargetDesc.cpp
+  ARM64MachObjectWriter.cpp
+)
+add_dependencies(LLVMARM64Desc ARM64CommonTableGen)
+
+# Hack: we need to include 'main' target directory to grab private headers
+include_directories(${CMAKE_CURRENT_SOURCE_DIR}/.. ${CMAKE_CURRENT_BINARY_DIR}/..)
--- a/lib/Target/ARM64/MCTargetDesc/LLVMBuild.txt
+++ b/lib/Target/ARM64/MCTargetDesc/LLVMBuild.txt
@ -0,0 +1,24 @@
+;===- ./lib/Target/ARM64/MCTargetDesc/LLVMBuild.txt ------------*- Conf -*--===;
+;
+;                     The LLVM Compiler Infrastructure
+;
+; This file is distributed under the University of Illinois Open Source
+; License. See LICENSE.TXT for details.
+;
+;===------------------------------------------------------------------------===;
+;
+; This is an LLVMBuild description file for the components in this subdirectory.
+;
+; For more information on the LLVMBuild system, please see:
+;
+;   http://llvm.org/docs/LLVMBuild.html
+;
+;===------------------------------------------------------------------------===;
+
+[component_0]
+type = Library
+name = ARM64Desc
+parent = ARM64
+required_libraries = ARM64AsmPrinter ARM64Info MC Support
+add_to_library_groups = ARM64
+
--- a/lib/Target/ARM64/MCTargetDesc/Makefile
+++ b/lib/Target/ARM64/MCTargetDesc/Makefile
@ -0,0 +1,16 @@
+##===- lib/Target/ARM64/TargetDesc/Makefile ----------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+
+LEVEL = ../../../..
+LIBRARYNAME = LLVMARM64Desc
+
+# Hack: we need to include 'main' target directory to grab private headers
+CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
+
+include $(LEVEL)/Makefile.common
--- a/lib/Target/ARM64/Makefile
+++ b/lib/Target/ARM64/Makefile
@ -0,0 +1,25 @@
+##===- lib/Target/ARM64/Makefile ---------------------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+
+LEVEL = ../../..
+LIBRARYNAME = LLVMARM64CodeGen
+TARGET = ARM64
+
+# Make sure that tblgen is run, first thing.
+BUILT_SOURCES = ARM64GenRegisterInfo.inc ARM64GenInstrInfo.inc \
+		ARM64GenAsmWriter.inc ARM64GenAsmWriter1.inc \
+		ARM64GenDAGISel.inc \
+		ARM64GenCallingConv.inc ARM64GenAsmMatcher.inc \
+		ARM64GenSubtargetInfo.inc ARM64GenMCCodeEmitter.inc \
+		ARM64GenFastISel.inc ARM64GenDisassemblerTables.inc \
+		ARM64GenMCPseudoLowering.inc
+
+DIRS = TargetInfo InstPrinter AsmParser Disassembler MCTargetDesc
+
+include $(LEVEL)/Makefile.common
--- a/Show More
+++ b/Show More