diff --git a/include/llvm/IR/IntrinsicsPowerPC.td b/include/llvm/IR/IntrinsicsPowerPC.td
index 5f8cda5e5f4..43ee7d27da7 100644
--- a/include/llvm/IR/IntrinsicsPowerPC.td
+++ b/include/llvm/IR/IntrinsicsPowerPC.td
@@ -516,6 +516,18 @@ def int_ppc_altivec_vrsqrtefp : PowerPC_Vec_FF_Intrinsic<"vrsqrtefp">;
 
 let TargetPrefix = "ppc" in {  // All intrinsics start with "llvm.ppc.".
 
+// Vector load.
+def int_ppc_vsx_lxvw4x :
+      Intrinsic<[llvm_v4i32_ty], [llvm_ptr_ty], [IntrReadArgMem]>;
+def int_ppc_vsx_lxvd2x :
+      Intrinsic<[llvm_v2f64_ty], [llvm_ptr_ty], [IntrReadArgMem]>;
+
+// Vector store.
+def int_ppc_vsx_stxvw4x :
+      Intrinsic<[], [llvm_v4i32_ty, llvm_ptr_ty], [IntrReadWriteArgMem]>;
+def int_ppc_vsx_stxvd2x :
+      Intrinsic<[], [llvm_v2f64_ty, llvm_ptr_ty], [IntrReadWriteArgMem]>;
+
 // Vector maximum.
 def int_ppc_vsx_xvmaxdp : PowerPC_VSX_Vec_DDD_Intrinsic<"xvmaxdp">;
 def int_ppc_vsx_xvmaxsp : PowerPC_VSX_Vec_FFF_Intrinsic<"xvmaxsp">;
diff --git a/lib/Target/PowerPC/PPCISelLowering.cpp b/lib/Target/PowerPC/PPCISelLowering.cpp
index 0984e60cd2c..0c844be3c55 100644
--- a/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -7583,8 +7583,12 @@ static bool isConsecutiveLS(SDNode *N, LSBaseSDNode *Base,
     default: return false;
     case Intrinsic::ppc_altivec_lvx:
     case Intrinsic::ppc_altivec_lvxl:
+    case Intrinsic::ppc_vsx_lxvw4x:
       VT = MVT::v4i32;
       break;
+    case Intrinsic::ppc_vsx_lxvd2x:
+      VT = MVT::v2f64;
+      break;
     case Intrinsic::ppc_altivec_lvebx:
       VT = MVT::i8;
       break;
@@ -7605,8 +7609,12 @@ static bool isConsecutiveLS(SDNode *N, LSBaseSDNode *Base,
     default: return false;
     case Intrinsic::ppc_altivec_stvx:
     case Intrinsic::ppc_altivec_stvxl:
+    case Intrinsic::ppc_vsx_stxvw4x:
       VT = MVT::v4i32;
       break;
+    case Intrinsic::ppc_vsx_stxvd2x:
+      VT = MVT::v2f64;
+      break;
     case Intrinsic::ppc_altivec_stvebx:
       VT = MVT::i8;
       break;
@@ -9094,7 +9102,9 @@ bool PPCTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
   case Intrinsic::ppc_altivec_lvxl:
   case Intrinsic::ppc_altivec_lvebx:
   case Intrinsic::ppc_altivec_lvehx:
-  case Intrinsic::ppc_altivec_lvewx: {
+  case Intrinsic::ppc_altivec_lvewx:
+  case Intrinsic::ppc_vsx_lxvd2x:
+  case Intrinsic::ppc_vsx_lxvw4x: {
     EVT VT;
     switch (Intrinsic) {
     case Intrinsic::ppc_altivec_lvebx:
@@ -9106,6 +9116,9 @@ bool PPCTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
     case Intrinsic::ppc_altivec_lvewx:
       VT = MVT::i32;
       break;
+    case Intrinsic::ppc_vsx_lxvd2x:
+      VT = MVT::v2f64;
+      break;
     default:
       VT = MVT::v4i32;
       break;
@@ -9126,7 +9139,9 @@ bool PPCTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
   case Intrinsic::ppc_altivec_stvxl:
   case Intrinsic::ppc_altivec_stvebx:
   case Intrinsic::ppc_altivec_stvehx:
-  case Intrinsic::ppc_altivec_stvewx: {
+  case Intrinsic::ppc_altivec_stvewx:
+  case Intrinsic::ppc_vsx_stxvd2x:
+  case Intrinsic::ppc_vsx_stxvw4x: {
     EVT VT;
     switch (Intrinsic) {
     case Intrinsic::ppc_altivec_stvebx:
@@ -9138,6 +9153,9 @@ bool PPCTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
     case Intrinsic::ppc_altivec_stvewx:
       VT = MVT::i32;
       break;
+    case Intrinsic::ppc_vsx_stxvd2x:
+      VT = MVT::v2f64;
+      break;
     default:
       VT = MVT::v4i32;
       break;
diff --git a/lib/Target/PowerPC/PPCInstrVSX.td b/lib/Target/PowerPC/PPCInstrVSX.td
index 3dfdf0806e3..522e0de7386 100644
--- a/lib/Target/PowerPC/PPCInstrVSX.td
+++ b/lib/Target/PowerPC/PPCInstrVSX.td
@@ -55,7 +55,7 @@ let Uses = [RM] in {
     def LXVD2X : XX1Form<31, 844,
                          (outs vsrc:$XT), (ins memrr:$src),
                          "lxvd2x $XT, $src", IIC_LdStLFD,
-                         [(set v2f64:$XT, (load xoaddr:$src))]>;
+                         [(set v2f64:$XT, (int_ppc_vsx_lxvd2x xoaddr:$src))]>;
 
     def LXVDSX : XX1Form<31, 332,
                          (outs vsrc:$XT), (ins memrr:$src),
@@ -64,7 +64,7 @@ let Uses = [RM] in {
     def LXVW4X : XX1Form<31, 780,
                          (outs vsrc:$XT), (ins memrr:$src),
                          "lxvw4x $XT, $src", IIC_LdStLFD,
-                         [(set v4i32:$XT, (load xoaddr:$src))]>;
+                         [(set v4i32:$XT, (int_ppc_vsx_lxvw4x xoaddr:$src))]>;
   }
 
   // Store indexed instructions
@@ -77,12 +77,12 @@ let Uses = [RM] in {
     def STXVD2X : XX1Form<31, 972,
                          (outs), (ins vsrc:$XT, memrr:$dst),
                          "stxvd2x $XT, $dst", IIC_LdStSTFD,
-                         [(store v2f64:$XT, xoaddr:$dst)]>;
+                         [(int_ppc_vsx_stxvd2x v2f64:$XT, xoaddr:$dst)]>;
 
     def STXVW4X : XX1Form<31, 908,
                          (outs), (ins vsrc:$XT, memrr:$dst),
                          "stxvw4x $XT, $dst", IIC_LdStSTFD,
-                         [(store v4i32:$XT, xoaddr:$dst)]>;
+                         [(int_ppc_vsx_stxvw4x v4i32:$XT, xoaddr:$dst)]>;
   }
 
   // Add/Mul Instructions
@@ -851,11 +851,14 @@ def : Pat<(v2f64 (sint_to_fp (sext_inreg v2i64:$C, v2i32))),
           (XVCVSXWDP (XXSLDWI $C, $C, 1))>;
 
 // Loads.
+def : Pat<(v2f64 (load xoaddr:$src)), (LXVD2X xoaddr:$src)>;
+def : Pat<(v2i64 (load xoaddr:$src)), (LXVD2X xoaddr:$src)>;
 def : Pat<(v4i32 (load xoaddr:$src)), (LXVW4X xoaddr:$src)>;
 
 // Stores.
-def : Pat<(store v4i32:$rS, xoaddr:$dst),
-          (STXVW4X $rS, xoaddr:$dst)>;
+def : Pat<(store v2f64:$rS, xoaddr:$dst), (STXVD2X $rS, xoaddr:$dst)>;
+def : Pat<(store v2i64:$rS, xoaddr:$dst), (STXVD2X $rS, xoaddr:$dst)>;
+def : Pat<(store v4i32:$rS, xoaddr:$dst), (STXVW4X $rS, xoaddr:$dst)>;
 
 // Selects.
 def : Pat<(v2f64 (selectcc i1:$lhs, i1:$rhs, v2f64:$tval, v2f64:$fval, SETLT)),
diff --git a/lib/Transforms/InstCombine/InstCombineCalls.cpp b/lib/Transforms/InstCombine/InstCombineCalls.cpp
index 8987ee0e848..87e49a11e6e 100644
--- a/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -613,6 +613,13 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
       return new LoadInst(Ptr);
     }
     break;
+  case Intrinsic::ppc_vsx_lxvw4x:
+  case Intrinsic::ppc_vsx_lxvd2x: {
+    // Turn PPC VSX loads into normal loads.
+    Value *Ptr = Builder->CreateBitCast(II->getArgOperand(0),
+                                        PointerType::getUnqual(II->getType()));
+    return new LoadInst(Ptr, Twine(""), false, 1);
+  }
   case Intrinsic::ppc_altivec_stvx:
   case Intrinsic::ppc_altivec_stvxl:
     // Turn stvx -> store if the pointer is known aligned.
@@ -624,6 +631,13 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
       return new StoreInst(II->getArgOperand(0), Ptr);
     }
     break;
+  case Intrinsic::ppc_vsx_stxvw4x:
+  case Intrinsic::ppc_vsx_stxvd2x: {
+    // Turn PPC VSX stores into normal stores.
+    Type *OpPtrTy = PointerType::getUnqual(II->getArgOperand(0)->getType());
+    Value *Ptr = Builder->CreateBitCast(II->getArgOperand(1), OpPtrTy);
+    return new StoreInst(II->getArgOperand(0), Ptr, false, 1);
+  }
   case Intrinsic::x86_sse_storeu_ps:
   case Intrinsic::x86_sse2_storeu_pd:
   case Intrinsic::x86_sse2_storeu_dq:
diff --git a/test/CodeGen/PowerPC/vsx-fma-m.ll b/test/CodeGen/PowerPC/vsx-fma-m.ll
index da4a20481e6..9dff9a755ad 100644
--- a/test/CodeGen/PowerPC/vsx-fma-m.ll
+++ b/test/CodeGen/PowerPC/vsx-fma-m.ll
@@ -177,21 +177,27 @@ entry:
   store <2 x double> %1, <2 x double>* %arrayidx3, align 8
   ret void
 
+; Note: There is some unavoidable changeability in this variant.  If the
+; FMAs are reordered differently, the algorithm can pick a different
+; multiplicand to destroy, changing the register assignment.  There isn't
+; a good way to express this possibility, so hopefully this doesn't change
+; too often.
+
 ; CHECK-LABEL: @testv3
 ; CHECK-DAG: xxlor [[V1:[0-9]+]], 34, 34
-; CHECK-DAG: xvmaddmdp 37, 35, 34
 ; CHECK-DAG: li [[C1:[0-9]+]], 48
 ; CHECK-DAG: li [[C2:[0-9]+]], 32
-; CHECK-DAG: xvmaddadp 34, 35, 38
+; CHECK-DAG: xvmaddmdp 37, 35, 34
 ; CHECK-DAG: li [[C3:[0-9]+]], 16
 
 ; Note: We could convert this next FMA to M-type as well, but it would require
 ; re-ordering the instructions.
 ; CHECK-DAG: xvmaddadp [[V1]], 35, 36
 
-; CHECK-DAG: xvmaddmdp 35, 36, 37
+; CHECK-DAG: xvmaddmdp 36, 35, 37
+; CHECK-DAG: xvmaddadp 34, 35, 38
 ; CHECK-DAG: stxvd2x 32, 0, 3
-; CHECK-DAG: stxvd2x 35, 3, [[C1]]
+; CHECK-DAG: stxvd2x 36, 3, [[C1]]
 ; CHECK-DAG: stxvd2x 34, 3, [[C2]]
 ; CHECK-DAG: stxvd2x 37, 3, [[C3]]
 ; CHECK: blr
diff --git a/test/CodeGen/PowerPC/vsx-ldst.ll b/test/CodeGen/PowerPC/vsx-ldst.ll
new file mode 100644
index 00000000000..0c9ebef8757
--- /dev/null
+++ b/test/CodeGen/PowerPC/vsx-ldst.ll
@@ -0,0 +1,36 @@
+; RUN: llc -mcpu=pwr8 -mattr=+vsx -O2 -mtriple=powerpc64-unknown-linux-gnu < %s > %t
+; RUN: grep lxvw4x < %t | count 3
+; RUN: grep lxvd2x < %t | count 3
+; RUN: grep stxvw4x < %t | count 3
+; RUN: grep stxvd2x < %t | count 3
+
+@vsi = global <4 x i32> <i32 -1, i32 2, i32 -3, i32 4>, align 16
+@vui = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@vf = global <4 x float> <float -1.500000e+00, float 2.500000e+00, float -3.500000e+00, float 4.500000e+00>, align 16
+@vsll = global <2 x i64> <i64 255, i64 -937>, align 16
+@vull = global <2 x i64> <i64 1447, i64 2894>, align 16
+@vd = global <2 x double> <double 3.500000e+00, double -7.500000e+00>, align 16
+@res_vsi = common global <4 x i32> zeroinitializer, align 16
+@res_vui = common global <4 x i32> zeroinitializer, align 16
+@res_vf = common global <4 x float> zeroinitializer, align 16
+@res_vsll = common global <2 x i64> zeroinitializer, align 16
+@res_vull = common global <2 x i64> zeroinitializer, align 16
+@res_vd = common global <2 x double> zeroinitializer, align 16
+
+; Function Attrs: nounwind
+define void @test1() {
+entry:
+  %0 = load <4 x i32>* @vsi, align 16
+  %1 = load <4 x i32>* @vui, align 16
+  %2 = load <4 x i32>* bitcast (<4 x float>* @vf to <4 x i32>*), align 16
+  %3 = load <2 x double>* bitcast (<2 x i64>* @vsll to <2 x double>*), align 16
+  %4 = load <2 x double>* bitcast (<2 x i64>* @vull to <2 x double>*), align 16
+  %5 = load <2 x double>* @vd, align 16
+  store <4 x i32> %0, <4 x i32>* @res_vsi, align 16
+  store <4 x i32> %1, <4 x i32>* @res_vui, align 16
+  store <4 x i32> %2, <4 x i32>* bitcast (<4 x float>* @res_vf to <4 x i32>*), align 16
+  store <2 x double> %3, <2 x double>* bitcast (<2 x i64>* @res_vsll to <2 x double>*), align 16
+  store <2 x double> %4, <2 x double>* bitcast (<2 x i64>* @res_vull to <2 x double>*), align 16
+  store <2 x double> %5, <2 x double>* @res_vd, align 16
+  ret void
+}
diff --git a/test/Transforms/InstCombine/vsx-unaligned.ll b/test/Transforms/InstCombine/vsx-unaligned.ll
new file mode 100644
index 00000000000..26e04268f44
--- /dev/null
+++ b/test/Transforms/InstCombine/vsx-unaligned.ll
@@ -0,0 +1,44 @@
+; Verify that we can create unaligned loads and stores from VSX intrinsics.
+
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target triple = "powerpc64-unknown-linux-gnu"
+
+@vf = common global <4 x float> zeroinitializer, align 1
+@res_vf = common global <4 x float> zeroinitializer, align 1
+@vd = common global <2 x double> zeroinitializer, align 1
+@res_vd = common global <2 x double> zeroinitializer, align 1
+
+define void @test1() {
+entry:
+  %t1 = alloca <4 x float>*, align 8
+  %t2 = alloca <2 x double>*, align 8
+  store <4 x float>* @vf, <4 x float>** %t1, align 8
+  %0 = load <4 x float>** %t1, align 8
+  %1 = bitcast <4 x float>* %0 to i8*
+  %2 = call <4 x i32> @llvm.ppc.vsx.lxvw4x(i8* %1)
+  store <4 x float>* @res_vf, <4 x float>** %t1, align 8
+  %3 = load <4 x float>** %t1, align 8
+  %4 = bitcast <4 x float>* %3 to i8*
+  call void @llvm.ppc.vsx.stxvw4x(<4 x i32> %2, i8* %4)
+  store <2 x double>* @vd, <2 x double>** %t2, align 8
+  %5 = load <2 x double>** %t2, align 8
+  %6 = bitcast <2 x double>* %5 to i8*
+  %7 = call <2 x double> @llvm.ppc.vsx.lxvd2x(i8* %6)
+  store <2 x double>* @res_vd, <2 x double>** %t2, align 8
+  %8 = load <2 x double>** %t2, align 8
+  %9 = bitcast <2 x double>* %8 to i8*
+  call void @llvm.ppc.vsx.stxvd2x(<2 x double> %7, i8* %9)
+  ret void
+}
+
+; CHECK-LABEL: @test1
+; CHECK: %0 = load <4 x i32>* bitcast (<4 x float>* @vf to <4 x i32>*), align 1
+; CHECK: store <4 x i32> %0, <4 x i32>* bitcast (<4 x float>* @res_vf to <4 x i32>*), align 1
+; CHECK: %1 = load <2 x double>* @vd, align 1
+; CHECK: store <2 x double> %1, <2 x double>* @res_vd, align 1
+
+declare <4 x i32> @llvm.ppc.vsx.lxvw4x(i8*)
+declare void @llvm.ppc.vsx.stxvw4x(<4 x i32>, i8*)
+declare <2 x double> @llvm.ppc.vsx.lxvd2x(i8*)
+declare void @llvm.ppc.vsx.stxvd2x(<2 x double>, i8*)