diff --git a/lib/Target/SystemZ/README.txt b/lib/Target/SystemZ/README.txt
index afa6cf090d0..e089047d013 100644
--- a/lib/Target/SystemZ/README.txt
+++ b/lib/Target/SystemZ/README.txt
@@ -166,13 +166,6 @@ See CodeGen/SystemZ/alloca-01.ll for an example.
 
 --
 
-Atomic loads and stores use the default compare-and-swap based implementation.
-This is much too conservative in practice, since the architecture guarantees
-that 1-, 2-, 4- and 8-byte loads and stores to aligned addresses are
-inherently atomic.
-
---
-
 If needed, we can support 16-byte atomics using LPQ, STPQ and CSDG.
 
 --
diff --git a/lib/Target/SystemZ/SystemZISelLowering.cpp b/lib/Target/SystemZ/SystemZISelLowering.cpp
index af5b52f1aee..b0a8fca7de7 100644
--- a/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -134,10 +134,10 @@ SystemZTargetLowering::SystemZTargetLowering(SystemZTargetMachine &tm)
       setOperationAction(ISD::SDIVREM, VT, Custom);
       setOperationAction(ISD::UDIVREM, VT, Custom);
 
-      // Expand ATOMIC_LOAD and ATOMIC_STORE using ATOMIC_CMP_SWAP.
-      // FIXME: probably much too conservative.
-      setOperationAction(ISD::ATOMIC_LOAD,  VT, Expand);
-      setOperationAction(ISD::ATOMIC_STORE, VT, Expand);
+      // Lower ATOMIC_LOAD and ATOMIC_STORE into normal volatile loads and
+      // stores, putting a serialization instruction after the stores.
+      setOperationAction(ISD::ATOMIC_LOAD,  VT, Custom);
+      setOperationAction(ISD::ATOMIC_STORE, VT, Custom);
 
       // No special instructions for these.
       setOperationAction(ISD::CTPOP,           VT, Expand);
@@ -2001,11 +2001,32 @@ SDValue SystemZTargetLowering::lowerOR(SDValue Op, SelectionDAG &DAG) const {
                                    MVT::i64, HighOp, Low32);
 }
 
+// Op is an atomic load.  Lower it into a normal volatile load.
+SDValue SystemZTargetLowering::lowerATOMIC_LOAD(SDValue Op,
+                                                SelectionDAG &DAG) const {
+  AtomicSDNode *Node = cast<AtomicSDNode>(Op.getNode());
+  return DAG.getExtLoad(ISD::EXTLOAD, SDLoc(Op), Op.getValueType(),
+                        Node->getChain(), Node->getBasePtr(),
+                        Node->getMemoryVT(), Node->getMemOperand());
+}
+
+// Op is an atomic store.  Lower it into a normal volatile store followed
+// by a serialization.
+SDValue SystemZTargetLowering::lowerATOMIC_STORE(SDValue Op,
+                                                 SelectionDAG &DAG) const {
+  AtomicSDNode *Node = cast<AtomicSDNode>(Op.getNode());
+  SDValue Chain = DAG.getTruncStore(Node->getChain(), SDLoc(Op), Node->getVal(),
+                                    Node->getBasePtr(), Node->getMemoryVT(),
+                                    Node->getMemOperand());
+  return SDValue(DAG.getMachineNode(SystemZ::Serialize, SDLoc(Op), MVT::Other,
+                                    Chain), 0);
+}
+
 // Op is an 8-, 16-bit or 32-bit ATOMIC_LOAD_* operation.  Lower the first
 // two into the fullword ATOMIC_LOADW_* operation given by Opcode.
-SDValue SystemZTargetLowering::lowerATOMIC_LOAD(SDValue Op,
-                                                SelectionDAG &DAG,
-                                                unsigned Opcode) const {
+SDValue SystemZTargetLowering::lowerATOMIC_LOAD_OP(SDValue Op,
+                                                   SelectionDAG &DAG,
+                                                   unsigned Opcode) const {
   AtomicSDNode *Node = cast<AtomicSDNode>(Op.getNode());
 
   // 32-bit operations need no code outside the main loop.
@@ -2195,27 +2216,31 @@ SDValue SystemZTargetLowering::LowerOperation(SDValue Op,
   case ISD::OR:
     return lowerOR(Op, DAG);
   case ISD::ATOMIC_SWAP:
-    return lowerATOMIC_LOAD(Op, DAG, SystemZISD::ATOMIC_SWAPW);
+    return lowerATOMIC_LOAD_OP(Op, DAG, SystemZISD::ATOMIC_SWAPW);
+  case ISD::ATOMIC_STORE:
+    return lowerATOMIC_STORE(Op, DAG);
+  case ISD::ATOMIC_LOAD:
+    return lowerATOMIC_LOAD(Op, DAG);
   case ISD::ATOMIC_LOAD_ADD:
-    return lowerATOMIC_LOAD(Op, DAG, SystemZISD::ATOMIC_LOADW_ADD);
+    return lowerATOMIC_LOAD_OP(Op, DAG, SystemZISD::ATOMIC_LOADW_ADD);
   case ISD::ATOMIC_LOAD_SUB:
-    return lowerATOMIC_LOAD(Op, DAG, SystemZISD::ATOMIC_LOADW_SUB);
+    return lowerATOMIC_LOAD_OP(Op, DAG, SystemZISD::ATOMIC_LOADW_SUB);
   case ISD::ATOMIC_LOAD_AND:
-    return lowerATOMIC_LOAD(Op, DAG, SystemZISD::ATOMIC_LOADW_AND);
+    return lowerATOMIC_LOAD_OP(Op, DAG, SystemZISD::ATOMIC_LOADW_AND);
   case ISD::ATOMIC_LOAD_OR:
-    return lowerATOMIC_LOAD(Op, DAG, SystemZISD::ATOMIC_LOADW_OR);
+    return lowerATOMIC_LOAD_OP(Op, DAG, SystemZISD::ATOMIC_LOADW_OR);
   case ISD::ATOMIC_LOAD_XOR:
-    return lowerATOMIC_LOAD(Op, DAG, SystemZISD::ATOMIC_LOADW_XOR);
+    return lowerATOMIC_LOAD_OP(Op, DAG, SystemZISD::ATOMIC_LOADW_XOR);
   case ISD::ATOMIC_LOAD_NAND:
-    return lowerATOMIC_LOAD(Op, DAG, SystemZISD::ATOMIC_LOADW_NAND);
+    return lowerATOMIC_LOAD_OP(Op, DAG, SystemZISD::ATOMIC_LOADW_NAND);
   case ISD::ATOMIC_LOAD_MIN:
-    return lowerATOMIC_LOAD(Op, DAG, SystemZISD::ATOMIC_LOADW_MIN);
+    return lowerATOMIC_LOAD_OP(Op, DAG, SystemZISD::ATOMIC_LOADW_MIN);
   case ISD::ATOMIC_LOAD_MAX:
-    return lowerATOMIC_LOAD(Op, DAG, SystemZISD::ATOMIC_LOADW_MAX);
+    return lowerATOMIC_LOAD_OP(Op, DAG, SystemZISD::ATOMIC_LOADW_MAX);
   case ISD::ATOMIC_LOAD_UMIN:
-    return lowerATOMIC_LOAD(Op, DAG, SystemZISD::ATOMIC_LOADW_UMIN);
+    return lowerATOMIC_LOAD_OP(Op, DAG, SystemZISD::ATOMIC_LOADW_UMIN);
   case ISD::ATOMIC_LOAD_UMAX:
-    return lowerATOMIC_LOAD(Op, DAG, SystemZISD::ATOMIC_LOADW_UMAX);
+    return lowerATOMIC_LOAD_OP(Op, DAG, SystemZISD::ATOMIC_LOADW_UMAX);
   case ISD::ATOMIC_CMP_SWAP:
     return lowerATOMIC_CMP_SWAP(Op, DAG);
   case ISD::STACKSAVE:
diff --git a/lib/Target/SystemZ/SystemZISelLowering.h b/lib/Target/SystemZ/SystemZISelLowering.h
index 0b36f9fc7fe..4cbb30da8b5 100644
--- a/lib/Target/SystemZ/SystemZISelLowering.h
+++ b/lib/Target/SystemZ/SystemZISelLowering.h
@@ -276,8 +276,10 @@ private:
   SDValue lowerUDIVREM(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerBITCAST(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerOR(SDValue Op, SelectionDAG &DAG) const;
-  SDValue lowerATOMIC_LOAD(SDValue Op, SelectionDAG &DAG,
-                           unsigned Opcode) const;
+  SDValue lowerATOMIC_LOAD(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerATOMIC_LOAD_OP(SDValue Op, SelectionDAG &DAG,
+                              unsigned Opcode) const;
   SDValue lowerATOMIC_CMP_SWAP(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerLOAD_SEQUENCE_POINT(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerSTACKSAVE(SDValue Op, SelectionDAG &DAG) const;
diff --git a/test/CodeGen/SystemZ/atomic-load-01.ll b/test/CodeGen/SystemZ/atomic-load-01.ll
index a5bc8833e78..f3acd605b01 100644
--- a/test/CodeGen/SystemZ/atomic-load-01.ll
+++ b/test/CodeGen/SystemZ/atomic-load-01.ll
@@ -2,11 +2,10 @@
 ;
 ; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
 
-; This is just a placeholder to make sure that loads are handled.
-; The CS-based sequence is probably far too conservative.
 define i8 @f1(i8 *%src) {
 ; CHECK-LABEL: f1:
-; CHECK: cs
+; CHECK: bcr 1{{[45]}}, %r0
+; CHECK: lb %r2, 0(%r2)
 ; CHECK: br %r14
   %val = load atomic i8 *%src seq_cst, align 1
   ret i8 %val
diff --git a/test/CodeGen/SystemZ/atomic-load-02.ll b/test/CodeGen/SystemZ/atomic-load-02.ll
index 2c9bbdb488a..d9bec60f4c1 100644
--- a/test/CodeGen/SystemZ/atomic-load-02.ll
+++ b/test/CodeGen/SystemZ/atomic-load-02.ll
@@ -2,11 +2,10 @@
 ;
 ; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
 
-; This is just a placeholder to make sure that loads are handled.
-; The CS-based sequence is probably far too conservative.
 define i16 @f1(i16 *%src) {
 ; CHECK-LABEL: f1:
-; CHECK: cs
+; CHECK: bcr 1{{[45]}}, %r0
+; CHECK: lh %r2, 0(%r2)
 ; CHECK: br %r14
   %val = load atomic i16 *%src seq_cst, align 2
   ret i16 %val
diff --git a/test/CodeGen/SystemZ/atomic-load-03.ll b/test/CodeGen/SystemZ/atomic-load-03.ll
index 1fb41f5e39a..7e5eb9249a9 100644
--- a/test/CodeGen/SystemZ/atomic-load-03.ll
+++ b/test/CodeGen/SystemZ/atomic-load-03.ll
@@ -2,12 +2,10 @@
 ;
 ; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
 
-; This is just a placeholder to make sure that loads are handled.
-; Using CS is probably too conservative.
-define i32 @f1(i32 %dummy, i32 *%src) {
+define i32 @f1(i32 *%src) {
 ; CHECK-LABEL: f1:
-; CHECK: lhi %r2, 0
-; CHECK: cs %r2, %r2, 0(%r3)
+; CHECK: bcr 1{{[45]}}, %r0
+; CHECK: l %r2, 0(%r2)
 ; CHECK: br %r14
   %val = load atomic i32 *%src seq_cst, align 4
   ret i32 %val
diff --git a/test/CodeGen/SystemZ/atomic-load-04.ll b/test/CodeGen/SystemZ/atomic-load-04.ll
index 92cac406e20..c7a9a98a425 100644
--- a/test/CodeGen/SystemZ/atomic-load-04.ll
+++ b/test/CodeGen/SystemZ/atomic-load-04.ll
@@ -2,12 +2,10 @@
 ;
 ; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
 
-; This is just a placeholder to make sure that loads are handled.
-; Using CSG is probably too conservative.
-define i64 @f1(i64 %dummy, i64 *%src) {
+define i64 @f1(i64 *%src) {
 ; CHECK-LABEL: f1:
-; CHECK: lghi %r2, 0
-; CHECK: csg %r2, %r2, 0(%r3)
+; CHECK: bcr 1{{[45]}}, %r0
+; CHECK: lg %r2, 0(%r2)
 ; CHECK: br %r14
   %val = load atomic i64 *%src seq_cst, align 8
   ret i64 %val
diff --git a/test/CodeGen/SystemZ/atomic-store-01.ll b/test/CodeGen/SystemZ/atomic-store-01.ll
index 53ed24f623c..952e1a91216 100644
--- a/test/CodeGen/SystemZ/atomic-store-01.ll
+++ b/test/CodeGen/SystemZ/atomic-store-01.ll
@@ -2,11 +2,10 @@
 ;
 ; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
 
-; This is just a placeholder to make sure that stores are handled.
-; The CS-based sequence is probably far too conservative.
 define void @f1(i8 %val, i8 *%src) {
 ; CHECK-LABEL: f1:
-; CHECK: cs
+; CHECK: stc %r2, 0(%r3)
+; CHECK: bcr 1{{[45]}}, %r0
 ; CHECK: br %r14
   store atomic i8 %val, i8 *%src seq_cst, align 1
   ret void
diff --git a/test/CodeGen/SystemZ/atomic-store-02.ll b/test/CodeGen/SystemZ/atomic-store-02.ll
index 42d6695b51d..c9576e55656 100644
--- a/test/CodeGen/SystemZ/atomic-store-02.ll
+++ b/test/CodeGen/SystemZ/atomic-store-02.ll
@@ -2,11 +2,10 @@
 ;
 ; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
 
-; This is just a placeholder to make sure that stores are handled.
-; The CS-based sequence is probably far too conservative.
 define void @f1(i16 %val, i16 *%src) {
 ; CHECK-LABEL: f1:
-; CHECK: cs
+; CHECK: sth %r2, 0(%r3)
+; CHECK: bcr 1{{[45]}}, %r0
 ; CHECK: br %r14
   store atomic i16 %val, i16 *%src seq_cst, align 2
   ret void
diff --git a/test/CodeGen/SystemZ/atomic-store-03.ll b/test/CodeGen/SystemZ/atomic-store-03.ll
index 846c86fd366..459cb6a94e1 100644
--- a/test/CodeGen/SystemZ/atomic-store-03.ll
+++ b/test/CodeGen/SystemZ/atomic-store-03.ll
@@ -2,14 +2,10 @@
 ;
 ; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
 
-; This is just a placeholder to make sure that stores are handled.
-; Using CS is probably too conservative.
 define void @f1(i32 %val, i32 *%src) {
 ; CHECK-LABEL: f1:
-; CHECK: l %r0, 0(%r3)
-; CHECK: [[LABEL:\.[^:]*]]:
-; CHECK: cs %r0, %r2, 0(%r3)
-; CHECK: jl [[LABEL]]
+; CHECK: st %r2, 0(%r3)
+; CHECK: bcr 1{{[45]}}, %r0
 ; CHECK: br %r14
   store atomic i32 %val, i32 *%src seq_cst, align 4
   ret void
diff --git a/test/CodeGen/SystemZ/atomic-store-04.ll b/test/CodeGen/SystemZ/atomic-store-04.ll
index 24615b11565..7f2406eb546 100644
--- a/test/CodeGen/SystemZ/atomic-store-04.ll
+++ b/test/CodeGen/SystemZ/atomic-store-04.ll
@@ -2,14 +2,10 @@
 ;
 ; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
 
-; This is just a placeholder to make sure that stores are handled.
-; Using CS is probably too conservative.
 define void @f1(i64 %val, i64 *%src) {
 ; CHECK-LABEL: f1:
-; CHECK: lg %r0, 0(%r3)
-; CHECK: [[LABEL:\.[^:]*]]:
-; CHECK: csg %r0, %r2, 0(%r3)
-; CHECK: jl [[LABEL]]
+; CHECK: stg %r2, 0(%r3)
+; CHECK: bcr 1{{[45]}}, %r0
 ; CHECK: br %r14
   store atomic i64 %val, i64 *%src seq_cst, align 8
   ret void
diff --git a/test/CodeGen/SystemZ/cond-store-01.ll b/test/CodeGen/SystemZ/cond-store-01.ll
index d55ea2133e8..62e9796fa21 100644
--- a/test/CodeGen/SystemZ/cond-store-01.ll
+++ b/test/CodeGen/SystemZ/cond-store-01.ll
@@ -347,11 +347,10 @@ define void @f19(i8 *%ptr, i8 %alt, i32 %limit) {
 define void @f20(i8 *%ptr, i8 %alt, i32 %limit) {
 ; FIXME: should use a normal load instead of CS.
 ; CHECK-LABEL: f20:
-; CHECK: cs {{%r[0-9]+}},
-; CHECK: jl
+; CHECK: lb {{%r[0-9]+}}, 0(%r2)
 ; CHECK: {{jl|jnl}} [[LABEL:[^ ]*]]
 ; CHECK: [[LABEL]]:
-; CHECK: stc {{%r[0-9]+}},
+; CHECK: stc {{%r[0-9]+}}, 0(%r2)
 ; CHECK: br %r14
   %cond = icmp ult i32 %limit, 420
   %orig = load atomic i8 *%ptr unordered, align 1
@@ -367,7 +366,7 @@ define void @f21(i8 *%ptr, i8 %alt, i32 %limit) {
 ; CHECK: jhe [[LABEL:[^ ]*]]
 ; CHECK: lb %r3, 0(%r2)
 ; CHECK: [[LABEL]]:
-; CHECK: cs {{%r[0-9]+}},
+; CHECK: stc %r3, 0(%r2)
 ; CHECK: br %r14
   %cond = icmp ult i32 %limit, 420
   %orig = load i8 *%ptr
diff --git a/test/CodeGen/SystemZ/cond-store-02.ll b/test/CodeGen/SystemZ/cond-store-02.ll
index 91bc4860b38..4fbcdaba510 100644
--- a/test/CodeGen/SystemZ/cond-store-02.ll
+++ b/test/CodeGen/SystemZ/cond-store-02.ll
@@ -347,11 +347,10 @@ define void @f19(i16 *%ptr, i16 %alt, i32 %limit) {
 define void @f20(i16 *%ptr, i16 %alt, i32 %limit) {
 ; FIXME: should use a normal load instead of CS.
 ; CHECK-LABEL: f20:
-; CHECK: cs {{%r[0-9]+}},
-; CHECK: jl
+; CHECK: lh {{%r[0-9]+}}, 0(%r2)
 ; CHECK: {{jl|jnl}} [[LABEL:[^ ]*]]
 ; CHECK: [[LABEL]]:
-; CHECK: sth {{%r[0-9]+}},
+; CHECK: sth {{%r[0-9]+}}, 0(%r2)
 ; CHECK: br %r14
   %cond = icmp ult i32 %limit, 420
   %orig = load atomic i16 *%ptr unordered, align 2
@@ -367,7 +366,7 @@ define void @f21(i16 *%ptr, i16 %alt, i32 %limit) {
 ; CHECK: jhe [[LABEL:[^ ]*]]
 ; CHECK: lh %r3, 0(%r2)
 ; CHECK: [[LABEL]]:
-; CHECK: cs {{%r[0-9]+}},
+; CHECK: sth %r3, 0(%r2)
 ; CHECK: br %r14
   %cond = icmp ult i32 %limit, 420
   %orig = load i16 *%ptr
diff --git a/test/CodeGen/SystemZ/cond-store-03.ll b/test/CodeGen/SystemZ/cond-store-03.ll
index d4fd48d6132..4b22555d0d6 100644
--- a/test/CodeGen/SystemZ/cond-store-03.ll
+++ b/test/CodeGen/SystemZ/cond-store-03.ll
@@ -272,7 +272,7 @@ define void @f15(i32 *%ptr, i32 %alt, i32 %limit) {
 define void @f16(i32 *%ptr, i32 %alt, i32 %limit) {
 ; FIXME: should use a normal load instead of CS.
 ; CHECK-LABEL: f16:
-; CHECK: cs {{%r[0-5]}}, {{%r[0-5]}}, 0(%r2)
+; CHECK: l {{%r[0-5]}}, 0(%r2)
 ; CHECK: {{jl|jnl}} [[LABEL:[^ ]*]]
 ; CHECK: [[LABEL]]:
 ; CHECK: st {{%r[0-5]}}, 0(%r2)
@@ -291,7 +291,7 @@ define void @f17(i32 *%ptr, i32 %alt, i32 %limit) {
 ; CHECK: jhe [[LABEL:[^ ]*]]
 ; CHECK: l %r3, 0(%r2)
 ; CHECK: [[LABEL]]:
-; CHECK: cs {{%r[0-5]}}, %r3, 0(%r2)
+; CHECK: st %r3, 0(%r2)
 ; CHECK: br %r14
   %cond = icmp ult i32 %limit, 420
   %orig = load i32 *%ptr
diff --git a/test/CodeGen/SystemZ/cond-store-04.ll b/test/CodeGen/SystemZ/cond-store-04.ll
index fc565c432ff..346b51a17d7 100644
--- a/test/CodeGen/SystemZ/cond-store-04.ll
+++ b/test/CodeGen/SystemZ/cond-store-04.ll
@@ -164,7 +164,7 @@ define void @f9(i64 *%ptr, i64 %alt, i32 %limit) {
 define void @f10(i64 *%ptr, i64 %alt, i32 %limit) {
 ; FIXME: should use a normal load instead of CSG.
 ; CHECK-LABEL: f10:
-; CHECK: csg {{%r[0-5]}}, {{%r[0-5]}}, 0(%r2)
+; CHECK: lg {{%r[0-5]}}, 0(%r2)
 ; CHECK: {{jl|jnl}} [[LABEL:[^ ]*]]
 ; CHECK: [[LABEL]]:
 ; CHECK: stg {{%r[0-5]}}, 0(%r2)
@@ -183,7 +183,7 @@ define void @f11(i64 *%ptr, i64 %alt, i32 %limit) {
 ; CHECK: jhe [[LABEL:[^ ]*]]
 ; CHECK: lg %r3, 0(%r2)
 ; CHECK: [[LABEL]]:
-; CHECK: csg {{%r[0-5]}}, %r3, 0(%r2)
+; CHECK: stg %r3, 0(%r2)
 ; CHECK: br %r14
   %cond = icmp ult i32 %limit, 420
   %orig = load i64 *%ptr