ARM & AArch64: make use of common cmpxchg idioms after expansion

The C and C++ semantics for compare_exchange require it to return a bool indicating success. This gets mapped to LLVM IR which follows each cmpxchg with an icmp of the value loaded against the desired value. When lowered to ldxr/stxr loops, this extra comparison is redundant: its results are implicit in the control-flow of the function. This commit makes two changes: it replaces that icmp with appropriate PHI nodes, and then makes sure earlyCSE is called after expansion to actually make use of the opportunities revealed. I've also added -{arm,aarch64}-enable-atomic-tidy options, so that existing fragile tests aren't perturbed too much by the change. Many of them either rely on undef/unreachable too pervasively to be restored to something well-defined (particularly while making sure they test the same obscure assert from many years ago), or depend on a particular CFG shape, which is disrupted by SimplifyCFG. rdar://problem/16227836 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@209883 91177308-0d34-0410-b5e6-96231b3b80d8
2024-11-01 15:11:24 +00:00 · 2014-05-30 10:09:59 +00:00 · 2014-05-30 10:09:59 +00:00 · d0dbe02fd2
commit d0dbe02fd2
parent 7be505ae88
59 changed files with 347 additions and 87 deletions
--- a/lib/CodeGen/AtomicExpandLoadLinkedPass.cpp
+++ b/lib/CodeGen/AtomicExpandLoadLinkedPass.cpp
@ -300,12 +300,50 @@ bool AtomicExpandLoadLinked::expandAtomicCmpXchg(AtomicCmpXchgInst *CI) {
      StoreSuccess, ConstantInt::get(Type::getInt32Ty(Ctx), 0), "success");
  Builder.CreateCondBr(TryAgain, LoopBB, BarrierBB);

-  // Finally, make sure later instructions don't get reordered with a fence if
-  // necessary.
+  // Make sure later instructions don't get reordered with a fence if necessary.
  Builder.SetInsertPoint(BarrierBB);
  insertTrailingFence(Builder, SuccessOrder);
  Builder.CreateBr(ExitBB);

+  // Finally, we have control-flow based knowledge of whether the cmpxchg
+  // succeeded or not. We expose this to later passes by converting any
+  // subsequent "icmp eq/ne %loaded, %oldval" into a use of an appropriate PHI.
+
+  // Setup the builder so we can create any PHIs we need.
+  Builder.SetInsertPoint(FailureBB, FailureBB->begin());
+  BasicBlock *SuccessBB = FailureOrder == Monotonic ? BarrierBB : TryStoreBB;
+  PHINode *Success = 0, *Failure = 0;
+
+  // Look for any users of the cmpxchg that are just comparing the loaded value
+  // against the desired one, and replace them with the CFG-derived version.
+  for (auto User : CI->users()) {
+    ICmpInst *ICmp = dyn_cast<ICmpInst>(User);
+    if (!ICmp)
+      continue;
+
+    // Because we know ICmp uses CI, we only need one operand to be the old
+    // value.
+    if (ICmp->getOperand(0) != CI->getCompareOperand() &&
+        ICmp->getOperand(1) != CI->getCompareOperand())
+      continue;
+
+    if (ICmp->getPredicate() == CmpInst::ICMP_EQ) {
+      if (!Success) {
+        Success = Builder.CreatePHI(Type::getInt1Ty(Ctx), 2);
+        Success->addIncoming(ConstantInt::getTrue(Ctx), SuccessBB);
+        Success->addIncoming(ConstantInt::getFalse(Ctx), LoopBB);
+      }
+      ICmp->replaceAllUsesWith(Success);
+    } else if (ICmp->getPredicate() == CmpInst::ICMP_NE) {
+      if (!Failure) {
+        Failure = Builder.CreatePHI(Type::getInt1Ty(Ctx), 2);
+        Failure->addIncoming(ConstantInt::getFalse(Ctx), SuccessBB);
+        Failure->addIncoming(ConstantInt::getTrue(Ctx), LoopBB);
+      }
+      ICmp->replaceAllUsesWith(Failure);
+    }
+  }
+
  CI->replaceAllUsesWith(Loaded);
  CI->eraseFromParent();

--- a/lib/Target/AArch64/AArch64TargetMachine.cpp
+++ b/lib/Target/AArch64/AArch64TargetMachine.cpp
@ -53,6 +53,12 @@ static cl::opt<bool>
 EnableLoadStoreOpt("aarch64-load-store-opt", cl::desc("Enable the load/store pair"
                   " optimization pass"), cl::init(true), cl::Hidden);

+static cl::opt<bool>
+EnableAtomicTidy("aarch64-atomic-cfg-tidy", cl::Hidden,
+                 cl::desc("Run SimplifyCFG after expanding atomic operations"
+                          " to make use of cmpxchg flow-based information"),
+                 cl::init(true));
+
 extern "C" void LLVMInitializeAArch64Target() {
  // Register the target.
  RegisterTargetMachine<AArch64leTargetMachine> X(TheAArch64leTarget);
@ -113,6 +119,7 @@ public:
    return getTM<AArch64TargetMachine>();
  }

+  void addIRPasses()  override;
  bool addPreISel() override;
  bool addInstSelector() override;
  bool addILPOpts() override;
@ -135,6 +142,20 @@ TargetPassConfig *AArch64TargetMachine::createPassConfig(PassManagerBase &PM) {
  return new AArch64PassConfig(this, PM);
 }

+void AArch64PassConfig::addIRPasses() {
+  // Always expand atomic operations, we don't deal with atomicrmw or cmpxchg
+  // ourselves.
+  addPass(createAtomicExpandLoadLinkedPass(TM));
+
+  // Cmpxchg instructions are often used with a subsequent comparison to
+  // determine whether it succeeded. We can exploit existing control-flow in
+  // ldrex/strex loops to simplify this, but it needs tidying up.
+  if (TM->getOptLevel() != CodeGenOpt::None && EnableAtomicTidy)
+    addPass(createCFGSimplificationPass());
+
+  TargetPassConfig::addIRPasses();
+}
+
 // Pass Pipeline Configuration
 bool AArch64PassConfig::addPreISel() {
  // Run promote constant before global merge, so that the promoted constants
@ -146,10 +167,6 @@ bool AArch64PassConfig::addPreISel() {
  if (TM->getOptLevel() != CodeGenOpt::None)
    addPass(createAArch64AddressTypePromotionPass());

-  // Always expand atomic operations, we don't deal with atomicrmw or cmpxchg
-  // ourselves.
-  addPass(createAtomicExpandLoadLinkedPass(TM));
-
  return false;
 }

--- a/lib/Target/ARM/ARMTargetMachine.cpp
+++ b/lib/Target/ARM/ARMTargetMachine.cpp
@ -28,6 +28,12 @@ DisableA15SDOptimization("disable-a15-sd-optimization", cl::Hidden,
                   cl::desc("Inhibit optimization of S->D register accesses on A15"),
                   cl::init(false));

+static cl::opt<bool>
+EnableAtomicTidy("arm-atomic-cfg-tidy", cl::Hidden,
+                 cl::desc("Run SimplifyCFG after expanding atomic operations"
+                          " to make use of cmpxchg flow-based information"),
+                 cl::init(true));
+
 extern "C" void LLVMInitializeARMTarget() {
  // Register the target.
  RegisterTargetMachine<ARMLETargetMachine> X(TheARMLETarget);
@ -213,6 +219,7 @@ public:
    return *getARMTargetMachine().getSubtargetImpl();
  }

+  void addIRPasses() override;
  bool addPreISel() override;
  bool addInstSelector() override;
  bool addPreRegAlloc() override;
@ -225,11 +232,22 @@ TargetPassConfig *ARMBaseTargetMachine::createPassConfig(PassManagerBase &PM) {
  return new ARMPassConfig(this, PM);
 }

-bool ARMPassConfig::addPreISel() {
+void ARMPassConfig::addIRPasses() {
  const ARMSubtarget *Subtarget = &getARMSubtarget();
-  if (Subtarget->hasAnyDataBarrier() && !Subtarget->isThumb1Only())
+  if (Subtarget->hasAnyDataBarrier() && !Subtarget->isThumb1Only()) {
    addPass(createAtomicExpandLoadLinkedPass(TM));

+    // Cmpxchg instructions are often used with a subsequent comparison to
+    // determine whether it succeeded. We can exploit existing control-flow in
+    // ldrex/strex loops to simplify this, but it needs tidying up.
+    if (TM->getOptLevel() != CodeGenOpt::None && EnableAtomicTidy)
+      addPass(createCFGSimplificationPass());
+  }
+
+  TargetPassConfig::addIRPasses();
+}
+
+bool ARMPassConfig::addPreISel() {
  if (TM->getOptLevel() != CodeGenOpt::None)
    addPass(createGlobalMergePass(TM));

--- a/test/CodeGen/AArch64/addsub_ext.ll
+++ b/test/CodeGen/AArch64/addsub_ext.ll
@ -1,4 +1,4 @@
-; RUN: llc -verify-machineinstrs %s -o - -mtriple=aarch64-linux-gnu | FileCheck %s
+; RUN: llc -verify-machineinstrs %s -o - -mtriple=aarch64-linux-gnu -aarch64-atomic-cfg-tidy=0 | FileCheck %s

@var8 = global i8 0
@var16 = global i16 0
--- a/test/CodeGen/AArch64/arm64-ands-bad-peephole.ll
+++ b/test/CodeGen/AArch64/arm64-ands-bad-peephole.ll
@ -1,4 +1,4 @@
-; RUN: llc %s -o - | FileCheck %s
+; RUN: llc %s -o - -aarch64-atomic-cfg-tidy=0 | FileCheck %s
 ; Check that ANDS (tst) is not merged with ADD when the immediate
 ; is not 0.
 ; <rdar://problem/16693089>
--- a/test/CodeGen/AArch64/arm64-cse.ll
+++ b/test/CodeGen/AArch64/arm64-cse.ll
@ -1,4 +1,4 @@
-; RUN: llc -O3 < %s | FileCheck %s
+; RUN: llc -O3 < %s -aarch64-atomic-cfg-tidy=0 | FileCheck %s
 target triple = "arm64-apple-ios"

 ; rdar://12462006
--- a/test/CodeGen/AArch64/arm64-early-ifcvt.ll
+++ b/test/CodeGen/AArch64/arm64-early-ifcvt.ll
@ -1,4 +1,4 @@
-; RUN: llc < %s -stress-early-ifcvt | FileCheck %s
+; RUN: llc < %s -stress-early-ifcvt -aarch64-atomic-cfg-tidy=0 | FileCheck %s
 target triple = "arm64-apple-macosx"

 ; CHECK: mm2
--- a/test/CodeGen/AArch64/arm64-fp128.ll
+++ b/test/CodeGen/AArch64/arm64-fp128.ll
@ -1,4 +1,4 @@
-; RUN: llc -mtriple=arm64-linux-gnu -verify-machineinstrs -mcpu=cyclone < %s | FileCheck %s
+; RUN: llc -mtriple=arm64-linux-gnu -verify-machineinstrs -mcpu=cyclone -aarch64-atomic-cfg-tidy=0 < %s | FileCheck %s

@lhs = global fp128 zeroinitializer, align 16
@rhs = global fp128 zeroinitializer, align 16
--- a/test/CodeGen/AArch64/arm64-frame-index.ll
+++ b/test/CodeGen/AArch64/arm64-frame-index.ll
@ -1,4 +1,4 @@
-; RUN: llc -march=arm64 -mtriple=arm64-apple-ios < %s | FileCheck %s
+; RUN: llc -march=arm64 -mtriple=arm64-apple-ios -aarch64-atomic-cfg-tidy=0 < %s | FileCheck %s
 ; rdar://11935841

 define void @t1() nounwind ssp {
--- a/test/CodeGen/AArch64/arm64-xaluo.ll
+++ b/test/CodeGen/AArch64/arm64-xaluo.ll
@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm64 | FileCheck %s
+; RUN: llc < %s -march=arm64 -aarch64-atomic-cfg-tidy=0 | FileCheck %s

 ;
 ; Get the actual value of the overflow bit.
--- a/test/CodeGen/AArch64/atomic-ops.ll
+++ b/test/CodeGen/AArch64/atomic-ops.ll
@ -889,8 +889,7 @@ define i8 @test_atomic_cmpxchg_i8(i8 %wanted, i8 %new) nounwind {
  ;  function there.
 ; CHECK-NEXT: cmp w[[OLD]], w0
 ; CHECK-NEXT: b.ne [[GET_OUT:.LBB[0-9]+_[0-9]+]]
-  ; As above, w1 is a reasonable guess.
-; CHECK: stxrb [[STATUS:w[0-9]+]], w1, [x[[ADDR]]]
+; CHECK: stxrb [[STATUS:w[0-9]+]], {{w[0-9]+}}, [x[[ADDR]]]
 ; CHECK-NEXT: cbnz [[STATUS]], [[STARTAGAIN]]
 ; CHECK-NOT: dmb

@ -911,8 +910,7 @@ define i16 @test_atomic_cmpxchg_i16(i16 %wanted, i16 %new) nounwind {
  ;  function there.
 ; CHECK-NEXT: cmp w[[OLD]], w0
 ; CHECK-NEXT: b.ne [[GET_OUT:.LBB[0-9]+_[0-9]+]]
-  ; As above, w1 is a reasonable guess.
-; CHECK: stlxrh [[STATUS:w[0-9]+]], w1, [x[[ADDR]]]
+; CHECK: stlxrh [[STATUS:w[0-9]+]], {{w[0-9]+}}, [x[[ADDR]]]
 ; CHECK-NEXT: cbnz [[STATUS]], [[STARTAGAIN]]
 ; CHECK-NOT: dmb

@ -933,8 +931,7 @@ define i32 @test_atomic_cmpxchg_i32(i32 %wanted, i32 %new) nounwind {
  ;  function there.
 ; CHECK-NEXT: cmp w[[OLD]], w0
 ; CHECK-NEXT: b.ne [[GET_OUT:.LBB[0-9]+_[0-9]+]]
-  ; As above, w1 is a reasonable guess.
-; CHECK: stlxr [[STATUS:w[0-9]+]], w1, [x[[ADDR]]]
+; CHECK: stlxr [[STATUS:w[0-9]+]], {{w[0-9]+}}, [x[[ADDR]]]
 ; CHECK-NEXT: cbnz [[STATUS]], [[STARTAGAIN]]
 ; CHECK-NOT: dmb

--- a/test/CodeGen/AArch64/blockaddress.ll
+++ b/test/CodeGen/AArch64/blockaddress.ll
@ -1,5 +1,5 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -code-model=large -mtriple=aarch64-none-linux-gnu -verify-machineinstrs < %s | FileCheck --check-prefix=CHECK-LARGE %s
+; RUN: llc -mtriple=aarch64-none-linux-gnu -aarch64-atomic-cfg-tidy=0 -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -code-model=large -mtriple=aarch64-none-linux-gnu -aarch64-atomic-cfg-tidy=0 -verify-machineinstrs < %s | FileCheck --check-prefix=CHECK-LARGE %s

@addr = global i8* null

--- a/test/CodeGen/AArch64/breg.ll
+++ b/test/CodeGen/AArch64/breg.ll
@ -1,4 +1,4 @@
-; RUN: llc -verify-machineinstrs -o - %s -mtriple=aarch64-linux-gnu | FileCheck %s
+; RUN: llc -verify-machineinstrs -o - %s -mtriple=aarch64-linux-gnu -aarch64-atomic-cfg-tidy=0 | FileCheck %s

@stored_label = global i8* null

--- a/test/CodeGen/AArch64/cmpxchg-idioms.ll
+++ b/test/CodeGen/AArch64/cmpxchg-idioms.ll
@ -0,0 +1,89 @@
+; RUN: llc -mtriple=aarch64-apple-ios7.0 -o - %s | FileCheck %s
+
+define i32 @test_return(i32* %p, i32 %oldval, i32 %newval) {
+; CHECK-LABEL: test_return:
+
+; CHECK: [[LOOP:LBB[0-9]+_[0-9]+]]:
+; CHECK: ldaxr [[LOADED:w[0-9]+]], [x0]
+; CHECK: cmp [[LOADED]], w1
+; CHECK: b.ne [[FAILED:LBB[0-9]+_[0-9]+]]
+
+; CHECK: stlxr [[STATUS:w[0-9]+]], {{w[0-9]+}}, [x0]
+; CHECK: cbnz [[STATUS]], [[LOOP]]
+
+; CHECK-NOT: cmp {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: orr w0, wzr, #0x1
+; CHECK: ret
+
+; CHECK: [[FAILED]]:
+; CHECK-NOT: cmp {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: mov w0, wzr
+; CHECK: ret
+
+  %loaded = cmpxchg i32* %p, i32 %oldval, i32 %newval seq_cst seq_cst
+  %success = icmp eq i32 %loaded, %oldval
+  %conv = zext i1 %success to i32
+  ret i32 %conv
+}
+
+define i1 @test_return_bool(i8* %value, i8 %oldValue, i8 %newValue) {
+; CHECK-LABEL: test_return_bool:
+
+; CHECK: [[LOOP:LBB[0-9]+_[0-9]+]]:
+; CHECK: ldaxrb [[LOADED:w[0-9]+]], [x0]
+; CHECK: cmp [[LOADED]], w1, uxtb
+; CHECK: b.ne [[FAILED:LBB[0-9]+_[0-9]+]]
+
+; CHECK: stlxrb [[STATUS:w[0-9]+]], {{w[0-9]+}}, [x0]
+; CHECK: cbnz [[STATUS]], [[LOOP]]
+
+; CHECK-NOT: cmp {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: mov w0, wzr
+; CHECK: ret
+
+; CHECK: [[FAILED]]:
+; CHECK-NOT: cmp {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: orr w0, wzr, #0x1
+; CHECK: ret
+
+  %loaded = cmpxchg i8* %value, i8 %oldValue, i8 %newValue acq_rel monotonic
+  %failure = icmp ne i8 %loaded, %oldValue
+  ret i1 %failure
+}
+
+define void @test_conditional(i32* %p, i32 %oldval, i32 %newval) {
+; CHECK-LABEL: test_conditional:
+
+; CHECK: [[LOOP:LBB[0-9]+_[0-9]+]]:
+; CHECK: ldaxr [[LOADED:w[0-9]+]], [x0]
+; CHECK: cmp [[LOADED]], w1
+; CHECK: b.ne [[FAILED:LBB[0-9]+_[0-9]+]]
+
+; CHECK: stlxr [[STATUS:w[0-9]+]], w2, [x0]
+; CHECK: cbnz [[STATUS]], [[LOOP]]
+
+; CHECK-NOT: cmp {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: b _bar
+
+; CHECK: [[FAILED]]:
+; CHECK-NOT: cmp {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: b _baz
+
+  %loaded = cmpxchg i32* %p, i32 %oldval, i32 %newval seq_cst seq_cst
+  %success = icmp eq i32 %loaded, %oldval
+  br i1 %success, label %true, label %false
+
+true:
+  tail call void @bar() #2
+  br label %end
+
+false:
+  tail call void @baz() #2
+  br label %end
+
+end:
+  ret void
+}
+
+declare void @bar()
+declare void @baz()
--- a/test/CodeGen/AArch64/directcond.ll
+++ b/test/CodeGen/AArch64/directcond.ll
@ -1,5 +1,5 @@
-; RUN: llc -verify-machineinstrs -o - %s -mtriple=arm64-apple-ios7.0 | FileCheck %s --check-prefix=CHECK
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -mattr=-fp-armv8 | FileCheck --check-prefix=CHECK-NOFP %s
+; RUN: llc -verify-machineinstrs -o - %s -mtriple=arm64-apple-ios7.0 -aarch64-atomic-cfg-tidy=0 | FileCheck %s --check-prefix=CHECK
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -mattr=-fp-armv8 -aarch64-atomic-cfg-tidy=0 | FileCheck --check-prefix=CHECK-NOFP %s

 define i32 @test_select_i32(i1 %bit, i32 %a, i32 %b) {
 ; CHECK-LABEL: test_select_i32:
--- a/test/CodeGen/AArch64/flags-multiuse.ll
+++ b/test/CodeGen/AArch64/flags-multiuse.ll
@ -1,4 +1,4 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs -o - %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-none-linux-gnu -aarch64-atomic-cfg-tidy=0 -verify-machineinstrs -o - %s | FileCheck %s

 ; LLVM should be able to cope with multiple uses of the same flag-setting
 ; instruction at different points of a routine. Either by rematerializing the
--- a/test/CodeGen/AArch64/jump-table.ll
+++ b/test/CodeGen/AArch64/jump-table.ll
@ -1,6 +1,6 @@
-; RUN: llc -verify-machineinstrs -o - %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
-; RUN: llc -code-model=large -verify-machineinstrs -o - %s -mtriple=aarch64-none-linux-gnu | FileCheck --check-prefix=CHECK-LARGE %s
-; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs -relocation-model=pic -o - %s | FileCheck --check-prefix=CHECK-PIC %s
+; RUN: llc -verify-machineinstrs -o - %s -mtriple=aarch64-none-linux-gnu -aarch64-atomic-cfg-tidy=0 | FileCheck %s
+; RUN: llc -code-model=large -verify-machineinstrs -o - %s -mtriple=aarch64-none-linux-gnu -aarch64-atomic-cfg-tidy=0 | FileCheck --check-prefix=CHECK-LARGE %s
+; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs -relocation-model=pic -aarch64-atomic-cfg-tidy=0 -o - %s | FileCheck --check-prefix=CHECK-PIC %s

 define i32 @test_jumptable(i32 %in) {
 ; CHECK: test_jumptable
--- a/test/CodeGen/AArch64/ldst-opt.ll
+++ b/test/CodeGen/AArch64/ldst-opt.ll
@ -1,4 +1,4 @@
-; RUN: llc -mtriple=aarch64-linux-gnu -verify-machineinstrs -o - %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -aarch64-atomic-cfg-tidy=0 -verify-machineinstrs -o - %s | FileCheck %s

 ; This file contains tests for the AArch64 load/store optimizer.

--- a/test/CodeGen/AArch64/tst-br.ll
+++ b/test/CodeGen/AArch64/tst-br.ll
@ -1,4 +1,4 @@
-; RUN: llc -verify-machineinstrs -o - %s -mtriple=arm64-apple-ios7.0 | FileCheck %s
+; RUN: llc -verify-machineinstrs -o - %s -mtriple=arm64-apple-ios7.0 -aarch64-atomic-cfg-tidy=0 | FileCheck %s

 ; We've got the usual issues with LLVM reordering blocks here. The
 ; tests are correct for the current order, but who knows when that
--- a/test/CodeGen/ARM/2009-11-02-NegativeLane.ll
+++ b/test/CodeGen/ARM/2009-11-02-NegativeLane.ll
@ -1,4 +1,4 @@
-; RUN: llc -mcpu=cortex-a8 < %s | FileCheck %s
+; RUN: llc -mcpu=cortex-a8 -arm-atomic-cfg-tidy=0 < %s | FileCheck %s
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64"
 target triple = "armv7-eabi"

--- a/test/CodeGen/ARM/2009-11-07-SubRegAsmPrinting.ll
+++ b/test/CodeGen/ARM/2009-11-07-SubRegAsmPrinting.ll
@ -1,4 +1,4 @@
-; RUN: llc -mcpu=cortex-a8 -mattr=-neonfp < %s | FileCheck %s
+; RUN: llc -mcpu=cortex-a8 -mattr=-neonfp -arm-atomic-cfg-tidy=0 < %s | FileCheck %s
 ; PR5423

 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64"
--- a/test/CodeGen/ARM/2010-10-25-ifcvt-ldm.ll
+++ b/test/CodeGen/ARM/2010-10-25-ifcvt-ldm.ll
@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=armv6-apple-darwin -mcpu=arm1136jf-s | FileCheck %s
+; RUN: llc < %s -mtriple=armv6-apple-darwin -mcpu=arm1136jf-s -arm-atomic-cfg-tidy=0 | FileCheck %s
 ; Radar 8589805: Counting the number of microcoded operations, such as for an
 ; LDM instruction, was causing an assertion failure because the microop count
 ; was being treated as an instruction count.
--- a/test/CodeGen/ARM/2011-02-04-AntidepMultidef.ll
+++ b/test/CodeGen/ARM/2011-02-04-AntidepMultidef.ll
@ -1,4 +1,4 @@
-; RUN: llc < %s -asm-verbose=false -O3 -mtriple=armv6-apple-darwin -relocation-model=pic  -mcpu=arm1136jf-s | FileCheck %s
+; RUN: llc < %s -asm-verbose=false -O3 -mtriple=armv6-apple-darwin -relocation-model=pic  -mcpu=arm1136jf-s -arm-atomic-cfg-tidy=0 | FileCheck %s
 ; rdar://8959122 illegal register operands for UMULL instruction
 ;   in cfrac nightly test.
 ; Armv6 generates a umull that must write to two distinct destination regs.
--- a/test/CodeGen/ARM/2012-11-14-subs_carry.ll
+++ b/test/CodeGen/ARM/2012-11-14-subs_carry.ll
@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=thumbv7-apple-ios | FileCheck %s
+; RUN: llc < %s -mtriple=thumbv7-apple-ios -arm-atomic-cfg-tidy=0 | FileCheck %s

 ;CHECK-LABEL: foo:
 ;CHECK: adds
--- a/test/CodeGen/ARM/2013-07-29-vector-or-combine.ll
+++ b/test/CodeGen/ARM/2013-07-29-vector-or-combine.ll
@ -1,4 +1,4 @@
-; RUN: llc < %s -mcpu=cortex-a8 | FileCheck %s
+; RUN: llc < %s -mcpu=cortex-a8 -arm-atomic-cfg-tidy=0 | FileCheck %s
 ; ModuleID = 'bugpoint-reduced-simplified.bc'
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:64:128-a0:0:64-n32-S64"
 target triple = "armv7--linux-gnueabi"
--- a/test/CodeGen/ARM/arm-and-tst-peephole.ll
+++ b/test/CodeGen/ARM/arm-and-tst-peephole.ll
@ -1,8 +1,8 @@
-; RUN: llc -mtriple=arm-eabi %s -o - | FileCheck -check-prefix=ARM %s
-; RUN: llc -mtriple=thumb-eabi %s -o - | FileCheck -check-prefix=THUMB %s
-; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2 %s -o - \
+; RUN: llc -mtriple=arm-eabi -arm-atomic-cfg-tidy=0 %s -o - | FileCheck -check-prefix=ARM %s
+; RUN: llc -mtriple=thumb-eabi -arm-atomic-cfg-tidy=0 %s -o - | FileCheck -check-prefix=THUMB %s
+; RUN: llc -mtriple=thumb-eabi -arm-atomic-cfg-tidy=0 -mcpu=arm1156t2-s -mattr=+thumb2 %s -o - \
 ; RUN:   | FileCheck -check-prefix=T2 %s
-; RUN: llc -mtriple=thumbv8-eabi %s -o - | FileCheck -check-prefix=V8 %s
+; RUN: llc -mtriple=thumbv8-eabi -arm-atomic-cfg-tidy=0 %s -o - | FileCheck -check-prefix=V8 %s

 ; FIXME: The -march=thumb test doesn't change if -disable-peephole is specified.

--- a/test/CodeGen/ARM/atomic-64bit.ll
+++ b/test/CodeGen/ARM/atomic-64bit.ll
@ -189,9 +189,9 @@ define i64 @test7(i64* %ptr, i64 %val1, i64 %val2) {
 ; CHECK-THUMB: ldrexd [[REG1:[a-z0-9]+]], [[REG2:[a-z0-9]+]]
 ; CHECK-THUMB-LE-DAG: eor.w     [[MISMATCH_LO:[a-z0-9]+]], [[REG1]], r2
 ; CHECK-THUMB-LE-DAG: eor.w     [[MISMATCH_HI:[a-z0-9]+]], [[REG2]], r3
-; CHECK-THUMB-BE-DAG: eor.w     [[MISMATCH_HI:[a-z0-9]+]], [[REG1]]
-; CHECK-THUMB-BE-DAG: eor.w     [[MISMATCH_LO:[a-z0-9]+]], [[REG2]]
-; CHECK-THUMB: orrs    [[MISMATCH_HI]], [[MISMATCH_LO]]
+; CHECK-THUMB-BE-DAG: eor.w     [[MISMATCH_HI:[a-z0-9]+]], [[REG1]], r2
+; CHECK-THUMB-BE-DAG: eor.w     [[MISMATCH_LO:[a-z0-9]+]], [[REG2]], r3
+; CHECK-THUMB-LE: orrs    [[MISMATCH_HI]], [[MISMATCH_LO]]
 ; CHECK-THUMB: bne
 ; CHECK-THUMB: strexd {{[a-z0-9]+}}, {{[a-z0-9]+}}, {{[a-z0-9]+}}
 ; CHECK-THUMB: cmp
--- a/test/CodeGen/ARM/call-tc.ll
+++ b/test/CodeGen/ARM/call-tc.ll
@ -1,6 +1,6 @@
-; RUN: llc < %s -mtriple=armv6-apple-ios5.0 -mattr=+vfp2 | FileCheck %s -check-prefix=CHECKV6
-; RUN: llc < %s -mtriple=thumbv7-apple-ios5.0 | FileCheck %s -check-prefix=CHECKT2D
-; RUN: llc < %s -mtriple=armv6-linux-gnueabi -relocation-model=pic -mattr=+vfp2 \
+; RUN: llc < %s -mtriple=armv6-apple-ios5.0 -mattr=+vfp2 -arm-atomic-cfg-tidy=0 | FileCheck %s -check-prefix=CHECKV6
+; RUN: llc < %s -mtriple=thumbv7-apple-ios5.0 -arm-atomic-cfg-tidy=0 | FileCheck %s -check-prefix=CHECKT2D
+; RUN: llc < %s -mtriple=armv6-linux-gnueabi -relocation-model=pic -mattr=+vfp2 -arm-atomic-cfg-tidy=0 \
 ; RUN:    | FileCheck %s -check-prefix=CHECKELF

 ; Enable tailcall optimization for iOS 5.0
--- a/test/CodeGen/ARM/cmpxchg-idioms.ll
+++ b/test/CodeGen/ARM/cmpxchg-idioms.ll
@ -0,0 +1,101 @@
+; RUN: llc -mtriple=thumbv7s-apple-ios7.0 -o - %s | FileCheck %s
+
+define i32 @test_return(i32* %p, i32 %oldval, i32 %newval) {
+; CHECK-LABEL: test_return:
+
+; CHECK: dmb ishst
+
+; CHECK: [[LOOP:LBB[0-9]+_[0-9]+]]:
+; CHECK: ldrex [[LOADED:r[0-9]+]], [r0]
+; CHECK: cmp [[LOADED]], r1
+; CHECK: bne [[FAILED:LBB[0-9]+_[0-9]+]]
+
+; CHECK: strex [[STATUS:r[0-9]+]], {{r[0-9]+}}, [r0]
+; CHECK: cmp [[STATUS]], #0
+; CHECK: bne [[LOOP]]
+
+; CHECK-NOT: cmp {{r[0-9]+}}, {{r[0-9]+}}
+; CHECK: movs r0, #1
+; CHECK: dmb ish
+; CHECK: bx lr
+
+; CHECK: [[FAILED]]:
+; CHECK-NOT: cmp {{r[0-9]+}}, {{r[0-9]+}}
+; CHECK: movs r0, #0
+; CHECK: dmb ish
+; CHECK: bx lr
+
+  %loaded = cmpxchg i32* %p, i32 %oldval, i32 %newval seq_cst seq_cst
+  %success = icmp eq i32 %loaded, %oldval
+  %conv = zext i1 %success to i32
+  ret i32 %conv
+}
+
+define i1 @test_return_bool(i8* %value, i8 %oldValue, i8 %newValue) {
+; CHECK-LABEL: test_return_bool:
+
+; CHECK: uxtb [[OLDBYTE:r[0-9]+]], r1
+; CHECK: dmb ishst
+
+; CHECK: [[LOOP:LBB[0-9]+_[0-9]+]]:
+; CHECK: ldrexb [[LOADED:r[0-9]+]], [r0]
+; CHECK: cmp [[LOADED]], [[OLDBYTE]]
+
+; CHECK: itt ne
+; CHECK: movne r0, #1
+; CHECK: bxne lr
+
+; CHECK: strexb [[STATUS:r[0-9]+]], {{r[0-9]+}}, [r0]
+; CHECK: cmp [[STATUS]], #0
+; CHECK: bne [[LOOP]]
+
+; CHECK-NOT: cmp {{r[0-9]+}}, {{r[0-9]+}}
+; CHECK: movs r0, #0
+; CHECK: bx lr
+
+  %loaded = cmpxchg i8* %value, i8 %oldValue, i8 %newValue acq_rel monotonic
+  %failure = icmp ne i8 %loaded, %oldValue
+  ret i1 %failure
+}
+
+define void @test_conditional(i32* %p, i32 %oldval, i32 %newval) {
+; CHECK-LABEL: test_conditional:
+
+; CHECK: dmb ishst
+
+; CHECK: [[LOOP:LBB[0-9]+_[0-9]+]]:
+; CHECK: ldrex [[LOADED:r[0-9]+]], [r0]
+; CHECK: cmp [[LOADED]], r1
+; CHECK: bne [[FAILED:LBB[0-9]+_[0-9]+]]
+
+; CHECK: strex [[STATUS:r[0-9]+]], r2, [r0]
+; CHECK: cmp [[STATUS]], #0
+; CHECK: bne [[LOOP]]
+
+; CHECK-NOT: cmp {{r[0-9]+}}, {{r[0-9]+}}
+; CHECK: dmb ish
+; CHECK: b.w _bar
+
+; CHECK: [[FAILED]]:
+; CHECK-NOT: cmp {{r[0-9]+}}, {{r[0-9]+}}
+; CHECK: dmb ish
+; CHECK: b.w _baz
+
+  %loaded = cmpxchg i32* %p, i32 %oldval, i32 %newval seq_cst seq_cst
+  %success = icmp eq i32 %loaded, %oldval
+  br i1 %success, label %true, label %false
+
+true:
+  tail call void @bar() #2
+  br label %end
+
+false:
+  tail call void @baz() #2
+  br label %end
+
+end:
+  ret void
+}
+
+declare void @bar()
+declare void @baz()
--- a/test/CodeGen/ARM/data-in-code-annotations.ll
+++ b/test/CodeGen/ARM/data-in-code-annotations.ll
@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=armv7-apple-darwin | FileCheck %s
+; RUN: llc < %s -mtriple=armv7-apple-darwin -arm-atomic-cfg-tidy=0 | FileCheck %s

 define double @f1() nounwind {
 ; CHECK-LABEL: f1:
--- a/test/CodeGen/ARM/fptoint.ll
+++ b/test/CodeGen/ARM/fptoint.ll
@ -1,4 +1,4 @@
-; RUN: llc -mtriple=arm-eabi -mattr=+v6,+vfp2 %s -o - | FileCheck %s
+; RUN: llc -mtriple=arm-eabi -arm-atomic-cfg-tidy=0 -mattr=+v6,+vfp2 %s -o - | FileCheck %s

@i = weak global i32 0		; <i32*> [#uses=2]
@u = weak global i32 0		; <i32*> [#uses=2]
--- a/test/CodeGen/ARM/ifcvt-branch-weight.ll
+++ b/test/CodeGen/ARM/ifcvt-branch-weight.ll
@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=thumbv8 -print-machineinstrs=if-converter -o /dev/null 2>&1 | FileCheck %s
+; RUN: llc < %s -mtriple=thumbv8 -print-machineinstrs=if-converter -arm-atomic-cfg-tidy=0 -o /dev/null 2>&1 | FileCheck %s

 %struct.S = type { i8* (i8*)*, [1 x i8] }
 define internal zeroext i8 @bar(%struct.S* %x, %struct.S* nocapture %y) nounwind readonly {
--- a/test/CodeGen/ARM/ifcvt10.ll
+++ b/test/CodeGen/ARM/ifcvt10.ll
@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=arm-apple-ios -mcpu=cortex-a9 | FileCheck %s
+; RUN: llc < %s -mtriple=arm-apple-ios -arm-atomic-cfg-tidy=0 -mcpu=cortex-a9 | FileCheck %s
 ; rdar://8402126
 ; Make sure if-converter is not predicating vldmia and ldmia. These are
 ; micro-coded and would have long issue latency even if predicated on
--- a/test/CodeGen/ARM/indirectbr-3.ll
+++ b/test/CodeGen/ARM/indirectbr-3.ll
@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=thumbv7-apple-ios | FileCheck %s
+; RUN: llc < %s -mtriple=thumbv7-apple-ios -arm-atomic-cfg-tidy=0 | FileCheck %s

 ; If ARMBaseInstrInfo::AnalyzeBlocks returns the wrong value, which was possible
 ; for blocks with indirect branches, the IfConverter could end up deleting
--- a/test/CodeGen/ARM/lsr-unfolded-offset.ll
+++ b/test/CodeGen/ARM/lsr-unfolded-offset.ll
@ -1,4 +1,4 @@
-; RUN: llc -regalloc=greedy < %s | FileCheck %s
+; RUN: llc -regalloc=greedy -arm-atomic-cfg-tidy=0 < %s | FileCheck %s

 ; LSR shouldn't introduce more induction variables than needed, increasing
 ; register pressure and therefore spilling. There is more room for improvement
--- a/test/CodeGen/ARM/misched-copy-arm.ll
+++ b/test/CodeGen/ARM/misched-copy-arm.ll
@ -1,5 +1,5 @@
 ; REQUIRES: asserts
-; RUN: llc -mtriple=thumb-eabi -mcpu=swift -pre-RA-sched=source -join-globalcopies -enable-misched -verify-misched -debug-only=misched %s -o - 2>&1 | FileCheck %s
+; RUN: llc -mtriple=thumb-eabi -mcpu=swift -pre-RA-sched=source -join-globalcopies -enable-misched -verify-misched -debug-only=misched -arm-atomic-cfg-tidy=0 %s -o - 2>&1 | FileCheck %s
 ;
 ; Loop counter copies should be eliminated.
 ; There is also a MUL here, but we don't care where it is scheduled.
--- a/test/CodeGen/ARM/reg_sequence.ll
+++ b/test/CodeGen/ARM/reg_sequence.ll
@ -1,5 +1,5 @@
-; RUN: llc < %s -mtriple=arm-apple-ios -mcpu=cortex-a8 | FileCheck %s
-; RUN: llc < %s -mtriple=arm-apple-ios -mcpu=cortex-a8 -regalloc=basic | FileCheck %s
+; RUN: llc < %s -mtriple=arm-apple-ios -mcpu=cortex-a8 -arm-atomic-cfg-tidy=0 | FileCheck %s
+; RUN: llc < %s -mtriple=arm-apple-ios -mcpu=cortex-a8 -arm-atomic-cfg-tidy=0 -regalloc=basic | FileCheck %s
 ; Implementing vld / vst as REG_SEQUENCE eliminates the extra vmov's.

 %struct.int16x8_t = type { <8 x i16> }
--- a/test/CodeGen/ARM/spill-q.ll
+++ b/test/CodeGen/ARM/spill-q.ll
@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=armv7-elf -mattr=+neon | FileCheck %s
+; RUN: llc < %s -mtriple=armv7-elf -mattr=+neon -arm-atomic-cfg-tidy=0 | FileCheck %s
 ; PR4789

 %bar = type { float, float, float }
--- a/test/CodeGen/ARM/struct-byval-frame-index.ll
+++ b/test/CodeGen/ARM/struct-byval-frame-index.ll
@ -1,4 +1,4 @@
-; RUN: llc < %s -mcpu=cortex-a15 -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -mcpu=cortex-a15 -verify-machineinstrs -arm-atomic-cfg-tidy=0 | FileCheck %s

 ; Check a spill right after a function call with large struct byval is correctly
 ; generated.
--- a/test/CodeGen/ARM/twoaddrinstr.ll
+++ b/test/CodeGen/ARM/twoaddrinstr.ll
@ -1,5 +1,5 @@
 ; Tests for the two-address instruction pass.
-; RUN: llc -mtriple=arm-eabi -mcpu=cortex-a9 %s -o - | FileCheck %s
+; RUN: llc -mtriple=arm-eabi -mcpu=cortex-a9 -arm-atomic-cfg-tidy=0 %s -o - | FileCheck %s

 define void @PR13378() nounwind {
 ; This was orriginally a crasher trying to schedule the instructions.
--- a/test/CodeGen/ARM/vldm-sched-a9.ll
+++ b/test/CodeGen/ARM/vldm-sched-a9.ll
@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm -mtriple=armv7-linux-gnueabihf -float-abi=hard -mcpu=cortex-a9 -O3 | FileCheck %s
+; RUN: llc < %s -march=arm -mtriple=armv7-linux-gnueabihf -arm-atomic-cfg-tidy=0 -float-abi=hard -mcpu=cortex-a9 -O3 | FileCheck %s

 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n32-S64"

--- a/test/CodeGen/Thumb2/2009-08-01-WrongLDRBOpc.ll
+++ b/test/CodeGen/Thumb2/2009-08-01-WrongLDRBOpc.ll
@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=thumbv7-apple-darwin9 -mcpu=cortex-a8 -relocation-model=pic -disable-fp-elim | FileCheck %s
+; RUN: llc < %s -mtriple=thumbv7-apple-darwin9 -mcpu=cortex-a8 -relocation-model=pic -disable-fp-elim -arm-atomic-cfg-tidy=0 | FileCheck %s

@csize = external global [100 x [20 x [4 x i8]]]		; <[100 x [20 x [4 x i8]]]*> [#uses=1]
@vsize = external global [100 x [20 x [4 x i8]]]		; <[100 x [20 x [4 x i8]]]*> [#uses=1]
--- a/test/CodeGen/Thumb2/2009-08-06-SpDecBug.ll
+++ b/test/CodeGen/Thumb2/2009-08-06-SpDecBug.ll
@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=thumbv7-none-linux-gnueabi | FileCheck %s
+; RUN: llc < %s -mtriple=thumbv7-none-linux-gnueabi -arm-atomic-cfg-tidy=0 | FileCheck %s
 ; PR4659
 ; PR4682

--- a/test/CodeGen/Thumb2/2009-09-28-ITBlockBug.ll
+++ b/test/CodeGen/Thumb2/2009-09-28-ITBlockBug.ll
@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=thumbv7-apple-darwin -mcpu=cortex-a8 -disable-cgp-branch-opts | FileCheck %s
+; RUN: llc < %s -mtriple=thumbv7-apple-darwin -mcpu=cortex-a8 -disable-cgp-branch-opts -arm-atomic-cfg-tidy=0 | FileCheck %s

 %struct.pix_pos = type { i32, i32, i32, i32, i32, i32 }

--- a/test/CodeGen/Thumb2/2010-04-15-DynAllocBug.ll
+++ b/test/CodeGen/Thumb2/2010-04-15-DynAllocBug.ll
@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=thumbv7-apple-darwin -mcpu=cortex-a8 -O3 | FileCheck %s
+; RUN: llc < %s -mtriple=thumbv7-apple-darwin -mcpu=cortex-a8 -arm-atomic-cfg-tidy=0 -O3 | FileCheck %s
 ; rdar://7493908

 ; Make sure the result of the first dynamic_alloc isn't copied back to sp more
--- a/test/CodeGen/Thumb2/2010-06-21-TailMergeBug.ll
+++ b/test/CodeGen/Thumb2/2010-06-21-TailMergeBug.ll
@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=thumbv7-apple-darwin -O3 -relocation-model=pic | FileCheck %s
+; RUN: llc < %s -mtriple=thumbv7-apple-darwin -O3 -relocation-model=pic -arm-atomic-cfg-tidy=0 | FileCheck %s
 ; rdar://8115404
 ; Tail merging must not split an IT block.

--- a/test/CodeGen/Thumb2/2010-11-22-EpilogueBug.ll
+++ b/test/CodeGen/Thumb2/2010-11-22-EpilogueBug.ll
@ -1,5 +1,5 @@
 ; rdar://8465407
-; RUN: llc < %s -mtriple=thumbv7-apple-ios | FileCheck %s
+; RUN: llc < %s -mtriple=thumbv7-apple-ios -arm-atomic-cfg-tidy=0 | FileCheck %s

 %struct.buf = type opaque

--- a/test/CodeGen/Thumb2/2011-06-07-TwoAddrEarlyClobber.ll
+++ b/test/CodeGen/Thumb2/2011-06-07-TwoAddrEarlyClobber.ll
@ -1,4 +1,4 @@
-; RUN: llc -mtriple=thumbv7-apple-darwin10 < %s | FileCheck %s
+; RUN: llc -mtriple=thumbv7-apple-darwin10 -arm-atomic-cfg-tidy=0 < %s | FileCheck %s

 %struct.op = type { %struct.op*, %struct.op*, %struct.op* ()*, i32, i16, i16, i8, i8 }

--- a/test/CodeGen/Thumb2/buildvector-crash.ll
+++ b/test/CodeGen/Thumb2/buildvector-crash.ll
@ -1,4 +1,4 @@
-; RUN: llc < %s -O3 -mtriple=thumbv7-apple-ios -mcpu=cortex-a8 | FileCheck %s
+; RUN: llc < %s -O3 -mtriple=thumbv7-apple-ios -arm-atomic-cfg-tidy=0 -mcpu=cortex-a8 | FileCheck %s
 ; Formerly crashed, 3573915.

 define void @RotateStarsFP_Vec() nounwind {
--- a/test/CodeGen/Thumb2/cross-rc-coalescing-2.ll
+++ b/test/CodeGen/Thumb2/cross-rc-coalescing-2.ll
@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=thumbv7-apple-darwin9 -mcpu=cortex-a8 | FileCheck %s
+; RUN: llc < %s -mtriple=thumbv7-apple-darwin9 -mcpu=cortex-a8 -arm-atomic-cfg-tidy=0 | FileCheck %s

 define void @fht(float* nocapture %fz, i16 signext %n) nounwind {
 ; CHECK-LABEL: fht:
--- a/test/CodeGen/Thumb2/ldr-str-imm12.ll
+++ b/test/CodeGen/Thumb2/ldr-str-imm12.ll
@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=thumbv7-apple-darwin -mcpu=cortex-a8 -relocation-model=pic -disable-fp-elim | FileCheck %s
+; RUN: llc < %s -mtriple=thumbv7-apple-darwin -arm-atomic-cfg-tidy=0 -mcpu=cortex-a8 -relocation-model=pic -disable-fp-elim | FileCheck %s
 ; rdar://7352504
 ; Make sure we use "str r9, [sp, #+28]" instead of "sub.w r4, r7, #256" followed by "str r9, [r4, #-32]".

--- a/test/CodeGen/Thumb2/thumb2-branch.ll
+++ b/test/CodeGen/Thumb2/thumb2-branch.ll
@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=thumbv7-apple-darwin -mattr=+thumb2 | FileCheck %s
+; RUN: llc < %s -mtriple=thumbv7-apple-darwin -mattr=+thumb2 -arm-atomic-cfg-tidy=0 | FileCheck %s
 ; If-conversion defeats the purpose of this test, which is to check
 ; conditional branch generation, so a call to make sure it doesn't
 ; happen and we get actual branches.
--- a/test/CodeGen/Thumb2/thumb2-cbnz.ll
+++ b/test/CodeGen/Thumb2/thumb2-cbnz.ll
@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=thumbv7-apple-darwin -mcpu=cortex-a8 | FileCheck %s
+; RUN: llc < %s -mtriple=thumbv7-apple-darwin -mcpu=cortex-a8 -arm-atomic-cfg-tidy=0 | FileCheck %s
 ; rdar://7354379

 declare double @foo(double) nounwind readnone
--- a/test/CodeGen/Thumb2/thumb2-ifcvt2.ll
+++ b/test/CodeGen/Thumb2/thumb2-ifcvt2.ll
@ -1,6 +1,6 @@
-; RUN: llc < %s -mtriple=thumbv7-apple-ios | FileCheck %s
-; RUN: llc < %s -mtriple=thumbv7-apple-ios -arm-default-it | FileCheck %s
-; RUN: llc < %s -mtriple=thumbv8-apple-ios -arm-no-restrict-it | FileCheck %s
+; RUN: llc < %s -mtriple=thumbv7-apple-ios -arm-atomic-cfg-tidy=0 | FileCheck %s
+; RUN: llc < %s -mtriple=thumbv7-apple-ios -arm-atomic-cfg-tidy=0 -arm-default-it | FileCheck %s
+; RUN: llc < %s -mtriple=thumbv8-apple-ios -arm-atomic-cfg-tidy=0 -arm-no-restrict-it | FileCheck %s

 define void @foo(i32 %X, i32 %Y) {
 entry:
--- a/test/CodeGen/Thumb2/thumb2-ifcvt3.ll
+++ b/test/CodeGen/Thumb2/thumb2-ifcvt3.ll
@ -1,6 +1,6 @@
-; RUN: llc < %s -mtriple=thumbv7-apple-darwin | FileCheck %s
-; RUN: llc < %s -mtriple=thumbv7-apple-darwin -arm-default-it | FileCheck %s
-; RUN: llc < %s -mtriple=thumbv8-apple-darwin -arm-no-restrict-it | FileCheck %s
+; RUN: llc < %s -mtriple=thumbv7-apple-darwin -arm-atomic-cfg-tidy=0 | FileCheck %s
+; RUN: llc < %s -mtriple=thumbv7-apple-darwin -arm-atomic-cfg-tidy=0 -arm-default-it | FileCheck %s
+; RUN: llc < %s -mtriple=thumbv8-apple-darwin -arm-atomic-cfg-tidy=0 -arm-no-restrict-it | FileCheck %s

 ; There shouldn't be a unconditional branch at end of bb52.
 ; rdar://7184787
--- a/test/CodeGen/Thumb2/thumb2-spill-q.ll
+++ b/test/CodeGen/Thumb2/thumb2-spill-q.ll
@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=thumbv7-elf -mattr=+neon | FileCheck %s
+; RUN: llc < %s -mtriple=thumbv7-elf -mattr=+neon -arm-atomic-cfg-tidy=0 | FileCheck %s
 ; PR4789

 %bar = type { float, float, float }
--- a/test/CodeGen/Thumb2/v8_IT_3.ll
+++ b/test/CodeGen/Thumb2/v8_IT_3.ll
@ -1,7 +1,7 @@
-; RUN: llc < %s -mtriple=thumbv8 | FileCheck %s
-; RUN: llc < %s -mtriple=thumbv7 -arm-restrict-it | FileCheck %s
-; RUN: llc < %s -mtriple=thumbv8 -relocation-model=pic | FileCheck %s --check-prefix=CHECK-PIC
-; RUN: llc < %s -mtriple=thumbv7 -arm-restrict-it -relocation-model=pic | FileCheck %s --check-prefix=CHECK-PIC
+; RUN: llc < %s -mtriple=thumbv8 -arm-atomic-cfg-tidy=0 | FileCheck %s
+; RUN: llc < %s -mtriple=thumbv7 -arm-atomic-cfg-tidy=0 -arm-restrict-it | FileCheck %s
+; RUN: llc < %s -mtriple=thumbv8 -arm-atomic-cfg-tidy=0 -relocation-model=pic | FileCheck %s --check-prefix=CHECK-PIC
+; RUN: llc < %s -mtriple=thumbv7 -arm-atomic-cfg-tidy=0 -arm-restrict-it -relocation-model=pic | FileCheck %s --check-prefix=CHECK-PIC

 %struct.FF = type { i32 (i32*)*, i32 (i32*, i32*, i32, i32, i32, i32)*, i32 (i32, i32, i8*)*, void ()*, i32 (i32, i8*, i32*)*, i32 ()* }
 %struct.BD = type { %struct.BD*, i32, i32, i32, i32, i64, i32 (%struct.BD*, i8*, i64, i32)*, i32 (%struct.BD*, i8*, i32, i32)*, i32 (%struct.BD*, i8*, i64, i32)*, i32 (%struct.BD*, i8*, i32, i32)*, i32 (%struct.BD*, i64, i32)*, [16 x i8], i64, i64 }
--- a/test/CodeGen/Thumb2/v8_IT_5.ll
+++ b/test/CodeGen/Thumb2/v8_IT_5.ll
@ -1,5 +1,5 @@
-; RUN: llc < %s -mtriple=thumbv8 | FileCheck %s
-; RUN: llc < %s -mtriple=thumbv7 -arm-restrict-it | FileCheck %s
+; RUN: llc < %s -mtriple=thumbv8 -arm-atomic-cfg-tidy=0 | FileCheck %s
+; RUN: llc < %s -mtriple=thumbv7 -arm-atomic-cfg-tidy=0 -arm-restrict-it | FileCheck %s
 ; CHECK: it	ne
 ; CHECK-NEXT: cmpne
 ; CHECK-NEXT: bne [[JUMPTARGET:.LBB[0-9]+_[0-9]+]]
--- a/test/Transforms/LoopStrengthReduce/ARM/2012-06-15-lsr-noaddrmode.ll
+++ b/test/Transforms/LoopStrengthReduce/ARM/2012-06-15-lsr-noaddrmode.ll
@ -1,4 +1,4 @@
-; RUN: llc -O3 -mtriple=thumb-eabi -mcpu=cortex-a8 %s -o - | FileCheck %s
+; RUN: llc -O3 -mtriple=thumb-eabi -mcpu=cortex-a8 %s -o - -arm-atomic-cfg-tidy=0 | FileCheck %s
 ;
 ; LSR should only check for valid address modes when the IV user is a
 ; memory address.