Add support for generating CMPXCHG16B on x86-64 for the cmpxchg IR instruction.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@138660 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
Eli Friedman 2011-08-26 21:21:21 +00:00
parent 51fb91c04c
commit 43f51aeca8
8 changed files with 107 additions and 48 deletions

View File

@ -68,6 +68,9 @@ def Feature3DNowA : SubtargetFeature<"3dnowa", "X863DNowLevel", "ThreeDNowA",
def Feature64Bit : SubtargetFeature<"64bit", "HasX86_64", "true", def Feature64Bit : SubtargetFeature<"64bit", "HasX86_64", "true",
"Support 64-bit instructions", "Support 64-bit instructions",
[FeatureCMOV]>; [FeatureCMOV]>;
def FeatureCMPXCHG16B : SubtargetFeature<"cmpxchg16b", "HasCmpxchg16b", "true",
"64-bit with cmpxchg16b",
[Feature64Bit]>;
def FeatureSlowBTMem : SubtargetFeature<"slow-bt-mem", "IsBTMemSlow", "true", def FeatureSlowBTMem : SubtargetFeature<"slow-bt-mem", "IsBTMemSlow", "true",
"Bit testing of memory is slow">; "Bit testing of memory is slow">;
def FeatureFastUAMem : SubtargetFeature<"fast-unaligned-mem", def FeatureFastUAMem : SubtargetFeature<"fast-unaligned-mem",
@ -112,26 +115,31 @@ def : Proc<"pentium3m", [FeatureSSE1, FeatureSlowBTMem]>;
def : Proc<"pentium-m", [FeatureSSE2, FeatureSlowBTMem]>; def : Proc<"pentium-m", [FeatureSSE2, FeatureSlowBTMem]>;
def : Proc<"pentium4", [FeatureSSE2]>; def : Proc<"pentium4", [FeatureSSE2]>;
def : Proc<"pentium4m", [FeatureSSE2, FeatureSlowBTMem]>; def : Proc<"pentium4m", [FeatureSSE2, FeatureSlowBTMem]>;
def : Proc<"x86-64", [FeatureSSE2, Feature64Bit, FeatureSlowBTMem]>; def : Proc<"x86-64", [FeatureSSE2, Feature64Bit, FeatureSlowBTMem]>;
def : Proc<"yonah", [FeatureSSE3, FeatureSlowBTMem]>; def : Proc<"yonah", [FeatureSSE3, FeatureSlowBTMem]>;
def : Proc<"prescott", [FeatureSSE3, FeatureSlowBTMem]>; def : Proc<"prescott", [FeatureSSE3, FeatureSlowBTMem]>;
def : Proc<"nocona", [FeatureSSE3, Feature64Bit, FeatureSlowBTMem]>; def : Proc<"nocona", [FeatureSSE3, FeatureCMPXCHG16B,
def : Proc<"core2", [FeatureSSSE3, Feature64Bit, FeatureSlowBTMem]>; FeatureSlowBTMem]>;
def : Proc<"penryn", [FeatureSSE41, Feature64Bit, FeatureSlowBTMem]>; def : Proc<"core2", [FeatureSSSE3, FeatureCMPXCHG16B,
def : Proc<"atom", [FeatureSSE3, Feature64Bit, FeatureSlowBTMem]>; FeatureSlowBTMem]>;
def : Proc<"penryn", [FeatureSSE41, FeatureCMPXCHG16B,
FeatureSlowBTMem]>;
def : Proc<"atom", [FeatureSSE3, FeatureCMPXCHG16B,
FeatureSlowBTMem]>;
// "Arrandale" along with corei3 and corei5 // "Arrandale" along with corei3 and corei5
def : Proc<"corei7", [FeatureSSE42, Feature64Bit, FeatureSlowBTMem, def : Proc<"corei7", [FeatureSSE42, FeatureCMPXCHG16B,
FeatureFastUAMem, FeatureAES]>; FeatureSlowBTMem, FeatureFastUAMem, FeatureAES]>;
def : Proc<"nehalem", [FeatureSSE42, Feature64Bit, FeatureSlowBTMem, def : Proc<"nehalem", [FeatureSSE42, FeatureCMPXCHG16B,
FeatureFastUAMem]>; FeatureSlowBTMem, FeatureFastUAMem]>;
// Westmere is a similar machine to nehalem with some additional features. // Westmere is a similar machine to nehalem with some additional features.
// Westmere is the corei3/i5/i7 path from nehalem to sandybridge // Westmere is the corei3/i5/i7 path from nehalem to sandybridge
def : Proc<"westmere", [FeatureSSE42, Feature64Bit, FeatureSlowBTMem, def : Proc<"westmere", [FeatureSSE42, FeatureCMPXCHG16B,
FeatureFastUAMem, FeatureAES, FeatureCLMUL]>; FeatureSlowBTMem, FeatureFastUAMem, FeatureAES,
FeatureCLMUL]>;
// SSE is not listed here since llvm treats AVX as a reimplementation of SSE, // SSE is not listed here since llvm treats AVX as a reimplementation of SSE,
// rather than a superset. // rather than a superset.
// FIXME: Disabling AVX for now since it's not ready. // FIXME: Disabling AVX for now since it's not ready.
def : Proc<"corei7-avx", [FeatureSSE42, Feature64Bit, def : Proc<"corei7-avx", [FeatureSSE42, FeatureCMPXCHG16B,
FeatureAES, FeatureCLMUL]>; FeatureAES, FeatureCLMUL]>;
def : Proc<"k6", [FeatureMMX]>; def : Proc<"k6", [FeatureMMX]>;
@ -150,19 +158,21 @@ def : Proc<"athlon64", [FeatureSSE2, Feature3DNowA, Feature64Bit,
FeatureSlowBTMem]>; FeatureSlowBTMem]>;
def : Proc<"athlon-fx", [FeatureSSE2, Feature3DNowA, Feature64Bit, def : Proc<"athlon-fx", [FeatureSSE2, Feature3DNowA, Feature64Bit,
FeatureSlowBTMem]>; FeatureSlowBTMem]>;
def : Proc<"k8-sse3", [FeatureSSE3, Feature3DNowA, Feature64Bit, def : Proc<"k8-sse3", [FeatureSSE3, Feature3DNowA, FeatureCMPXCHG16B,
FeatureSlowBTMem]>; FeatureSlowBTMem]>;
def : Proc<"opteron-sse3", [FeatureSSE3, Feature3DNowA, Feature64Bit, def : Proc<"opteron-sse3", [FeatureSSE3, Feature3DNowA, FeatureCMPXCHG16B,
FeatureSlowBTMem]>; FeatureSlowBTMem]>;
def : Proc<"athlon64-sse3", [FeatureSSE3, Feature3DNowA, Feature64Bit, def : Proc<"athlon64-sse3", [FeatureSSE3, Feature3DNowA, FeatureCMPXCHG16B,
FeatureSlowBTMem]>; FeatureSlowBTMem]>;
def : Proc<"amdfam10", [FeatureSSE3, FeatureSSE4A, def : Proc<"amdfam10", [FeatureSSE3, FeatureSSE4A,
Feature3DNowA, Feature64Bit, FeatureSlowBTMem]>; Feature3DNowA, FeatureCMPXCHG16B,
FeatureSlowBTMem]>;
def : Proc<"barcelona", [FeatureSSE3, FeatureSSE4A, def : Proc<"barcelona", [FeatureSSE3, FeatureSSE4A,
Feature3DNowA, Feature64Bit, FeatureSlowBTMem]>; Feature3DNowA, FeatureCMPXCHG16B,
def : Proc<"istanbul", [Feature3DNowA, Feature64Bit, FeatureSSE4A, FeatureSlowBTMem]>;
Feature3DNowA]>; def : Proc<"istanbul", [Feature3DNowA, FeatureCMPXCHG16B,
def : Proc<"shanghai", [Feature3DNowA, Feature64Bit, FeatureSSE4A, FeatureSSE4A, Feature3DNowA]>;
def : Proc<"shanghai", [Feature3DNowA, FeatureCMPXCHG16B, FeatureSSE4A,
Feature3DNowA]>; Feature3DNowA]>;
def : Proc<"winchip-c6", [FeatureMMX]>; def : Proc<"winchip-c6", [FeatureMMX]>;

View File

@ -478,6 +478,10 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
setOperationAction(ISD::ATOMIC_SWAP, MVT::i64, Custom); setOperationAction(ISD::ATOMIC_SWAP, MVT::i64, Custom);
} }
if (Subtarget->hasCmpxchg16b()) {
setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i128, Custom);
}
// FIXME - use subtarget debug flags // FIXME - use subtarget debug flags
if (!Subtarget->isTargetDarwin() && if (!Subtarget->isTargetDarwin() &&
!Subtarget->isTargetELF() && !Subtarget->isTargetELF() &&
@ -10421,37 +10425,48 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
} }
case ISD::ATOMIC_CMP_SWAP: { case ISD::ATOMIC_CMP_SWAP: {
EVT T = N->getValueType(0); EVT T = N->getValueType(0);
assert (T == MVT::i64 && "Only know how to expand i64 Cmp and Swap"); assert (T == MVT::i64 || T == MVT::i128 && "can only expand cmpxchg pair");
bool Regs64bit = T == MVT::i128;
EVT HalfT = Regs64bit ? MVT::i64 : MVT::i32;
SDValue cpInL, cpInH; SDValue cpInL, cpInH;
cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(2), cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
DAG.getConstant(0, MVT::i32)); DAG.getConstant(0, HalfT));
cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(2), cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
DAG.getConstant(1, MVT::i32)); DAG.getConstant(1, HalfT));
cpInL = DAG.getCopyToReg(N->getOperand(0), dl, X86::EAX, cpInL, SDValue()); cpInL = DAG.getCopyToReg(N->getOperand(0), dl,
cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl, X86::EDX, cpInH, Regs64bit ? X86::RAX : X86::EAX,
cpInL.getValue(1)); cpInL, SDValue());
cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl,
Regs64bit ? X86::RDX : X86::EDX,
cpInH, cpInL.getValue(1));
SDValue swapInL, swapInH; SDValue swapInL, swapInH;
swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(3), swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
DAG.getConstant(0, MVT::i32)); DAG.getConstant(0, HalfT));
swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(3), swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
DAG.getConstant(1, MVT::i32)); DAG.getConstant(1, HalfT));
swapInL = DAG.getCopyToReg(cpInH.getValue(0), dl, X86::EBX, swapInL, swapInL = DAG.getCopyToReg(cpInH.getValue(0), dl,
cpInH.getValue(1)); Regs64bit ? X86::RBX : X86::EBX,
swapInH = DAG.getCopyToReg(swapInL.getValue(0), dl, X86::ECX, swapInH, swapInL, cpInH.getValue(1));
swapInL.getValue(1)); swapInH = DAG.getCopyToReg(swapInL.getValue(0), dl,
Regs64bit ? X86::RCX : X86::ECX,
swapInH, swapInL.getValue(1));
SDValue Ops[] = { swapInH.getValue(0), SDValue Ops[] = { swapInH.getValue(0),
N->getOperand(1), N->getOperand(1),
swapInH.getValue(1) }; swapInH.getValue(1) };
SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand(); MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();
SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG8_DAG, dl, Tys, unsigned Opcode = Regs64bit ? X86ISD::LCMPXCHG16_DAG :
X86ISD::LCMPXCHG8_DAG;
SDValue Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys,
Ops, 3, T, MMO); Ops, 3, T, MMO);
SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl, X86::EAX, SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl,
MVT::i32, Result.getValue(1)); Regs64bit ? X86::RAX : X86::EAX,
SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl, X86::EDX, HalfT, Result.getValue(1));
MVT::i32, cpOutL.getValue(2)); SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl,
Regs64bit ? X86::RDX : X86::EDX,
HalfT, cpOutL.getValue(2));
SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)}; SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)};
Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, OpsF, 2)); Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF, 2));
Results.push_back(cpOutH.getValue(1)); Results.push_back(cpOutH.getValue(1));
return; return;
} }

View File

@ -303,9 +303,10 @@ namespace llvm {
ATOMNAND64_DAG, ATOMNAND64_DAG,
ATOMSWAP64_DAG, ATOMSWAP64_DAG,
// LCMPXCHG_DAG, LCMPXCHG8_DAG - Compare and swap. // LCMPXCHG_DAG, LCMPXCHG8_DAG, LCMPXCHG16_DAG - Compare and swap.
LCMPXCHG_DAG, LCMPXCHG_DAG,
LCMPXCHG8_DAG, LCMPXCHG8_DAG,
LCMPXCHG16_DAG,
// VZEXT_LOAD - Load, scalar_to_vector, and zero extend. // VZEXT_LOAD - Load, scalar_to_vector, and zero extend.
VZEXT_LOAD, VZEXT_LOAD,

View File

@ -665,12 +665,20 @@ def LOCK_DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst),
// Atomic compare and swap. // Atomic compare and swap.
let Defs = [EAX, EDX, EFLAGS], Uses = [EAX, EBX, ECX, EDX], let Defs = [EAX, EDX, EFLAGS], Uses = [EAX, EBX, ECX, EDX],
isCodeGenOnly = 1 in { isCodeGenOnly = 1 in
def LCMPXCHG8B : I<0xC7, MRM1m, (outs), (ins i64mem:$ptr), def LCMPXCHG8B : I<0xC7, MRM1m, (outs), (ins i64mem:$ptr),
"lock\n\t" "lock\n\t"
"cmpxchg8b\t$ptr", "cmpxchg8b\t$ptr",
[(X86cas8 addr:$ptr)]>, TB, LOCK; [(X86cas8 addr:$ptr)]>, TB, LOCK;
}
let Defs = [RAX, RDX, EFLAGS], Uses = [RAX, RBX, RCX, RDX],
isCodeGenOnly = 1 in
def LCMPXCHG16B : RI<0xC7, MRM1m, (outs), (ins i128mem:$ptr),
"lock\n\t"
"cmpxchg16b\t$ptr",
[(X86cas16 addr:$ptr)]>, TB, LOCK,
Requires<[HasCmpxchg16b]>;
let Defs = [AL, EFLAGS], Uses = [AL], isCodeGenOnly = 1 in { let Defs = [AL, EFLAGS], Uses = [AL], isCodeGenOnly = 1 in {
def LCMPXCHG8 : I<0xB0, MRMDestMem, (outs), (ins i8mem:$ptr, GR8:$swap), def LCMPXCHG8 : I<0xB0, MRMDestMem, (outs), (ins i8mem:$ptr, GR8:$swap),
"lock\n\t" "lock\n\t"

View File

@ -65,7 +65,7 @@ def SDTX86SetCC_C : SDTypeProfile<1, 2,
def SDTX86cas : SDTypeProfile<0, 3, [SDTCisPtrTy<0>, SDTCisInt<1>, def SDTX86cas : SDTypeProfile<0, 3, [SDTCisPtrTy<0>, SDTCisInt<1>,
SDTCisVT<2, i8>]>; SDTCisVT<2, i8>]>;
def SDTX86cas8 : SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>; def SDTX86caspair : SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>;
def SDTX86atomicBinary : SDTypeProfile<2, 3, [SDTCisInt<0>, SDTCisInt<1>, def SDTX86atomicBinary : SDTypeProfile<2, 3, [SDTCisInt<0>, SDTCisInt<1>,
SDTCisPtrTy<2>, SDTCisInt<3>,SDTCisInt<4>]>; SDTCisPtrTy<2>, SDTCisInt<3>,SDTCisInt<4>]>;
@ -133,9 +133,13 @@ def X86setcc_c : SDNode<"X86ISD::SETCC_CARRY", SDTX86SetCC_C>;
def X86cas : SDNode<"X86ISD::LCMPXCHG_DAG", SDTX86cas, def X86cas : SDNode<"X86ISD::LCMPXCHG_DAG", SDTX86cas,
[SDNPHasChain, SDNPInGlue, SDNPOutGlue, SDNPMayStore, [SDNPHasChain, SDNPInGlue, SDNPOutGlue, SDNPMayStore,
SDNPMayLoad, SDNPMemOperand]>; SDNPMayLoad, SDNPMemOperand]>;
def X86cas8 : SDNode<"X86ISD::LCMPXCHG8_DAG", SDTX86cas8, def X86cas8 : SDNode<"X86ISD::LCMPXCHG8_DAG", SDTX86caspair,
[SDNPHasChain, SDNPInGlue, SDNPOutGlue, SDNPMayStore, [SDNPHasChain, SDNPInGlue, SDNPOutGlue, SDNPMayStore,
SDNPMayLoad, SDNPMemOperand]>; SDNPMayLoad, SDNPMemOperand]>;
def X86cas16 : SDNode<"X86ISD::LCMPXCHG16_DAG", SDTX86caspair,
[SDNPHasChain, SDNPInGlue, SDNPOutGlue, SDNPMayStore,
SDNPMayLoad, SDNPMemOperand]>;
def X86AtomAdd64 : SDNode<"X86ISD::ATOMADD64_DAG", SDTX86atomicBinary, def X86AtomAdd64 : SDNode<"X86ISD::ATOMADD64_DAG", SDTX86atomicBinary,
[SDNPHasChain, SDNPMayStore, [SDNPHasChain, SDNPMayStore,
SDNPMayLoad, SDNPMemOperand]>; SDNPMayLoad, SDNPMemOperand]>;
@ -466,6 +470,7 @@ def HasFMA3 : Predicate<"Subtarget->hasFMA3()">;
def HasFMA4 : Predicate<"Subtarget->hasFMA4()">; def HasFMA4 : Predicate<"Subtarget->hasFMA4()">;
def FPStackf32 : Predicate<"!Subtarget->hasXMM()">; def FPStackf32 : Predicate<"!Subtarget->hasXMM()">;
def FPStackf64 : Predicate<"!Subtarget->hasXMMInt()">; def FPStackf64 : Predicate<"!Subtarget->hasXMMInt()">;
def HasCmpxchg16b: Predicate<"Subtarget->hasCmpxchg16b()">;
def In32BitMode : Predicate<"!Subtarget->is64Bit()">, def In32BitMode : Predicate<"!Subtarget->is64Bit()">,
AssemblerPredicate<"!Mode64Bit">; AssemblerPredicate<"!Mode64Bit">;
def In64BitMode : Predicate<"Subtarget->is64Bit()">, def In64BitMode : Predicate<"Subtarget->is64Bit()">,
@ -1190,7 +1195,7 @@ def CMPXCHG8B : I<0xC7, MRM1m, (outs), (ins i64mem:$dst),
let Defs = [RAX, RDX, EFLAGS], Uses = [RAX, RBX, RCX, RDX] in let Defs = [RAX, RDX, EFLAGS], Uses = [RAX, RBX, RCX, RDX] in
def CMPXCHG16B : RI<0xC7, MRM1m, (outs), (ins i128mem:$dst), def CMPXCHG16B : RI<0xC7, MRM1m, (outs), (ins i128mem:$dst),
"cmpxchg16b\t$dst", []>, TB; "cmpxchg16b\t$dst", []>, TB, Requires<[HasCmpxchg16b]>;

View File

@ -203,6 +203,7 @@ void X86Subtarget::AutoDetectSubtargetFeatures() {
HasFMA3 = IsIntel && ((ECX >> 12) & 0x1); ToggleFeature(X86::FeatureFMA3); HasFMA3 = IsIntel && ((ECX >> 12) & 0x1); ToggleFeature(X86::FeatureFMA3);
HasPOPCNT = IsIntel && ((ECX >> 23) & 0x1); ToggleFeature(X86::FeaturePOPCNT); HasPOPCNT = IsIntel && ((ECX >> 23) & 0x1); ToggleFeature(X86::FeaturePOPCNT);
HasAES = IsIntel && ((ECX >> 25) & 0x1); ToggleFeature(X86::FeatureAES); HasAES = IsIntel && ((ECX >> 25) & 0x1); ToggleFeature(X86::FeatureAES);
HasCmpxchg16b = ((ECX >> 13) & 0x1); ToggleFeature(X86::FeatureCMPXCHG16B);
if (IsIntel || IsAMD) { if (IsIntel || IsAMD) {
// Determine if bit test memory instructions are slow. // Determine if bit test memory instructions are slow.
@ -254,6 +255,7 @@ X86Subtarget::X86Subtarget(const std::string &TT, const std::string &CPU,
, IsBTMemSlow(false) , IsBTMemSlow(false)
, IsUAMemFast(false) , IsUAMemFast(false)
, HasVectorUAMem(false) , HasVectorUAMem(false)
, HasCmpxchg16b(false)
, stackAlignment(8) , stackAlignment(8)
// FIXME: this is a known good value for Yonah. How about others? // FIXME: this is a known good value for Yonah. How about others?
, MaxInlineSizeThreshold(128) , MaxInlineSizeThreshold(128)

View File

@ -100,6 +100,10 @@ protected:
/// operands. This may require setting a feature bit in the processor. /// operands. This may require setting a feature bit in the processor.
bool HasVectorUAMem; bool HasVectorUAMem;
/// HasCmpxchg16b - True if this processor has the CMPXCHG16B instruction;
/// this is true for most x86-64 chips, but not the first AMD chips.
bool HasCmpxchg16b;
/// stackAlignment - The minimum alignment known to hold of the stack frame on /// stackAlignment - The minimum alignment known to hold of the stack frame on
/// entry to the function and which must be maintained by every function. /// entry to the function and which must be maintained by every function.
unsigned stackAlignment; unsigned stackAlignment;
@ -168,6 +172,7 @@ public:
bool isBTMemSlow() const { return IsBTMemSlow; } bool isBTMemSlow() const { return IsBTMemSlow; }
bool isUnalignedMemAccessFast() const { return IsUAMemFast; } bool isUnalignedMemAccessFast() const { return IsUAMemFast; }
bool hasVectorUAMem() const { return HasVectorUAMem; } bool hasVectorUAMem() const { return HasVectorUAMem; }
bool hasCmpxchg16b() const { return HasCmpxchg16b; }
const Triple &getTargetTriple() const { return TargetTriple; } const Triple &getTargetTriple() const { return TargetTriple; }

View File

@ -0,0 +1,13 @@
; RUN: llc < %s -march=x86-64 -mcpu=core2 | FileCheck %s
; Basic 128-bit cmpxchg
define void @t1(i128* nocapture %p) nounwind ssp {
entry:
; CHECK movl $1, %ebx
; CHECK: lock
; CHECK-NEXT: cmpxchg16b
%r = cmpxchg i128* %p, i128 0, i128 1 seq_cst
ret void
}
; FIXME: Handle 128-bit atomicrmw/load atomic/store atomic