diff --git a/include/llvm/IntrinsicsX86.td b/include/llvm/IntrinsicsX86.td index 1abeb9e5559..805d3667d15 100644 --- a/include/llvm/IntrinsicsX86.td +++ b/include/llvm/IntrinsicsX86.td @@ -1282,16 +1282,6 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". Intrinsic<[], [llvm_ptr_ty, llvm_v32i8_ty], []>; } -// Cacheability support ops -let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". - def int_x86_avx_movnt_dq_256 : GCCBuiltin<"__builtin_ia32_movntdq256">, - Intrinsic<[], [llvm_ptr_ty, llvm_v4i64_ty], []>; - def int_x86_avx_movnt_pd_256 : GCCBuiltin<"__builtin_ia32_movntpd256">, - Intrinsic<[], [llvm_ptr_ty, llvm_v4f64_ty], []>; - def int_x86_avx_movnt_ps_256 : GCCBuiltin<"__builtin_ia32_movntps256">, - Intrinsic<[], [llvm_ptr_ty, llvm_v8f32_ty], []>; -} - // Conditional load ops let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". def int_x86_avx_maskload_pd : GCCBuiltin<"__builtin_ia32_maskloadpd">, diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index d6755cea41c..a518997b786 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -3336,13 +3336,6 @@ let AddedComplexity = 400 in { // Prefer non-temporal versions IIC_SSE_MOVNT>, VEX; } -def : Pat<(int_x86_avx_movnt_dq_256 addr:$dst, VR256:$src), - (VMOVNTDQYmr addr:$dst, VR256:$src)>; -def : Pat<(int_x86_avx_movnt_pd_256 addr:$dst, VR256:$src), - (VMOVNTPDYmr addr:$dst, VR256:$src)>; -def : Pat<(int_x86_avx_movnt_ps_256 addr:$dst, VR256:$src), - (VMOVNTPSYmr addr:$dst, VR256:$src)>; - let AddedComplexity = 400 in { // Prefer non-temporal versions def MOVNTPSmr : PSI<0x2B, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), "movntps\t{$src, $dst|$dst, $src}", diff --git a/lib/VMCore/AutoUpgrade.cpp b/lib/VMCore/AutoUpgrade.cpp index 2e16372fb1c..522b07e4545 100644 --- a/lib/VMCore/AutoUpgrade.cpp +++ b/lib/VMCore/AutoUpgrade.cpp @@ -57,7 +57,10 @@ static bool UpgradeIntrinsicFunction1(Function *F, Function *&NewFn) { Name.startswith("x86.sse2.pcmpgt.") || Name.startswith("x86.avx2.pcmpeq.") || Name.startswith("x86.avx2.pcmpgt.") || - Name.startswith("x86.avx.vpermil.")) { + Name.startswith("x86.avx.vpermil.") || + Name == "x86.avx.movnt.dq.256" || + Name == "x86.avx.movnt.pd.256" || + Name == "x86.avx.movnt.ps.256") { NewFn = 0; return true; } @@ -118,15 +121,40 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) { "pcmpgt"); // need to sign extend since icmp returns vector of i1 Rep = Builder.CreateSExt(Rep, CI->getType(), ""); + } else if (Name == "llvm.x86.avx.movnt.dq.256" || + Name == "llvm.x86.avx.movnt.ps.256" || + Name == "llvm.x86.avx.movnt.pd.256") { + IRBuilder<> Builder(C); + Builder.SetInsertPoint(CI->getParent(), CI); + + Module *M = F->getParent(); + SmallVector Elts; + Elts.push_back(ConstantInt::get(Type::getInt32Ty(C), 1)); + MDNode *Node = MDNode::get(C, Elts); + + Value *Arg0 = CI->getArgOperand(0); + Value *Arg1 = CI->getArgOperand(1); + + // Convert the type of the pointer to a pointer to the stored type. + Value *BC = Builder.CreateBitCast(Arg0, + PointerType::getUnqual(Arg1->getType()), + "cast"); + StoreInst *SI = Builder.CreateStore(Arg1, BC); + SI->setMetadata(M->getMDKindID("nontemporal"), Node); + SI->setAlignment(16); + + // Remove intrinsic. + CI->eraseFromParent(); + return; } else { bool PD128 = false, PD256 = false, PS128 = false, PS256 = false; - if (Name.startswith("llvm.x86.avx.vpermil.pd.256")) + if (Name == "llvm.x86.avx.vpermil.pd.256") PD256 = true; - else if (Name.startswith("llvm.x86.avx.vpermil.pd")) + else if (Name == "llvm.x86.avx.vpermil.pd") PD128 = true; - else if (Name.startswith("llvm.x86.avx.vpermil.ps.256")) + else if (Name == "llvm.x86.avx.vpermil.ps.256") PS256 = true; - else if (Name.startswith("llvm.x86.avx.vpermil.ps")) + else if (Name == "llvm.x86.avx.vpermil.ps") PS128 = true; if (PD256 || PD128 || PS256 || PS128) { diff --git a/test/CodeGen/X86/avx-intrinsics-x86.ll b/test/CodeGen/X86/avx-intrinsics-x86.ll index b33493252a5..4cca693d383 100644 --- a/test/CodeGen/X86/avx-intrinsics-x86.ll +++ b/test/CodeGen/X86/avx-intrinsics-x86.ll @@ -2555,3 +2555,27 @@ define i32 @crc32_32_32(i32 %a, i32 %b) nounwind { ret i32 %tmp } declare i32 @llvm.x86.sse42.crc32.32.32(i32, i32) nounwind + +; CHECK: movntdq +define void @movnt_dq(i8* %p, <4 x i64> %a1) nounwind { + %a2 = add <4 x i64> %a1, + tail call void @llvm.x86.avx.movnt.dq.256(i8* %p, <4 x i64> %a2) nounwind + ret void +} +declare void @llvm.x86.avx.movnt.dq.256(i8*, <4 x i64>) nounwind + +; CHECK: movntps +define void @movnt_ps(i8* %p, <8 x float> %a) nounwind { + tail call void @llvm.x86.avx.movnt.ps.256(i8* %p, <8 x float> %a) nounwind + ret void +} +declare void @llvm.x86.avx.movnt.ps.256(i8*, <8 x float>) nounwind + +; CHECK: movntpd +define void @movnt_pd(i8* %p, <4 x double> %a1) nounwind { + ; add operation forces the execution domain. + %a2 = fadd <4 x double> %a1, + tail call void @llvm.x86.avx.movnt.pd.256(i8* %p, <4 x double> %a2) nounwind + ret void +} +declare void @llvm.x86.avx.movnt.pd.256(i8*, <4 x double>) nounwind