From d5f323d70bd2d9bc8a63a68bfe439a69e0104bbf Mon Sep 17 00:00:00 2001 From: Bill Wendling Date: Tue, 12 Apr 2011 22:46:31 +0000 Subject: [PATCH] Remove the unaligned load intrinsics in favor of using native unaligned loads. Now that we have a first-class way to represent unaligned loads, the unaligned load intrinsics are superfluous. First part of . git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@129401 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/IntrinsicsX86.td | 14 --------- lib/Target/X86/X86InstrInfo.cpp | 5 ---- lib/Target/X86/X86InstrSSE.td | 26 ---------------- .../InstCombine/InstCombineCalls.cpp | 6 +--- lib/Transforms/Scalar/LoopStrengthReduce.cpp | 3 -- lib/VMCore/AutoUpgrade.cpp | 30 ++++++++++++++++++- test/Assembler/AutoUpgradeIntrinsics.ll | 12 ++++++++ test/CodeGen/X86/avx-intrinsics-x86.ll | 4 +-- 8 files changed, 44 insertions(+), 56 deletions(-) diff --git a/include/llvm/IntrinsicsX86.td b/include/llvm/IntrinsicsX86.td index 49462200f09..61143eca932 100644 --- a/include/llvm/IntrinsicsX86.td +++ b/include/llvm/IntrinsicsX86.td @@ -138,12 +138,6 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". llvm_x86mmx_ty], [IntrNoMem]>; } -// SIMD load ops -let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". - def int_x86_sse_loadu_ps : GCCBuiltin<"__builtin_ia32_loadups">, - Intrinsic<[llvm_v4f32_ty], [llvm_ptr_ty], [IntrReadMem]>; -} - // SIMD store ops let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". def int_x86_sse_storeu_ps : GCCBuiltin<"__builtin_ia32_storeups">, @@ -452,14 +446,6 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". Intrinsic<[llvm_v2f64_ty], [llvm_x86mmx_ty], [IntrNoMem]>; } -// SIMD load ops -let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". - def int_x86_sse2_loadu_pd : GCCBuiltin<"__builtin_ia32_loadupd">, - Intrinsic<[llvm_v2f64_ty], [llvm_ptr_ty], [IntrReadMem]>; - def int_x86_sse2_loadu_dq : GCCBuiltin<"__builtin_ia32_loaddqu">, - Intrinsic<[llvm_v16i8_ty], [llvm_ptr_ty], [IntrReadMem]>; -} - // SIMD store ops let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". def int_x86_sse2_storeu_pd : GCCBuiltin<"__builtin_ia32_storeupd">, diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp index 21df57c9cda..85ab916d71d 100644 --- a/lib/Target/X86/X86InstrInfo.cpp +++ b/lib/Target/X86/X86InstrInfo.cpp @@ -916,7 +916,6 @@ X86InstrInfo::isReallyTriviallyReMaterializable(const MachineInstr *MI, case X86::MOVSDrm: case X86::MOVAPSrm: case X86::MOVUPSrm: - case X86::MOVUPSrm_Int: case X86::MOVAPDrm: case X86::MOVDQArm: case X86::MMX_MOVD64rm: @@ -2845,11 +2844,9 @@ X86InstrInfo::areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2, case X86::FsMOVAPDrm: case X86::MOVAPSrm: case X86::MOVUPSrm: - case X86::MOVUPSrm_Int: case X86::MOVAPDrm: case X86::MOVDQArm: case X86::MOVDQUrm: - case X86::MOVDQUrm_Int: break; } switch (Opc2) { @@ -2869,11 +2866,9 @@ X86InstrInfo::areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2, case X86::FsMOVAPDrm: case X86::MOVAPSrm: case X86::MOVUPSrm: - case X86::MOVUPSrm_Int: case X86::MOVAPDrm: case X86::MOVDQArm: case X86::MOVDQUrm: - case X86::MOVDQUrm_Int: break; } diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index 8f08e68a262..c36e975e3cf 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -329,15 +329,6 @@ def MOVUPDmr : PDI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), // Intrinsic forms of MOVUPS/D load and store let isAsmParserOnly = 0 in { - let canFoldAsLoad = 1, isReMaterializable = 1 in - def VMOVUPSrm_Int : VPSI<0x10, MRMSrcMem, (outs VR128:$dst), - (ins f128mem:$src), - "movups\t{$src, $dst|$dst, $src}", - [(set VR128:$dst, (int_x86_sse_loadu_ps addr:$src))]>, VEX; - def VMOVUPDrm_Int : VPDI<0x10, MRMSrcMem, (outs VR128:$dst), - (ins f128mem:$src), - "movupd\t{$src, $dst|$dst, $src}", - [(set VR128:$dst, (int_x86_sse2_loadu_pd addr:$src))]>, VEX; def VMOVUPSmr_Int : VPSI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), "movups\t{$src, $dst|$dst, $src}", @@ -347,13 +338,6 @@ let isAsmParserOnly = 0 in { "movupd\t{$src, $dst|$dst, $src}", [(int_x86_sse2_storeu_pd addr:$dst, VR128:$src)]>, VEX; } -let canFoldAsLoad = 1, isReMaterializable = 1 in -def MOVUPSrm_Int : PSI<0x10, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), - "movups\t{$src, $dst|$dst, $src}", - [(set VR128:$dst, (int_x86_sse_loadu_ps addr:$src))]>; -def MOVUPDrm_Int : PDI<0x10, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), - "movupd\t{$src, $dst|$dst, $src}", - [(set VR128:$dst, (int_x86_sse2_loadu_pd addr:$src))]>; def MOVUPSmr_Int : PSI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), "movups\t{$src, $dst|$dst, $src}", @@ -2229,22 +2213,12 @@ def MOVDQUmr : I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src), // Intrinsic forms of MOVDQU load and store let isAsmParserOnly = 0 in { -let canFoldAsLoad = 1 in -def VMOVDQUrm_Int : I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), - "vmovdqu\t{$src, $dst|$dst, $src}", - [(set VR128:$dst, (int_x86_sse2_loadu_dq addr:$src))]>, - XS, VEX, Requires<[HasAVX]>; def VMOVDQUmr_Int : I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src), "vmovdqu\t{$src, $dst|$dst, $src}", [(int_x86_sse2_storeu_dq addr:$dst, VR128:$src)]>, XS, VEX, Requires<[HasAVX]>; } -let canFoldAsLoad = 1 in -def MOVDQUrm_Int : I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), - "movdqu\t{$src, $dst|$dst, $src}", - [(set VR128:$dst, (int_x86_sse2_loadu_dq addr:$src))]>, - XS, Requires<[HasSSE2]>; def MOVDQUmr_Int : I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src), "movdqu\t{$src, $dst|$dst, $src}", [(int_x86_sse2_storeu_dq addr:$dst, VR128:$src)]>, diff --git a/lib/Transforms/InstCombine/InstCombineCalls.cpp b/lib/Transforms/InstCombine/InstCombineCalls.cpp index 875e9cae582..726105f75d6 100644 --- a/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ b/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -537,11 +537,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { break; case Intrinsic::ppc_altivec_lvx: case Intrinsic::ppc_altivec_lvxl: - case Intrinsic::x86_sse_loadu_ps: - case Intrinsic::x86_sse2_loadu_pd: - case Intrinsic::x86_sse2_loadu_dq: - // Turn PPC lvx -> load if the pointer is known aligned. - // Turn X86 loadups -> load if the pointer is known aligned. + // Turn PPC lvx -> load if the pointer is known aligned. if (getOrEnforceKnownAlignment(II->getArgOperand(0), 16, TD) >= 16) { Value *Ptr = Builder->CreateBitCast(II->getArgOperand(0), PointerType::getUnqual(II->getType())); diff --git a/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/lib/Transforms/Scalar/LoopStrengthReduce.cpp index 87e78fab829..f6e2c88bc6a 100644 --- a/lib/Transforms/Scalar/LoopStrengthReduce.cpp +++ b/lib/Transforms/Scalar/LoopStrengthReduce.cpp @@ -572,9 +572,6 @@ static bool isAddressUse(Instruction *Inst, Value *OperandVal) { switch (II->getIntrinsicID()) { default: break; case Intrinsic::prefetch: - case Intrinsic::x86_sse2_loadu_dq: - case Intrinsic::x86_sse2_loadu_pd: - case Intrinsic::x86_sse_loadu_ps: case Intrinsic::x86_sse_storeu_ps: case Intrinsic::x86_sse2_storeu_pd: case Intrinsic::x86_sse2_storeu_dq: diff --git a/lib/VMCore/AutoUpgrade.cpp b/lib/VMCore/AutoUpgrade.cpp index 9e551bb0604..4541f381ed4 100644 --- a/lib/VMCore/AutoUpgrade.cpp +++ b/lib/VMCore/AutoUpgrade.cpp @@ -527,6 +527,12 @@ static bool UpgradeIntrinsicFunction1(Function *F, Function *&NewFn) { // or 0. NewFn = 0; return true; + } else if (Name.compare(5, 16, "x86.sse.loadu.ps", 16) == 0 || + Name.compare(5, 17, "x86.sse2.loadu.dq", 17) == 0 || + Name.compare(5, 17, "x86.sse2.loadu.pd", 17) == 0) { + // Calls to these instructions are transformed into unaligned loads. + NewFn = 0; + return true; } else if (Name.compare(5, 17, "x86.ssse3.pshuf.w", 17) == 0) { // This is an SSE/MMX instruction. const Type *X86_MMXTy = VectorType::getX86_MMXTy(FTy->getContext()); @@ -946,7 +952,29 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) { // Remove upgraded instruction. CI->eraseFromParent(); - + + } else if (F->getName() == "llvm.x86.sse.loadu.ps" || + F->getName() == "llvm.x86.sse2.loadu.dq" || + F->getName() == "llvm.x86.sse2.loadu.pd") { + // Convert to a native, unaligned load. + const Type *VecTy = CI->getType(); + const Type *IntTy = IntegerType::get(C, 128); + IRBuilder<> Builder(C); + Builder.SetInsertPoint(CI->getParent(), CI); + + Value *BC = Builder.CreateBitCast(CI->getArgOperand(0), + PointerType::getUnqual(IntTy), + "cast"); + LoadInst *LI = Builder.CreateLoad(BC, CI->getName()); + LI->setAlignment(1); // Unaligned load. + BC = Builder.CreateBitCast(LI, VecTy, "new.cast"); + + // Fix up all the uses with our new load. + if (!CI->use_empty()) + CI->replaceAllUsesWith(BC); + + // Remove intrinsic. + CI->eraseFromParent(); } else { llvm_unreachable("Unknown function for CallInst upgrade."); } diff --git a/test/Assembler/AutoUpgradeIntrinsics.ll b/test/Assembler/AutoUpgradeIntrinsics.ll index 6752bd8281b..e4e2d3a56e0 100644 --- a/test/Assembler/AutoUpgradeIntrinsics.ll +++ b/test/Assembler/AutoUpgradeIntrinsics.ll @@ -7,6 +7,8 @@ ; RUN: llvm-as < %s | llvm-dis | \ ; RUN: not grep {llvm\\.bswap\\.i\[0-9\]*\\.i\[0-9\]*} ; RUN: llvm-as < %s | llvm-dis | \ +; RUN: not grep {llvm\\.x86\\.sse2\\.loadu} +; RUN: llvm-as < %s | llvm-dis | \ ; RUN: grep {llvm\\.x86\\.mmx\\.ps} | grep {x86_mmx} | count 16 declare i32 @llvm.ctpop.i28(i28 %val) @@ -79,3 +81,13 @@ define void @sh64(<1 x i64> %A, <2 x i32> %B) { %r2 = call <1 x i64> @llvm.x86.mmx.psrl.q( <1 x i64> %A, <2 x i32> %B ) ; <<1 x i64>> [#uses=0] ret void } + +declare <4 x float> @llvm.x86.sse.loadu.ps(i8*) nounwind readnone +declare <16 x i8> @llvm.x86.sse2.loadu.dq(i8*) nounwind readnone +declare <2 x double> @llvm.x86.sse2.loadu.pd(double*) nounwind readnone +define void @test_loadu(i8* %a, double* %b) { + %v0 = call <4 x float> @llvm.x86.sse.loadu.ps(i8* %a) + %v1 = call <16 x i8> @llvm.x86.sse2.loadu.dq(i8* %a) + %v2 = call <2 x double> @llvm.x86.sse2.loadu.pd(double* %b) + ret void +} diff --git a/test/CodeGen/X86/avx-intrinsics-x86.ll b/test/CodeGen/X86/avx-intrinsics-x86.ll index 6c32396a417..5201688686d 100644 --- a/test/CodeGen/X86/avx-intrinsics-x86.ll +++ b/test/CodeGen/X86/avx-intrinsics-x86.ll @@ -247,7 +247,7 @@ declare <2 x double> @llvm.x86.sse2.div.sd(<2 x double>, <2 x double>) nounwind define <16 x i8> @test_x86_sse2_loadu_dq(i8* %a0) { ; CHECK: movl - ; CHECK: vmovdqu + ; CHECK: vmovups %res = call <16 x i8> @llvm.x86.sse2.loadu.dq(i8* %a0) ; <<16 x i8>> [#uses=1] ret <16 x i8> %res } @@ -256,7 +256,7 @@ declare <16 x i8> @llvm.x86.sse2.loadu.dq(i8*) nounwind readonly define <2 x double> @test_x86_sse2_loadu_pd(i8* %a0) { ; CHECK: movl - ; CHECK: vmovupd + ; CHECK: vmovups %res = call <2 x double> @llvm.x86.sse2.loadu.pd(i8* %a0) ; <<2 x double>> [#uses=1] ret <2 x double> %res }