diff --git a/include/llvm/IR/IntrinsicsX86.td b/include/llvm/IR/IntrinsicsX86.td index b5230dcfe15..b6022246b52 100644 --- a/include/llvm/IR/IntrinsicsX86.td +++ b/include/llvm/IR/IntrinsicsX86.td @@ -460,12 +460,6 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". def int_x86_sse2_psrl_dq : Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_i32_ty], [IntrNoMem]>; - def int_x86_sse2_psll_dq_bs : GCCBuiltin<"__builtin_ia32_pslldqi128_byteshift">, - Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, - llvm_i32_ty], [IntrNoMem]>; - def int_x86_sse2_psrl_dq_bs : GCCBuiltin<"__builtin_ia32_psrldqi128_byteshift">, - Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, - llvm_i32_ty], [IntrNoMem]>; } // Conversion ops diff --git a/lib/IR/AutoUpgrade.cpp b/lib/IR/AutoUpgrade.cpp index 7f70fce4484..6d53d11988f 100644 --- a/lib/IR/AutoUpgrade.cpp +++ b/lib/IR/AutoUpgrade.cpp @@ -163,6 +163,8 @@ static bool UpgradeIntrinsicFunction1(Function *F, Function *&NewFn) { Name == "x86.avx.vbroadcast.ss" || Name == "x86.avx.vbroadcast.ss.256" || Name == "x86.avx.vbroadcast.sd.256" || + Name == "x86.sse2.psll.dq.bs" || + Name == "x86.sse2.psrl.dq.bs" || (Name.startswith("x86.xop.vpcom") && F->arg_size() == 2)) { NewFn = nullptr; return true; @@ -487,6 +489,43 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) { for (unsigned I = 0; I < EltNum; ++I) Rep = Builder.CreateInsertElement(Rep, Load, ConstantInt::get(I32Ty, I)); + } else if (Name == "llvm.x86.sse2.psll.dq.bs") { + Value *Op0 = ConstantVector::getSplat(16, Builder.getInt8(0)); + Value *Op1 = Builder.CreateBitCast(CI->getArgOperand(0), + VectorType::get(Type::getInt8Ty(C),16), + "cast"); + + unsigned Shift = cast(CI->getArgOperand(1))->getZExtValue(); + + if (Shift < 16) { + SmallVector Idxs; + for (unsigned i = 16; i != 32; ++i) + Idxs.push_back(Builder.getInt32(i - Shift)); + + Op0 = Builder.CreateShuffleVector(Op0, Op1, ConstantVector::get(Idxs)); + } + + Rep = Builder.CreateBitCast(Op0, + VectorType::get(Type::getInt64Ty(C), 2), + "cast"); + } else if (Name == "llvm.x86.sse2.psrl.dq.bs") { + Value *Op0 = Builder.CreateBitCast(CI->getArgOperand(0), + VectorType::get(Type::getInt8Ty(C),16), + "cast"); + Value *Op1 = ConstantVector::getSplat(16, Builder.getInt8(0)); + + unsigned Shift = cast(CI->getArgOperand(1))->getZExtValue(); + + if (Shift < 16) { + SmallVector Idxs; + for (unsigned i = 0; i != 16; ++i) + Idxs.push_back(Builder.getInt32(i + Shift)); + + Op1 = Builder.CreateShuffleVector(Op0, Op1, ConstantVector::get(Idxs)); + } + Rep = Builder.CreateBitCast(Op1, + VectorType::get(Type::getInt64Ty(C), 2), + "cast"); } else { bool PD128 = false, PD256 = false, PS128 = false, PS256 = false; if (Name == "llvm.x86.avx.vpermil.pd.256") diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index 3e9eafbc12d..0d6a9f0f692 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -4367,20 +4367,16 @@ defm VPSRAD : PDI_binop_rmi<0xE2, 0x72, MRM4r, "vpsrad", X86vsra, X86vsrai, VR128, v4i32, v4i32, bc_v4i32, loadv2i64, SSE_INTSHIFT_ITINS_P, 0>, VEX_4V; -let ExeDomain = SSEPackedInt, SchedRW = [WriteVecShift] in { +let ExeDomain = SSEPackedInt, SchedRW = [WriteVecShift], hasSideEffects = 0 in { // 128-bit logical shifts. def VPSLLDQri : PDIi8<0x73, MRM7r, (outs VR128:$dst), (ins VR128:$src1, i32u8imm:$src2), "vpslldq\t{$src2, $src1, $dst|$dst, $src1, $src2}", - [(set VR128:$dst, - (int_x86_sse2_psll_dq_bs VR128:$src1, imm:$src2))]>, - VEX_4V; + []>, VEX_4V; def VPSRLDQri : PDIi8<0x73, MRM3r, (outs VR128:$dst), (ins VR128:$src1, i32u8imm:$src2), "vpsrldq\t{$src2, $src1, $dst|$dst, $src1, $src2}", - [(set VR128:$dst, - (int_x86_sse2_psrl_dq_bs VR128:$src1, imm:$src2))]>, - VEX_4V; + []>, VEX_4V; // PSRADQri doesn't exist in SSE[1-3]. } } // Predicates = [HasAVX] @@ -4459,20 +4455,16 @@ defm PSRAD : PDI_binop_rmi<0xE2, 0x72, MRM4r, "psrad", X86vsra, X86vsrai, VR128, v4i32, v4i32, bc_v4i32, memopv2i64, SSE_INTSHIFT_ITINS_P>; -let ExeDomain = SSEPackedInt, SchedRW = [WriteVecShift] in { +let ExeDomain = SSEPackedInt, SchedRW = [WriteVecShift], hasSideEffects = 0 in { // 128-bit logical shifts. def PSLLDQri : PDIi8<0x73, MRM7r, (outs VR128:$dst), (ins VR128:$src1, i32u8imm:$src2), "pslldq\t{$src2, $dst|$dst, $src2}", - [(set VR128:$dst, - (int_x86_sse2_psll_dq_bs VR128:$src1, imm:$src2))], - IIC_SSE_INTSHDQ_P_RI>; + [], IIC_SSE_INTSHDQ_P_RI>; def PSRLDQri : PDIi8<0x73, MRM3r, (outs VR128:$dst), (ins VR128:$src1, i32u8imm:$src2), "psrldq\t{$src2, $dst|$dst, $src2}", - [(set VR128:$dst, - (int_x86_sse2_psrl_dq_bs VR128:$src1, imm:$src2))], - IIC_SSE_INTSHDQ_P_RI>; + [], IIC_SSE_INTSHDQ_P_RI>; // PSRADQri doesn't exist in SSE[1-3]. } } // Constraints = "$src1 = $dst" diff --git a/test/CodeGen/X86/avx-intrinsics-x86.ll b/test/CodeGen/X86/avx-intrinsics-x86.ll index bb9354cff03..2696f3b9652 100644 --- a/test/CodeGen/X86/avx-intrinsics-x86.ll +++ b/test/CodeGen/X86/avx-intrinsics-x86.ll @@ -465,14 +465,6 @@ define <2 x i64> @test_x86_sse2_psll_dq(<2 x i64> %a0) { declare <2 x i64> @llvm.x86.sse2.psll.dq(<2 x i64>, i32) nounwind readnone -define <2 x i64> @test_x86_sse2_psll_dq_bs(<2 x i64> %a0) { - ; CHECK: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8] - %res = call <2 x i64> @llvm.x86.sse2.psll.dq.bs(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1] - ret <2 x i64> %res -} -declare <2 x i64> @llvm.x86.sse2.psll.dq.bs(<2 x i64>, i32) nounwind readnone - - define <2 x i64> @test_x86_sse2_psll_q(<2 x i64> %a0, <2 x i64> %a1) { ; CHECK: vpsllq %res = call <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64> %a0, <2 x i64> %a1) ; <<2 x i64>> [#uses=1] @@ -561,14 +553,6 @@ define <2 x i64> @test_x86_sse2_psrl_dq(<2 x i64> %a0) { declare <2 x i64> @llvm.x86.sse2.psrl.dq(<2 x i64>, i32) nounwind readnone -define <2 x i64> @test_x86_sse2_psrl_dq_bs(<2 x i64> %a0) { - ; CHECK: vpsrldq {{.*#+}} xmm0 = xmm0[7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero - %res = call <2 x i64> @llvm.x86.sse2.psrl.dq.bs(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1] - ret <2 x i64> %res -} -declare <2 x i64> @llvm.x86.sse2.psrl.dq.bs(<2 x i64>, i32) nounwind readnone - - define <2 x i64> @test_x86_sse2_psrl_q(<2 x i64> %a0, <2 x i64> %a1) { ; CHECK: vpsrlq %res = call <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64> %a0, <2 x i64> %a1) ; <<2 x i64>> [#uses=1] diff --git a/test/CodeGen/X86/sse2-intrinsics-x86.ll b/test/CodeGen/X86/sse2-intrinsics-x86.ll index ddb04211ec7..99357e73766 100644 --- a/test/CodeGen/X86/sse2-intrinsics-x86.ll +++ b/test/CodeGen/X86/sse2-intrinsics-x86.ll @@ -418,14 +418,6 @@ define <2 x i64> @test_x86_sse2_psll_dq(<2 x i64> %a0) { declare <2 x i64> @llvm.x86.sse2.psll.dq(<2 x i64>, i32) nounwind readnone -define <2 x i64> @test_x86_sse2_psll_dq_bs(<2 x i64> %a0) { - ; CHECK: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8] - %res = call <2 x i64> @llvm.x86.sse2.psll.dq.bs(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1] - ret <2 x i64> %res -} -declare <2 x i64> @llvm.x86.sse2.psll.dq.bs(<2 x i64>, i32) nounwind readnone - - define <2 x i64> @test_x86_sse2_psll_q(<2 x i64> %a0, <2 x i64> %a1) { ; CHECK: psllq %res = call <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64> %a0, <2 x i64> %a1) ; <<2 x i64>> [#uses=1] @@ -514,14 +506,6 @@ define <2 x i64> @test_x86_sse2_psrl_dq(<2 x i64> %a0) { declare <2 x i64> @llvm.x86.sse2.psrl.dq(<2 x i64>, i32) nounwind readnone -define <2 x i64> @test_x86_sse2_psrl_dq_bs(<2 x i64> %a0) { - ; CHECK: psrldq {{.*#+}} xmm0 = xmm0[7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero - %res = call <2 x i64> @llvm.x86.sse2.psrl.dq.bs(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1] - ret <2 x i64> %res -} -declare <2 x i64> @llvm.x86.sse2.psrl.dq.bs(<2 x i64>, i32) nounwind readnone - - define <2 x i64> @test_x86_sse2_psrl_q(<2 x i64> %a0, <2 x i64> %a1) { ; CHECK: psrlq %res = call <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64> %a0, <2 x i64> %a1) ; <<2 x i64>> [#uses=1]