diff --git a/include/llvm/IntrinsicsX86.td b/include/llvm/IntrinsicsX86.td index e14b4b6d5ce..22c9d294356 100644 --- a/include/llvm/IntrinsicsX86.td +++ b/include/llvm/IntrinsicsX86.td @@ -445,7 +445,6 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". def int_x86_sse2_packuswb_128 : GCCBuiltin<"__builtin_ia32_packuswb128">, Intrinsic<[llvm_v8i16_ty, llvm_v8i16_ty, llvm_v8i16_ty], [IntrNoMem]>; - // FIXME: Temporary workaround since 2-wide shuffle is broken. def int_x86_sse2_movl_dq : GCCBuiltin<"__builtin_ia32_movqv4si">, Intrinsic<[llvm_v4i32_ty, llvm_v4i32_ty], [IntrNoMem]>; def int_x86_sse2_movmsk_pd : GCCBuiltin<"__builtin_ia32_movmskpd">, @@ -463,6 +462,35 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". Intrinsic<[llvm_void_ty], [IntrWriteMem]>; } +// Shuffles. +// FIXME: Temporary workarounds since 2-wide shuffle is broken. +let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". + def int_x86_sse2_movs_d : GCCBuiltin<"__builtin_ia32_movsd">, + Intrinsic<[llvm_v2f64_ty, llvm_v2f64_ty, + llvm_v2f64_ty], [IntrNoMem]>; + def int_x86_sse2_loadh_pd : GCCBuiltin<"__builtin_ia32_loadhpd">, + Intrinsic<[llvm_v2f64_ty, llvm_v2f64_ty, + llvm_ptr_ty], [IntrReadMem]>; + def int_x86_sse2_loadl_pd : GCCBuiltin<"__builtin_ia32_loadlpd">, + Intrinsic<[llvm_v2f64_ty, llvm_v2f64_ty, + llvm_ptr_ty], [IntrReadMem]>; + def int_x86_sse2_shuf_pd : GCCBuiltin<"__builtin_ia32_shufpd">, + Intrinsic<[llvm_v2f64_ty, llvm_v2f64_ty, + llvm_v2f64_ty, llvm_int_ty], [IntrNoMem]>; + def int_x86_sse2_unpckh_pd : GCCBuiltin<"__builtin_ia32_unpckhpd">, + Intrinsic<[llvm_v2f64_ty, llvm_v2f64_ty, + llvm_v2f64_ty], [IntrNoMem]>; + def int_x86_sse2_unpckl_pd : GCCBuiltin<"__builtin_ia32_unpcklpd">, + Intrinsic<[llvm_v2f64_ty, llvm_v2f64_ty, + llvm_v2f64_ty], [IntrNoMem]>; + def int_x86_sse2_punpckh_qdq : GCCBuiltin<"__builtin_ia32_punpckhqdq128">, + Intrinsic<[llvm_v2i64_ty, llvm_v2i64_ty, + llvm_v2i64_ty], [IntrNoMem]>; + def int_x86_sse2_punpckl_qdq : GCCBuiltin<"__builtin_ia32_punpcklqdq128">, + Intrinsic<[llvm_v2i64_ty, llvm_v2i64_ty, + llvm_v2i64_ty], [IntrNoMem]>; +} + //===----------------------------------------------------------------------===// // SSE3 diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index 36b8428bab6..4d4e8dbe8da 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -2212,11 +2212,6 @@ def MOVLQ128mr : PDI<0xD6, MRMDestMem, (ops i64mem:$dst, VR128:$src), "movq {$src, $dst|$dst, $src}", [(int_x86_sse2_storel_dq addr:$dst, VR128:$src)]>; -// FIXME: Temporary workaround since 2-wide shuffle is broken. -def MOVLQ128rr : PDI<0xD6, MRMSrcReg, (ops VR128:$dst, VR128:$src), - "movq {$src, $dst|$dst, $src}", - [(set VR128:$dst, (int_x86_sse2_movl_dq VR128:$src))]>; - // Move to lower bits of a VR128 and zeroing upper bits. // Loading from memory automatically zeroing upper bits. let AddedComplexity = 20 in { @@ -2241,13 +2236,16 @@ def MOVZDI2PDIrm : PDI<0x6E, MRMSrcMem, (ops VR128:$dst, i32mem:$src), [(set VR128:$dst, (v4i32 (vector_shuffle immAllZerosV, (v4i32 (scalar_to_vector (loadi32 addr:$src))), MOVL_shuffle_mask)))]>; -def MOVZQI2PQIrr : PDI<0x7E, MRMSrcMem, (ops VR128:$dst, VR64:$src), - "movq {$src, $dst|$dst, $src}", []>; -def MOVZQI2PQIrm : PDI<0x7E, MRMSrcMem, (ops VR128:$dst, i64mem:$src), - "movq {$src, $dst|$dst, $src}", - [(set VR128:$dst, (bc_v2i64 (vector_shuffle immAllZerosV, - (v2f64 (scalar_to_vector (loadf64 addr:$src))), - MOVL_shuffle_mask)))]>; +// Moving from XMM to XMM but still clear upper 64 bits. +def MOVZQI2PQIrr : I<0x7E, MRMSrcReg, (ops VR128:$dst, VR128:$src), + "movq {$src, $dst|$dst, $src}", + [(set VR128:$dst, (int_x86_sse2_movl_dq VR128:$src))]>, + XS, Requires<[HasSSE2]>; +def MOVZQI2PQIrm : I<0x7E, MRMSrcMem, (ops VR128:$dst, i64mem:$src), + "movq {$src, $dst|$dst, $src}", + [(set VR128:$dst, (int_x86_sse2_movl_dq + (bc_v4i32 (loadv2i64 addr:$src))))]>, + XS, Requires<[HasSSE2]>; } //===----------------------------------------------------------------------===// @@ -2482,8 +2480,42 @@ def : Pat<(v4i32 (vector_shuffle VR128:$src1, VR128:$src2, def : Pat<(v2i64 (vector_shuffle VR128:$src1, VR128:$src2, MOVL_shuffle_mask)), (MOVLPDrr VR128:$src1, VR128:$src2)>, Requires<[HasSSE2]>; + +// Set lowest element and zero upper elements. +def : Pat<(bc_v2i64 (vector_shuffle immAllZerosV, + (v2f64 (scalar_to_vector (loadf64 addr:$src))), + MOVL_shuffle_mask)), + (MOVZQI2PQIrm addr:$src)>, Requires<[HasSSE2]>; } +// FIXME: Temporary workaround since 2-wide shuffle is broken. +def : Pat<(int_x86_sse2_movs_d VR128:$src1, VR128:$src2), + (MOVLPDrr VR128:$src1, VR128:$src2)>, Requires<[HasSSE2]>; +def : Pat<(int_x86_sse2_loadh_pd VR128:$src1, addr:$src2), + (MOVHPDrm VR128:$src1, addr:$src2)>, Requires<[HasSSE2]>; +def : Pat<(int_x86_sse2_loadl_pd VR128:$src1, addr:$src2), + (MOVLPDrm VR128:$src1, addr:$src2)>, Requires<[HasSSE2]>; +def : Pat<(int_x86_sse2_shuf_pd VR128:$src1, VR128:$src2, imm:$src3), + (SHUFPDrri VR128:$src1, VR128:$src2, imm:$src3)>, Requires<[HasSSE2]>; +def : Pat<(int_x86_sse2_shuf_pd VR128:$src1, (load addr:$src2), imm:$src3), + (SHUFPDrmi VR128:$src1, addr:$src2, imm:$src3)>, Requires<[HasSSE2]>; +def : Pat<(int_x86_sse2_unpckh_pd VR128:$src1, VR128:$src2), + (UNPCKHPDrr VR128:$src1, VR128:$src2)>, Requires<[HasSSE2]>; +def : Pat<(int_x86_sse2_unpckh_pd VR128:$src1, (load addr:$src2)), + (UNPCKHPDrm VR128:$src1, addr:$src2)>, Requires<[HasSSE2]>; +def : Pat<(int_x86_sse2_unpckl_pd VR128:$src1, VR128:$src2), + (UNPCKLPDrr VR128:$src1, VR128:$src2)>, Requires<[HasSSE2]>; +def : Pat<(int_x86_sse2_unpckl_pd VR128:$src1, (load addr:$src2)), + (UNPCKLPDrm VR128:$src1, addr:$src2)>, Requires<[HasSSE2]>; +def : Pat<(int_x86_sse2_punpckh_qdq VR128:$src1, VR128:$src2), + (PUNPCKHQDQrr VR128:$src1, VR128:$src2)>, Requires<[HasSSE2]>; +def : Pat<(int_x86_sse2_punpckh_qdq VR128:$src1, (load addr:$src2)), + (PUNPCKHQDQrm VR128:$src1, addr:$src2)>, Requires<[HasSSE2]>; +def : Pat<(int_x86_sse2_punpckl_qdq VR128:$src1, VR128:$src2), + (PUNPCKLQDQrr VR128:$src1, VR128:$src2)>, Requires<[HasSSE2]>; +def : Pat<(int_x86_sse2_punpckl_qdq VR128:$src1, (load addr:$src2)), + (PUNPCKLQDQrm VR128:$src1, addr:$src2)>, Requires<[HasSSE2]>; + // 128-bit logical shifts def : Pat<(int_x86_sse2_psll_dq VR128:$src1, imm:$src2), (v2i64 (PSLLDQri VR128:$src1, (PSxLDQ_imm imm:$src2)))>,