From ca57f78332eb0faae549b233662e38fbf8e09a0f Mon Sep 17 00:00:00 2001 From: Evan Cheng Date: Wed, 24 Sep 2008 23:27:55 +0000 Subject: [PATCH] Fix patterns for SSE4.1 move and sign extend instructions. Also add instructions which fold VZEXT_MOVL and VZEXT_LOAD. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@56594 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86InstrInfo.td | 10 ++++ lib/Target/X86/X86InstrSSE.td | 82 ++++++++++++++++++++++++++++++--- test/CodeGen/X86/sse41-pmovx.ll | 47 +++++++++++++++++++ 3 files changed, 133 insertions(+), 6 deletions(-) create mode 100644 test/CodeGen/X86/sse41-pmovx.ll diff --git a/lib/Target/X86/X86InstrInfo.td b/lib/Target/X86/X86InstrInfo.td index 1b68dda4484..bd61c3aed46 100644 --- a/lib/Target/X86/X86InstrInfo.td +++ b/lib/Target/X86/X86InstrInfo.td @@ -243,6 +243,16 @@ def loadi16 : PatFrag<(ops node:$ptr), (i16 (ld node:$ptr)), [{ return false; }]>; +def loadi16_anyext : PatFrag<(ops node:$ptr), (i32 (ld node:$ptr)), [{ + LoadSDNode *LD = cast(N); + if (LD->getAddressingMode() != ISD::UNINDEXED) + return false; + ISD::LoadExtType ExtType = LD->getExtensionType(); + if (ExtType == ISD::EXTLOAD) + return LD->getAlignment() >= 2 && !LD->isVolatile(); + return false; +}]>; + def loadi32 : PatFrag<(ops node:$ptr), (i32 (ld node:$ptr)), [{ LoadSDNode *LD = cast(N); if (LD->getAddressingMode() != ISD::UNINDEXED) diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index 3e7262ab5c3..e89e8a3b56d 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -162,6 +162,17 @@ def bc_v8i16 : PatFrag<(ops node:$in), (v8i16 (bitconvert node:$in))>; def bc_v4i32 : PatFrag<(ops node:$in), (v4i32 (bitconvert node:$in))>; def bc_v2i64 : PatFrag<(ops node:$in), (v2i64 (bitconvert node:$in))>; +def vzmovl_v2i64 : PatFrag<(ops node:$src), + (bitconvert (v2i64 (X86vzmovl + (v2i64 (scalar_to_vector (loadi64 node:$src))))))>; +def vzmovl_v4i32 : PatFrag<(ops node:$src), + (bitconvert (v4i32 (X86vzmovl + (v4i32 (scalar_to_vector (loadi32 node:$src))))))>; + +def vzload_v2i64 : PatFrag<(ops node:$src), + (bitconvert (v2i64 (X86vzload node:$src)))>; + + def fp32imm0 : PatLeaf<(f32 fpimm), [{ return N->isExactlyValue(+0.0); }]>; @@ -3368,8 +3379,9 @@ multiclass SS41I_binop_rm_int8 opc, string OpcodeStr, Intrinsic IntId> { def rm : SS48I, OpSize; + [(set VR128:$dst, + (IntId (bitconvert (v2i64 (scalar_to_vector (loadi64 addr:$src))))))]>, + OpSize; } defm PMOVSXBW : SS41I_binop_rm_int8<0x20, "pmovsxbw", int_x86_sse41_pmovsxbw>; @@ -3379,6 +3391,38 @@ defm PMOVZXBW : SS41I_binop_rm_int8<0x30, "pmovzxbw", int_x86_sse41_pmovzxbw>; defm PMOVZXWD : SS41I_binop_rm_int8<0x33, "pmovzxwd", int_x86_sse41_pmovzxwd>; defm PMOVZXDQ : SS41I_binop_rm_int8<0x35, "pmovzxdq", int_x86_sse41_pmovzxdq>; +// Common patterns involving scalar load. +def : Pat<(int_x86_sse41_pmovsxbw (vzmovl_v2i64 addr:$src)), + (PMOVSXBWrm addr:$src)>, Requires<[HasSSE41]>; +def : Pat<(int_x86_sse41_pmovsxbw (vzload_v2i64 addr:$src)), + (PMOVSXBWrm addr:$src)>, Requires<[HasSSE41]>; + +def : Pat<(int_x86_sse41_pmovsxwd (vzmovl_v2i64 addr:$src)), + (PMOVSXWDrm addr:$src)>, Requires<[HasSSE41]>; +def : Pat<(int_x86_sse41_pmovsxwd (vzload_v2i64 addr:$src)), + (PMOVSXWDrm addr:$src)>, Requires<[HasSSE41]>; + +def : Pat<(int_x86_sse41_pmovsxdq (vzmovl_v2i64 addr:$src)), + (PMOVSXDQrm addr:$src)>, Requires<[HasSSE41]>; +def : Pat<(int_x86_sse41_pmovsxdq (vzload_v2i64 addr:$src)), + (PMOVSXDQrm addr:$src)>, Requires<[HasSSE41]>; + +def : Pat<(int_x86_sse41_pmovzxbw (vzmovl_v2i64 addr:$src)), + (PMOVZXBWrm addr:$src)>, Requires<[HasSSE41]>; +def : Pat<(int_x86_sse41_pmovzxbw (vzload_v2i64 addr:$src)), + (PMOVZXBWrm addr:$src)>, Requires<[HasSSE41]>; + +def : Pat<(int_x86_sse41_pmovzxwd (vzmovl_v2i64 addr:$src)), + (PMOVZXWDrm addr:$src)>, Requires<[HasSSE41]>; +def : Pat<(int_x86_sse41_pmovzxwd (vzload_v2i64 addr:$src)), + (PMOVZXWDrm addr:$src)>, Requires<[HasSSE41]>; + +def : Pat<(int_x86_sse41_pmovzxdq (vzmovl_v2i64 addr:$src)), + (PMOVZXDQrm addr:$src)>, Requires<[HasSSE41]>; +def : Pat<(int_x86_sse41_pmovzxdq (vzload_v2i64 addr:$src)), + (PMOVZXDQrm addr:$src)>, Requires<[HasSSE41]>; + + multiclass SS41I_binop_rm_int4 opc, string OpcodeStr, Intrinsic IntId> { def rr : SS48I opc, string OpcodeStr, Intrinsic IntId> { def rm : SS48I, OpSize; + [(set VR128:$dst, + (IntId (bitconvert (v4i32 (scalar_to_vector (loadi32 addr:$src))))))]>, + OpSize; } defm PMOVSXBD : SS41I_binop_rm_int4<0x21, "pmovsxbd", int_x86_sse41_pmovsxbd>; @@ -3395,20 +3440,45 @@ defm PMOVSXWQ : SS41I_binop_rm_int4<0x24, "pmovsxwq", int_x86_sse41_pmovsxwq>; defm PMOVZXBD : SS41I_binop_rm_int4<0x31, "pmovzxbd", int_x86_sse41_pmovzxbd>; defm PMOVZXWQ : SS41I_binop_rm_int4<0x34, "pmovzxwq", int_x86_sse41_pmovzxwq>; +// Common patterns involving scalar load +def : Pat<(int_x86_sse41_pmovsxbd (vzmovl_v4i32 addr:$src)), + (PMOVSXBDrm addr:$src)>; +def : Pat<(int_x86_sse41_pmovsxwq (vzmovl_v4i32 addr:$src)), + (PMOVSXWQrm addr:$src)>; + +def : Pat<(int_x86_sse41_pmovzxbd (vzmovl_v4i32 addr:$src)), + (PMOVZXBDrm addr:$src)>; +def : Pat<(int_x86_sse41_pmovzxwq (vzmovl_v4i32 addr:$src)), + (PMOVZXWQrm addr:$src)>; + + multiclass SS41I_binop_rm_int2 opc, string OpcodeStr, Intrinsic IntId> { def rr : SS48I, OpSize; + // Expecting a i16 load any extended to i32 value. def rm : SS48I, OpSize; + [(set VR128:$dst, (IntId (bitconvert + (v4i32 (scalar_to_vector (loadi16_anyext addr:$src))))))]>, + OpSize; } defm PMOVSXBQ : SS41I_binop_rm_int2<0x22, "pmovsxbq", int_x86_sse41_pmovsxbq>; defm PMOVZXBQ : SS41I_binop_rm_int2<0x32, "pmovsxbq", int_x86_sse41_pmovzxbq>; +// Common patterns involving scalar load +def : Pat<(int_x86_sse41_pmovsxbq + (bitconvert (v4i32 (X86vzmovl + (v4i32 (scalar_to_vector (loadi32 addr:$src))))))), + (PMOVSXBQrm addr:$src)>; + +def : Pat<(int_x86_sse41_pmovzxbq + (bitconvert (v4i32 (X86vzmovl + (v4i32 (scalar_to_vector (loadi32 addr:$src))))))), + (PMOVZXBQrm addr:$src)>; + /// SS41I_binop_ext8 - SSE 4.1 extract 8 bits to 32 bit reg or 8 bit mem multiclass SS41I_extract8 opc, string OpcodeStr> { diff --git a/test/CodeGen/X86/sse41-pmovx.ll b/test/CodeGen/X86/sse41-pmovx.ll new file mode 100644 index 00000000000..2db23c19f6a --- /dev/null +++ b/test/CodeGen/X86/sse41-pmovx.ll @@ -0,0 +1,47 @@ +; RUN: llvm-as < %s | llc -march=x86 -mattr=sse41 | not grep movd +; RUN: llvm-as < %s | llc -march=x86 -mattr=sse41 | not grep movq +; RUN: llvm-as < %s | llc -march=x86 -mattr=sse41 | grep pmovsxbd +; RUN: llvm-as < %s | llc -march=x86 -mattr=sse41 | grep pmovsxwd +; RUN: llvm-as < %s | llc -march=x86 -mattr=sse41 | grep pmovsxbq +; RUN: llvm-as < %s | llc -march=x86-64 -mattr=sse41 | grep movq | count 1 + +define <2 x i64> @t1(i32* %p) nounwind { +entry: + %0 = load i32* %p, align 4 ; [#uses=1] + %1 = insertelement <4 x i32> undef, i32 %0, i32 0 ; <<4 x i32>> [#uses=1] + %2 = insertelement <4 x i32> %1, i32 0, i32 1 ; <<4 x i32>> [#uses=1] + %3 = insertelement <4 x i32> %2, i32 0, i32 2 ; <<4 x i32>> [#uses=1] + %4 = insertelement <4 x i32> %3, i32 0, i32 3 ; <<4 x i32>> [#uses=1] + %5 = bitcast <4 x i32> %4 to <16 x i8> ; <<16 x i8>> [#uses=1] + %6 = tail call <4 x i32> @llvm.x86.sse41.pmovsxbd(<16 x i8> %5) nounwind readnone ; <<4 x i32>> [#uses=1] + %7 = bitcast <4 x i32> %6 to <2 x i64> ; <<2 x i64>> [#uses=1] + ret <2 x i64> %7 +} + +declare <4 x i32> @llvm.x86.sse41.pmovsxbd(<16 x i8>) nounwind readnone + +define <2 x i64> @t2(i64* %p) nounwind readonly { +entry: + %0 = load i64* %p ; [#uses=1] + %tmp2 = insertelement <2 x i64> zeroinitializer, i64 %0, i32 0 ; <<2 x i64>> [#uses=1] + %1 = bitcast <2 x i64> %tmp2 to <8 x i16> ; <<8 x i16>> [#uses=1] + %2 = tail call <4 x i32> @llvm.x86.sse41.pmovsxwd(<8 x i16> %1) nounwind readnone ; <<4 x i32>> [#uses=1] + %3 = bitcast <4 x i32> %2 to <2 x i64> ; <<2 x i64>> [#uses=1] + ret <2 x i64> %3 +} + +declare <4 x i32> @llvm.x86.sse41.pmovsxwd(<8 x i16>) nounwind readnone + + +@gv = external global i16 ; [#uses=1] + +define <2 x i64> @t3() nounwind { +entry: + %0 = load i16* @gv, align 2 ; [#uses=1] + %1 = insertelement <8 x i16> undef, i16 %0, i32 0 ; <<8 x i16>> [#uses=1] + %2 = bitcast <8 x i16> %1 to <16 x i8> ; <<16 x i8>> [#uses=1] + %3 = tail call <2 x i64> @llvm.x86.sse41.pmovzxbq(<16 x i8> %2) nounwind readnone ; <<2 x i64>> [#uses=1] + ret <2 x i64> %3 +} + +declare <2 x i64> @llvm.x86.sse41.pmovzxbq(<16 x i8>) nounwind readnone