diff --git a/lib/Target/ARM/ARMInstrNEON.td b/lib/Target/ARM/ARMInstrNEON.td index 7a368bfc35f..fd8ac0b328e 100644 --- a/lib/Target/ARM/ARMInstrNEON.td +++ b/lib/Target/ARM/ARMInstrNEON.td @@ -5634,6 +5634,7 @@ multiclass Lengthen_HalfSingle = // Pat<(v4i32 (extloadvi8 addrmode5:$addr)) // (EXTRACT_SUBREG (VMOVLuv4i32 @@ -5644,28 +5645,63 @@ multiclass Lengthen_HalfSingle; multiclass Lengthen_Double { + string Insn2Ty> { + def _Any : Pat<(!cast("v" # DestLanes # DestTy) + (!cast("extloadv" # SrcTy) addrmode5:$addr)), + (!cast("VMOVLuv" # Insn2Lanes # Insn2Ty) + (EXTRACT_SUBREG (!cast("VMOVLuv" # Insn1Lanes # Insn1Ty) + (INSERT_SUBREG (f64 (IMPLICIT_DEF)), (VLDRS addrmode5:$addr), + ssub_0)), dsub_0))>; + def _Z : Pat<(!cast("v" # DestLanes # DestTy) + (!cast("zextloadv" # SrcTy) addrmode5:$addr)), + (!cast("VMOVLuv" # Insn2Lanes # Insn2Ty) + (EXTRACT_SUBREG (!cast("VMOVLuv" # Insn1Lanes # Insn1Ty) + (INSERT_SUBREG (f64 (IMPLICIT_DEF)), (VLDRS addrmode5:$addr), + ssub_0)), dsub_0))>; + def _S : Pat<(!cast("v" # DestLanes # DestTy) + (!cast("sextloadv" # SrcTy) addrmode5:$addr)), + (!cast("VMOVLsv" # Insn2Lanes # Insn2Ty) + (EXTRACT_SUBREG (!cast("VMOVLsv" # Insn1Lanes # Insn1Ty) + (INSERT_SUBREG (f64 (IMPLICIT_DEF)), (VLDRS addrmode5:$addr), + ssub_0)), dsub_0))>; +} + +// extload, zextload and sextload for a lengthening load followed by another +// lengthening load, to quadruple the initial length, but which ends up only +// requiring half the available lanes (a 64-bit outcome instead of a 128-bit). +// +// Lengthen_HalfDouble<"2", "i32", "i8", "8", "i16", "4", "i32"> = +// Pat<(v4i32 (extloadvi8 addrmode5:$addr)) +// (EXTRACT_SUBREG (VMOVLuv4i32 +// (EXTRACT_SUBREG (VMOVLuv8i16 (INSERT_SUBREG (f64 (IMPLICIT_DEF)), +// (VLDRS addrmode5:$addr), +// ssub_0)), +// dsub_0)), +// dsub_0)>; +multiclass Lengthen_HalfDouble { def _Any : Pat<(!cast("v" # DestLanes # DestTy) (!cast("extloadv" # SrcTy) addrmode5:$addr)), (EXTRACT_SUBREG (!cast("VMOVLuv" # Insn2Lanes # Insn2Ty) (EXTRACT_SUBREG (!cast("VMOVLuv" # Insn1Lanes # Insn1Ty) (INSERT_SUBREG (f64 (IMPLICIT_DEF)), (VLDRS addrmode5:$addr), ssub_0)), dsub_0)), - RegType)>; + dsub_0)>; def _Z : Pat<(!cast("v" # DestLanes # DestTy) (!cast("zextloadv" # SrcTy) addrmode5:$addr)), (EXTRACT_SUBREG (!cast("VMOVLuv" # Insn2Lanes # Insn2Ty) (EXTRACT_SUBREG (!cast("VMOVLuv" # Insn1Lanes # Insn1Ty) (INSERT_SUBREG (f64 (IMPLICIT_DEF)), (VLDRS addrmode5:$addr), ssub_0)), dsub_0)), - RegType)>; + dsub_0)>; def _S : Pat<(!cast("v" # DestLanes # DestTy) (!cast("sextloadv" # SrcTy) addrmode5:$addr)), (EXTRACT_SUBREG (!cast("VMOVLsv" # Insn2Lanes # Insn2Ty) (EXTRACT_SUBREG (!cast("VMOVLsv" # Insn1Lanes # Insn1Ty) (INSERT_SUBREG (f64 (IMPLICIT_DEF)), (VLDRS addrmode5:$addr), ssub_0)), dsub_0)), - RegType)>; + dsub_0)>; } defm : Lengthen_Single<"8", "i16", "i8">; // v8i8 -> v8i16 @@ -5676,12 +5712,12 @@ defm : Lengthen_HalfSingle<"4", "i16", "i8", "8", "i16">; // v4i8 -> v4i16 defm : Lengthen_HalfSingle<"2", "i16", "i8", "8", "i16">; // v2i8 -> v2i16 defm : Lengthen_HalfSingle<"2", "i32", "i16", "4", "i32">; // v2i16 -> v2i32 -// Double lengthening - v4i8 -> v4i16 -> v4i32 -defm : Lengthen_Double<"4", "i32", "i8", "8", "i16", "4", "i32", qsub_0>; +// Double lengthening - v4i8 -> v4i16 -> v4i32 +defm : Lengthen_Double<"4", "i32", "i8", "8", "i16", "4", "i32">; // v2i8 -> v2i16 -> v2i32 -defm : Lengthen_Double<"2", "i32", "i8", "8", "i16", "4", "i32", dsub_0>; +defm : Lengthen_HalfDouble<"2", "i32", "i8", "8", "i16", "4", "i32">; // v2i16 -> v2i32 -> v2i64 -defm : Lengthen_Double<"2", "i64", "i16", "4", "i32", "2", "i64", qsub_0>; +defm : Lengthen_Double<"2", "i64", "i16", "4", "i32", "2", "i64">; // Triple lengthening - v2i8 -> v2i16 -> v2i32 -> v2i64 def : Pat<(v2i64 (extloadvi8 addrmode5:$addr)), diff --git a/test/CodeGen/ARM/vector-extend-narrow.ll b/test/CodeGen/ARM/vector-extend-narrow.ll index 5e9239f2563..1ec36da38f7 100644 --- a/test/CodeGen/ARM/vector-extend-narrow.ll +++ b/test/CodeGen/ARM/vector-extend-narrow.ll @@ -44,3 +44,17 @@ define <4 x i8> @h(<4 x float> %v) { %1 = fptoui <4 x float> %v to <4 x i8> ret <4 x i8> %1 } + +; CHECK: i: +define <4 x i8> @i(<4 x i8>* %x) { + ; CHECK: vldr + ; CHECK: vmovl.s8 + ; CHECK: vmovl.s16 + ; CHECK: vrecpe + ; CHECK: vrecps + ; CHECK: vmul + ; CHECK: vmovn + %1 = load <4 x i8>* %x, align 4 + %2 = sdiv <4 x i8> zeroinitializer, %1 + ret <4 x i8> %2 +}