From 8263dcdf23bc534405745959c97cbfd562362458 Mon Sep 17 00:00:00 2001 From: Kevin Qin Date: Tue, 5 Nov 2013 02:03:59 +0000 Subject: [PATCH] Implemented aarch64 neon intrinsic vcopy_lane with float type. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@194041 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/AArch64/AArch64ISelLowering.cpp | 9 +- lib/Target/AArch64/AArch64InstrNEON.td | 161 ++++++++++----------- test/CodeGen/AArch64/neon-copy.ll | 112 ++++++++++++++ 3 files changed, 194 insertions(+), 88 deletions(-) diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp index 323acdd01d2..4fa7deb97ef 100644 --- a/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -3756,9 +3756,12 @@ AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, // Any value type smaller than i32 is illegal in AArch64, and this lower // function is called after legalize pass, so we need to legalize // the result here. - EVT EltVT = MVT::i32; - if(EltSize == 64) - EltVT = MVT::i64; + EVT EltVT; + if (VT.getVectorElementType().isFloatingPoint()) + EltVT = (EltSize == 64) ? MVT::f64 : MVT::f32; + else + EltVT = (EltSize == 64) ? MVT::i64 : MVT::i32; + PassN = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, ExtV, DAG.getConstant(Mask, MVT::i64)); PassN = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, InsV, PassN, diff --git a/lib/Target/AArch64/AArch64InstrNEON.td b/lib/Target/AArch64/AArch64InstrNEON.td index 4d8adb898ff..4ecc0dc4b35 100644 --- a/lib/Target/AArch64/AArch64InstrNEON.td +++ b/lib/Target/AArch64/AArch64InstrNEON.td @@ -5045,19 +5045,12 @@ def INSsw_pattern : Neon_INS_main_pattern; -class NeonI_INS_element +class NeonI_INS_element : NeonI_insert<0b1, 0b1, (outs VPR128:$Rd), (ins VPR128:$src, VPR128:$Rn, ResImm:$Immd, ResImm:$Immn), asmop # "\t$Rd." # Res # "[$Immd], $Rn." # Res # "[$Immn]", - [(set (ResTy VPR128:$Rd), - (ResTy (vector_insert - (ResTy VPR128:$src), - (MidTy (vector_extract - (ResTy VPR128:$Rn), - (ResImm:$Immn))), - (ResImm:$Immd))))], + [], NoItinerary> { let Constraints = "$src = $Rd"; bits<4> Immd; @@ -5065,39 +5058,92 @@ class NeonI_INS_element { +def INSELb : NeonI_INS_element<"ins", "b", neon_uimm4_bare> { let Inst{20-16} = {Immd{3}, Immd{2}, Immd{1}, Immd{0}, 0b1}; let Inst{14-11} = {Immn{3}, Immn{2}, Immn{1}, Immn{0}}; } -def INSELh : NeonI_INS_element<"ins", "h", v8i16, neon_uimm3_bare, i32> { +def INSELh : NeonI_INS_element<"ins", "h", neon_uimm3_bare> { let Inst{20-16} = {Immd{2}, Immd{1}, Immd{0}, 0b1, 0b0}; let Inst{14-12} = {Immn{2}, Immn{1}, Immn{0}}; // bit 11 is unspecified. } -def INSELs : NeonI_INS_element<"ins", "s", v4i32, neon_uimm2_bare, i32> { +def INSELs : NeonI_INS_element<"ins", "s", neon_uimm2_bare> { let Inst{20-16} = {Immd{1}, Immd{0}, 0b1, 0b0, 0b0}; let Inst{14-13} = {Immn{1}, Immn{0}}; // bits 11-12 are unspecified. } -def INSELd : NeonI_INS_element<"ins", "d", v2i64, neon_uimm1_bare, i64> { +def INSELd : NeonI_INS_element<"ins", "d", neon_uimm1_bare> { let Inst{20-16} = {Immd, 0b1, 0b0, 0b0, 0b0}; let Inst{14} = Immn{0}; // bits 11-13 are unspecified. } +multiclass Neon_INS_elt_pattern { +def : Pat<(ResTy (vector_insert + (ResTy VPR128:$src), + (MidTy (vector_extract + (ResTy VPR128:$Rn), + (StImm:$Immn))), + (StImm:$Immd))), + (INS (ResTy VPR128:$src), (ResTy VPR128:$Rn), + StImm:$Immd, StImm:$Immn)>; + +def : Pat <(ResTy (vector_insert + (ResTy VPR128:$src), + (MidTy (vector_extract + (NaTy VPR64:$Rn), + (NaImm:$Immn))), + (StImm:$Immd))), + (INS (ResTy VPR128:$src), + (ResTy (SUBREG_TO_REG (i64 0), (NaTy VPR64:$Rn), sub_64)), + StImm:$Immd, NaImm:$Immn)>; + +def : Pat <(NaTy (vector_insert + (NaTy VPR64:$src), + (MidTy (vector_extract + (ResTy VPR128:$Rn), + (StImm:$Immn))), + (NaImm:$Immd))), + (NaTy (EXTRACT_SUBREG + (ResTy (INS + (ResTy (SUBREG_TO_REG (i64 0), (NaTy VPR64:$src), sub_64)), + (ResTy VPR128:$Rn), + NaImm:$Immd, StImm:$Immn)), + sub_64))>; + +def : Pat <(NaTy (vector_insert + (NaTy VPR64:$src), + (MidTy (vector_extract + (NaTy VPR64:$Rn), + (NaImm:$Immn))), + (NaImm:$Immd))), + (NaTy (EXTRACT_SUBREG + (ResTy (INS + (ResTy (SUBREG_TO_REG (i64 0), (NaTy VPR64:$src), sub_64)), + (ResTy (SUBREG_TO_REG (i64 0), (NaTy VPR64:$Rn), sub_64)), + NaImm:$Immd, NaImm:$Immn)), + sub_64))>; +} + +defm : Neon_INS_elt_pattern; +defm : Neon_INS_elt_pattern; +defm : Neon_INS_elt_pattern; +defm : Neon_INS_elt_pattern; +defm : Neon_INS_elt_pattern; +defm : Neon_INS_elt_pattern; + multiclass Neon_INS_elt_float_pattern { -def : Pat<(ResTy (vector_insert - (ResTy VPR128:$src), - (MidTy (vector_extract - (ResTy VPR128:$Rn), - (ResImm:$Immn))), - (ResImm:$Immd))), - (INS (ResTy VPR128:$src), (ResTy VPR128:$Rn), - ResImm:$Immd, ResImm:$Immn)>; - def : Pat <(ResTy (vector_insert (ResTy VPR128:$src), (MidTy OpFPR:$Rn), @@ -5125,60 +5171,6 @@ defm : Neon_INS_elt_float_pattern; -multiclass Neon_INS_elt_pattern { -def : Pat<(NaTy (vector_insert - (NaTy VPR64:$src), - (MidTy (vector_extract - (StTy VPR128:$Rn), - (StImm:$Immn))), - (NaImm:$Immd))), - (NaTy (EXTRACT_SUBREG - (StTy (INS - (StTy (SUBREG_TO_REG (i64 0), VPR64:$src, sub_64)), - (StTy VPR128:$Rn), - NaImm:$Immd, - StImm:$Immn)), - sub_64))>; - -def : Pat<(StTy (vector_insert - (StTy VPR128:$src), - (MidTy (vector_extract - (NaTy VPR64:$Rn), - (NaImm:$Immn))), - (StImm:$Immd))), - (StTy (INS - (StTy VPR128:$src), - (StTy (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)), - StImm:$Immd, - NaImm:$Immn))>; - -def : Pat<(NaTy (vector_insert - (NaTy VPR64:$src), - (MidTy (vector_extract - (NaTy VPR64:$Rn), - (NaImm:$Immn))), - (NaImm:$Immd))), - (NaTy (EXTRACT_SUBREG - (StTy (INS - (StTy (SUBREG_TO_REG (i64 0), VPR64:$src, sub_64)), - (StTy (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)), - NaImm:$Immd, - NaImm:$Immn)), - sub_64))>; -} - -defm : Neon_INS_elt_pattern; -defm : Neon_INS_elt_pattern; -defm : Neon_INS_elt_pattern; -defm : Neon_INS_elt_pattern; - - class NeonI_SMOV @@ -5408,8 +5400,7 @@ def : Pat<(v2f64 (scalar_to_vector (f64 FPR64:$src))), (f64 FPR64:$src), sub_64)>; class NeonI_DUP_Elt + RegisterOperand ResVPR, Operand OpImm> : NeonI_copy Imm; } -def DUPELT16b : NeonI_DUP_Elt<0b1, "dup", ".16b", ".b", VPR128, v16i8, v16i8, +def DUPELT16b : NeonI_DUP_Elt<0b1, "dup", ".16b", ".b", VPR128, neon_uimm4_bare> { let Inst{20-16} = {Imm{3}, Imm{2}, Imm{1}, Imm{0}, 0b1}; } -def DUPELT8h : NeonI_DUP_Elt<0b1, "dup", ".8h", ".h", VPR128, v8i16, v8i16, +def DUPELT8h : NeonI_DUP_Elt<0b1, "dup", ".8h", ".h", VPR128, neon_uimm3_bare> { let Inst{20-16} = {Imm{2}, Imm{1}, Imm{0}, 0b1, 0b0}; } -def DUPELT4s : NeonI_DUP_Elt<0b1, "dup", ".4s", ".s", VPR128, v4i32, v4i32, +def DUPELT4s : NeonI_DUP_Elt<0b1, "dup", ".4s", ".s", VPR128, neon_uimm2_bare> { let Inst{20-16} = {Imm{1}, Imm{0}, 0b1, 0b0, 0b0}; } -def DUPELT2d : NeonI_DUP_Elt<0b1, "dup", ".2d", ".d", VPR128, v2i64, v2i64, +def DUPELT2d : NeonI_DUP_Elt<0b1, "dup", ".2d", ".d", VPR128, neon_uimm1_bare> { let Inst{20-16} = {Imm, 0b1, 0b0, 0b0, 0b0}; } -def DUPELT8b : NeonI_DUP_Elt<0b0, "dup", ".8b", ".b", VPR64, v8i8, v16i8, +def DUPELT8b : NeonI_DUP_Elt<0b0, "dup", ".8b", ".b", VPR64, neon_uimm4_bare> { let Inst{20-16} = {Imm{3}, Imm{2}, Imm{1}, Imm{0}, 0b1}; } -def DUPELT4h : NeonI_DUP_Elt<0b0, "dup", ".4h", ".h", VPR64, v4i16, v8i16, +def DUPELT4h : NeonI_DUP_Elt<0b0, "dup", ".4h", ".h", VPR64, neon_uimm3_bare> { let Inst{20-16} = {Imm{2}, Imm{1}, Imm{0}, 0b1, 0b0}; } -def DUPELT2s : NeonI_DUP_Elt<0b0, "dup", ".2s", ".s", VPR64, v2i32, v4i32, +def DUPELT2s : NeonI_DUP_Elt<0b0, "dup", ".2s", ".s", VPR64, neon_uimm2_bare> { let Inst{20-16} = {Imm{1}, Imm{0}, 0b1, 0b0, 0b0}; } diff --git a/test/CodeGen/AArch64/neon-copy.ll b/test/CodeGen/AArch64/neon-copy.ll index 943072e285b..e1afc245645 100644 --- a/test/CodeGen/AArch64/neon-copy.ll +++ b/test/CodeGen/AArch64/neon-copy.ll @@ -71,6 +71,104 @@ define <2 x i64> @ins2d2(<2 x i64> %tmp1, <2 x i64> %tmp2) { ret <2 x i64> %tmp4 } +define <4 x float> @ins4f4(<4 x float> %tmp1, <4 x float> %tmp2) { +;CHECK: ins {{v[0-31]+}}.s[1], {{v[0-31]+}}.s[2] + %tmp3 = extractelement <4 x float> %tmp1, i32 2 + %tmp4 = insertelement <4 x float> %tmp2, float %tmp3, i32 1 + ret <4 x float> %tmp4 +} + +define <2 x double> @ins2df2(<2 x double> %tmp1, <2 x double> %tmp2) { +;CHECK: ins {{v[0-31]+}}.d[1], {{v[0-31]+}}.d[0] + %tmp3 = extractelement <2 x double> %tmp1, i32 0 + %tmp4 = insertelement <2 x double> %tmp2, double %tmp3, i32 1 + ret <2 x double> %tmp4 +} + +define <16 x i8> @ins8b16(<8 x i8> %tmp1, <16 x i8> %tmp2) { +;CHECK: ins {{v[0-31]+}}.b[15], {{v[0-31]+}}.b[2] + %tmp3 = extractelement <8 x i8> %tmp1, i32 2 + %tmp4 = insertelement <16 x i8> %tmp2, i8 %tmp3, i32 15 + ret <16 x i8> %tmp4 +} + +define <8 x i16> @ins4h8(<4 x i16> %tmp1, <8 x i16> %tmp2) { +;CHECK: ins {{v[0-31]+}}.h[7], {{v[0-31]+}}.h[2] + %tmp3 = extractelement <4 x i16> %tmp1, i32 2 + %tmp4 = insertelement <8 x i16> %tmp2, i16 %tmp3, i32 7 + ret <8 x i16> %tmp4 +} + +define <4 x i32> @ins2s4(<2 x i32> %tmp1, <4 x i32> %tmp2) { +;CHECK: ins {{v[0-31]+}}.s[1], {{v[0-31]+}}.s[1] + %tmp3 = extractelement <2 x i32> %tmp1, i32 1 + %tmp4 = insertelement <4 x i32> %tmp2, i32 %tmp3, i32 1 + ret <4 x i32> %tmp4 +} + +define <2 x i64> @ins1d2(<1 x i64> %tmp1, <2 x i64> %tmp2) { +;CHECK: ins {{v[0-31]+}}.d[1], {{v[0-31]+}}.d[0] + %tmp3 = extractelement <1 x i64> %tmp1, i32 0 + %tmp4 = insertelement <2 x i64> %tmp2, i64 %tmp3, i32 1 + ret <2 x i64> %tmp4 +} + +define <4 x float> @ins2f4(<2 x float> %tmp1, <4 x float> %tmp2) { +;CHECK: ins {{v[0-31]+}}.s[1], {{v[0-31]+}}.s[1] + %tmp3 = extractelement <2 x float> %tmp1, i32 1 + %tmp4 = insertelement <4 x float> %tmp2, float %tmp3, i32 1 + ret <4 x float> %tmp4 +} + +define <2 x double> @ins1f2(<1 x double> %tmp1, <2 x double> %tmp2) { +;CHECK: ins {{v[0-31]+}}.d[1], {{v[0-31]+}}.d[0] + %tmp3 = extractelement <1 x double> %tmp1, i32 0 + %tmp4 = insertelement <2 x double> %tmp2, double %tmp3, i32 1 + ret <2 x double> %tmp4 +} + +define <8 x i8> @ins16b8(<16 x i8> %tmp1, <8 x i8> %tmp2) { +;CHECK: ins {{v[0-31]+}}.b[7], {{v[0-31]+}}.b[2] + %tmp3 = extractelement <16 x i8> %tmp1, i32 2 + %tmp4 = insertelement <8 x i8> %tmp2, i8 %tmp3, i32 7 + ret <8 x i8> %tmp4 +} + +define <4 x i16> @ins8h4(<8 x i16> %tmp1, <4 x i16> %tmp2) { +;CHECK: ins {{v[0-31]+}}.h[3], {{v[0-31]+}}.h[2] + %tmp3 = extractelement <8 x i16> %tmp1, i32 2 + %tmp4 = insertelement <4 x i16> %tmp2, i16 %tmp3, i32 3 + ret <4 x i16> %tmp4 +} + +define <2 x i32> @ins4s2(<4 x i32> %tmp1, <2 x i32> %tmp2) { +;CHECK: ins {{v[0-31]+}}.s[1], {{v[0-31]+}}.s[2] + %tmp3 = extractelement <4 x i32> %tmp1, i32 2 + %tmp4 = insertelement <2 x i32> %tmp2, i32 %tmp3, i32 1 + ret <2 x i32> %tmp4 +} + +define <1 x i64> @ins2d1(<2 x i64> %tmp1, <1 x i64> %tmp2) { +;CHECK: ins {{v[0-31]+}}.d[0], {{v[0-31]+}}.d[0] + %tmp3 = extractelement <2 x i64> %tmp1, i32 0 + %tmp4 = insertelement <1 x i64> %tmp2, i64 %tmp3, i32 0 + ret <1 x i64> %tmp4 +} + +define <2 x float> @ins4f2(<4 x float> %tmp1, <2 x float> %tmp2) { +;CHECK: ins {{v[0-31]+}}.s[1], {{v[0-31]+}}.s[2] + %tmp3 = extractelement <4 x float> %tmp1, i32 2 + %tmp4 = insertelement <2 x float> %tmp2, float %tmp3, i32 1 + ret <2 x float> %tmp4 +} + +define <1 x double> @ins2f1(<2 x double> %tmp1, <1 x double> %tmp2) { +;CHECK: ins {{v[0-31]+}}.d[0], {{v[0-31]+}}.d[0] + %tmp3 = extractelement <2 x double> %tmp1, i32 0 + %tmp4 = insertelement <1 x double> %tmp2, double %tmp3, i32 0 + ret <1 x double> %tmp4 +} + define <8 x i8> @ins8b8(<8 x i8> %tmp1, <8 x i8> %tmp2) { ;CHECK: ins {{v[0-31]+}}.b[4], {{v[0-31]+}}.b[2] %tmp3 = extractelement <8 x i8> %tmp1, i32 2 @@ -99,6 +197,20 @@ define <1 x i64> @ins1d1(<1 x i64> %tmp1, <1 x i64> %tmp2) { ret <1 x i64> %tmp4 } +define <2 x float> @ins2f2(<2 x float> %tmp1, <2 x float> %tmp2) { +;CHECK: ins {{v[0-31]+}}.s[1], {{v[0-31]+}}.s[0] + %tmp3 = extractelement <2 x float> %tmp1, i32 0 + %tmp4 = insertelement <2 x float> %tmp2, float %tmp3, i32 1 + ret <2 x float> %tmp4 +} + +define <1 x double> @ins1df1(<1 x double> %tmp1, <1 x double> %tmp2) { +;CHECK: ins {{v[0-31]+}}.d[0], {{v[0-31]+}}.d[0] + %tmp3 = extractelement <1 x double> %tmp1, i32 0 + %tmp4 = insertelement <1 x double> %tmp2, double %tmp3, i32 0 + ret <1 x double> %tmp4 +} + define i32 @umovw16b(<16 x i8> %tmp1) { ;CHECK: umov {{w[0-31]+}}, {{v[0-31]+}}.b[8] %tmp3 = extractelement <16 x i8> %tmp1, i32 8