From 7f1f8d4146942e4cd1efd3f730dbada026dfdf57 Mon Sep 17 00:00:00 2001 From: Jiangning Liu Date: Tue, 3 Dec 2013 01:29:32 +0000 Subject: [PATCH] Add some missing pattern matches for AArch64 Neon intrinsics like vmull_high_n_s16 and friends. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@196190 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/AArch64/AArch64ISelLowering.cpp | 2 +- lib/Target/AArch64/AArch64InstrNEON.td | 1417 +++++++++++--------- test/CodeGen/AArch64/neon-2velem-high.ll | 331 +++++ 3 files changed, 1084 insertions(+), 666 deletions(-) create mode 100644 test/CodeGen/AArch64/neon-2velem-high.ll diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp index 6ea4b483eb4..5d20a96ff02 100644 --- a/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -4004,8 +4004,8 @@ AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG, return DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Value); unsigned EltSize = VT.getVectorElementType().getSizeInBits(); - // Use VDUP for non-constant splats. if (hasDominantValue && EltSize <= 64) { + // Use VDUP for non-constant splats. if (!isConstant) { SDValue N; diff --git a/lib/Target/AArch64/AArch64InstrNEON.td b/lib/Target/AArch64/AArch64InstrNEON.td index 581ebae2481..8306fd5d966 100644 --- a/lib/Target/AArch64/AArch64InstrNEON.td +++ b/lib/Target/AArch64/AArch64InstrNEON.td @@ -6097,669 +6097,6 @@ defm TBX2 : NI_TBX_pat<0b01, 0b1, "tbx", "VPair">; defm TBX3 : NI_TBX_pat<0b10, 0b1, "tbx", "VTriple">; defm TBX4 : NI_TBX_pat<0b11, 0b1, "tbx", "VQuad">; -// The followings are for instruction class (3V Elem) - -// Variant 1 - -class NI_2VE size, bits<4> opcode, - string asmop, string ResS, string OpS, string EleOpS, - Operand OpImm, RegisterOperand ResVPR, - RegisterOperand OpVPR, RegisterOperand EleOpVPR> - : NeonI_2VElem { - bits<3> Index; - bits<5> Re; - - let Constraints = "$src = $Rd"; -} - -multiclass NI_2VE_v1 opcode, string asmop> { - // vector register class for element is always 128-bit to cover the max index - def _2s4s : NI_2VE<0b0, u, 0b10, opcode, asmop, "2s", "2s", "s", - neon_uimm2_bare, VPR64, VPR64, VPR128> { - let Inst{11} = {Index{1}}; - let Inst{21} = {Index{0}}; - let Inst{20-16} = Re; - } - - def _4s4s : NI_2VE<0b1, u, 0b10, opcode, asmop, "4s", "4s", "s", - neon_uimm2_bare, VPR128, VPR128, VPR128> { - let Inst{11} = {Index{1}}; - let Inst{21} = {Index{0}}; - let Inst{20-16} = Re; - } - - // Index operations on 16-bit(H) elements are restricted to using v0-v15. - def _4h8h : NI_2VE<0b0, u, 0b01, opcode, asmop, "4h", "4h", "h", - neon_uimm3_bare, VPR64, VPR64, VPR128Lo> { - let Inst{11} = {Index{2}}; - let Inst{21} = {Index{1}}; - let Inst{20} = {Index{0}}; - let Inst{19-16} = Re{3-0}; - } - - def _8h8h : NI_2VE<0b1, u, 0b01, opcode, asmop, "8h", "8h", "h", - neon_uimm3_bare, VPR128, VPR128, VPR128Lo> { - let Inst{11} = {Index{2}}; - let Inst{21} = {Index{1}}; - let Inst{20} = {Index{0}}; - let Inst{19-16} = Re{3-0}; - } -} - -defm MLAvve : NI_2VE_v1<0b1, 0b0000, "mla">; -defm MLSvve : NI_2VE_v1<0b1, 0b0100, "mls">; - -// Pattern for lane in 128-bit vector -class NI_2VE_laneq - : Pat<(ResTy (op (ResTy ResVPR:$src), (OpTy OpVPR:$Rn), - (OpTy (Neon_vduplane (EleOpTy EleOpVPR:$Re), (i64 OpImm:$Index))))), - (INST ResVPR:$src, OpVPR:$Rn, EleOpVPR:$Re, OpImm:$Index)>; - -// Pattern for lane in 64-bit vector -class NI_2VE_lane - : Pat<(ResTy (op (ResTy ResVPR:$src), (OpTy OpVPR:$Rn), - (OpTy (Neon_vduplane (EleOpTy EleOpVPR:$Re), (i64 OpImm:$Index))))), - (INST ResVPR:$src, OpVPR:$Rn, - (SUBREG_TO_REG (i64 0), EleOpVPR:$Re, sub_64), OpImm:$Index)>; - -multiclass NI_2VE_v1_pat -{ - def : NI_2VE_laneq(subop # "_2s4s"), neon_uimm2_bare, - op, VPR64, VPR64, VPR128, v2i32, v2i32, v4i32>; - - def : NI_2VE_laneq(subop # "_4s4s"), neon_uimm2_bare, - op, VPR128, VPR128, VPR128, v4i32, v4i32, v4i32>; - - def : NI_2VE_laneq(subop # "_4h8h"), neon_uimm3_bare, - op, VPR64, VPR64, VPR128Lo, v4i16, v4i16, v8i16>; - - def : NI_2VE_laneq(subop # "_8h8h"), neon_uimm3_bare, - op, VPR128, VPR128, VPR128Lo, v8i16, v8i16, v8i16>; - - // Index can only be half of the max value for lane in 64-bit vector - - def : NI_2VE_lane(subop # "_2s4s"), neon_uimm1_bare, - op, VPR64, VPR64, VPR64, v2i32, v2i32, v2i32>; - - def : NI_2VE_lane(subop # "_4h8h"), neon_uimm2_bare, - op, VPR64, VPR64, VPR64Lo, v4i16, v4i16, v4i16>; -} - -defm MLA_lane_v1 : NI_2VE_v1_pat<"MLAvve", Neon_mla>; -defm MLS_lane_v1 : NI_2VE_v1_pat<"MLSvve", Neon_mls>; - -class NI_2VE_2op size, bits<4> opcode, - string asmop, string ResS, string OpS, string EleOpS, - Operand OpImm, RegisterOperand ResVPR, - RegisterOperand OpVPR, RegisterOperand EleOpVPR> - : NeonI_2VElem { - bits<3> Index; - bits<5> Re; -} - -multiclass NI_2VE_v1_2op opcode, string asmop> { - // vector register class for element is always 128-bit to cover the max index - def _2s4s : NI_2VE_2op<0b0, u, 0b10, opcode, asmop, "2s", "2s", "s", - neon_uimm2_bare, VPR64, VPR64, VPR128> { - let Inst{11} = {Index{1}}; - let Inst{21} = {Index{0}}; - let Inst{20-16} = Re; - } - - def _4s4s : NI_2VE_2op<0b1, u, 0b10, opcode, asmop, "4s", "4s", "s", - neon_uimm2_bare, VPR128, VPR128, VPR128> { - let Inst{11} = {Index{1}}; - let Inst{21} = {Index{0}}; - let Inst{20-16} = Re; - } - - // Index operations on 16-bit(H) elements are restricted to using v0-v15. - def _4h8h : NI_2VE_2op<0b0, u, 0b01, opcode, asmop, "4h", "4h", "h", - neon_uimm3_bare, VPR64, VPR64, VPR128Lo> { - let Inst{11} = {Index{2}}; - let Inst{21} = {Index{1}}; - let Inst{20} = {Index{0}}; - let Inst{19-16} = Re{3-0}; - } - - def _8h8h : NI_2VE_2op<0b1, u, 0b01, opcode, asmop, "8h", "8h", "h", - neon_uimm3_bare, VPR128, VPR128, VPR128Lo> { - let Inst{11} = {Index{2}}; - let Inst{21} = {Index{1}}; - let Inst{20} = {Index{0}}; - let Inst{19-16} = Re{3-0}; - } -} - -defm MULve : NI_2VE_v1_2op<0b0, 0b1000, "mul">; -defm SQDMULHve : NI_2VE_v1_2op<0b0, 0b1100, "sqdmulh">; -defm SQRDMULHve : NI_2VE_v1_2op<0b0, 0b1101, "sqrdmulh">; - -// Pattern for lane in 128-bit vector -class NI_2VE_mul_laneq - : Pat<(ResTy (op (OpTy OpVPR:$Rn), - (OpTy (Neon_vduplane (EleOpTy EleOpVPR:$Re), (i64 OpImm:$Index))))), - (INST OpVPR:$Rn, EleOpVPR:$Re, OpImm:$Index)>; - -// Pattern for lane in 64-bit vector -class NI_2VE_mul_lane - : Pat<(ResTy (op (OpTy OpVPR:$Rn), - (OpTy (Neon_vduplane (EleOpTy EleOpVPR:$Re), (i64 OpImm:$Index))))), - (INST OpVPR:$Rn, - (SUBREG_TO_REG (i64 0), EleOpVPR:$Re, sub_64), OpImm:$Index)>; - -multiclass NI_2VE_mul_v1_pat { - def : NI_2VE_mul_laneq(subop # "_2s4s"), neon_uimm2_bare, - op, VPR64, VPR128, v2i32, v2i32, v4i32>; - - def : NI_2VE_mul_laneq(subop # "_4s4s"), neon_uimm2_bare, - op, VPR128, VPR128, v4i32, v4i32, v4i32>; - - def : NI_2VE_mul_laneq(subop # "_4h8h"), neon_uimm3_bare, - op, VPR64, VPR128Lo, v4i16, v4i16, v8i16>; - - def : NI_2VE_mul_laneq(subop # "_8h8h"), neon_uimm3_bare, - op, VPR128, VPR128Lo, v8i16, v8i16, v8i16>; - - // Index can only be half of the max value for lane in 64-bit vector - - def : NI_2VE_mul_lane(subop # "_2s4s"), neon_uimm1_bare, - op, VPR64, VPR64, v2i32, v2i32, v2i32>; - - def : NI_2VE_mul_lane(subop # "_4h8h"), neon_uimm2_bare, - op, VPR64, VPR64Lo, v4i16, v4i16, v4i16>; -} - -defm MUL_lane_v1 : NI_2VE_mul_v1_pat<"MULve", mul>; -defm SQDMULH_lane_v1 : NI_2VE_mul_v1_pat<"SQDMULHve", int_arm_neon_vqdmulh>; -defm SQRDMULH_lane_v1 : NI_2VE_mul_v1_pat<"SQRDMULHve", int_arm_neon_vqrdmulh>; - -// Variant 2 - -multiclass NI_2VE_v2_2op opcode, string asmop> { - // vector register class for element is always 128-bit to cover the max index - def _2s4s : NI_2VE_2op<0b0, u, 0b10, opcode, asmop, "2s", "2s", "s", - neon_uimm2_bare, VPR64, VPR64, VPR128> { - let Inst{11} = {Index{1}}; - let Inst{21} = {Index{0}}; - let Inst{20-16} = Re; - } - - def _4s4s : NI_2VE_2op<0b1, u, 0b10, opcode, asmop, "4s", "4s", "s", - neon_uimm2_bare, VPR128, VPR128, VPR128> { - let Inst{11} = {Index{1}}; - let Inst{21} = {Index{0}}; - let Inst{20-16} = Re; - } - - // _1d2d doesn't exist! - - def _2d2d : NI_2VE_2op<0b1, u, 0b11, opcode, asmop, "2d", "2d", "d", - neon_uimm1_bare, VPR128, VPR128, VPR128> { - let Inst{11} = {Index{0}}; - let Inst{21} = 0b0; - let Inst{20-16} = Re; - } -} - -defm FMULve : NI_2VE_v2_2op<0b0, 0b1001, "fmul">; -defm FMULXve : NI_2VE_v2_2op<0b1, 0b1001, "fmulx">; - -class NI_2VE_mul_lane_2d - : Pat<(ResTy (op (OpTy OpVPR:$Rn), - (OpTy (coreop (EleOpTy EleOpVPR:$Re), (EleOpTy EleOpVPR:$Re))))), - (INST OpVPR:$Rn, - (SUBREG_TO_REG (i64 0), EleOpVPR:$Re, sub_64), 0)>; - -multiclass NI_2VE_mul_v2_pat { - def : NI_2VE_mul_laneq(subop # "_2s4s"), neon_uimm2_bare, - op, VPR64, VPR128, v2f32, v2f32, v4f32>; - - def : NI_2VE_mul_laneq(subop # "_4s4s"), neon_uimm2_bare, - op, VPR128, VPR128, v4f32, v4f32, v4f32>; - - def : NI_2VE_mul_laneq(subop # "_2d2d"), neon_uimm1_bare, - op, VPR128, VPR128, v2f64, v2f64, v2f64>; - - // Index can only be half of the max value for lane in 64-bit vector - - def : NI_2VE_mul_lane(subop # "_2s4s"), neon_uimm1_bare, - op, VPR64, VPR64, v2f32, v2f32, v2f32>; - - def : NI_2VE_mul_lane_2d(subop # "_2d2d"), neon_uimm1_bare, - op, VPR128, VPR64, v2f64, v2f64, v1f64, - BinOpFrag<(Neon_combine_2d node:$LHS, node:$RHS)>>; -} - -defm FMUL_lane_v2 : NI_2VE_mul_v2_pat<"FMULve", fmul>; -defm FMULX_lane_v2 : NI_2VE_mul_v2_pat<"FMULXve", int_aarch64_neon_vmulx>; - -// The followings are patterns using fma -// -ffp-contract=fast generates fma - -multiclass NI_2VE_v2 opcode, string asmop> { - // vector register class for element is always 128-bit to cover the max index - def _2s4s : NI_2VE<0b0, u, 0b10, opcode, asmop, "2s", "2s", "s", - neon_uimm2_bare, VPR64, VPR64, VPR128> { - let Inst{11} = {Index{1}}; - let Inst{21} = {Index{0}}; - let Inst{20-16} = Re; - } - - def _4s4s : NI_2VE<0b1, u, 0b10, opcode, asmop, "4s", "4s", "s", - neon_uimm2_bare, VPR128, VPR128, VPR128> { - let Inst{11} = {Index{1}}; - let Inst{21} = {Index{0}}; - let Inst{20-16} = Re; - } - - // _1d2d doesn't exist! - - def _2d2d : NI_2VE<0b1, u, 0b11, opcode, asmop, "2d", "2d", "d", - neon_uimm1_bare, VPR128, VPR128, VPR128> { - let Inst{11} = {Index{0}}; - let Inst{21} = 0b0; - let Inst{20-16} = Re; - } -} - -defm FMLAvve : NI_2VE_v2<0b0, 0b0001, "fmla">; -defm FMLSvve : NI_2VE_v2<0b0, 0b0101, "fmls">; - -// Pattern for lane in 128-bit vector -class NI_2VEswap_laneq - : Pat<(ResTy (op (ResTy (coreop (OpTy OpVPR:$Re), (i64 OpImm:$Index))), - (ResTy ResVPR:$src), (ResTy ResVPR:$Rn))), - (INST ResVPR:$src, ResVPR:$Rn, OpVPR:$Re, OpImm:$Index)>; - -// Pattern for lane in 64-bit vector -class NI_2VEswap_lane - : Pat<(ResTy (op (ResTy (coreop (OpTy OpVPR:$Re), (i64 OpImm:$Index))), - (ResTy ResVPR:$Rn), (ResTy ResVPR:$src))), - (INST ResVPR:$src, ResVPR:$Rn, - (SUBREG_TO_REG (i64 0), OpVPR:$Re, sub_64), OpImm:$Index)>; - -// Pattern for lane in 64-bit vector -class NI_2VEswap_lane_2d2d - : Pat<(ResTy (op (ResTy (coreop (OpTy OpVPR:$Re), (OpTy OpVPR:$Re))), - (ResTy ResVPR:$Rn), (ResTy ResVPR:$src))), - (INST ResVPR:$src, ResVPR:$Rn, - (SUBREG_TO_REG (i64 0), OpVPR:$Re, sub_64), 0)>; - - -multiclass NI_2VE_fma_v2_pat { - def : NI_2VEswap_laneq(subop # "_2s4s"), - neon_uimm2_bare, op, VPR64, VPR128, v2f32, v4f32, - BinOpFrag<(Neon_vduplane node:$LHS, node:$RHS)>>; - - def : NI_2VEswap_laneq(subop # "_4s4s"), - neon_uimm2_bare, op, VPR128, VPR128, v4f32, v4f32, - BinOpFrag<(Neon_vduplane node:$LHS, node:$RHS)>>; - - def : NI_2VEswap_laneq(subop # "_2d2d"), - neon_uimm1_bare, op, VPR128, VPR128, v2f64, v2f64, - BinOpFrag<(Neon_vduplane node:$LHS, node:$RHS)>>; - - // Index can only be half of the max value for lane in 64-bit vector - - def : NI_2VEswap_lane(subop # "_2s4s"), - neon_uimm1_bare, op, VPR64, VPR64, v2f32, v2f32, - BinOpFrag<(Neon_vduplane node:$LHS, node:$RHS)>>; - - def : NI_2VEswap_lane_2d2d(subop # "_2d2d"), - neon_uimm1_bare, op, VPR128, VPR64, v2f64, v1f64, - BinOpFrag<(Neon_combine_2d node:$LHS, node:$RHS)>>; -} - -defm FMLA_lane_v2_s : NI_2VE_fma_v2_pat<"FMLAvve", fma>; - -multiclass NI_2VE_fms_v2_pat -{ - def : NI_2VEswap_laneq(subop # "_2s4s"), - neon_uimm2_bare, op, VPR64, VPR128, v2f32, v4f32, - BinOpFrag<(fneg (Neon_vduplane node:$LHS, node:$RHS))>>; - - def : NI_2VEswap_laneq(subop # "_2s4s"), - neon_uimm2_bare, op, VPR64, VPR128, v2f32, v4f32, - BinOpFrag<(Neon_vduplane - (fneg node:$LHS), node:$RHS)>>; - - def : NI_2VEswap_laneq(subop # "_4s4s"), - neon_uimm2_bare, op, VPR128, VPR128, v4f32, v4f32, - BinOpFrag<(fneg (Neon_vduplane - node:$LHS, node:$RHS))>>; - - def : NI_2VEswap_laneq(subop # "_4s4s"), - neon_uimm2_bare, op, VPR128, VPR128, v4f32, v4f32, - BinOpFrag<(Neon_vduplane - (fneg node:$LHS), node:$RHS)>>; - - def : NI_2VEswap_laneq(subop # "_2d2d"), - neon_uimm1_bare, op, VPR128, VPR128, v2f64, v2f64, - BinOpFrag<(fneg (Neon_vduplane - node:$LHS, node:$RHS))>>; - - def : NI_2VEswap_laneq(subop # "_2d2d"), - neon_uimm1_bare, op, VPR128, VPR128, v2f64, v2f64, - BinOpFrag<(Neon_vduplane - (fneg node:$LHS), node:$RHS)>>; - - // Index can only be half of the max value for lane in 64-bit vector - - def : NI_2VEswap_lane(subop # "_2s4s"), - neon_uimm1_bare, op, VPR64, VPR64, v2f32, v2f32, - BinOpFrag<(fneg (Neon_vduplane - node:$LHS, node:$RHS))>>; - - def : NI_2VEswap_lane(subop # "_2s4s"), - neon_uimm1_bare, op, VPR64, VPR64, v2f32, v2f32, - BinOpFrag<(Neon_vduplane - (fneg node:$LHS), node:$RHS)>>; - - def : NI_2VEswap_lane(subop # "_4s4s"), - neon_uimm1_bare, op, VPR128, VPR64, v4f32, v2f32, - BinOpFrag<(fneg (Neon_vduplane node:$LHS, node:$RHS))>>; - - def : NI_2VEswap_lane(subop # "_4s4s"), - neon_uimm1_bare, op, VPR128, VPR64, v4f32, v2f32, - BinOpFrag<(Neon_vduplane (fneg node:$LHS), node:$RHS)>>; - - def : NI_2VEswap_lane_2d2d(subop # "_2d2d"), - neon_uimm1_bare, op, VPR128, VPR64, v2f64, v1f64, - BinOpFrag<(fneg (Neon_combine_2d - node:$LHS, node:$RHS))>>; - - def : NI_2VEswap_lane_2d2d(subop # "_2d2d"), - neon_uimm1_bare, op, VPR128, VPR64, v2f64, v1f64, - BinOpFrag<(Neon_combine_2d - (fneg node:$LHS), (fneg node:$RHS))>>; -} - -defm FMLS_lane_v2_s : NI_2VE_fms_v2_pat<"FMLSvve", fma>; - -// Variant 3: Long type -// E.g. SMLAL : 4S/4H/H (v0-v15), 2D/2S/S -// SMLAL2: 4S/8H/H (v0-v15), 2D/4S/S - -multiclass NI_2VE_v3 opcode, string asmop> { - // vector register class for element is always 128-bit to cover the max index - def _2d2s : NI_2VE<0b0, u, 0b10, opcode, asmop, "2d", "2s", "s", - neon_uimm2_bare, VPR128, VPR64, VPR128> { - let Inst{11} = {Index{1}}; - let Inst{21} = {Index{0}}; - let Inst{20-16} = Re; - } - - def _2d4s : NI_2VE<0b1, u, 0b10, opcode, asmop # "2", "2d", "4s", "s", - neon_uimm2_bare, VPR128, VPR128, VPR128> { - let Inst{11} = {Index{1}}; - let Inst{21} = {Index{0}}; - let Inst{20-16} = Re; - } - - // Index operations on 16-bit(H) elements are restricted to using v0-v15. - def _4s8h : NI_2VE<0b1, u, 0b01, opcode, asmop # "2", "4s", "8h", "h", - neon_uimm3_bare, VPR128, VPR128, VPR128Lo> { - let Inst{11} = {Index{2}}; - let Inst{21} = {Index{1}}; - let Inst{20} = {Index{0}}; - let Inst{19-16} = Re{3-0}; - } - - def _4s4h : NI_2VE<0b0, u, 0b01, opcode, asmop, "4s", "4h", "h", - neon_uimm3_bare, VPR128, VPR64, VPR128Lo> { - let Inst{11} = {Index{2}}; - let Inst{21} = {Index{1}}; - let Inst{20} = {Index{0}}; - let Inst{19-16} = Re{3-0}; - } -} - -defm SMLALvve : NI_2VE_v3<0b0, 0b0010, "smlal">; -defm UMLALvve : NI_2VE_v3<0b1, 0b0010, "umlal">; -defm SMLSLvve : NI_2VE_v3<0b0, 0b0110, "smlsl">; -defm UMLSLvve : NI_2VE_v3<0b1, 0b0110, "umlsl">; -defm SQDMLALvve : NI_2VE_v3<0b0, 0b0011, "sqdmlal">; -defm SQDMLSLvve : NI_2VE_v3<0b0, 0b0111, "sqdmlsl">; - -multiclass NI_2VE_v3_2op opcode, string asmop> { - // vector register class for element is always 128-bit to cover the max index - def _2d2s : NI_2VE_2op<0b0, u, 0b10, opcode, asmop, "2d", "2s", "s", - neon_uimm2_bare, VPR128, VPR64, VPR128> { - let Inst{11} = {Index{1}}; - let Inst{21} = {Index{0}}; - let Inst{20-16} = Re; - } - - def _2d4s : NI_2VE_2op<0b1, u, 0b10, opcode, asmop # "2", "2d", "4s", "s", - neon_uimm2_bare, VPR128, VPR128, VPR128> { - let Inst{11} = {Index{1}}; - let Inst{21} = {Index{0}}; - let Inst{20-16} = Re; - } - - // Index operations on 16-bit(H) elements are restricted to using v0-v15. - def _4s8h : NI_2VE_2op<0b1, u, 0b01, opcode, asmop # "2", "4s", "8h", "h", - neon_uimm3_bare, VPR128, VPR128, VPR128Lo> { - let Inst{11} = {Index{2}}; - let Inst{21} = {Index{1}}; - let Inst{20} = {Index{0}}; - let Inst{19-16} = Re{3-0}; - } - - def _4s4h : NI_2VE_2op<0b0, u, 0b01, opcode, asmop, "4s", "4h", "h", - neon_uimm3_bare, VPR128, VPR64, VPR128Lo> { - let Inst{11} = {Index{2}}; - let Inst{21} = {Index{1}}; - let Inst{20} = {Index{0}}; - let Inst{19-16} = Re{3-0}; - } -} - -defm SMULLve : NI_2VE_v3_2op<0b0, 0b1010, "smull">; -defm UMULLve : NI_2VE_v3_2op<0b1, 0b1010, "umull">; -defm SQDMULLve : NI_2VE_v3_2op<0b0, 0b1011, "sqdmull">; - -// Pattern for lane in 128-bit vector -class NI_2VEL2_laneq - : Pat<(ResTy (op (ResTy VPR128:$src), - (HalfOpTy (hiop (OpTy VPR128:$Rn))), - (HalfOpTy (Neon_vduplane - (EleOpTy EleOpVPR:$Re), (i64 OpImm:$Index))))), - (INST VPR128:$src, VPR128:$Rn, EleOpVPR:$Re, OpImm:$Index)>; - -// Pattern for lane in 64-bit vector -class NI_2VEL2_lane - : Pat<(ResTy (op (ResTy VPR128:$src), - (HalfOpTy (hiop (OpTy VPR128:$Rn))), - (HalfOpTy (Neon_vduplane - (EleOpTy EleOpVPR:$Re), (i64 OpImm:$Index))))), - (INST VPR128:$src, VPR128:$Rn, - (SUBREG_TO_REG (i64 0), EleOpVPR:$Re, sub_64), OpImm:$Index)>; - -multiclass NI_2VEL_v3_pat { - def : NI_2VE_laneq(subop # "_4s4h"), neon_uimm3_bare, - op, VPR128, VPR64, VPR128Lo, v4i32, v4i16, v8i16>; - - def : NI_2VE_laneq(subop # "_2d2s"), neon_uimm2_bare, - op, VPR128, VPR64, VPR128, v2i64, v2i32, v4i32>; - - def : NI_2VEL2_laneq(subop # "_4s8h"), neon_uimm3_bare, - op, VPR128Lo, v4i32, v8i16, v8i16, v4i16, Neon_High8H>; - - def : NI_2VEL2_laneq(subop # "_2d4s"), neon_uimm2_bare, - op, VPR128, v2i64, v4i32, v4i32, v2i32, Neon_High4S>; - - // Index can only be half of the max value for lane in 64-bit vector - - def : NI_2VE_lane(subop # "_4s4h"), neon_uimm2_bare, - op, VPR128, VPR64, VPR64Lo, v4i32, v4i16, v4i16>; - - def : NI_2VE_lane(subop # "_2d2s"), neon_uimm1_bare, - op, VPR128, VPR64, VPR64, v2i64, v2i32, v2i32>; - - def : NI_2VEL2_lane(subop # "_4s8h"), neon_uimm2_bare, - op, VPR64Lo, v4i32, v8i16, v4i16, v4i16, Neon_High8H>; - - def : NI_2VEL2_lane(subop # "_2d4s"), neon_uimm1_bare, - op, VPR64, v2i64, v4i32, v2i32, v2i32, Neon_High4S>; -} - -defm SMLAL_lane_v3 : NI_2VEL_v3_pat<"SMLALvve", Neon_smlal>; -defm UMLAL_lane_v3 : NI_2VEL_v3_pat<"UMLALvve", Neon_umlal>; -defm SMLSL_lane_v3 : NI_2VEL_v3_pat<"SMLSLvve", Neon_smlsl>; -defm UMLSL_lane_v3 : NI_2VEL_v3_pat<"UMLSLvve", Neon_umlsl>; - -// Pattern for lane in 128-bit vector -class NI_2VEL2_mul_laneq - : Pat<(ResTy (op - (HalfOpTy (hiop (OpTy VPR128:$Rn))), - (HalfOpTy (Neon_vduplane - (EleOpTy EleOpVPR:$Re), (i64 OpImm:$Index))))), - (INST VPR128:$Rn, EleOpVPR:$Re, OpImm:$Index)>; - -// Pattern for lane in 64-bit vector -class NI_2VEL2_mul_lane - : Pat<(ResTy (op - (HalfOpTy (hiop (OpTy VPR128:$Rn))), - (HalfOpTy (Neon_vduplane - (EleOpTy EleOpVPR:$Re), (i64 OpImm:$Index))))), - (INST VPR128:$Rn, - (SUBREG_TO_REG (i64 0), EleOpVPR:$Re, sub_64), OpImm:$Index)>; - -multiclass NI_2VEL_mul_v3_pat { - def : NI_2VE_mul_laneq(subop # "_4s4h"), neon_uimm3_bare, - op, VPR64, VPR128Lo, v4i32, v4i16, v8i16>; - - def : NI_2VE_mul_laneq(subop # "_2d2s"), neon_uimm2_bare, - op, VPR64, VPR128, v2i64, v2i32, v4i32>; - - def : NI_2VEL2_mul_laneq(subop # "_4s8h"), neon_uimm3_bare, - op, VPR128Lo, v4i32, v8i16, v8i16, v4i16, Neon_High8H>; - - def : NI_2VEL2_mul_laneq(subop # "_2d4s"), neon_uimm2_bare, - op, VPR128, v2i64, v4i32, v4i32, v2i32, Neon_High4S>; - - // Index can only be half of the max value for lane in 64-bit vector - - def : NI_2VE_mul_lane(subop # "_4s4h"), neon_uimm2_bare, - op, VPR64, VPR64Lo, v4i32, v4i16, v4i16>; - - def : NI_2VE_mul_lane(subop # "_2d2s"), neon_uimm1_bare, - op, VPR64, VPR64, v2i64, v2i32, v2i32>; - - def : NI_2VEL2_mul_lane(subop # "_4s8h"), neon_uimm2_bare, - op, VPR64Lo, v4i32, v8i16, v4i16, v4i16, Neon_High8H>; - - def : NI_2VEL2_mul_lane(subop # "_2d4s"), neon_uimm1_bare, - op, VPR64, v2i64, v4i32, v2i32, v2i32, Neon_High4S>; -} - -defm SMULL_lane_v3 : NI_2VEL_mul_v3_pat<"SMULLve", int_arm_neon_vmulls>; -defm UMULL_lane_v3 : NI_2VEL_mul_v3_pat<"UMULLve", int_arm_neon_vmullu>; -defm SQDMULL_lane_v3 : NI_2VEL_mul_v3_pat<"SQDMULLve", int_arm_neon_vqdmull>; - -multiclass NI_qdma { - def _4s : PatFrag<(ops node:$Ra, node:$Rn, node:$Rm), - (op node:$Ra, - (v4i32 (int_arm_neon_vqdmull node:$Rn, node:$Rm)))>; - - def _2d : PatFrag<(ops node:$Ra, node:$Rn, node:$Rm), - (op node:$Ra, - (v2i64 (int_arm_neon_vqdmull node:$Rn, node:$Rm)))>; -} - -defm Neon_qdmlal : NI_qdma; -defm Neon_qdmlsl : NI_qdma; - -multiclass NI_2VEL_v3_qdma_pat { - def : NI_2VE_laneq(subop # "_4s4h"), neon_uimm3_bare, - !cast(op # "_4s"), VPR128, VPR64, VPR128Lo, - v4i32, v4i16, v8i16>; - - def : NI_2VE_laneq(subop # "_2d2s"), neon_uimm2_bare, - !cast(op # "_2d"), VPR128, VPR64, VPR128, - v2i64, v2i32, v4i32>; - - def : NI_2VEL2_laneq(subop # "_4s8h"), neon_uimm3_bare, - !cast(op # "_4s"), VPR128Lo, - v4i32, v8i16, v8i16, v4i16, Neon_High8H>; - - def : NI_2VEL2_laneq(subop # "_2d4s"), neon_uimm2_bare, - !cast(op # "_2d"), VPR128, - v2i64, v4i32, v4i32, v2i32, Neon_High4S>; - - // Index can only be half of the max value for lane in 64-bit vector - - def : NI_2VE_lane(subop # "_4s4h"), neon_uimm2_bare, - !cast(op # "_4s"), VPR128, VPR64, VPR64Lo, - v4i32, v4i16, v4i16>; - - def : NI_2VE_lane(subop # "_2d2s"), neon_uimm1_bare, - !cast(op # "_2d"), VPR128, VPR64, VPR64, - v2i64, v2i32, v2i32>; - - def : NI_2VEL2_lane(subop # "_4s8h"), neon_uimm2_bare, - !cast(op # "_4s"), VPR64Lo, - v4i32, v8i16, v4i16, v4i16, Neon_High8H>; - - def : NI_2VEL2_lane(subop # "_2d4s"), neon_uimm1_bare, - !cast(op # "_2d"), VPR64, - v2i64, v4i32, v2i32, v2i32, Neon_High4S>; -} - -defm SQDMLAL_lane_v3 : NI_2VEL_v3_qdma_pat<"SQDMLALvve", "Neon_qdmlal">; -defm SQDMLSL_lane_v3 : NI_2VEL_v3_qdma_pat<"SQDMLSLvve", "Neon_qdmlsl">; - -// End of implementation for instruction class (3V Elem) - class NeonI_INS_main : NeonI_copy<0b1, 0b0, 0b0011, @@ -7182,10 +6519,13 @@ def : Pat<(v1i32 (scalar_to_vector GPR32:$src)), def : Pat<(v1i64 (scalar_to_vector GPR64:$src)), (FMOVdx $src)>; +def : Pat<(v1f32 (scalar_to_vector (f32 FPR32:$Rn))), + (v1f32 FPR32:$Rn)>; +def : Pat<(v1f64 (scalar_to_vector (f64 FPR64:$Rn))), + (v1f64 FPR64:$Rn)>; + def : Pat<(v1f64 (scalar_to_vector (f64 FPR64:$src))), (FMOVdd $src)>; -def : Pat<(v1f32 (scalar_to_vector (f32 FPR32:$src))), - (FMOVss $src)>; def : Pat<(v2f64 (scalar_to_vector (f64 FPR64:$src))), (INSERT_SUBREG (v2f64 (IMPLICIT_DEF)), @@ -7363,6 +6703,753 @@ def : Pat<(v2f32 (extract_subvector (v4f32 VPR128:$Rn), (i64 0))), def : Pat<(v1f64 (extract_subvector (v2f64 VPR128:$Rn), (i64 0))), (v1f64 (EXTRACT_SUBREG VPR128:$Rn, sub_64))>; +// The followings are for instruction class (3V Elem) + +// Variant 1 + +class NI_2VE size, bits<4> opcode, + string asmop, string ResS, string OpS, string EleOpS, + Operand OpImm, RegisterOperand ResVPR, + RegisterOperand OpVPR, RegisterOperand EleOpVPR> + : NeonI_2VElem { + bits<3> Index; + bits<5> Re; + + let Constraints = "$src = $Rd"; +} + +multiclass NI_2VE_v1 opcode, string asmop> { + // vector register class for element is always 128-bit to cover the max index + def _2s4s : NI_2VE<0b0, u, 0b10, opcode, asmop, "2s", "2s", "s", + neon_uimm2_bare, VPR64, VPR64, VPR128> { + let Inst{11} = {Index{1}}; + let Inst{21} = {Index{0}}; + let Inst{20-16} = Re; + } + + def _4s4s : NI_2VE<0b1, u, 0b10, opcode, asmop, "4s", "4s", "s", + neon_uimm2_bare, VPR128, VPR128, VPR128> { + let Inst{11} = {Index{1}}; + let Inst{21} = {Index{0}}; + let Inst{20-16} = Re; + } + + // Index operations on 16-bit(H) elements are restricted to using v0-v15. + def _4h8h : NI_2VE<0b0, u, 0b01, opcode, asmop, "4h", "4h", "h", + neon_uimm3_bare, VPR64, VPR64, VPR128Lo> { + let Inst{11} = {Index{2}}; + let Inst{21} = {Index{1}}; + let Inst{20} = {Index{0}}; + let Inst{19-16} = Re{3-0}; + } + + def _8h8h : NI_2VE<0b1, u, 0b01, opcode, asmop, "8h", "8h", "h", + neon_uimm3_bare, VPR128, VPR128, VPR128Lo> { + let Inst{11} = {Index{2}}; + let Inst{21} = {Index{1}}; + let Inst{20} = {Index{0}}; + let Inst{19-16} = Re{3-0}; + } +} + +defm MLAvve : NI_2VE_v1<0b1, 0b0000, "mla">; +defm MLSvve : NI_2VE_v1<0b1, 0b0100, "mls">; + +// Pattern for lane in 128-bit vector +class NI_2VE_laneq + : Pat<(ResTy (op (ResTy ResVPR:$src), (OpTy OpVPR:$Rn), + (OpTy (Neon_vduplane (EleOpTy EleOpVPR:$Re), (i64 OpImm:$Index))))), + (INST ResVPR:$src, OpVPR:$Rn, EleOpVPR:$Re, OpImm:$Index)>; + +// Pattern for lane in 64-bit vector +class NI_2VE_lane + : Pat<(ResTy (op (ResTy ResVPR:$src), (OpTy OpVPR:$Rn), + (OpTy (Neon_vduplane (EleOpTy EleOpVPR:$Re), (i64 OpImm:$Index))))), + (INST ResVPR:$src, OpVPR:$Rn, + (SUBREG_TO_REG (i64 0), EleOpVPR:$Re, sub_64), OpImm:$Index)>; + +multiclass NI_2VE_v1_pat +{ + def : NI_2VE_laneq(subop # "_2s4s"), neon_uimm2_bare, + op, VPR64, VPR64, VPR128, v2i32, v2i32, v4i32>; + + def : NI_2VE_laneq(subop # "_4s4s"), neon_uimm2_bare, + op, VPR128, VPR128, VPR128, v4i32, v4i32, v4i32>; + + def : NI_2VE_laneq(subop # "_4h8h"), neon_uimm3_bare, + op, VPR64, VPR64, VPR128Lo, v4i16, v4i16, v8i16>; + + def : NI_2VE_laneq(subop # "_8h8h"), neon_uimm3_bare, + op, VPR128, VPR128, VPR128Lo, v8i16, v8i16, v8i16>; + + // Index can only be half of the max value for lane in 64-bit vector + + def : NI_2VE_lane(subop # "_2s4s"), neon_uimm1_bare, + op, VPR64, VPR64, VPR64, v2i32, v2i32, v2i32>; + + def : NI_2VE_lane(subop # "_4h8h"), neon_uimm2_bare, + op, VPR64, VPR64, VPR64Lo, v4i16, v4i16, v4i16>; +} + +defm MLA_lane_v1 : NI_2VE_v1_pat<"MLAvve", Neon_mla>; +defm MLS_lane_v1 : NI_2VE_v1_pat<"MLSvve", Neon_mls>; + +class NI_2VE_2op size, bits<4> opcode, + string asmop, string ResS, string OpS, string EleOpS, + Operand OpImm, RegisterOperand ResVPR, + RegisterOperand OpVPR, RegisterOperand EleOpVPR> + : NeonI_2VElem { + bits<3> Index; + bits<5> Re; +} + +multiclass NI_2VE_v1_2op opcode, string asmop> { + // vector register class for element is always 128-bit to cover the max index + def _2s4s : NI_2VE_2op<0b0, u, 0b10, opcode, asmop, "2s", "2s", "s", + neon_uimm2_bare, VPR64, VPR64, VPR128> { + let Inst{11} = {Index{1}}; + let Inst{21} = {Index{0}}; + let Inst{20-16} = Re; + } + + def _4s4s : NI_2VE_2op<0b1, u, 0b10, opcode, asmop, "4s", "4s", "s", + neon_uimm2_bare, VPR128, VPR128, VPR128> { + let Inst{11} = {Index{1}}; + let Inst{21} = {Index{0}}; + let Inst{20-16} = Re; + } + + // Index operations on 16-bit(H) elements are restricted to using v0-v15. + def _4h8h : NI_2VE_2op<0b0, u, 0b01, opcode, asmop, "4h", "4h", "h", + neon_uimm3_bare, VPR64, VPR64, VPR128Lo> { + let Inst{11} = {Index{2}}; + let Inst{21} = {Index{1}}; + let Inst{20} = {Index{0}}; + let Inst{19-16} = Re{3-0}; + } + + def _8h8h : NI_2VE_2op<0b1, u, 0b01, opcode, asmop, "8h", "8h", "h", + neon_uimm3_bare, VPR128, VPR128, VPR128Lo> { + let Inst{11} = {Index{2}}; + let Inst{21} = {Index{1}}; + let Inst{20} = {Index{0}}; + let Inst{19-16} = Re{3-0}; + } +} + +defm MULve : NI_2VE_v1_2op<0b0, 0b1000, "mul">; +defm SQDMULHve : NI_2VE_v1_2op<0b0, 0b1100, "sqdmulh">; +defm SQRDMULHve : NI_2VE_v1_2op<0b0, 0b1101, "sqrdmulh">; + +// Pattern for lane in 128-bit vector +class NI_2VE_mul_laneq + : Pat<(ResTy (op (OpTy OpVPR:$Rn), + (OpTy (Neon_vduplane (EleOpTy EleOpVPR:$Re), (i64 OpImm:$Index))))), + (INST OpVPR:$Rn, EleOpVPR:$Re, OpImm:$Index)>; + +// Pattern for lane in 64-bit vector +class NI_2VE_mul_lane + : Pat<(ResTy (op (OpTy OpVPR:$Rn), + (OpTy (Neon_vduplane (EleOpTy EleOpVPR:$Re), (i64 OpImm:$Index))))), + (INST OpVPR:$Rn, + (SUBREG_TO_REG (i64 0), EleOpVPR:$Re, sub_64), OpImm:$Index)>; + +multiclass NI_2VE_mul_v1_pat { + def : NI_2VE_mul_laneq(subop # "_2s4s"), neon_uimm2_bare, + op, VPR64, VPR128, v2i32, v2i32, v4i32>; + + def : NI_2VE_mul_laneq(subop # "_4s4s"), neon_uimm2_bare, + op, VPR128, VPR128, v4i32, v4i32, v4i32>; + + def : NI_2VE_mul_laneq(subop # "_4h8h"), neon_uimm3_bare, + op, VPR64, VPR128Lo, v4i16, v4i16, v8i16>; + + def : NI_2VE_mul_laneq(subop # "_8h8h"), neon_uimm3_bare, + op, VPR128, VPR128Lo, v8i16, v8i16, v8i16>; + + // Index can only be half of the max value for lane in 64-bit vector + + def : NI_2VE_mul_lane(subop # "_2s4s"), neon_uimm1_bare, + op, VPR64, VPR64, v2i32, v2i32, v2i32>; + + def : NI_2VE_mul_lane(subop # "_4h8h"), neon_uimm2_bare, + op, VPR64, VPR64Lo, v4i16, v4i16, v4i16>; +} + +defm MUL_lane_v1 : NI_2VE_mul_v1_pat<"MULve", mul>; +defm SQDMULH_lane_v1 : NI_2VE_mul_v1_pat<"SQDMULHve", int_arm_neon_vqdmulh>; +defm SQRDMULH_lane_v1 : NI_2VE_mul_v1_pat<"SQRDMULHve", int_arm_neon_vqrdmulh>; + +// Variant 2 + +multiclass NI_2VE_v2_2op opcode, string asmop> { + // vector register class for element is always 128-bit to cover the max index + def _2s4s : NI_2VE_2op<0b0, u, 0b10, opcode, asmop, "2s", "2s", "s", + neon_uimm2_bare, VPR64, VPR64, VPR128> { + let Inst{11} = {Index{1}}; + let Inst{21} = {Index{0}}; + let Inst{20-16} = Re; + } + + def _4s4s : NI_2VE_2op<0b1, u, 0b10, opcode, asmop, "4s", "4s", "s", + neon_uimm2_bare, VPR128, VPR128, VPR128> { + let Inst{11} = {Index{1}}; + let Inst{21} = {Index{0}}; + let Inst{20-16} = Re; + } + + // _1d2d doesn't exist! + + def _2d2d : NI_2VE_2op<0b1, u, 0b11, opcode, asmop, "2d", "2d", "d", + neon_uimm1_bare, VPR128, VPR128, VPR128> { + let Inst{11} = {Index{0}}; + let Inst{21} = 0b0; + let Inst{20-16} = Re; + } +} + +defm FMULve : NI_2VE_v2_2op<0b0, 0b1001, "fmul">; +defm FMULXve : NI_2VE_v2_2op<0b1, 0b1001, "fmulx">; + +class NI_2VE_mul_lane_2d + : Pat<(ResTy (op (OpTy OpVPR:$Rn), + (OpTy (coreop (EleOpTy EleOpVPR:$Re), (EleOpTy EleOpVPR:$Re))))), + (INST OpVPR:$Rn, + (SUBREG_TO_REG (i64 0), EleOpVPR:$Re, sub_64), 0)>; + +multiclass NI_2VE_mul_v2_pat { + def : NI_2VE_mul_laneq(subop # "_2s4s"), neon_uimm2_bare, + op, VPR64, VPR128, v2f32, v2f32, v4f32>; + + def : NI_2VE_mul_laneq(subop # "_4s4s"), neon_uimm2_bare, + op, VPR128, VPR128, v4f32, v4f32, v4f32>; + + def : NI_2VE_mul_laneq(subop # "_2d2d"), neon_uimm1_bare, + op, VPR128, VPR128, v2f64, v2f64, v2f64>; + + // Index can only be half of the max value for lane in 64-bit vector + + def : NI_2VE_mul_lane(subop # "_2s4s"), neon_uimm1_bare, + op, VPR64, VPR64, v2f32, v2f32, v2f32>; + + def : NI_2VE_mul_lane_2d(subop # "_2d2d"), neon_uimm1_bare, + op, VPR128, VPR64, v2f64, v2f64, v1f64, + BinOpFrag<(Neon_combine_2d node:$LHS, node:$RHS)>>; +} + +defm FMUL_lane_v2 : NI_2VE_mul_v2_pat<"FMULve", fmul>; +defm FMULX_lane_v2 : NI_2VE_mul_v2_pat<"FMULXve", int_aarch64_neon_vmulx>; + +def : Pat<(v2f32 (fmul (v2f32 (Neon_vdup (f32 FPR32:$Re))), + (v2f32 VPR64:$Rn))), + (FMULve_2s4s VPR64:$Rn, (SUBREG_TO_REG (i32 0), $Re, sub_32), 0)>; + +def : Pat<(v4f32 (fmul (v4f32 (Neon_vdup (f32 FPR32:$Re))), + (v4f32 VPR128:$Rn))), + (FMULve_4s4s VPR128:$Rn, (SUBREG_TO_REG (i32 0), $Re, sub_32), 0)>; + +def : Pat<(v2f64 (fmul (v2f64 (Neon_vdup (f64 FPR64:$Re))), + (v2f64 VPR128:$Rn))), + (FMULve_2d2d VPR128:$Rn, (SUBREG_TO_REG (i64 0), $Re, sub_64), 0)>; + +// The followings are patterns using fma +// -ffp-contract=fast generates fma + +multiclass NI_2VE_v2 opcode, string asmop> { + // vector register class for element is always 128-bit to cover the max index + def _2s4s : NI_2VE<0b0, u, 0b10, opcode, asmop, "2s", "2s", "s", + neon_uimm2_bare, VPR64, VPR64, VPR128> { + let Inst{11} = {Index{1}}; + let Inst{21} = {Index{0}}; + let Inst{20-16} = Re; + } + + def _4s4s : NI_2VE<0b1, u, 0b10, opcode, asmop, "4s", "4s", "s", + neon_uimm2_bare, VPR128, VPR128, VPR128> { + let Inst{11} = {Index{1}}; + let Inst{21} = {Index{0}}; + let Inst{20-16} = Re; + } + + // _1d2d doesn't exist! + + def _2d2d : NI_2VE<0b1, u, 0b11, opcode, asmop, "2d", "2d", "d", + neon_uimm1_bare, VPR128, VPR128, VPR128> { + let Inst{11} = {Index{0}}; + let Inst{21} = 0b0; + let Inst{20-16} = Re; + } +} + +defm FMLAvve : NI_2VE_v2<0b0, 0b0001, "fmla">; +defm FMLSvve : NI_2VE_v2<0b0, 0b0101, "fmls">; + +// Pattern for lane in 128-bit vector +class NI_2VEswap_laneq + : Pat<(ResTy (op (ResTy (coreop (OpTy OpVPR:$Re), (i64 OpImm:$Index))), + (ResTy ResVPR:$src), (ResTy ResVPR:$Rn))), + (INST ResVPR:$src, ResVPR:$Rn, OpVPR:$Re, OpImm:$Index)>; + +// Pattern for lane 0 +class NI_2VEfma_lane0 + : Pat<(ResTy (op (ResTy ResVPR:$Rn), + (ResTy (Neon_vdup (f32 FPR32:$Re))), + (ResTy ResVPR:$src))), + (INST ResVPR:$src, ResVPR:$Rn, + (SUBREG_TO_REG (i32 0), $Re, sub_32), 0)>; + +// Pattern for lane in 64-bit vector +class NI_2VEswap_lane + : Pat<(ResTy (op (ResTy (coreop (OpTy OpVPR:$Re), (i64 OpImm:$Index))), + (ResTy ResVPR:$Rn), (ResTy ResVPR:$src))), + (INST ResVPR:$src, ResVPR:$Rn, + (SUBREG_TO_REG (i64 0), OpVPR:$Re, sub_64), OpImm:$Index)>; + +// Pattern for lane in 64-bit vector +class NI_2VEswap_lane_2d2d + : Pat<(ResTy (op (ResTy (coreop (OpTy OpVPR:$Re), (OpTy OpVPR:$Re))), + (ResTy ResVPR:$Rn), (ResTy ResVPR:$src))), + (INST ResVPR:$src, ResVPR:$Rn, + (SUBREG_TO_REG (i64 0), OpVPR:$Re, sub_64), 0)>; + + +multiclass NI_2VE_fma_v2_pat { + def : NI_2VEswap_laneq(subop # "_2s4s"), + neon_uimm2_bare, op, VPR64, VPR128, v2f32, v4f32, + BinOpFrag<(Neon_vduplane node:$LHS, node:$RHS)>>; + + def : NI_2VEfma_lane0(subop # "_2s4s"), + op, VPR64, v2f32>; + + def : NI_2VEswap_laneq(subop # "_4s4s"), + neon_uimm2_bare, op, VPR128, VPR128, v4f32, v4f32, + BinOpFrag<(Neon_vduplane node:$LHS, node:$RHS)>>; + + def : NI_2VEfma_lane0(subop # "_4s4s"), + op, VPR128, v4f32>; + + def : NI_2VEswap_laneq(subop # "_2d2d"), + neon_uimm1_bare, op, VPR128, VPR128, v2f64, v2f64, + BinOpFrag<(Neon_vduplane node:$LHS, node:$RHS)>>; + + // Index can only be half of the max value for lane in 64-bit vector + + def : NI_2VEswap_lane(subop # "_2s4s"), + neon_uimm1_bare, op, VPR64, VPR64, v2f32, v2f32, + BinOpFrag<(Neon_vduplane node:$LHS, node:$RHS)>>; + + def : NI_2VEswap_lane_2d2d(subop # "_2d2d"), + neon_uimm1_bare, op, VPR128, VPR64, v2f64, v1f64, + BinOpFrag<(Neon_combine_2d node:$LHS, node:$RHS)>>; +} + +defm FMLA_lane_v2_s : NI_2VE_fma_v2_pat<"FMLAvve", fma>; + +// Pattern for lane 0 +class NI_2VEfms_lane0 + : Pat<(ResTy (op (ResTy (fneg ResVPR:$Rn)), + (ResTy (Neon_vdup (f32 FPR32:$Re))), + (ResTy ResVPR:$src))), + (INST ResVPR:$src, ResVPR:$Rn, + (SUBREG_TO_REG (i32 0), $Re, sub_32), 0)>; + +multiclass NI_2VE_fms_v2_pat +{ + def : NI_2VEswap_laneq(subop # "_2s4s"), + neon_uimm2_bare, op, VPR64, VPR128, v2f32, v4f32, + BinOpFrag<(fneg (Neon_vduplane node:$LHS, node:$RHS))>>; + + def : NI_2VEswap_laneq(subop # "_2s4s"), + neon_uimm2_bare, op, VPR64, VPR128, v2f32, v4f32, + BinOpFrag<(Neon_vduplane + (fneg node:$LHS), node:$RHS)>>; + + def : NI_2VEfms_lane0(subop # "_2s4s"), + op, VPR64, v2f32>; + + def : NI_2VEswap_laneq(subop # "_4s4s"), + neon_uimm2_bare, op, VPR128, VPR128, v4f32, v4f32, + BinOpFrag<(fneg (Neon_vduplane + node:$LHS, node:$RHS))>>; + + def : NI_2VEswap_laneq(subop # "_4s4s"), + neon_uimm2_bare, op, VPR128, VPR128, v4f32, v4f32, + BinOpFrag<(Neon_vduplane + (fneg node:$LHS), node:$RHS)>>; + + def : NI_2VEfms_lane0(subop # "_4s4s"), + op, VPR128, v4f32>; + + def : NI_2VEswap_laneq(subop # "_2d2d"), + neon_uimm1_bare, op, VPR128, VPR128, v2f64, v2f64, + BinOpFrag<(fneg (Neon_vduplane + node:$LHS, node:$RHS))>>; + + def : NI_2VEswap_laneq(subop # "_2d2d"), + neon_uimm1_bare, op, VPR128, VPR128, v2f64, v2f64, + BinOpFrag<(Neon_vduplane + (fneg node:$LHS), node:$RHS)>>; + + // Index can only be half of the max value for lane in 64-bit vector + + def : NI_2VEswap_lane(subop # "_2s4s"), + neon_uimm1_bare, op, VPR64, VPR64, v2f32, v2f32, + BinOpFrag<(fneg (Neon_vduplane + node:$LHS, node:$RHS))>>; + + def : NI_2VEswap_lane(subop # "_2s4s"), + neon_uimm1_bare, op, VPR64, VPR64, v2f32, v2f32, + BinOpFrag<(Neon_vduplane + (fneg node:$LHS), node:$RHS)>>; + + def : NI_2VEswap_lane(subop # "_4s4s"), + neon_uimm1_bare, op, VPR128, VPR64, v4f32, v2f32, + BinOpFrag<(fneg (Neon_vduplane node:$LHS, node:$RHS))>>; + + def : NI_2VEswap_lane(subop # "_4s4s"), + neon_uimm1_bare, op, VPR128, VPR64, v4f32, v2f32, + BinOpFrag<(Neon_vduplane (fneg node:$LHS), node:$RHS)>>; + + def : NI_2VEswap_lane_2d2d(subop # "_2d2d"), + neon_uimm1_bare, op, VPR128, VPR64, v2f64, v1f64, + BinOpFrag<(fneg (Neon_combine_2d + node:$LHS, node:$RHS))>>; + + def : NI_2VEswap_lane_2d2d(subop # "_2d2d"), + neon_uimm1_bare, op, VPR128, VPR64, v2f64, v1f64, + BinOpFrag<(Neon_combine_2d + (fneg node:$LHS), (fneg node:$RHS))>>; +} + +defm FMLS_lane_v2_s : NI_2VE_fms_v2_pat<"FMLSvve", fma>; + +// Variant 3: Long type +// E.g. SMLAL : 4S/4H/H (v0-v15), 2D/2S/S +// SMLAL2: 4S/8H/H (v0-v15), 2D/4S/S + +multiclass NI_2VE_v3 opcode, string asmop> { + // vector register class for element is always 128-bit to cover the max index + def _2d2s : NI_2VE<0b0, u, 0b10, opcode, asmop, "2d", "2s", "s", + neon_uimm2_bare, VPR128, VPR64, VPR128> { + let Inst{11} = {Index{1}}; + let Inst{21} = {Index{0}}; + let Inst{20-16} = Re; + } + + def _2d4s : NI_2VE<0b1, u, 0b10, opcode, asmop # "2", "2d", "4s", "s", + neon_uimm2_bare, VPR128, VPR128, VPR128> { + let Inst{11} = {Index{1}}; + let Inst{21} = {Index{0}}; + let Inst{20-16} = Re; + } + + // Index operations on 16-bit(H) elements are restricted to using v0-v15. + def _4s8h : NI_2VE<0b1, u, 0b01, opcode, asmop # "2", "4s", "8h", "h", + neon_uimm3_bare, VPR128, VPR128, VPR128Lo> { + let Inst{11} = {Index{2}}; + let Inst{21} = {Index{1}}; + let Inst{20} = {Index{0}}; + let Inst{19-16} = Re{3-0}; + } + + def _4s4h : NI_2VE<0b0, u, 0b01, opcode, asmop, "4s", "4h", "h", + neon_uimm3_bare, VPR128, VPR64, VPR128Lo> { + let Inst{11} = {Index{2}}; + let Inst{21} = {Index{1}}; + let Inst{20} = {Index{0}}; + let Inst{19-16} = Re{3-0}; + } +} + +defm SMLALvve : NI_2VE_v3<0b0, 0b0010, "smlal">; +defm UMLALvve : NI_2VE_v3<0b1, 0b0010, "umlal">; +defm SMLSLvve : NI_2VE_v3<0b0, 0b0110, "smlsl">; +defm UMLSLvve : NI_2VE_v3<0b1, 0b0110, "umlsl">; +defm SQDMLALvve : NI_2VE_v3<0b0, 0b0011, "sqdmlal">; +defm SQDMLSLvve : NI_2VE_v3<0b0, 0b0111, "sqdmlsl">; + +multiclass NI_2VE_v3_2op opcode, string asmop> { + // vector register class for element is always 128-bit to cover the max index + def _2d2s : NI_2VE_2op<0b0, u, 0b10, opcode, asmop, "2d", "2s", "s", + neon_uimm2_bare, VPR128, VPR64, VPR128> { + let Inst{11} = {Index{1}}; + let Inst{21} = {Index{0}}; + let Inst{20-16} = Re; + } + + def _2d4s : NI_2VE_2op<0b1, u, 0b10, opcode, asmop # "2", "2d", "4s", "s", + neon_uimm2_bare, VPR128, VPR128, VPR128> { + let Inst{11} = {Index{1}}; + let Inst{21} = {Index{0}}; + let Inst{20-16} = Re; + } + + // Index operations on 16-bit(H) elements are restricted to using v0-v15. + def _4s8h : NI_2VE_2op<0b1, u, 0b01, opcode, asmop # "2", "4s", "8h", "h", + neon_uimm3_bare, VPR128, VPR128, VPR128Lo> { + let Inst{11} = {Index{2}}; + let Inst{21} = {Index{1}}; + let Inst{20} = {Index{0}}; + let Inst{19-16} = Re{3-0}; + } + + def _4s4h : NI_2VE_2op<0b0, u, 0b01, opcode, asmop, "4s", "4h", "h", + neon_uimm3_bare, VPR128, VPR64, VPR128Lo> { + let Inst{11} = {Index{2}}; + let Inst{21} = {Index{1}}; + let Inst{20} = {Index{0}}; + let Inst{19-16} = Re{3-0}; + } +} + +defm SMULLve : NI_2VE_v3_2op<0b0, 0b1010, "smull">; +defm UMULLve : NI_2VE_v3_2op<0b1, 0b1010, "umull">; +defm SQDMULLve : NI_2VE_v3_2op<0b0, 0b1011, "sqdmull">; + +def : Pat<(v1f64 (scalar_to_vector (f64 FPR64:$src))), + (FMOVdd $src)>; +def : Pat<(v1f32 (scalar_to_vector (f32 FPR32:$src))), + (FMOVss $src)>; + +// Pattern for lane in 128-bit vector +class NI_2VEL2_laneq + : Pat<(ResTy (op (ResTy VPR128:$src), + (HalfOpTy (hiop (OpTy VPR128:$Rn))), + (HalfOpTy (Neon_vduplane + (EleOpTy EleOpVPR:$Re), (i64 OpImm:$Index))))), + (INST VPR128:$src, VPR128:$Rn, EleOpVPR:$Re, OpImm:$Index)>; + +// Pattern for lane in 64-bit vector +class NI_2VEL2_lane + : Pat<(ResTy (op (ResTy VPR128:$src), + (HalfOpTy (hiop (OpTy VPR128:$Rn))), + (HalfOpTy (Neon_vduplane + (EleOpTy EleOpVPR:$Re), (i64 OpImm:$Index))))), + (INST VPR128:$src, VPR128:$Rn, + (SUBREG_TO_REG (i64 0), EleOpVPR:$Re, sub_64), OpImm:$Index)>; + +class NI_2VEL2_lane0 + : Pat<(ResTy (op (ResTy VPR128:$src), + (HalfOpTy (hiop (OpTy VPR128:$Rn))), + (HalfOpTy (Neon_vdup (i32 GPR32:$Re))))), + (INST VPR128:$src, VPR128:$Rn, (DupInst $Re), 0)>; + +multiclass NI_2VEL_v3_pat { + def : NI_2VE_laneq(subop # "_4s4h"), neon_uimm3_bare, + op, VPR128, VPR64, VPR128Lo, v4i32, v4i16, v8i16>; + + def : NI_2VE_laneq(subop # "_2d2s"), neon_uimm2_bare, + op, VPR128, VPR64, VPR128, v2i64, v2i32, v4i32>; + + def : NI_2VEL2_laneq(subop # "_4s8h"), neon_uimm3_bare, + op, VPR128Lo, v4i32, v8i16, v8i16, v4i16, Neon_High8H>; + + def : NI_2VEL2_laneq(subop # "_2d4s"), neon_uimm2_bare, + op, VPR128, v2i64, v4i32, v4i32, v2i32, Neon_High4S>; + + def : NI_2VEL2_lane0(subop # "_4s8h"), + op, v4i32, v8i16, v4i16, Neon_High8H, DUP8h>; + + def : NI_2VEL2_lane0(subop # "_2d4s"), + op, v2i64, v4i32, v2i32, Neon_High4S, DUP4s>; + + // Index can only be half of the max value for lane in 64-bit vector + + def : NI_2VE_lane(subop # "_4s4h"), neon_uimm2_bare, + op, VPR128, VPR64, VPR64Lo, v4i32, v4i16, v4i16>; + + def : NI_2VE_lane(subop # "_2d2s"), neon_uimm1_bare, + op, VPR128, VPR64, VPR64, v2i64, v2i32, v2i32>; + + def : NI_2VEL2_lane(subop # "_4s8h"), neon_uimm2_bare, + op, VPR64Lo, v4i32, v8i16, v4i16, v4i16, Neon_High8H>; + + def : NI_2VEL2_lane(subop # "_2d4s"), neon_uimm1_bare, + op, VPR64, v2i64, v4i32, v2i32, v2i32, Neon_High4S>; +} + +defm SMLAL_lane_v3 : NI_2VEL_v3_pat<"SMLALvve", Neon_smlal>; +defm UMLAL_lane_v3 : NI_2VEL_v3_pat<"UMLALvve", Neon_umlal>; +defm SMLSL_lane_v3 : NI_2VEL_v3_pat<"SMLSLvve", Neon_smlsl>; +defm UMLSL_lane_v3 : NI_2VEL_v3_pat<"UMLSLvve", Neon_umlsl>; + +// Pattern for lane in 128-bit vector +class NI_2VEL2_mul_laneq + : Pat<(ResTy (op + (HalfOpTy (hiop (OpTy VPR128:$Rn))), + (HalfOpTy (Neon_vduplane + (EleOpTy EleOpVPR:$Re), (i64 OpImm:$Index))))), + (INST VPR128:$Rn, EleOpVPR:$Re, OpImm:$Index)>; + +// Pattern for lane in 64-bit vector +class NI_2VEL2_mul_lane + : Pat<(ResTy (op + (HalfOpTy (hiop (OpTy VPR128:$Rn))), + (HalfOpTy (Neon_vduplane + (EleOpTy EleOpVPR:$Re), (i64 OpImm:$Index))))), + (INST VPR128:$Rn, + (SUBREG_TO_REG (i64 0), EleOpVPR:$Re, sub_64), OpImm:$Index)>; + +// Pattern for fixed lane 0 +class NI_2VEL2_mul_lane0 + : Pat<(ResTy (op + (HalfOpTy (hiop (OpTy VPR128:$Rn))), + (HalfOpTy (Neon_vdup (i32 GPR32:$Re))))), + (INST VPR128:$Rn, (DupInst $Re), 0)>; + +multiclass NI_2VEL_mul_v3_pat { + def : NI_2VE_mul_laneq(subop # "_4s4h"), neon_uimm3_bare, + op, VPR64, VPR128Lo, v4i32, v4i16, v8i16>; + + def : NI_2VE_mul_laneq(subop # "_2d2s"), neon_uimm2_bare, + op, VPR64, VPR128, v2i64, v2i32, v4i32>; + + def : NI_2VEL2_mul_laneq(subop # "_4s8h"), neon_uimm3_bare, + op, VPR128Lo, v4i32, v8i16, v8i16, v4i16, Neon_High8H>; + + def : NI_2VEL2_mul_laneq(subop # "_2d4s"), neon_uimm2_bare, + op, VPR128, v2i64, v4i32, v4i32, v2i32, Neon_High4S>; + + def : NI_2VEL2_mul_lane0(subop # "_4s8h"), + op, v4i32, v8i16, v4i16, Neon_High8H, DUP8h>; + + def : NI_2VEL2_mul_lane0(subop # "_2d4s"), + op, v2i64, v4i32, v2i32, Neon_High4S, DUP4s>; + + // Index can only be half of the max value for lane in 64-bit vector + + def : NI_2VE_mul_lane(subop # "_4s4h"), neon_uimm2_bare, + op, VPR64, VPR64Lo, v4i32, v4i16, v4i16>; + + def : NI_2VE_mul_lane(subop # "_2d2s"), neon_uimm1_bare, + op, VPR64, VPR64, v2i64, v2i32, v2i32>; + + def : NI_2VEL2_mul_lane(subop # "_4s8h"), neon_uimm2_bare, + op, VPR64Lo, v4i32, v8i16, v4i16, v4i16, Neon_High8H>; + + def : NI_2VEL2_mul_lane(subop # "_2d4s"), neon_uimm1_bare, + op, VPR64, v2i64, v4i32, v2i32, v2i32, Neon_High4S>; +} + +defm SMULL_lane_v3 : NI_2VEL_mul_v3_pat<"SMULLve", int_arm_neon_vmulls>; +defm UMULL_lane_v3 : NI_2VEL_mul_v3_pat<"UMULLve", int_arm_neon_vmullu>; +defm SQDMULL_lane_v3 : NI_2VEL_mul_v3_pat<"SQDMULLve", int_arm_neon_vqdmull>; + +multiclass NI_qdma { + def _4s : PatFrag<(ops node:$Ra, node:$Rn, node:$Rm), + (op node:$Ra, + (v4i32 (int_arm_neon_vqdmull node:$Rn, node:$Rm)))>; + + def _2d : PatFrag<(ops node:$Ra, node:$Rn, node:$Rm), + (op node:$Ra, + (v2i64 (int_arm_neon_vqdmull node:$Rn, node:$Rm)))>; +} + +defm Neon_qdmlal : NI_qdma; +defm Neon_qdmlsl : NI_qdma; + +multiclass NI_2VEL_v3_qdma_pat { + def : NI_2VE_laneq(subop # "_4s4h"), neon_uimm3_bare, + !cast(op # "_4s"), VPR128, VPR64, VPR128Lo, + v4i32, v4i16, v8i16>; + + def : NI_2VE_laneq(subop # "_2d2s"), neon_uimm2_bare, + !cast(op # "_2d"), VPR128, VPR64, VPR128, + v2i64, v2i32, v4i32>; + + def : NI_2VEL2_laneq(subop # "_4s8h"), neon_uimm3_bare, + !cast(op # "_4s"), VPR128Lo, + v4i32, v8i16, v8i16, v4i16, Neon_High8H>; + + def : NI_2VEL2_laneq(subop # "_2d4s"), neon_uimm2_bare, + !cast(op # "_2d"), VPR128, + v2i64, v4i32, v4i32, v2i32, Neon_High4S>; + + def : NI_2VEL2_lane0(subop # "_4s8h"), + !cast(op # "_4s"), + v4i32, v8i16, v4i16, Neon_High8H, DUP8h>; + + def : NI_2VEL2_lane0(subop # "_2d4s"), + !cast(op # "_2d"), + v2i64, v4i32, v2i32, Neon_High4S, DUP4s>; + + // Index can only be half of the max value for lane in 64-bit vector + + def : NI_2VE_lane(subop # "_4s4h"), neon_uimm2_bare, + !cast(op # "_4s"), VPR128, VPR64, VPR64Lo, + v4i32, v4i16, v4i16>; + + def : NI_2VE_lane(subop # "_2d2s"), neon_uimm1_bare, + !cast(op # "_2d"), VPR128, VPR64, VPR64, + v2i64, v2i32, v2i32>; + + def : NI_2VEL2_lane(subop # "_4s8h"), neon_uimm2_bare, + !cast(op # "_4s"), VPR64Lo, + v4i32, v8i16, v4i16, v4i16, Neon_High8H>; + + def : NI_2VEL2_lane(subop # "_2d4s"), neon_uimm1_bare, + !cast(op # "_2d"), VPR64, + v2i64, v4i32, v2i32, v2i32, Neon_High4S>; +} + +defm SQDMLAL_lane_v3 : NI_2VEL_v3_qdma_pat<"SQDMLALvve", "Neon_qdmlal">; +defm SQDMLSL_lane_v3 : NI_2VEL_v3_qdma_pat<"SQDMLSLvve", "Neon_qdmlsl">; + +// End of implementation for instruction class (3V Elem) + class NeonI_REV size, bit Q, bit U, bits<5> opcode, RegisterOperand ResVPR, ValueType ResTy, SDPatternOperator Neon_Rev> diff --git a/test/CodeGen/AArch64/neon-2velem-high.ll b/test/CodeGen/AArch64/neon-2velem-high.ll new file mode 100644 index 00000000000..97031d98b7c --- /dev/null +++ b/test/CodeGen/AArch64/neon-2velem-high.ll @@ -0,0 +1,331 @@ +; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon -fp-contract=fast | FileCheck %s + +declare <2 x float> @llvm.fma.v2f32(<2 x float>, <2 x float>, <2 x float>) + +declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>) + +declare <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32>, <2 x i32>) + +declare <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64>, <2 x i64>) + +declare <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16>, <4 x i16>) + +declare <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32>, <4 x i32>) + +declare <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64>, <2 x i64>) + +declare <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32>, <4 x i32>) + +declare <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32>, <2 x i32>) + +declare <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16>, <4 x i16>) + +declare <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32>, <2 x i32>) + +declare <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16>, <4 x i16>) + +define <4 x i32> @test_vmull_high_n_s16(<8 x i16> %a, i16 %b) { +; CHECK: test_vmull_high_n_s16: +; CHECK: smull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0] +entry: + %shuffle.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> + %vecinit.i.i = insertelement <4 x i16> undef, i16 %b, i32 0 + %vecinit1.i.i = insertelement <4 x i16> %vecinit.i.i, i16 %b, i32 1 + %vecinit2.i.i = insertelement <4 x i16> %vecinit1.i.i, i16 %b, i32 2 + %vecinit3.i.i = insertelement <4 x i16> %vecinit2.i.i, i16 %b, i32 3 + %vmull15.i.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i) + ret <4 x i32> %vmull15.i.i +} + +define <2 x i64> @test_vmull_high_n_s32(<4 x i32> %a, i32 %b) { +; CHECK: test_vmull_high_n_s32: +; CHECK: smull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] +entry: + %shuffle.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> + %vecinit.i.i = insertelement <2 x i32> undef, i32 %b, i32 0 + %vecinit1.i.i = insertelement <2 x i32> %vecinit.i.i, i32 %b, i32 1 + %vmull9.i.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i) + ret <2 x i64> %vmull9.i.i +} + +define <4 x i32> @test_vmull_high_n_u16(<8 x i16> %a, i16 %b) { +; CHECK: test_vmull_high_n_u16: +; CHECK: umull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0] +entry: + %shuffle.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> + %vecinit.i.i = insertelement <4 x i16> undef, i16 %b, i32 0 + %vecinit1.i.i = insertelement <4 x i16> %vecinit.i.i, i16 %b, i32 1 + %vecinit2.i.i = insertelement <4 x i16> %vecinit1.i.i, i16 %b, i32 2 + %vecinit3.i.i = insertelement <4 x i16> %vecinit2.i.i, i16 %b, i32 3 + %vmull15.i.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i) + ret <4 x i32> %vmull15.i.i +} + +define <2 x i64> @test_vmull_high_n_u32(<4 x i32> %a, i32 %b) { +; CHECK: test_vmull_high_n_u32: +; CHECK: umull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] +entry: + %shuffle.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> + %vecinit.i.i = insertelement <2 x i32> undef, i32 %b, i32 0 + %vecinit1.i.i = insertelement <2 x i32> %vecinit.i.i, i32 %b, i32 1 + %vmull9.i.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i) + ret <2 x i64> %vmull9.i.i +} + +define <4 x i32> @test_vqdmull_high_n_s16(<8 x i16> %a, i16 %b) { +; CHECK: test_vqdmull_high_n_s16: +; CHECK: sqdmull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0] +entry: + %shuffle.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> + %vecinit.i.i = insertelement <4 x i16> undef, i16 %b, i32 0 + %vecinit1.i.i = insertelement <4 x i16> %vecinit.i.i, i16 %b, i32 1 + %vecinit2.i.i = insertelement <4 x i16> %vecinit1.i.i, i16 %b, i32 2 + %vecinit3.i.i = insertelement <4 x i16> %vecinit2.i.i, i16 %b, i32 3 + %vqdmull15.i.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i) + ret <4 x i32> %vqdmull15.i.i +} + +define <2 x i64> @test_vqdmull_high_n_s32(<4 x i32> %a, i32 %b) { +; CHECK: test_vqdmull_high_n_s32: +; CHECK: sqdmull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] +entry: + %shuffle.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> + %vecinit.i.i = insertelement <2 x i32> undef, i32 %b, i32 0 + %vecinit1.i.i = insertelement <2 x i32> %vecinit.i.i, i32 %b, i32 1 + %vqdmull9.i.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i) + ret <2 x i64> %vqdmull9.i.i +} + +define <4 x i32> @test_vmlal_high_n_s16(<4 x i32> %a, <8 x i16> %b, i16 %c) { +; CHECK: test_vmlal_high_n_s16: +; CHECK: smlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[{{[0-9]+}}] +entry: + %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> + %vecinit.i.i = insertelement <4 x i16> undef, i16 %c, i32 0 + %vecinit1.i.i = insertelement <4 x i16> %vecinit.i.i, i16 %c, i32 1 + %vecinit2.i.i = insertelement <4 x i16> %vecinit1.i.i, i16 %c, i32 2 + %vecinit3.i.i = insertelement <4 x i16> %vecinit2.i.i, i16 %c, i32 3 + %vmull2.i.i.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i) + %add.i.i = add <4 x i32> %vmull2.i.i.i, %a + ret <4 x i32> %add.i.i +} + +define <2 x i64> @test_vmlal_high_n_s32(<2 x i64> %a, <4 x i32> %b, i32 %c) { +; CHECK: test_vmlal_high_n_s32: +; CHECK: smlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[{{[0-9]+}}] +entry: + %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> + %vecinit.i.i = insertelement <2 x i32> undef, i32 %c, i32 0 + %vecinit1.i.i = insertelement <2 x i32> %vecinit.i.i, i32 %c, i32 1 + %vmull2.i.i.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i) + %add.i.i = add <2 x i64> %vmull2.i.i.i, %a + ret <2 x i64> %add.i.i +} + +define <4 x i32> @test_vmlal_high_n_u16(<4 x i32> %a, <8 x i16> %b, i16 %c) { +; CHECK: test_vmlal_high_n_u16: +; CHECK: umlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[{{[0-9]+}}] +entry: + %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> + %vecinit.i.i = insertelement <4 x i16> undef, i16 %c, i32 0 + %vecinit1.i.i = insertelement <4 x i16> %vecinit.i.i, i16 %c, i32 1 + %vecinit2.i.i = insertelement <4 x i16> %vecinit1.i.i, i16 %c, i32 2 + %vecinit3.i.i = insertelement <4 x i16> %vecinit2.i.i, i16 %c, i32 3 + %vmull2.i.i.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i) + %add.i.i = add <4 x i32> %vmull2.i.i.i, %a + ret <4 x i32> %add.i.i +} + +define <2 x i64> @test_vmlal_high_n_u32(<2 x i64> %a, <4 x i32> %b, i32 %c) { +; CHECK: test_vmlal_high_n_u32: +; CHECK: umlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[{{[0-9]+}}] +entry: + %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> + %vecinit.i.i = insertelement <2 x i32> undef, i32 %c, i32 0 + %vecinit1.i.i = insertelement <2 x i32> %vecinit.i.i, i32 %c, i32 1 + %vmull2.i.i.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i) + %add.i.i = add <2 x i64> %vmull2.i.i.i, %a + ret <2 x i64> %add.i.i +} + +define <4 x i32> @test_vqdmlal_high_n_s16(<4 x i32> %a, <8 x i16> %b, i16 %c) { +; CHECK: test_vqdmlal_high_n_s16: +; CHECK: sqdmlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[{{[0-9]+}}] +entry: + %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> + %vecinit.i.i = insertelement <4 x i16> undef, i16 %c, i32 0 + %vecinit1.i.i = insertelement <4 x i16> %vecinit.i.i, i16 %c, i32 1 + %vecinit2.i.i = insertelement <4 x i16> %vecinit1.i.i, i16 %c, i32 2 + %vecinit3.i.i = insertelement <4 x i16> %vecinit2.i.i, i16 %c, i32 3 + %vqdmlal15.i.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i) + %vqdmlal17.i.i = tail call <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32> %a, <4 x i32> %vqdmlal15.i.i) + ret <4 x i32> %vqdmlal17.i.i +} + +define <2 x i64> @test_vqdmlal_high_n_s32(<2 x i64> %a, <4 x i32> %b, i32 %c) { +; CHECK: test_vqdmlal_high_n_s32: +; CHECK: sqdmlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[{{[0-9]+}}] +entry: + %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> + %vecinit.i.i = insertelement <2 x i32> undef, i32 %c, i32 0 + %vecinit1.i.i = insertelement <2 x i32> %vecinit.i.i, i32 %c, i32 1 + %vqdmlal9.i.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i) + %vqdmlal11.i.i = tail call <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64> %a, <2 x i64> %vqdmlal9.i.i) + ret <2 x i64> %vqdmlal11.i.i +} + +define <4 x i32> @test_vmlsl_high_n_s16(<4 x i32> %a, <8 x i16> %b, i16 %c) { +; CHECK: test_vmlsl_high_n_s16: +; CHECK: smlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[{{[0-9]+}}] +entry: + %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> + %vecinit.i.i = insertelement <4 x i16> undef, i16 %c, i32 0 + %vecinit1.i.i = insertelement <4 x i16> %vecinit.i.i, i16 %c, i32 1 + %vecinit2.i.i = insertelement <4 x i16> %vecinit1.i.i, i16 %c, i32 2 + %vecinit3.i.i = insertelement <4 x i16> %vecinit2.i.i, i16 %c, i32 3 + %vmull2.i.i.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i) + %sub.i.i = sub <4 x i32> %a, %vmull2.i.i.i + ret <4 x i32> %sub.i.i +} + +define <2 x i64> @test_vmlsl_high_n_s32(<2 x i64> %a, <4 x i32> %b, i32 %c) { +; CHECK: test_vmlsl_high_n_s32: +; CHECK: smlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[{{[0-9]+}}] +entry: + %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> + %vecinit.i.i = insertelement <2 x i32> undef, i32 %c, i32 0 + %vecinit1.i.i = insertelement <2 x i32> %vecinit.i.i, i32 %c, i32 1 + %vmull2.i.i.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i) + %sub.i.i = sub <2 x i64> %a, %vmull2.i.i.i + ret <2 x i64> %sub.i.i +} + +define <4 x i32> @test_vmlsl_high_n_u16(<4 x i32> %a, <8 x i16> %b, i16 %c) { +; CHECK: test_vmlsl_high_n_u16: +; CHECK: umlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[{{[0-9]+}}] +entry: + %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> + %vecinit.i.i = insertelement <4 x i16> undef, i16 %c, i32 0 + %vecinit1.i.i = insertelement <4 x i16> %vecinit.i.i, i16 %c, i32 1 + %vecinit2.i.i = insertelement <4 x i16> %vecinit1.i.i, i16 %c, i32 2 + %vecinit3.i.i = insertelement <4 x i16> %vecinit2.i.i, i16 %c, i32 3 + %vmull2.i.i.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i) + %sub.i.i = sub <4 x i32> %a, %vmull2.i.i.i + ret <4 x i32> %sub.i.i +} + +define <2 x i64> @test_vmlsl_high_n_u32(<2 x i64> %a, <4 x i32> %b, i32 %c) { +; CHECK: test_vmlsl_high_n_u32: +; CHECK: umlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[{{[0-9]+}}] +entry: + %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> + %vecinit.i.i = insertelement <2 x i32> undef, i32 %c, i32 0 + %vecinit1.i.i = insertelement <2 x i32> %vecinit.i.i, i32 %c, i32 1 + %vmull2.i.i.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i) + %sub.i.i = sub <2 x i64> %a, %vmull2.i.i.i + ret <2 x i64> %sub.i.i +} + +define <4 x i32> @test_vqdmlsl_high_n_s16(<4 x i32> %a, <8 x i16> %b, i16 %c) { +; CHECK: test_vqdmlsl_high_n_s16: +; CHECK: sqdmlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[{{[0-9]+}}] +entry: + %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> + %vecinit.i.i = insertelement <4 x i16> undef, i16 %c, i32 0 + %vecinit1.i.i = insertelement <4 x i16> %vecinit.i.i, i16 %c, i32 1 + %vecinit2.i.i = insertelement <4 x i16> %vecinit1.i.i, i16 %c, i32 2 + %vecinit3.i.i = insertelement <4 x i16> %vecinit2.i.i, i16 %c, i32 3 + %vqdmlsl15.i.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i) + %vqdmlsl17.i.i = tail call <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32> %a, <4 x i32> %vqdmlsl15.i.i) + ret <4 x i32> %vqdmlsl17.i.i +} + +define <2 x i64> @test_vqdmlsl_high_n_s32(<2 x i64> %a, <4 x i32> %b, i32 %c) { +; CHECK: test_vqdmlsl_high_n_s32: +; CHECK: sqdmlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[{{[0-9]+}}] +entry: + %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> + %vecinit.i.i = insertelement <2 x i32> undef, i32 %c, i32 0 + %vecinit1.i.i = insertelement <2 x i32> %vecinit.i.i, i32 %c, i32 1 + %vqdmlsl9.i.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i) + %vqdmlsl11.i.i = tail call <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64> %a, <2 x i64> %vqdmlsl9.i.i) + ret <2 x i64> %vqdmlsl11.i.i +} + +define <2 x float> @test_vmul_n_f32(<2 x float> %a, float %b) { +; CHECK: test_vmul_n_f32: +; CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] +entry: + %vecinit.i = insertelement <2 x float> undef, float %b, i32 0 + %vecinit1.i = insertelement <2 x float> %vecinit.i, float %b, i32 1 + %mul.i = fmul <2 x float> %vecinit1.i, %a + ret <2 x float> %mul.i +} + +define <4 x float> @test_vmulq_n_f32(<4 x float> %a, float %b) { +; CHECK: test_vmulq_n_f32: +; CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] +entry: + %vecinit.i = insertelement <4 x float> undef, float %b, i32 0 + %vecinit1.i = insertelement <4 x float> %vecinit.i, float %b, i32 1 + %vecinit2.i = insertelement <4 x float> %vecinit1.i, float %b, i32 2 + %vecinit3.i = insertelement <4 x float> %vecinit2.i, float %b, i32 3 + %mul.i = fmul <4 x float> %vecinit3.i, %a + ret <4 x float> %mul.i +} + +define <2 x double> @test_vmulq_n_f64(<2 x double> %a, double %b) { +; CHECK: test_vmulq_n_f64: +; CHECK: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0] +entry: + %vecinit.i = insertelement <2 x double> undef, double %b, i32 0 + %vecinit1.i = insertelement <2 x double> %vecinit.i, double %b, i32 1 + %mul.i = fmul <2 x double> %vecinit1.i, %a + ret <2 x double> %mul.i +} + +define <2 x float> @test_vfma_n_f32(<2 x float> %a, <2 x float> %b, float %n) { +; CHECK: test_vfma_n_f32: +; CHECK: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[{{[0-9]+}}] +entry: + %vecinit.i = insertelement <2 x float> undef, float %n, i32 0 + %vecinit1.i = insertelement <2 x float> %vecinit.i, float %n, i32 1 + %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %b, <2 x float> %vecinit1.i, <2 x float> %a) + ret <2 x float> %0 +} + +define <4 x float> @test_vfmaq_n_f32(<4 x float> %a, <4 x float> %b, float %n) { +; CHECK: test_vfmaq_n_f32: +; CHECK: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[{{[0-9]+}}] +entry: + %vecinit.i = insertelement <4 x float> undef, float %n, i32 0 + %vecinit1.i = insertelement <4 x float> %vecinit.i, float %n, i32 1 + %vecinit2.i = insertelement <4 x float> %vecinit1.i, float %n, i32 2 + %vecinit3.i = insertelement <4 x float> %vecinit2.i, float %n, i32 3 + %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %b, <4 x float> %vecinit3.i, <4 x float> %a) + ret <4 x float> %0 +} + +define <2 x float> @test_vfms_n_f32(<2 x float> %a, <2 x float> %b, float %n) { +; CHECK: test_vfms_n_f32: +; CHECK: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[{{[0-9]+}}] +entry: + %vecinit.i = insertelement <2 x float> undef, float %n, i32 0 + %vecinit1.i = insertelement <2 x float> %vecinit.i, float %n, i32 1 + %0 = fsub <2 x float> , %b + %1 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %0, <2 x float> %vecinit1.i, <2 x float> %a) + ret <2 x float> %1 +} + +define <4 x float> @test_vfmsq_n_f32(<4 x float> %a, <4 x float> %b, float %n) { +; CHECK: test_vfmsq_n_f32: +; CHECK: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[{{[0-9]+}}] +entry: + %vecinit.i = insertelement <4 x float> undef, float %n, i32 0 + %vecinit1.i = insertelement <4 x float> %vecinit.i, float %n, i32 1 + %vecinit2.i = insertelement <4 x float> %vecinit1.i, float %n, i32 2 + %vecinit3.i = insertelement <4 x float> %vecinit2.i, float %n, i32 3 + %0 = fsub <4 x float> , %b + %1 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %0, <4 x float> %vecinit3.i, <4 x float> %a) + ret <4 x float> %1 +}