diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp index ebb9b8e5d86..5cfd6645c65 100644 --- a/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -531,6 +531,11 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM) // AArch64 doesn't have MUL.2d: setOperationAction(ISD::MUL, MVT::v2i64, Expand); + // Custom handling for some quad-vector types to detect MULL. + setOperationAction(ISD::MUL, MVT::v8i16, Custom); + setOperationAction(ISD::MUL, MVT::v4i32, Custom); + setOperationAction(ISD::MUL, MVT::v2i64, Custom); + setOperationAction(ISD::ANY_EXTEND, MVT::v4i32, Legal); setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand); // Likewise, narrowing and extending vector loads/stores aren't handled @@ -853,6 +858,8 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const { case AArch64ISD::ST2LANEpost: return "AArch64ISD::ST2LANEpost"; case AArch64ISD::ST3LANEpost: return "AArch64ISD::ST3LANEpost"; case AArch64ISD::ST4LANEpost: return "AArch64ISD::ST4LANEpost"; + case AArch64ISD::SMULL: return "AArch64ISD::SMULL"; + case AArch64ISD::UMULL: return "AArch64ISD::UMULL"; } } @@ -1668,6 +1675,197 @@ static SDValue LowerBITCAST(SDValue Op, SelectionDAG &DAG) { 0); } +static EVT getExtensionTo64Bits(const EVT &OrigVT) { + if (OrigVT.getSizeInBits() >= 64) + return OrigVT; + + assert(OrigVT.isSimple() && "Expecting a simple value type"); + + MVT::SimpleValueType OrigSimpleTy = OrigVT.getSimpleVT().SimpleTy; + switch (OrigSimpleTy) { + default: llvm_unreachable("Unexpected Vector Type"); + case MVT::v2i8: + case MVT::v2i16: + return MVT::v2i32; + case MVT::v4i8: + return MVT::v4i16; + } +} + +static SDValue addRequiredExtensionForVectorMULL(SDValue N, SelectionDAG &DAG, + const EVT &OrigTy, + const EVT &ExtTy, + unsigned ExtOpcode) { + // The vector originally had a size of OrigTy. It was then extended to ExtTy. + // We expect the ExtTy to be 128-bits total. If the OrigTy is less than + // 64-bits we need to insert a new extension so that it will be 64-bits. + assert(ExtTy.is128BitVector() && "Unexpected extension size"); + if (OrigTy.getSizeInBits() >= 64) + return N; + + // Must extend size to at least 64 bits to be used as an operand for VMULL. + EVT NewVT = getExtensionTo64Bits(OrigTy); + + return DAG.getNode(ExtOpcode, SDLoc(N), NewVT, N); +} + +static bool isExtendedBUILD_VECTOR(SDNode *N, SelectionDAG &DAG, + bool isSigned) { + EVT VT = N->getValueType(0); + + if (N->getOpcode() != ISD::BUILD_VECTOR) + return false; + + for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) { + SDNode *Elt = N->getOperand(i).getNode(); + if (ConstantSDNode *C = dyn_cast(Elt)) { + unsigned EltSize = VT.getVectorElementType().getSizeInBits(); + unsigned HalfSize = EltSize / 2; + if (isSigned) { + if (!isIntN(HalfSize, C->getSExtValue())) + return false; + } else { + if (!isUIntN(HalfSize, C->getZExtValue())) + return false; + } + continue; + } + return false; + } + + return true; +} + +static SDValue skipExtensionForVectorMULL(SDNode *N, SelectionDAG &DAG) { + if (N->getOpcode() == ISD::SIGN_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND) + return addRequiredExtensionForVectorMULL(N->getOperand(0), DAG, + N->getOperand(0)->getValueType(0), + N->getValueType(0), + N->getOpcode()); + + assert(N->getOpcode() == ISD::BUILD_VECTOR && "expected BUILD_VECTOR"); + EVT VT = N->getValueType(0); + unsigned EltSize = VT.getVectorElementType().getSizeInBits() / 2; + unsigned NumElts = VT.getVectorNumElements(); + MVT TruncVT = MVT::getIntegerVT(EltSize); + SmallVector Ops; + for (unsigned i = 0; i != NumElts; ++i) { + ConstantSDNode *C = cast(N->getOperand(i)); + const APInt &CInt = C->getAPIntValue(); + // Element types smaller than 32 bits are not legal, so use i32 elements. + // The values are implicitly truncated so sext vs. zext doesn't matter. + Ops.push_back(DAG.getConstant(CInt.zextOrTrunc(32), MVT::i32)); + } + return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(N), + MVT::getVectorVT(TruncVT, NumElts), Ops); +} + +static bool isSignExtended(SDNode *N, SelectionDAG &DAG) { + if (N->getOpcode() == ISD::SIGN_EXTEND) + return true; + if (isExtendedBUILD_VECTOR(N, DAG, true)) + return true; + return false; +} + +static bool isZeroExtended(SDNode *N, SelectionDAG &DAG) { + if (N->getOpcode() == ISD::ZERO_EXTEND) + return true; + if (isExtendedBUILD_VECTOR(N, DAG, false)) + return true; + return false; +} + +static bool isAddSubSExt(SDNode *N, SelectionDAG &DAG) { + unsigned Opcode = N->getOpcode(); + if (Opcode == ISD::ADD || Opcode == ISD::SUB) { + SDNode *N0 = N->getOperand(0).getNode(); + SDNode *N1 = N->getOperand(1).getNode(); + return N0->hasOneUse() && N1->hasOneUse() && + isSignExtended(N0, DAG) && isSignExtended(N1, DAG); + } + return false; +} + +static bool isAddSubZExt(SDNode *N, SelectionDAG &DAG) { + unsigned Opcode = N->getOpcode(); + if (Opcode == ISD::ADD || Opcode == ISD::SUB) { + SDNode *N0 = N->getOperand(0).getNode(); + SDNode *N1 = N->getOperand(1).getNode(); + return N0->hasOneUse() && N1->hasOneUse() && + isZeroExtended(N0, DAG) && isZeroExtended(N1, DAG); + } + return false; +} + +static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) { + // Multiplications are only custom-lowered for 128-bit vectors so that + // VMULL can be detected. Otherwise v2i64 multiplications are not legal. + EVT VT = Op.getValueType(); + assert(VT.is128BitVector() && VT.isInteger() && + "unexpected type for custom-lowering ISD::MUL"); + SDNode *N0 = Op.getOperand(0).getNode(); + SDNode *N1 = Op.getOperand(1).getNode(); + unsigned NewOpc = 0; + bool isMLA = false; + bool isN0SExt = isSignExtended(N0, DAG); + bool isN1SExt = isSignExtended(N1, DAG); + if (isN0SExt && isN1SExt) + NewOpc = AArch64ISD::SMULL; + else { + bool isN0ZExt = isZeroExtended(N0, DAG); + bool isN1ZExt = isZeroExtended(N1, DAG); + if (isN0ZExt && isN1ZExt) + NewOpc = AArch64ISD::UMULL; + else if (isN1SExt || isN1ZExt) { + // Look for (s/zext A + s/zext B) * (s/zext C). We want to turn these + // into (s/zext A * s/zext C) + (s/zext B * s/zext C) + if (isN1SExt && isAddSubSExt(N0, DAG)) { + NewOpc = AArch64ISD::SMULL; + isMLA = true; + } else if (isN1ZExt && isAddSubZExt(N0, DAG)) { + NewOpc = AArch64ISD::UMULL; + isMLA = true; + } else if (isN0ZExt && isAddSubZExt(N1, DAG)) { + std::swap(N0, N1); + NewOpc = AArch64ISD::UMULL; + isMLA = true; + } + } + + if (!NewOpc) { + if (VT == MVT::v2i64) + // Fall through to expand this. It is not legal. + return SDValue(); + else + // Other vector multiplications are legal. + return Op; + } + } + + // Legalize to a S/UMULL instruction + SDLoc DL(Op); + SDValue Op0; + SDValue Op1 = skipExtensionForVectorMULL(N1, DAG); + if (!isMLA) { + Op0 = skipExtensionForVectorMULL(N0, DAG); + assert(Op0.getValueType().is64BitVector() && + Op1.getValueType().is64BitVector() && + "unexpected types for extended operands to VMULL"); + return DAG.getNode(NewOpc, DL, VT, Op0, Op1); + } + // Optimizing (zext A + zext B) * C, to (S/UMULL A, C) + (S/UMULL B, C) during + // isel lowering to take advantage of no-stall back to back s/umul + s/umla. + // This is true for CPUs with accumulate forwarding such as Cortex-A53/A57 + SDValue N00 = skipExtensionForVectorMULL(N0->getOperand(0).getNode(), DAG); + SDValue N01 = skipExtensionForVectorMULL(N0->getOperand(1).getNode(), DAG); + EVT Op1VT = Op1.getValueType(); + return DAG.getNode(N0->getOpcode(), DL, VT, + DAG.getNode(NewOpc, DL, VT, + DAG.getNode(ISD::BITCAST, DL, Op1VT, N00), Op1), + DAG.getNode(NewOpc, DL, VT, + DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1)); +} SDValue AArch64TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { @@ -1768,6 +1966,8 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op, return LowerFP_TO_INT(Op, DAG); case ISD::FSINCOS: return LowerFSINCOS(Op, DAG); + case ISD::MUL: + return LowerMUL(Op, DAG); } } diff --git a/lib/Target/AArch64/AArch64ISelLowering.h b/lib/Target/AArch64/AArch64ISelLowering.h index 3e960ca75f5..2f5708dd897 100644 --- a/lib/Target/AArch64/AArch64ISelLowering.h +++ b/lib/Target/AArch64/AArch64ISelLowering.h @@ -169,6 +169,9 @@ enum { /// mode without emitting such REV instructions. NVCAST, + SMULL, + UMULL, + // NEON Load/Store with post-increment base updates LD2post = ISD::FIRST_TARGET_MEMORY_OPCODE, LD3post, diff --git a/lib/Target/AArch64/AArch64InstrInfo.td b/lib/Target/AArch64/AArch64InstrInfo.td index c2e2e8b22de..831defa3987 100644 --- a/lib/Target/AArch64/AArch64InstrInfo.td +++ b/lib/Target/AArch64/AArch64InstrInfo.td @@ -239,6 +239,11 @@ def AArch64WrapperLarge : SDNode<"AArch64ISD::WrapperLarge", def AArch64NvCast : SDNode<"AArch64ISD::NVCAST", SDTUnaryOp>; +def SDT_AArch64mull : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisInt<1>, + SDTCisSameAs<1, 2>]>; +def AArch64smull : SDNode<"AArch64ISD::SMULL", SDT_AArch64mull>; +def AArch64umull : SDNode<"AArch64ISD::UMULL", SDT_AArch64mull>; + //===----------------------------------------------------------------------===// //===----------------------------------------------------------------------===// @@ -3148,6 +3153,46 @@ defm USUBL : SIMDLongThreeVectorBHS<1, 0b0010, "usubl", defm USUBW : SIMDWideThreeVectorBHS< 1, 0b0011, "usubw", BinOpFrag<(sub node:$LHS, (zext node:$RHS))>>; +// Additional patterns for SMULL and UMULL +multiclass Neon_mul_widen_patterns { + def : Pat<(v8i16 (opnode (v8i8 V64:$Rn), (v8i8 V64:$Rm))), + (INST8B V64:$Rn, V64:$Rm)>; + def : Pat<(v4i32 (opnode (v4i16 V64:$Rn), (v4i16 V64:$Rm))), + (INST4H V64:$Rn, V64:$Rm)>; + def : Pat<(v2i64 (opnode (v2i32 V64:$Rn), (v2i32 V64:$Rm))), + (INST2S V64:$Rn, V64:$Rm)>; +} + +defm : Neon_mul_widen_patterns; +defm : Neon_mul_widen_patterns; + +// Additional patterns for SMLAL/SMLSL and UMLAL/UMLSL +multiclass Neon_mulacc_widen_patterns { + def : Pat<(v8i16 (opnode (v8i16 V128:$Rd), (v8i8 V64:$Rn), (v8i8 V64:$Rm))), + (INST8B V128:$Rd, V64:$Rn, V64:$Rm)>; + def : Pat<(v4i32 (opnode (v4i32 V128:$Rd), (v4i16 V64:$Rn), (v4i16 V64:$Rm))), + (INST4H V128:$Rd, V64:$Rn, V64:$Rm)>; + def : Pat<(v2i64 (opnode (v2i64 V128:$Rd), (v2i32 V64:$Rn), (v2i32 V64:$Rm))), + (INST2S V128:$Rd, V64:$Rn, V64:$Rm)>; +} + +defm : Neon_mulacc_widen_patterns< + TriOpFrag<(add node:$LHS, (AArch64smull node:$MHS, node:$RHS))>, + SMLALv8i8_v8i16, SMLALv4i16_v4i32, SMLALv2i32_v2i64>; +defm : Neon_mulacc_widen_patterns< + TriOpFrag<(add node:$LHS, (AArch64umull node:$MHS, node:$RHS))>, + UMLALv8i8_v8i16, UMLALv4i16_v4i32, UMLALv2i32_v2i64>; +defm : Neon_mulacc_widen_patterns< + TriOpFrag<(sub node:$LHS, (AArch64smull node:$MHS, node:$RHS))>, + SMLSLv8i8_v8i16, SMLSLv4i16_v4i32, SMLSLv2i32_v2i64>; +defm : Neon_mulacc_widen_patterns< + TriOpFrag<(sub node:$LHS, (AArch64umull node:$MHS, node:$RHS))>, + UMLSLv8i8_v8i16, UMLSLv4i16_v4i32, UMLSLv2i32_v2i64>; + // Patterns for 64-bit pmull def : Pat<(int_aarch64_neon_pmull64 V64:$Rn, V64:$Rm), (PMULLv1i64 V64:$Rn, V64:$Rm)>; diff --git a/test/CodeGen/AArch64/aarch64-smull.ll b/test/CodeGen/AArch64/aarch64-smull.ll new file mode 100644 index 00000000000..92582d7d25e --- /dev/null +++ b/test/CodeGen/AArch64/aarch64-smull.ll @@ -0,0 +1,332 @@ +; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s -o -| FileCheck %s + +define <8 x i16> @smull_v8i8_v8i16(<8 x i8>* %A, <8 x i8>* %B) nounwind { +; CHECK-LABEL: smull_v8i8_v8i16: +; CHECK: smull {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b + %tmp1 = load <8 x i8>* %A + %tmp2 = load <8 x i8>* %B + %tmp3 = sext <8 x i8> %tmp1 to <8 x i16> + %tmp4 = sext <8 x i8> %tmp2 to <8 x i16> + %tmp5 = mul <8 x i16> %tmp3, %tmp4 + ret <8 x i16> %tmp5 +} + +define <4 x i32> @smull_v4i16_v4i32(<4 x i16>* %A, <4 x i16>* %B) nounwind { +; CHECK-LABEL: smull_v4i16_v4i32: +; CHECK: smull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h + %tmp1 = load <4 x i16>* %A + %tmp2 = load <4 x i16>* %B + %tmp3 = sext <4 x i16> %tmp1 to <4 x i32> + %tmp4 = sext <4 x i16> %tmp2 to <4 x i32> + %tmp5 = mul <4 x i32> %tmp3, %tmp4 + ret <4 x i32> %tmp5 +} + +define <2 x i64> @smull_v2i32_v2i64(<2 x i32>* %A, <2 x i32>* %B) nounwind { +; CHECK-LABEL: smull_v2i32_v2i64: +; CHECK: smull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s + %tmp1 = load <2 x i32>* %A + %tmp2 = load <2 x i32>* %B + %tmp3 = sext <2 x i32> %tmp1 to <2 x i64> + %tmp4 = sext <2 x i32> %tmp2 to <2 x i64> + %tmp5 = mul <2 x i64> %tmp3, %tmp4 + ret <2 x i64> %tmp5 +} + +define <8 x i16> @umull_v8i8_v8i16(<8 x i8>* %A, <8 x i8>* %B) nounwind { +; CHECK-LABEL: umull_v8i8_v8i16: +; CHECK: umull {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b + %tmp1 = load <8 x i8>* %A + %tmp2 = load <8 x i8>* %B + %tmp3 = zext <8 x i8> %tmp1 to <8 x i16> + %tmp4 = zext <8 x i8> %tmp2 to <8 x i16> + %tmp5 = mul <8 x i16> %tmp3, %tmp4 + ret <8 x i16> %tmp5 +} + +define <4 x i32> @umull_v4i16_v4i32(<4 x i16>* %A, <4 x i16>* %B) nounwind { +; CHECK-LABEL: umull_v4i16_v4i32: +; CHECK: umull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h + %tmp1 = load <4 x i16>* %A + %tmp2 = load <4 x i16>* %B + %tmp3 = zext <4 x i16> %tmp1 to <4 x i32> + %tmp4 = zext <4 x i16> %tmp2 to <4 x i32> + %tmp5 = mul <4 x i32> %tmp3, %tmp4 + ret <4 x i32> %tmp5 +} + +define <2 x i64> @umull_v2i32_v2i64(<2 x i32>* %A, <2 x i32>* %B) nounwind { +; CHECK-LABEL: umull_v2i32_v2i64: +; CHECK: umull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s + %tmp1 = load <2 x i32>* %A + %tmp2 = load <2 x i32>* %B + %tmp3 = zext <2 x i32> %tmp1 to <2 x i64> + %tmp4 = zext <2 x i32> %tmp2 to <2 x i64> + %tmp5 = mul <2 x i64> %tmp3, %tmp4 + ret <2 x i64> %tmp5 +} + +define <8 x i16> @smlal_v8i8_v8i16(<8 x i16>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind { +; CHECK-LABEL: smlal_v8i8_v8i16: +; CHECK: smlal {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b + %tmp1 = load <8 x i16>* %A + %tmp2 = load <8 x i8>* %B + %tmp3 = load <8 x i8>* %C + %tmp4 = sext <8 x i8> %tmp2 to <8 x i16> + %tmp5 = sext <8 x i8> %tmp3 to <8 x i16> + %tmp6 = mul <8 x i16> %tmp4, %tmp5 + %tmp7 = add <8 x i16> %tmp1, %tmp6 + ret <8 x i16> %tmp7 +} + +define <4 x i32> @smlal_v4i16_v4i32(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind { +; CHECK-LABEL: smlal_v4i16_v4i32: +; CHECK: smlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h + %tmp1 = load <4 x i32>* %A + %tmp2 = load <4 x i16>* %B + %tmp3 = load <4 x i16>* %C + %tmp4 = sext <4 x i16> %tmp2 to <4 x i32> + %tmp5 = sext <4 x i16> %tmp3 to <4 x i32> + %tmp6 = mul <4 x i32> %tmp4, %tmp5 + %tmp7 = add <4 x i32> %tmp1, %tmp6 + ret <4 x i32> %tmp7 +} + +define <2 x i64> @smlal_v2i32_v2i64(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind { +; CHECK-LABEL: smlal_v2i32_v2i64: +; CHECK: smlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s + %tmp1 = load <2 x i64>* %A + %tmp2 = load <2 x i32>* %B + %tmp3 = load <2 x i32>* %C + %tmp4 = sext <2 x i32> %tmp2 to <2 x i64> + %tmp5 = sext <2 x i32> %tmp3 to <2 x i64> + %tmp6 = mul <2 x i64> %tmp4, %tmp5 + %tmp7 = add <2 x i64> %tmp1, %tmp6 + ret <2 x i64> %tmp7 +} + +define <8 x i16> @umlal_v8i8_v8i16(<8 x i16>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind { +; CHECK-LABEL: umlal_v8i8_v8i16: +; CHECK: umlal {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b + %tmp1 = load <8 x i16>* %A + %tmp2 = load <8 x i8>* %B + %tmp3 = load <8 x i8>* %C + %tmp4 = zext <8 x i8> %tmp2 to <8 x i16> + %tmp5 = zext <8 x i8> %tmp3 to <8 x i16> + %tmp6 = mul <8 x i16> %tmp4, %tmp5 + %tmp7 = add <8 x i16> %tmp1, %tmp6 + ret <8 x i16> %tmp7 +} + +define <4 x i32> @umlal_v4i16_v4i32(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind { +; CHECK-LABEL: umlal_v4i16_v4i32: +; CHECK: umlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h + %tmp1 = load <4 x i32>* %A + %tmp2 = load <4 x i16>* %B + %tmp3 = load <4 x i16>* %C + %tmp4 = zext <4 x i16> %tmp2 to <4 x i32> + %tmp5 = zext <4 x i16> %tmp3 to <4 x i32> + %tmp6 = mul <4 x i32> %tmp4, %tmp5 + %tmp7 = add <4 x i32> %tmp1, %tmp6 + ret <4 x i32> %tmp7 +} + +define <2 x i64> @umlal_v2i32_v2i64(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind { +; CHECK-LABEL: umlal_v2i32_v2i64: +; CHECK: umlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s + %tmp1 = load <2 x i64>* %A + %tmp2 = load <2 x i32>* %B + %tmp3 = load <2 x i32>* %C + %tmp4 = zext <2 x i32> %tmp2 to <2 x i64> + %tmp5 = zext <2 x i32> %tmp3 to <2 x i64> + %tmp6 = mul <2 x i64> %tmp4, %tmp5 + %tmp7 = add <2 x i64> %tmp1, %tmp6 + ret <2 x i64> %tmp7 +} + +define <8 x i16> @smlsl_v8i8_v8i16(<8 x i16>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind { +; CHECK-LABEL: smlsl_v8i8_v8i16: +; CHECK: smlsl {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b + %tmp1 = load <8 x i16>* %A + %tmp2 = load <8 x i8>* %B + %tmp3 = load <8 x i8>* %C + %tmp4 = sext <8 x i8> %tmp2 to <8 x i16> + %tmp5 = sext <8 x i8> %tmp3 to <8 x i16> + %tmp6 = mul <8 x i16> %tmp4, %tmp5 + %tmp7 = sub <8 x i16> %tmp1, %tmp6 + ret <8 x i16> %tmp7 +} + +define <4 x i32> @smlsl_v4i16_v4i32(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind { +; CHECK-LABEL: smlsl_v4i16_v4i32: +; CHECK: smlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h + %tmp1 = load <4 x i32>* %A + %tmp2 = load <4 x i16>* %B + %tmp3 = load <4 x i16>* %C + %tmp4 = sext <4 x i16> %tmp2 to <4 x i32> + %tmp5 = sext <4 x i16> %tmp3 to <4 x i32> + %tmp6 = mul <4 x i32> %tmp4, %tmp5 + %tmp7 = sub <4 x i32> %tmp1, %tmp6 + ret <4 x i32> %tmp7 +} + +define <2 x i64> @smlsl_v2i32_v2i64(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind { +; CHECK-LABEL: smlsl_v2i32_v2i64: +; CHECK: smlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s + %tmp1 = load <2 x i64>* %A + %tmp2 = load <2 x i32>* %B + %tmp3 = load <2 x i32>* %C + %tmp4 = sext <2 x i32> %tmp2 to <2 x i64> + %tmp5 = sext <2 x i32> %tmp3 to <2 x i64> + %tmp6 = mul <2 x i64> %tmp4, %tmp5 + %tmp7 = sub <2 x i64> %tmp1, %tmp6 + ret <2 x i64> %tmp7 +} + +define <8 x i16> @umlsl_v8i8_v8i16(<8 x i16>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind { +; CHECK-LABEL: umlsl_v8i8_v8i16: +; CHECK: umlsl {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b + %tmp1 = load <8 x i16>* %A + %tmp2 = load <8 x i8>* %B + %tmp3 = load <8 x i8>* %C + %tmp4 = zext <8 x i8> %tmp2 to <8 x i16> + %tmp5 = zext <8 x i8> %tmp3 to <8 x i16> + %tmp6 = mul <8 x i16> %tmp4, %tmp5 + %tmp7 = sub <8 x i16> %tmp1, %tmp6 + ret <8 x i16> %tmp7 +} + +define <4 x i32> @umlsl_v4i16_v4i32(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind { +; CHECK-LABEL: umlsl_v4i16_v4i32: +; CHECK: umlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h + %tmp1 = load <4 x i32>* %A + %tmp2 = load <4 x i16>* %B + %tmp3 = load <4 x i16>* %C + %tmp4 = zext <4 x i16> %tmp2 to <4 x i32> + %tmp5 = zext <4 x i16> %tmp3 to <4 x i32> + %tmp6 = mul <4 x i32> %tmp4, %tmp5 + %tmp7 = sub <4 x i32> %tmp1, %tmp6 + ret <4 x i32> %tmp7 +} + +define <2 x i64> @umlsl_v2i32_v2i64(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind { +; CHECK-LABEL: umlsl_v2i32_v2i64: +; CHECK: umlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s + %tmp1 = load <2 x i64>* %A + %tmp2 = load <2 x i32>* %B + %tmp3 = load <2 x i32>* %C + %tmp4 = zext <2 x i32> %tmp2 to <2 x i64> + %tmp5 = zext <2 x i32> %tmp3 to <2 x i64> + %tmp6 = mul <2 x i64> %tmp4, %tmp5 + %tmp7 = sub <2 x i64> %tmp1, %tmp6 + ret <2 x i64> %tmp7 +} + +; SMULL recognizing BUILD_VECTORs with sign/zero-extended elements. +define <8 x i16> @smull_extvec_v8i8_v8i16(<8 x i8> %arg) nounwind { +; CHECK-LABEL: smull_extvec_v8i8_v8i16: +; CHECK: smull {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b + %tmp3 = sext <8 x i8> %arg to <8 x i16> + %tmp4 = mul <8 x i16> %tmp3, + ret <8 x i16> %tmp4 +} + +define <8 x i16> @smull_noextvec_v8i8_v8i16(<8 x i8> %arg) nounwind { +; Do not use SMULL if the BUILD_VECTOR element values are too big. +; CHECK-LABEL: smull_noextvec_v8i8_v8i16: +; CHECK: movz +; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h + %tmp3 = sext <8 x i8> %arg to <8 x i16> + %tmp4 = mul <8 x i16> %tmp3, + ret <8 x i16> %tmp4 +} + +define <4 x i32> @smull_extvec_v4i16_v4i32(<4 x i16> %arg) nounwind { +; CHECK-LABEL: smull_extvec_v4i16_v4i32: +; CHECK: smull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h + %tmp3 = sext <4 x i16> %arg to <4 x i32> + %tmp4 = mul <4 x i32> %tmp3, + ret <4 x i32> %tmp4 +} + +define <2 x i64> @smull_extvec_v2i32_v2i64(<2 x i32> %arg) nounwind { +; CHECK: smull_extvec_v2i32_v2i64 +; CHECK: smull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s + %tmp3 = sext <2 x i32> %arg to <2 x i64> + %tmp4 = mul <2 x i64> %tmp3, + ret <2 x i64> %tmp4 +} + +define <8 x i16> @umull_extvec_v8i8_v8i16(<8 x i8> %arg) nounwind { +; CHECK-LABEL: umull_extvec_v8i8_v8i16: +; CHECK: umull {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b + %tmp3 = zext <8 x i8> %arg to <8 x i16> + %tmp4 = mul <8 x i16> %tmp3, + ret <8 x i16> %tmp4 +} + +define <8 x i16> @umull_noextvec_v8i8_v8i16(<8 x i8> %arg) nounwind { +; Do not use SMULL if the BUILD_VECTOR element values are too big. +; CHECK-LABEL: umull_noextvec_v8i8_v8i16: +; CHECK: movz +; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h + %tmp3 = zext <8 x i8> %arg to <8 x i16> + %tmp4 = mul <8 x i16> %tmp3, + ret <8 x i16> %tmp4 +} + +define <4 x i32> @umull_extvec_v4i16_v4i32(<4 x i16> %arg) nounwind { +; CHECK-LABEL: umull_extvec_v4i16_v4i32: +; CHECK: umull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h + %tmp3 = zext <4 x i16> %arg to <4 x i32> + %tmp4 = mul <4 x i32> %tmp3, + ret <4 x i32> %tmp4 +} + +define <2 x i64> @umull_extvec_v2i32_v2i64(<2 x i32> %arg) nounwind { +; CHECK-LABEL: umull_extvec_v2i32_v2i64: +; CHECK: umull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s + %tmp3 = zext <2 x i32> %arg to <2 x i64> + %tmp4 = mul <2 x i64> %tmp3, + ret <2 x i64> %tmp4 +} + +define i16 @smullWithInconsistentExtensions(<8 x i8> %vec) { +; If one operand has a zero-extend and the other a sign-extend, smull +; cannot be used. +; CHECK-LABEL: smullWithInconsistentExtensions: +; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h + %1 = sext <8 x i8> %vec to <8 x i16> + %2 = mul <8 x i16> %1, + %3 = extractelement <8 x i16> %2, i32 0 + ret i16 %3 +} + +define void @distribute(i16* %dst, i8* %src, i32 %mul) nounwind { +entry: +; CHECK-LABEL: distribute: +; CHECK: umull [[REG1:(v[0-9]+.8h)]], {{v[0-9]+}}.8b, [[REG2:(v[0-9]+.8b)]] +; CHECK: umlal [[REG1]], {{v[0-9]+}}.8b, [[REG2]] + %0 = trunc i32 %mul to i8 + %1 = insertelement <8 x i8> undef, i8 %0, i32 0 + %2 = shufflevector <8 x i8> %1, <8 x i8> undef, <8 x i32> zeroinitializer + %3 = tail call <16 x i8> @llvm.aarch64.neon.vld1.v16i8(i8* %src, i32 1) + %4 = bitcast <16 x i8> %3 to <2 x double> + %5 = extractelement <2 x double> %4, i32 1 + %6 = bitcast double %5 to <8 x i8> + %7 = zext <8 x i8> %6 to <8 x i16> + %8 = zext <8 x i8> %2 to <8 x i16> + %9 = extractelement <2 x double> %4, i32 0 + %10 = bitcast double %9 to <8 x i8> + %11 = zext <8 x i8> %10 to <8 x i16> + %12 = add <8 x i16> %7, %11 + %13 = mul <8 x i16> %12, %8 + %14 = bitcast i16* %dst to i8* + tail call void @llvm.aarch64.neon.vst1.v8i16(i8* %14, <8 x i16> %13, i32 2) + ret void +} + +declare <16 x i8> @llvm.aarch64.neon.vld1.v16i8(i8*, i32) nounwind readonly + +declare void @llvm.aarch64.neon.vst1.v8i16(i8*, <8 x i16>, i32) nounwind +