From 36c7806f4eacd676932ba630246f88e0e37b1cd4 Mon Sep 17 00:00:00 2001 From: Hao Liu Date: Tue, 19 Nov 2013 02:17:05 +0000 Subject: [PATCH] Implement AArch64 neon instructions class SIMD lsone and SIMD lone-post. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@195078 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/AArch64/AArch64ISelDAGToDAG.cpp | 589 ++++- lib/Target/AArch64/AArch64ISelLowering.cpp | 217 +- lib/Target/AArch64/AArch64ISelLowering.h | 24 +- lib/Target/AArch64/AArch64InstrFormats.td | 79 + lib/Target/AArch64/AArch64InstrNEON.td | 654 ++++- .../AArch64/AsmParser/AArch64AsmParser.cpp | 20 +- .../Disassembler/AArch64Disassembler.cpp | 429 +++- .../InstPrinter/AArch64InstPrinter.cpp | 2 +- lib/Target/AArch64/Utils/AArch64BaseInfo.h | 17 +- test/CodeGen/AArch64/neon-simd-ldst-one.ll | 2113 +++++++++++++++++ .../AArch64/neon-simd-post-ldst-one.ll | 319 +++ test/MC/AArch64/neon-diagnostics.s | 119 + test/MC/AArch64/neon-simd-ldst-one-elem.s | 325 +++ .../AArch64/neon-instructions.txt | 84 + 14 files changed, 4800 insertions(+), 191 deletions(-) create mode 100644 test/CodeGen/AArch64/neon-simd-ldst-one.ll create mode 100644 test/CodeGen/AArch64/neon-simd-post-ldst-one.ll create mode 100644 test/MC/AArch64/neon-simd-ldst-one-elem.s diff --git a/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp index a6ebfe31b4c..ef99541c170 100644 --- a/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp +++ b/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp @@ -117,11 +117,11 @@ private: SDNode *SelectVTBL(SDNode *N, unsigned NumVecs, bool IsExt); /// Select NEON load intrinsics. NumVecs should be 1, 2, 3 or 4. - SDNode *SelectVLD(SDNode *N, unsigned NumVecs, bool isUpdating, + SDNode *SelectVLD(SDNode *N, bool isUpdating, unsigned NumVecs, const uint16_t *Opcode); /// Select NEON store intrinsics. NumVecs should be 1, 2, 3 or 4. - SDNode *SelectVST(SDNode *N, unsigned NumVecs, bool isUpdating, + SDNode *SelectVST(SDNode *N, bool isUpdating, unsigned NumVecs, const uint16_t *Opcodes); /// Form sequences of consecutive 64/128-bit registers for use in NEON @@ -135,6 +135,19 @@ private: /// functions. Those should almost always be called instead. SDValue createTuple(ArrayRef Vecs, unsigned RegClassIDs[], unsigned SubRegs[]); + + /// Select NEON load-duplicate intrinsics. NumVecs should be 2, 3 or 4. + /// The opcode array specifies the instructions used for load. + SDNode *SelectVLDDup(SDNode *N, bool isUpdating, unsigned NumVecs, + const uint16_t *Opcodes); + + /// Select NEON load/store lane intrinsics. NumVecs should be 2, 3 or 4. + /// The opcode arrays specify the instructions used for load/store. + SDNode *SelectVLDSTLane(SDNode *N, bool IsLoad, bool isUpdating, + unsigned NumVecs, const uint16_t *Opcodes); + + SDValue getTargetSubregToReg(int SRIdx, SDLoc DL, EVT VT, EVT VTD, + SDValue Operand); }; } @@ -590,32 +603,84 @@ static unsigned getVLDSTRegisterUpdateOpcode(unsigned Opc) { case AArch64::ST1x4WB_8H_fixed: return AArch64::ST1x4WB_8H_register; case AArch64::ST1x4WB_4S_fixed: return AArch64::ST1x4WB_4S_register; case AArch64::ST1x4WB_2D_fixed: return AArch64::ST1x4WB_2D_register; + + // Post-index of duplicate loads + case AArch64::LD2R_WB_8B_fixed: return AArch64::LD2R_WB_8B_register; + case AArch64::LD2R_WB_4H_fixed: return AArch64::LD2R_WB_4H_register; + case AArch64::LD2R_WB_2S_fixed: return AArch64::LD2R_WB_2S_register; + case AArch64::LD2R_WB_1D_fixed: return AArch64::LD2R_WB_1D_register; + case AArch64::LD2R_WB_16B_fixed: return AArch64::LD2R_WB_16B_register; + case AArch64::LD2R_WB_8H_fixed: return AArch64::LD2R_WB_8H_register; + case AArch64::LD2R_WB_4S_fixed: return AArch64::LD2R_WB_4S_register; + case AArch64::LD2R_WB_2D_fixed: return AArch64::LD2R_WB_2D_register; + + case AArch64::LD3R_WB_8B_fixed: return AArch64::LD3R_WB_8B_register; + case AArch64::LD3R_WB_4H_fixed: return AArch64::LD3R_WB_4H_register; + case AArch64::LD3R_WB_2S_fixed: return AArch64::LD3R_WB_2S_register; + case AArch64::LD3R_WB_1D_fixed: return AArch64::LD3R_WB_1D_register; + case AArch64::LD3R_WB_16B_fixed: return AArch64::LD3R_WB_16B_register; + case AArch64::LD3R_WB_8H_fixed: return AArch64::LD3R_WB_8H_register; + case AArch64::LD3R_WB_4S_fixed: return AArch64::LD3R_WB_4S_register; + case AArch64::LD3R_WB_2D_fixed: return AArch64::LD3R_WB_2D_register; + + case AArch64::LD4R_WB_8B_fixed: return AArch64::LD4R_WB_8B_register; + case AArch64::LD4R_WB_4H_fixed: return AArch64::LD4R_WB_4H_register; + case AArch64::LD4R_WB_2S_fixed: return AArch64::LD4R_WB_2S_register; + case AArch64::LD4R_WB_1D_fixed: return AArch64::LD4R_WB_1D_register; + case AArch64::LD4R_WB_16B_fixed: return AArch64::LD4R_WB_16B_register; + case AArch64::LD4R_WB_8H_fixed: return AArch64::LD4R_WB_8H_register; + case AArch64::LD4R_WB_4S_fixed: return AArch64::LD4R_WB_4S_register; + case AArch64::LD4R_WB_2D_fixed: return AArch64::LD4R_WB_2D_register; + + // Post-index of lane loads + case AArch64::LD2LN_WB_B_fixed: return AArch64::LD2LN_WB_B_register; + case AArch64::LD2LN_WB_H_fixed: return AArch64::LD2LN_WB_H_register; + case AArch64::LD2LN_WB_S_fixed: return AArch64::LD2LN_WB_S_register; + case AArch64::LD2LN_WB_D_fixed: return AArch64::LD2LN_WB_D_register; + + case AArch64::LD3LN_WB_B_fixed: return AArch64::LD3LN_WB_B_register; + case AArch64::LD3LN_WB_H_fixed: return AArch64::LD3LN_WB_H_register; + case AArch64::LD3LN_WB_S_fixed: return AArch64::LD3LN_WB_S_register; + case AArch64::LD3LN_WB_D_fixed: return AArch64::LD3LN_WB_D_register; + + case AArch64::LD4LN_WB_B_fixed: return AArch64::LD4LN_WB_B_register; + case AArch64::LD4LN_WB_H_fixed: return AArch64::LD4LN_WB_H_register; + case AArch64::LD4LN_WB_S_fixed: return AArch64::LD4LN_WB_S_register; + case AArch64::LD4LN_WB_D_fixed: return AArch64::LD4LN_WB_D_register; + + // Post-index of lane stores + case AArch64::ST2LN_WB_B_fixed: return AArch64::ST2LN_WB_B_register; + case AArch64::ST2LN_WB_H_fixed: return AArch64::ST2LN_WB_H_register; + case AArch64::ST2LN_WB_S_fixed: return AArch64::ST2LN_WB_S_register; + case AArch64::ST2LN_WB_D_fixed: return AArch64::ST2LN_WB_D_register; + + case AArch64::ST3LN_WB_B_fixed: return AArch64::ST3LN_WB_B_register; + case AArch64::ST3LN_WB_H_fixed: return AArch64::ST3LN_WB_H_register; + case AArch64::ST3LN_WB_S_fixed: return AArch64::ST3LN_WB_S_register; + case AArch64::ST3LN_WB_D_fixed: return AArch64::ST3LN_WB_D_register; + + case AArch64::ST4LN_WB_B_fixed: return AArch64::ST4LN_WB_B_register; + case AArch64::ST4LN_WB_H_fixed: return AArch64::ST4LN_WB_H_register; + case AArch64::ST4LN_WB_S_fixed: return AArch64::ST4LN_WB_S_register; + case AArch64::ST4LN_WB_D_fixed: return AArch64::ST4LN_WB_D_register; } return Opc; // If not one we handle, return it unchanged. } -SDNode *AArch64DAGToDAGISel::SelectVLD(SDNode *N, unsigned NumVecs, - bool isUpdating, +SDNode *AArch64DAGToDAGISel::SelectVLD(SDNode *N, bool isUpdating, + unsigned NumVecs, const uint16_t *Opcodes) { assert(NumVecs >= 1 && NumVecs <= 4 && "VLD NumVecs out-of-range"); EVT VT = N->getValueType(0); unsigned OpcodeIndex; - switch (VT.getSimpleVT().SimpleTy) { + bool is64BitVector = VT.is64BitVector(); + switch (VT.getScalarType().getSizeInBits()) { + case 8: OpcodeIndex = is64BitVector ? 0 : 4; break; + case 16: OpcodeIndex = is64BitVector ? 1 : 5; break; + case 32: OpcodeIndex = is64BitVector ? 2 : 6; break; + case 64: OpcodeIndex = is64BitVector ? 3 : 7; break; default: llvm_unreachable("unhandled vector load type"); - case MVT::v8i8: OpcodeIndex = 0; break; - case MVT::v4i16: OpcodeIndex = 1; break; - case MVT::v2f32: - case MVT::v2i32: OpcodeIndex = 2; break; - case MVT::v1f64: - case MVT::v1i64: OpcodeIndex = 3; break; - case MVT::v16i8: OpcodeIndex = 4; break; - case MVT::v8f16: - case MVT::v8i16: OpcodeIndex = 5; break; - case MVT::v4f32: - case MVT::v4i32: OpcodeIndex = 6; break; - case MVT::v2f64: - case MVT::v2i64: OpcodeIndex = 7; break; } unsigned Opc = Opcodes[OpcodeIndex]; @@ -632,9 +697,8 @@ SDNode *AArch64DAGToDAGISel::SelectVLD(SDNode *N, unsigned NumVecs, Ops.push_back(N->getOperand(0)); // Push back the Chain - std::vector ResTys; - bool is64BitVector = VT.is64BitVector(); - + SmallVector ResTys; + // Push back the type of return super register if (NumVecs == 1) ResTys.push_back(VT); else if (NumVecs == 3) @@ -675,8 +739,8 @@ SDNode *AArch64DAGToDAGISel::SelectVLD(SDNode *N, unsigned NumVecs, return NULL; } -SDNode *AArch64DAGToDAGISel::SelectVST(SDNode *N, unsigned NumVecs, - bool isUpdating, +SDNode *AArch64DAGToDAGISel::SelectVST(SDNode *N, bool isUpdating, + unsigned NumVecs, const uint16_t *Opcodes) { assert(NumVecs >= 1 && NumVecs <= 4 && "VST NumVecs out-of-range"); SDLoc dl(N); @@ -685,28 +749,20 @@ SDNode *AArch64DAGToDAGISel::SelectVST(SDNode *N, unsigned NumVecs, MemOp[0] = cast(N)->getMemOperand(); unsigned AddrOpIdx = isUpdating ? 1 : 2; - unsigned Vec0Idx = 3; // AddrOpIdx + (isUpdating ? 2 : 1) + unsigned Vec0Idx = 3; EVT VT = N->getOperand(Vec0Idx).getValueType(); unsigned OpcodeIndex; - switch (VT.getSimpleVT().SimpleTy) { + bool is64BitVector = VT.is64BitVector(); + switch (VT.getScalarType().getSizeInBits()) { + case 8: OpcodeIndex = is64BitVector ? 0 : 4; break; + case 16: OpcodeIndex = is64BitVector ? 1 : 5; break; + case 32: OpcodeIndex = is64BitVector ? 2 : 6; break; + case 64: OpcodeIndex = is64BitVector ? 3 : 7; break; default: llvm_unreachable("unhandled vector store type"); - case MVT::v8i8: OpcodeIndex = 0; break; - case MVT::v4i16: OpcodeIndex = 1; break; - case MVT::v2f32: - case MVT::v2i32: OpcodeIndex = 2; break; - case MVT::v1f64: - case MVT::v1i64: OpcodeIndex = 3; break; - case MVT::v16i8: OpcodeIndex = 4; break; - case MVT::v8f16: - case MVT::v8i16: OpcodeIndex = 5; break; - case MVT::v4f32: - case MVT::v4i32: OpcodeIndex = 6; break; - case MVT::v2f64: - case MVT::v2i64: OpcodeIndex = 7; break; } unsigned Opc = Opcodes[OpcodeIndex]; - std::vector ResTys; + SmallVector ResTys; if (isUpdating) ResTys.push_back(MVT::i64); ResTys.push_back(MVT::Other); // Type for the Chain @@ -720,7 +776,6 @@ SDNode *AArch64DAGToDAGISel::SelectVST(SDNode *N, unsigned NumVecs, Opc = getVLDSTRegisterUpdateOpcode(Opc); Ops.push_back(Inc); } - bool is64BitVector = VT.is64BitVector(); SmallVector Regs(N->op_begin() + Vec0Idx, N->op_begin() + Vec0Idx + NumVecs); @@ -737,6 +792,172 @@ SDNode *AArch64DAGToDAGISel::SelectVST(SDNode *N, unsigned NumVecs, return VSt; } +SDValue +AArch64DAGToDAGISel::getTargetSubregToReg(int SRIdx, SDLoc DL, EVT VT, EVT VTD, + SDValue Operand) { + SDNode *Reg = CurDAG->getMachineNode(TargetOpcode::SUBREG_TO_REG, DL, + VT, VTD, MVT::Other, + CurDAG->getTargetConstant(0, MVT::i64), + Operand, + CurDAG->getTargetConstant(AArch64::sub_64, MVT::i32)); + return SDValue(Reg, 0); +} + +SDNode *AArch64DAGToDAGISel::SelectVLDDup(SDNode *N, bool isUpdating, + unsigned NumVecs, + const uint16_t *Opcodes) { + assert(NumVecs >=2 && NumVecs <= 4 && "Load Dup NumVecs out-of-range"); + SDLoc dl(N); + + EVT VT = N->getValueType(0); + unsigned OpcodeIndex; + bool is64BitVector = VT.is64BitVector(); + switch (VT.getScalarType().getSizeInBits()) { + case 8: OpcodeIndex = is64BitVector ? 0 : 4; break; + case 16: OpcodeIndex = is64BitVector ? 1 : 5; break; + case 32: OpcodeIndex = is64BitVector ? 2 : 6; break; + case 64: OpcodeIndex = is64BitVector ? 3 : 7; break; + default: llvm_unreachable("unhandled vector duplicate lane load type"); + } + unsigned Opc = Opcodes[OpcodeIndex]; + + SDValue SuperReg; + SmallVector Ops; + Ops.push_back(N->getOperand(1)); // Push back the Memory Address + if (isUpdating) { + SDValue Inc = N->getOperand(2); + if (!isa(Inc.getNode())) // Increment in Register + Opc = getVLDSTRegisterUpdateOpcode(Opc); + Ops.push_back(Inc); + } + Ops.push_back(N->getOperand(0)); // Push back the Chain + + SmallVector ResTys; + // Push back the type of return super register + if (NumVecs == 3) + ResTys.push_back(MVT::Untyped); + else { + EVT ResTy = EVT::getVectorVT(*CurDAG->getContext(), MVT::i64, + is64BitVector ? NumVecs : NumVecs * 2); + ResTys.push_back(ResTy); + } + if (isUpdating) + ResTys.push_back(MVT::i64); // Type of the updated register + ResTys.push_back(MVT::Other); // Type of the Chain + SDNode *VLdDup = CurDAG->getMachineNode(Opc, dl, ResTys, Ops); + + // Transfer memoperands. + MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1); + MemOp[0] = cast(N)->getMemOperand(); + cast(VLdDup)->setMemRefs(MemOp, MemOp + 1); + + SuperReg = SDValue(VLdDup, 0); + unsigned Sub0 = is64BitVector ? AArch64::dsub_0 : AArch64::qsub_0; + // Update uses of each registers in super register + for (unsigned Vec = 0; Vec < NumVecs; ++Vec) + ReplaceUses(SDValue(N, Vec), + CurDAG->getTargetExtractSubreg(Sub0 + Vec, dl, VT, SuperReg)); + // Update uses of the Chain + ReplaceUses(SDValue(N, NumVecs), SDValue(VLdDup, 1)); + if (isUpdating) + ReplaceUses(SDValue(N, NumVecs + 1), SDValue(VLdDup, 2)); + return NULL; +} + +// We only have 128-bit vector type of load/store lane instructions. +// If it is 64-bit vector, we also select it to the 128-bit instructions. +// Just use SUBREG_TO_REG to adapt the input to 128-bit vector and +// EXTRACT_SUBREG to get the 64-bit vector from the 128-bit vector output. +SDNode *AArch64DAGToDAGISel::SelectVLDSTLane(SDNode *N, bool IsLoad, + bool isUpdating, unsigned NumVecs, + const uint16_t *Opcodes) { + assert(NumVecs >= 2 && NumVecs <= 4 && "VLDSTLane NumVecs out-of-range"); + SDLoc dl(N); + unsigned AddrOpIdx = isUpdating ? 1 : 2; + unsigned Vec0Idx = 3; + + SDValue Chain = N->getOperand(0); + unsigned Lane = + cast(N->getOperand(Vec0Idx + NumVecs))->getZExtValue(); + EVT VT = N->getOperand(Vec0Idx).getValueType(); + bool is64BitVector = VT.is64BitVector(); + EVT VT64; // 64-bit Vector Type + + if (is64BitVector) { + VT64 = VT; + VT = EVT::getVectorVT(*CurDAG->getContext(), VT.getVectorElementType(), + VT.getVectorNumElements() * 2); + } + + unsigned OpcodeIndex; + switch (VT.getScalarType().getSizeInBits()) { + case 8: OpcodeIndex = 0; break; + case 16: OpcodeIndex = 1; break; + case 32: OpcodeIndex = 2; break; + case 64: OpcodeIndex = 3; break; + default: llvm_unreachable("unhandled vector lane load/store type"); + } + unsigned Opc = Opcodes[OpcodeIndex]; + + SmallVector ResTys; + if (IsLoad) { + // Push back the type of return super register + if (NumVecs == 3) + ResTys.push_back(MVT::Untyped); + else { + EVT ResTy = EVT::getVectorVT(*CurDAG->getContext(), MVT::i64, + is64BitVector ? NumVecs : NumVecs * 2); + ResTys.push_back(ResTy); + } + } + if (isUpdating) + ResTys.push_back(MVT::i64); // Type of the updated register + ResTys.push_back(MVT::Other); // Type of Chain + SmallVector Ops; + Ops.push_back(N->getOperand(AddrOpIdx)); // Push back the Memory Address + if (isUpdating) { + SDValue Inc = N->getOperand(AddrOpIdx + 1); + if (!isa(Inc.getNode())) // Increment in Register + Opc = getVLDSTRegisterUpdateOpcode(Opc); + Ops.push_back(Inc); + } + + SmallVector Regs(N->op_begin() + Vec0Idx, + N->op_begin() + Vec0Idx + NumVecs); + if (is64BitVector) + for (unsigned i = 0; i < Regs.size(); i++) + Regs[i] = getTargetSubregToReg(AArch64::sub_64, dl, VT, VT64, Regs[i]); + SDValue SuperReg = createQTuple(Regs); + + Ops.push_back(SuperReg); // Source Reg + SDValue LaneValue = CurDAG->getTargetConstant(Lane, MVT::i32); + Ops.push_back(LaneValue); + Ops.push_back(Chain); // Push back the Chain + + SDNode *VLdLn = CurDAG->getMachineNode(Opc, dl, ResTys, Ops); + MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1); + MemOp[0] = cast(N)->getMemOperand(); + cast(VLdLn)->setMemRefs(MemOp, MemOp + 1); + if (!IsLoad) + return VLdLn; + + // Extract the subregisters. + SuperReg = SDValue(VLdLn, 0); + unsigned Sub0 = AArch64::qsub_0; + // Update uses of each registers in super register + for (unsigned Vec = 0; Vec < NumVecs; ++Vec) { + SDValue SUB0 = CurDAG->getTargetExtractSubreg(Sub0 + Vec, dl, VT, SuperReg); + if (is64BitVector) { + SUB0 = CurDAG->getTargetExtractSubreg(AArch64::sub_64, dl, VT64, SUB0); + } + ReplaceUses(SDValue(N, Vec), SUB0); + } + ReplaceUses(SDValue(N, NumVecs), SDValue(VLdLn, 1)); + if (isUpdating) + ReplaceUses(SDValue(N, NumVecs + 1), SDValue(VLdLn, 2)); + return NULL; +} + unsigned AArch64DAGToDAGISel::getTBLOpc(bool IsExt, bool Is64Bit, unsigned NumOfVec) { assert(NumOfVec >= 1 && NumOfVec <= 4 && "VST NumVecs out-of-range"); @@ -955,7 +1176,7 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) { AArch64::LD1WB_16B_fixed, AArch64::LD1WB_8H_fixed, AArch64::LD1WB_4S_fixed, AArch64::LD1WB_2D_fixed }; - return SelectVLD(Node, 1, true, Opcodes); + return SelectVLD(Node, true, 1, Opcodes); } case AArch64ISD::NEON_LD2_UPD: { static const uint16_t Opcodes[] = { @@ -964,7 +1185,7 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) { AArch64::LD2WB_16B_fixed, AArch64::LD2WB_8H_fixed, AArch64::LD2WB_4S_fixed, AArch64::LD2WB_2D_fixed }; - return SelectVLD(Node, 2, true, Opcodes); + return SelectVLD(Node, true, 2, Opcodes); } case AArch64ISD::NEON_LD3_UPD: { static const uint16_t Opcodes[] = { @@ -973,7 +1194,7 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) { AArch64::LD3WB_16B_fixed, AArch64::LD3WB_8H_fixed, AArch64::LD3WB_4S_fixed, AArch64::LD3WB_2D_fixed }; - return SelectVLD(Node, 3, true, Opcodes); + return SelectVLD(Node, true, 3, Opcodes); } case AArch64ISD::NEON_LD4_UPD: { static const uint16_t Opcodes[] = { @@ -982,7 +1203,7 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) { AArch64::LD4WB_16B_fixed, AArch64::LD4WB_8H_fixed, AArch64::LD4WB_4S_fixed, AArch64::LD4WB_2D_fixed }; - return SelectVLD(Node, 4, true, Opcodes); + return SelectVLD(Node, true, 4, Opcodes); } case AArch64ISD::NEON_LD1x2_UPD: { static const uint16_t Opcodes[] = { @@ -991,7 +1212,7 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) { AArch64::LD1x2WB_16B_fixed, AArch64::LD1x2WB_8H_fixed, AArch64::LD1x2WB_4S_fixed, AArch64::LD1x2WB_2D_fixed }; - return SelectVLD(Node, 2, true, Opcodes); + return SelectVLD(Node, true, 2, Opcodes); } case AArch64ISD::NEON_LD1x3_UPD: { static const uint16_t Opcodes[] = { @@ -1000,7 +1221,7 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) { AArch64::LD1x3WB_16B_fixed, AArch64::LD1x3WB_8H_fixed, AArch64::LD1x3WB_4S_fixed, AArch64::LD1x3WB_2D_fixed }; - return SelectVLD(Node, 3, true, Opcodes); + return SelectVLD(Node, true, 3, Opcodes); } case AArch64ISD::NEON_LD1x4_UPD: { static const uint16_t Opcodes[] = { @@ -1009,7 +1230,7 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) { AArch64::LD1x4WB_16B_fixed, AArch64::LD1x4WB_8H_fixed, AArch64::LD1x4WB_4S_fixed, AArch64::LD1x4WB_2D_fixed }; - return SelectVLD(Node, 4, true, Opcodes); + return SelectVLD(Node, true, 4, Opcodes); } case AArch64ISD::NEON_ST1_UPD: { static const uint16_t Opcodes[] = { @@ -1018,7 +1239,7 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) { AArch64::ST1WB_16B_fixed, AArch64::ST1WB_8H_fixed, AArch64::ST1WB_4S_fixed, AArch64::ST1WB_2D_fixed }; - return SelectVST(Node, 1, true, Opcodes); + return SelectVST(Node, true, 1, Opcodes); } case AArch64ISD::NEON_ST2_UPD: { static const uint16_t Opcodes[] = { @@ -1027,7 +1248,7 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) { AArch64::ST2WB_16B_fixed, AArch64::ST2WB_8H_fixed, AArch64::ST2WB_4S_fixed, AArch64::ST2WB_2D_fixed }; - return SelectVST(Node, 2, true, Opcodes); + return SelectVST(Node, true, 2, Opcodes); } case AArch64ISD::NEON_ST3_UPD: { static const uint16_t Opcodes[] = { @@ -1036,7 +1257,7 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) { AArch64::ST3WB_16B_fixed, AArch64::ST3WB_8H_fixed, AArch64::ST3WB_4S_fixed, AArch64::ST3WB_2D_fixed }; - return SelectVST(Node, 3, true, Opcodes); + return SelectVST(Node, true, 3, Opcodes); } case AArch64ISD::NEON_ST4_UPD: { static const uint16_t Opcodes[] = { @@ -1045,7 +1266,100 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) { AArch64::ST4WB_16B_fixed, AArch64::ST4WB_8H_fixed, AArch64::ST4WB_4S_fixed, AArch64::ST4WB_2D_fixed }; - return SelectVST(Node, 4, true, Opcodes); + return SelectVST(Node, true, 4, Opcodes); + } + case AArch64ISD::NEON_LD2DUP: { + static const uint16_t Opcodes[] = { + AArch64::LD2R_8B, AArch64::LD2R_4H, AArch64::LD2R_2S, + AArch64::LD2R_1D, AArch64::LD2R_16B, AArch64::LD2R_8H, + AArch64::LD2R_4S, AArch64::LD2R_2D + }; + return SelectVLDDup(Node, false, 2, Opcodes); + } + case AArch64ISD::NEON_LD3DUP: { + static const uint16_t Opcodes[] = { + AArch64::LD3R_8B, AArch64::LD3R_4H, AArch64::LD3R_2S, + AArch64::LD3R_1D, AArch64::LD3R_16B, AArch64::LD3R_8H, + AArch64::LD3R_4S, AArch64::LD3R_2D + }; + return SelectVLDDup(Node, false, 3, Opcodes); + } + case AArch64ISD::NEON_LD4DUP: { + static const uint16_t Opcodes[] = { + AArch64::LD4R_8B, AArch64::LD4R_4H, AArch64::LD4R_2S, + AArch64::LD4R_1D, AArch64::LD4R_16B, AArch64::LD4R_8H, + AArch64::LD4R_4S, AArch64::LD4R_2D + }; + return SelectVLDDup(Node, false, 4, Opcodes); + } + case AArch64ISD::NEON_LD2DUP_UPD: { + static const uint16_t Opcodes[] = { + AArch64::LD2R_WB_8B_fixed, AArch64::LD2R_WB_4H_fixed, + AArch64::LD2R_WB_2S_fixed, AArch64::LD2R_WB_1D_fixed, + AArch64::LD2R_WB_16B_fixed, AArch64::LD2R_WB_8H_fixed, + AArch64::LD2R_WB_4S_fixed, AArch64::LD2R_WB_2D_fixed + }; + return SelectVLDDup(Node, true, 2, Opcodes); + } + case AArch64ISD::NEON_LD3DUP_UPD: { + static const uint16_t Opcodes[] = { + AArch64::LD3R_WB_8B_fixed, AArch64::LD3R_WB_4H_fixed, + AArch64::LD3R_WB_2S_fixed, AArch64::LD3R_WB_1D_fixed, + AArch64::LD3R_WB_16B_fixed, AArch64::LD3R_WB_8H_fixed, + AArch64::LD3R_WB_4S_fixed, AArch64::LD3R_WB_2D_fixed + }; + return SelectVLDDup(Node, true, 3, Opcodes); + } + case AArch64ISD::NEON_LD4DUP_UPD: { + static const uint16_t Opcodes[] = { + AArch64::LD4R_WB_8B_fixed, AArch64::LD4R_WB_4H_fixed, + AArch64::LD4R_WB_2S_fixed, AArch64::LD4R_WB_1D_fixed, + AArch64::LD4R_WB_16B_fixed, AArch64::LD4R_WB_8H_fixed, + AArch64::LD4R_WB_4S_fixed, AArch64::LD4R_WB_2D_fixed + }; + return SelectVLDDup(Node, true, 4, Opcodes); + } + case AArch64ISD::NEON_LD2LN_UPD: { + static const uint16_t Opcodes[] = { + AArch64::LD2LN_WB_B_fixed, AArch64::LD2LN_WB_H_fixed, + AArch64::LD2LN_WB_S_fixed, AArch64::LD2LN_WB_D_fixed + }; + return SelectVLDSTLane(Node, true, true, 2, Opcodes); + } + case AArch64ISD::NEON_LD3LN_UPD: { + static const uint16_t Opcodes[] = { + AArch64::LD3LN_WB_B_fixed, AArch64::LD3LN_WB_H_fixed, + AArch64::LD3LN_WB_S_fixed, AArch64::LD3LN_WB_D_fixed + }; + return SelectVLDSTLane(Node, true, true, 3, Opcodes); + } + case AArch64ISD::NEON_LD4LN_UPD: { + static const uint16_t Opcodes[] = { + AArch64::LD4LN_WB_B_fixed, AArch64::LD4LN_WB_H_fixed, + AArch64::LD4LN_WB_S_fixed, AArch64::LD4LN_WB_D_fixed + }; + return SelectVLDSTLane(Node, true, true, 4, Opcodes); + } + case AArch64ISD::NEON_ST2LN_UPD: { + static const uint16_t Opcodes[] = { + AArch64::ST2LN_WB_B_fixed, AArch64::ST2LN_WB_H_fixed, + AArch64::ST2LN_WB_S_fixed, AArch64::ST2LN_WB_D_fixed + }; + return SelectVLDSTLane(Node, false, true, 2, Opcodes); + } + case AArch64ISD::NEON_ST3LN_UPD: { + static const uint16_t Opcodes[] = { + AArch64::ST3LN_WB_B_fixed, AArch64::ST3LN_WB_H_fixed, + AArch64::ST3LN_WB_S_fixed, AArch64::ST3LN_WB_D_fixed + }; + return SelectVLDSTLane(Node, false, true, 3, Opcodes); + } + case AArch64ISD::NEON_ST4LN_UPD: { + static const uint16_t Opcodes[] = { + AArch64::ST4LN_WB_B_fixed, AArch64::ST4LN_WB_H_fixed, + AArch64::ST4LN_WB_S_fixed, AArch64::ST4LN_WB_D_fixed + }; + return SelectVLDSTLane(Node, false, true, 4, Opcodes); } case AArch64ISD::NEON_ST1x2_UPD: { static const uint16_t Opcodes[] = { @@ -1054,7 +1368,7 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) { AArch64::ST1x2WB_16B_fixed, AArch64::ST1x2WB_8H_fixed, AArch64::ST1x2WB_4S_fixed, AArch64::ST1x2WB_2D_fixed }; - return SelectVST(Node, 2, true, Opcodes); + return SelectVST(Node, true, 2, Opcodes); } case AArch64ISD::NEON_ST1x3_UPD: { static const uint16_t Opcodes[] = { @@ -1063,7 +1377,7 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) { AArch64::ST1x3WB_16B_fixed, AArch64::ST1x3WB_8H_fixed, AArch64::ST1x3WB_4S_fixed, AArch64::ST1x3WB_2D_fixed }; - return SelectVST(Node, 3, true, Opcodes); + return SelectVST(Node, true, 3, Opcodes); } case AArch64ISD::NEON_ST1x4_UPD: { static const uint16_t Opcodes[] = { @@ -1072,7 +1386,7 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) { AArch64::ST1x4WB_16B_fixed, AArch64::ST1x4WB_8H_fixed, AArch64::ST1x4WB_4S_fixed, AArch64::ST1x4WB_2D_fixed }; - return SelectVST(Node, 4, true, Opcodes); + return SelectVST(Node, true, 4, Opcodes); } case ISD::INTRINSIC_WO_CHAIN: { unsigned IntNo = cast(Node->getOperand(0))->getZExtValue(); @@ -1105,114 +1419,149 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) { switch (IntNo) { default: break; - case Intrinsic::arm_neon_vld1: { - static const uint16_t Opcodes[] = { AArch64::LD1_8B, AArch64::LD1_4H, - AArch64::LD1_2S, AArch64::LD1_1D, - AArch64::LD1_16B, AArch64::LD1_8H, - AArch64::LD1_4S, AArch64::LD1_2D }; - return SelectVLD(Node, 1, false, Opcodes); + static const uint16_t Opcodes[] = { + AArch64::LD1_8B, AArch64::LD1_4H, AArch64::LD1_2S, AArch64::LD1_1D, + AArch64::LD1_16B, AArch64::LD1_8H, AArch64::LD1_4S, AArch64::LD1_2D + }; + return SelectVLD(Node, false, 1, Opcodes); } case Intrinsic::arm_neon_vld2: { - static const uint16_t Opcodes[] = { AArch64::LD2_8B, AArch64::LD2_4H, - AArch64::LD2_2S, AArch64::LD1x2_1D, - AArch64::LD2_16B, AArch64::LD2_8H, - AArch64::LD2_4S, AArch64::LD2_2D }; - return SelectVLD(Node, 2, false, Opcodes); + static const uint16_t Opcodes[] = { + AArch64::LD2_8B, AArch64::LD2_4H, AArch64::LD2_2S, AArch64::LD1x2_1D, + AArch64::LD2_16B, AArch64::LD2_8H, AArch64::LD2_4S, AArch64::LD2_2D + }; + return SelectVLD(Node, false, 2, Opcodes); } case Intrinsic::arm_neon_vld3: { - static const uint16_t Opcodes[] = { AArch64::LD3_8B, AArch64::LD3_4H, - AArch64::LD3_2S, AArch64::LD1x3_1D, - AArch64::LD3_16B, AArch64::LD3_8H, - AArch64::LD3_4S, AArch64::LD3_2D }; - return SelectVLD(Node, 3, false, Opcodes); + static const uint16_t Opcodes[] = { + AArch64::LD3_8B, AArch64::LD3_4H, AArch64::LD3_2S, AArch64::LD1x3_1D, + AArch64::LD3_16B, AArch64::LD3_8H, AArch64::LD3_4S, AArch64::LD3_2D + }; + return SelectVLD(Node, false, 3, Opcodes); } case Intrinsic::arm_neon_vld4: { - static const uint16_t Opcodes[] = { AArch64::LD4_8B, AArch64::LD4_4H, - AArch64::LD4_2S, AArch64::LD1x4_1D, - AArch64::LD4_16B, AArch64::LD4_8H, - AArch64::LD4_4S, AArch64::LD4_2D }; - return SelectVLD(Node, 4, false, Opcodes); + static const uint16_t Opcodes[] = { + AArch64::LD4_8B, AArch64::LD4_4H, AArch64::LD4_2S, AArch64::LD1x4_1D, + AArch64::LD4_16B, AArch64::LD4_8H, AArch64::LD4_4S, AArch64::LD4_2D + }; + return SelectVLD(Node, false, 4, Opcodes); } case Intrinsic::aarch64_neon_vld1x2: { static const uint16_t Opcodes[] = { - AArch64::LD1x2_8B, AArch64::LD1x2_4H, AArch64::LD1x2_2S, - AArch64::LD1x2_1D, AArch64::LD1x2_16B, AArch64::LD1x2_8H, - AArch64::LD1x2_4S, AArch64::LD1x2_2D + AArch64::LD1x2_8B, AArch64::LD1x2_4H, AArch64::LD1x2_2S, + AArch64::LD1x2_1D, AArch64::LD1x2_16B, AArch64::LD1x2_8H, + AArch64::LD1x2_4S, AArch64::LD1x2_2D }; - return SelectVLD(Node, 2, false, Opcodes); + return SelectVLD(Node, false, 2, Opcodes); } case Intrinsic::aarch64_neon_vld1x3: { static const uint16_t Opcodes[] = { - AArch64::LD1x3_8B, AArch64::LD1x3_4H, AArch64::LD1x3_2S, - AArch64::LD1x3_1D, AArch64::LD1x3_16B, AArch64::LD1x3_8H, - AArch64::LD1x3_4S, AArch64::LD1x3_2D + AArch64::LD1x3_8B, AArch64::LD1x3_4H, AArch64::LD1x3_2S, + AArch64::LD1x3_1D, AArch64::LD1x3_16B, AArch64::LD1x3_8H, + AArch64::LD1x3_4S, AArch64::LD1x3_2D }; - return SelectVLD(Node, 3, false, Opcodes); + return SelectVLD(Node, false, 3, Opcodes); } case Intrinsic::aarch64_neon_vld1x4: { static const uint16_t Opcodes[] = { - AArch64::LD1x4_8B, AArch64::LD1x4_4H, AArch64::LD1x4_2S, - AArch64::LD1x4_1D, AArch64::LD1x4_16B, AArch64::LD1x4_8H, - AArch64::LD1x4_4S, AArch64::LD1x4_2D + AArch64::LD1x4_8B, AArch64::LD1x4_4H, AArch64::LD1x4_2S, + AArch64::LD1x4_1D, AArch64::LD1x4_16B, AArch64::LD1x4_8H, + AArch64::LD1x4_4S, AArch64::LD1x4_2D }; - return SelectVLD(Node, 4, false, Opcodes); + return SelectVLD(Node, false, 4, Opcodes); } case Intrinsic::arm_neon_vst1: { - static const uint16_t Opcodes[] = { AArch64::ST1_8B, AArch64::ST1_4H, - AArch64::ST1_2S, AArch64::ST1_1D, - AArch64::ST1_16B, AArch64::ST1_8H, - AArch64::ST1_4S, AArch64::ST1_2D }; - return SelectVST(Node, 1, false, Opcodes); + static const uint16_t Opcodes[] = { + AArch64::ST1_8B, AArch64::ST1_4H, AArch64::ST1_2S, AArch64::ST1_1D, + AArch64::ST1_16B, AArch64::ST1_8H, AArch64::ST1_4S, AArch64::ST1_2D + }; + return SelectVST(Node, false, 1, Opcodes); } case Intrinsic::arm_neon_vst2: { - static const uint16_t Opcodes[] = { AArch64::ST2_8B, AArch64::ST2_4H, - AArch64::ST2_2S, AArch64::ST1x2_1D, - AArch64::ST2_16B, AArch64::ST2_8H, - AArch64::ST2_4S, AArch64::ST2_2D }; - return SelectVST(Node, 2, false, Opcodes); + static const uint16_t Opcodes[] = { + AArch64::ST2_8B, AArch64::ST2_4H, AArch64::ST2_2S, AArch64::ST1x2_1D, + AArch64::ST2_16B, AArch64::ST2_8H, AArch64::ST2_4S, AArch64::ST2_2D + }; + return SelectVST(Node, false, 2, Opcodes); } case Intrinsic::arm_neon_vst3: { - static const uint16_t Opcodes[] = { AArch64::ST3_8B, AArch64::ST3_4H, - AArch64::ST3_2S, AArch64::ST1x3_1D, - AArch64::ST3_16B, AArch64::ST3_8H, - AArch64::ST3_4S, AArch64::ST3_2D }; - return SelectVST(Node, 3, false, Opcodes); + static const uint16_t Opcodes[] = { + AArch64::ST3_8B, AArch64::ST3_4H, AArch64::ST3_2S, AArch64::ST1x3_1D, + AArch64::ST3_16B, AArch64::ST3_8H, AArch64::ST3_4S, AArch64::ST3_2D + }; + return SelectVST(Node, false, 3, Opcodes); } case Intrinsic::arm_neon_vst4: { - static const uint16_t Opcodes[] = { AArch64::ST4_8B, AArch64::ST4_4H, - AArch64::ST4_2S, AArch64::ST1x4_1D, - AArch64::ST4_16B, AArch64::ST4_8H, - AArch64::ST4_4S, AArch64::ST4_2D }; - return SelectVST(Node, 4, false, Opcodes); + static const uint16_t Opcodes[] = { + AArch64::ST4_8B, AArch64::ST4_4H, AArch64::ST4_2S, AArch64::ST1x4_1D, + AArch64::ST4_16B, AArch64::ST4_8H, AArch64::ST4_4S, AArch64::ST4_2D + }; + return SelectVST(Node, false, 4, Opcodes); } case Intrinsic::aarch64_neon_vst1x2: { static const uint16_t Opcodes[] = { - AArch64::ST1x2_8B, AArch64::ST1x2_4H, AArch64::ST1x2_2S, - AArch64::ST1x2_1D, AArch64::ST1x2_16B, AArch64::ST1x2_8H, - AArch64::ST1x2_4S, AArch64::ST1x2_2D + AArch64::ST1x2_8B, AArch64::ST1x2_4H, AArch64::ST1x2_2S, + AArch64::ST1x2_1D, AArch64::ST1x2_16B, AArch64::ST1x2_8H, + AArch64::ST1x2_4S, AArch64::ST1x2_2D }; - return SelectVST(Node, 2, false, Opcodes); + return SelectVST(Node, false, 2, Opcodes); } case Intrinsic::aarch64_neon_vst1x3: { static const uint16_t Opcodes[] = { - AArch64::ST1x3_8B, AArch64::ST1x3_4H, AArch64::ST1x3_2S, - AArch64::ST1x3_1D, AArch64::ST1x3_16B, AArch64::ST1x3_8H, - AArch64::ST1x3_4S, AArch64::ST1x3_2D + AArch64::ST1x3_8B, AArch64::ST1x3_4H, AArch64::ST1x3_2S, + AArch64::ST1x3_1D, AArch64::ST1x3_16B, AArch64::ST1x3_8H, + AArch64::ST1x3_4S, AArch64::ST1x3_2D }; - return SelectVST(Node, 3, false, Opcodes); + return SelectVST(Node, false, 3, Opcodes); } case Intrinsic::aarch64_neon_vst1x4: { static const uint16_t Opcodes[] = { - AArch64::ST1x4_8B, AArch64::ST1x4_4H, AArch64::ST1x4_2S, - AArch64::ST1x4_1D, AArch64::ST1x4_16B, AArch64::ST1x4_8H, - AArch64::ST1x4_4S, AArch64::ST1x4_2D + AArch64::ST1x4_8B, AArch64::ST1x4_4H, AArch64::ST1x4_2S, + AArch64::ST1x4_1D, AArch64::ST1x4_16B, AArch64::ST1x4_8H, + AArch64::ST1x4_4S, AArch64::ST1x4_2D }; - return SelectVST(Node, 4, false, Opcodes); + return SelectVST(Node, false, 4, Opcodes); } + case Intrinsic::arm_neon_vld2lane: { + static const uint16_t Opcodes[] = { + AArch64::LD2LN_B, AArch64::LD2LN_H, AArch64::LD2LN_S, AArch64::LD2LN_D + }; + return SelectVLDSTLane(Node, true, false, 2, Opcodes); } + case Intrinsic::arm_neon_vld3lane: { + static const uint16_t Opcodes[] = { + AArch64::LD3LN_B, AArch64::LD3LN_H, AArch64::LD3LN_S, AArch64::LD3LN_D + }; + return SelectVLDSTLane(Node, true, false, 3, Opcodes); + } + case Intrinsic::arm_neon_vld4lane: { + static const uint16_t Opcodes[] = { + AArch64::LD4LN_B, AArch64::LD4LN_H, AArch64::LD4LN_S, AArch64::LD4LN_D + }; + return SelectVLDSTLane(Node, true, false, 4, Opcodes); + } + case Intrinsic::arm_neon_vst2lane: { + static const uint16_t Opcodes[] = { + AArch64::ST2LN_B, AArch64::ST2LN_H, AArch64::ST2LN_S, AArch64::ST2LN_D + }; + return SelectVLDSTLane(Node, false, false, 2, Opcodes); + } + case Intrinsic::arm_neon_vst3lane: { + static const uint16_t Opcodes[] = { + AArch64::ST3LN_B, AArch64::ST3LN_H, AArch64::ST3LN_S, AArch64::ST3LN_D + }; + return SelectVLDSTLane(Node, false, false, 3, Opcodes); + } + case Intrinsic::arm_neon_vst4lane: { + static const uint16_t Opcodes[] = { + AArch64::ST4LN_B, AArch64::ST4LN_H, AArch64::ST4LN_S, AArch64::ST4LN_D + }; + return SelectVLDSTLane(Node, false, false, 4, Opcodes); + } + } // End of switch IntNo break; - } + } // End of case ISD::INTRINSIC_VOID and :ISD::INTRINSIC_W_CHAIN default: break; // Let generic code handle it } diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp index bf04bf3747d..003359d1b57 100644 --- a/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -949,6 +949,30 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const { return "AArch64ISD::NEON_ST1x3_UPD"; case AArch64ISD::NEON_ST1x4_UPD: return "AArch64ISD::NEON_ST1x4_UPD"; + case AArch64ISD::NEON_LD2DUP: + return "AArch64ISD::NEON_LD2DUP"; + case AArch64ISD::NEON_LD3DUP: + return "AArch64ISD::NEON_LD3DUP"; + case AArch64ISD::NEON_LD4DUP: + return "AArch64ISD::NEON_LD4DUP"; + case AArch64ISD::NEON_LD2DUP_UPD: + return "AArch64ISD::NEON_LD2DUP_UPD"; + case AArch64ISD::NEON_LD3DUP_UPD: + return "AArch64ISD::NEON_LD3DUP_UPD"; + case AArch64ISD::NEON_LD4DUP_UPD: + return "AArch64ISD::NEON_LD4DUP_UPD"; + case AArch64ISD::NEON_LD2LN_UPD: + return "AArch64ISD::NEON_LD2LN_UPD"; + case AArch64ISD::NEON_LD3LN_UPD: + return "AArch64ISD::NEON_LD3LN_UPD"; + case AArch64ISD::NEON_LD4LN_UPD: + return "AArch64ISD::NEON_LD4LN_UPD"; + case AArch64ISD::NEON_ST2LN_UPD: + return "AArch64ISD::NEON_ST2LN_UPD"; + case AArch64ISD::NEON_ST3LN_UPD: + return "AArch64ISD::NEON_ST3LN_UPD"; + case AArch64ISD::NEON_ST4LN_UPD: + return "AArch64ISD::NEON_ST4LN_UPD"; case AArch64ISD::NEON_VEXTRACT: return "AArch64ISD::NEON_VEXTRACT"; default: @@ -3518,7 +3542,9 @@ static SDValue CombineBaseUpdate(SDNode *N, return SDValue(); SelectionDAG &DAG = DCI.DAG; - unsigned AddrOpIdx = 2; + bool isIntrinsic = (N->getOpcode() == ISD::INTRINSIC_VOID || + N->getOpcode() == ISD::INTRINSIC_W_CHAIN); + unsigned AddrOpIdx = (isIntrinsic ? 2 : 1); SDValue Addr = N->getOperand(AddrOpIdx); // Search for a use of the address operand that is an increment. @@ -3536,39 +3562,65 @@ static SDValue CombineBaseUpdate(SDNode *N, // Find the new opcode for the updating load/store. bool isLoad = true; + bool isLaneOp = false; unsigned NewOpc = 0; unsigned NumVecs = 0; - unsigned IntNo = cast(N->getOperand(1))->getZExtValue(); - switch (IntNo) { - default: llvm_unreachable("unexpected intrinsic for Neon base update"); - case Intrinsic::arm_neon_vld1: NewOpc = AArch64ISD::NEON_LD1_UPD; - NumVecs = 1; break; - case Intrinsic::arm_neon_vld2: NewOpc = AArch64ISD::NEON_LD2_UPD; - NumVecs = 2; break; - case Intrinsic::arm_neon_vld3: NewOpc = AArch64ISD::NEON_LD3_UPD; - NumVecs = 3; break; - case Intrinsic::arm_neon_vld4: NewOpc = AArch64ISD::NEON_LD4_UPD; - NumVecs = 4; break; - case Intrinsic::arm_neon_vst1: NewOpc = AArch64ISD::NEON_ST1_UPD; - NumVecs = 1; isLoad = false; break; - case Intrinsic::arm_neon_vst2: NewOpc = AArch64ISD::NEON_ST2_UPD; - NumVecs = 2; isLoad = false; break; - case Intrinsic::arm_neon_vst3: NewOpc = AArch64ISD::NEON_ST3_UPD; - NumVecs = 3; isLoad = false; break; - case Intrinsic::arm_neon_vst4: NewOpc = AArch64ISD::NEON_ST4_UPD; - NumVecs = 4; isLoad = false; break; - case Intrinsic::aarch64_neon_vld1x2: NewOpc = AArch64ISD::NEON_LD1x2_UPD; - NumVecs = 2; break; - case Intrinsic::aarch64_neon_vld1x3: NewOpc = AArch64ISD::NEON_LD1x3_UPD; - NumVecs = 3; break; - case Intrinsic::aarch64_neon_vld1x4: NewOpc = AArch64ISD::NEON_LD1x4_UPD; - NumVecs = 4; break; - case Intrinsic::aarch64_neon_vst1x2: NewOpc = AArch64ISD::NEON_ST1x2_UPD; - NumVecs = 2; isLoad = false; break; - case Intrinsic::aarch64_neon_vst1x3: NewOpc = AArch64ISD::NEON_ST1x3_UPD; - NumVecs = 3; isLoad = false; break; - case Intrinsic::aarch64_neon_vst1x4: NewOpc = AArch64ISD::NEON_ST1x4_UPD; - NumVecs = 4; isLoad = false; break; + if (isIntrinsic) { + unsigned IntNo = cast(N->getOperand(1))->getZExtValue(); + switch (IntNo) { + default: llvm_unreachable("unexpected intrinsic for Neon base update"); + case Intrinsic::arm_neon_vld1: NewOpc = AArch64ISD::NEON_LD1_UPD; + NumVecs = 1; break; + case Intrinsic::arm_neon_vld2: NewOpc = AArch64ISD::NEON_LD2_UPD; + NumVecs = 2; break; + case Intrinsic::arm_neon_vld3: NewOpc = AArch64ISD::NEON_LD3_UPD; + NumVecs = 3; break; + case Intrinsic::arm_neon_vld4: NewOpc = AArch64ISD::NEON_LD4_UPD; + NumVecs = 4; break; + case Intrinsic::arm_neon_vst1: NewOpc = AArch64ISD::NEON_ST1_UPD; + NumVecs = 1; isLoad = false; break; + case Intrinsic::arm_neon_vst2: NewOpc = AArch64ISD::NEON_ST2_UPD; + NumVecs = 2; isLoad = false; break; + case Intrinsic::arm_neon_vst3: NewOpc = AArch64ISD::NEON_ST3_UPD; + NumVecs = 3; isLoad = false; break; + case Intrinsic::arm_neon_vst4: NewOpc = AArch64ISD::NEON_ST4_UPD; + NumVecs = 4; isLoad = false; break; + case Intrinsic::aarch64_neon_vld1x2: NewOpc = AArch64ISD::NEON_LD1x2_UPD; + NumVecs = 2; break; + case Intrinsic::aarch64_neon_vld1x3: NewOpc = AArch64ISD::NEON_LD1x3_UPD; + NumVecs = 3; break; + case Intrinsic::aarch64_neon_vld1x4: NewOpc = AArch64ISD::NEON_LD1x4_UPD; + NumVecs = 4; break; + case Intrinsic::aarch64_neon_vst1x2: NewOpc = AArch64ISD::NEON_ST1x2_UPD; + NumVecs = 2; isLoad = false; break; + case Intrinsic::aarch64_neon_vst1x3: NewOpc = AArch64ISD::NEON_ST1x3_UPD; + NumVecs = 3; isLoad = false; break; + case Intrinsic::aarch64_neon_vst1x4: NewOpc = AArch64ISD::NEON_ST1x4_UPD; + NumVecs = 4; isLoad = false; break; + case Intrinsic::arm_neon_vld2lane: NewOpc = AArch64ISD::NEON_LD2LN_UPD; + NumVecs = 2; isLaneOp = true; break; + case Intrinsic::arm_neon_vld3lane: NewOpc = AArch64ISD::NEON_LD3LN_UPD; + NumVecs = 3; isLaneOp = true; break; + case Intrinsic::arm_neon_vld4lane: NewOpc = AArch64ISD::NEON_LD4LN_UPD; + NumVecs = 4; isLaneOp = true; break; + case Intrinsic::arm_neon_vst2lane: NewOpc = AArch64ISD::NEON_ST2LN_UPD; + NumVecs = 2; isLoad = false; isLaneOp = true; break; + case Intrinsic::arm_neon_vst3lane: NewOpc = AArch64ISD::NEON_ST3LN_UPD; + NumVecs = 3; isLoad = false; isLaneOp = true; break; + case Intrinsic::arm_neon_vst4lane: NewOpc = AArch64ISD::NEON_ST4LN_UPD; + NumVecs = 4; isLoad = false; isLaneOp = true; break; + } + } else { + isLaneOp = true; + switch (N->getOpcode()) { + default: llvm_unreachable("unexpected opcode for Neon base update"); + case AArch64ISD::NEON_LD2DUP: NewOpc = AArch64ISD::NEON_LD2DUP_UPD; + NumVecs = 2; break; + case AArch64ISD::NEON_LD3DUP: NewOpc = AArch64ISD::NEON_LD3DUP_UPD; + NumVecs = 3; break; + case AArch64ISD::NEON_LD4DUP: NewOpc = AArch64ISD::NEON_LD4DUP_UPD; + NumVecs = 4; break; + } } // Find the size of memory referenced by the load/store. @@ -3578,6 +3630,8 @@ static SDValue CombineBaseUpdate(SDNode *N, else VecTy = N->getOperand(AddrOpIdx + 1).getValueType(); unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8; + if (isLaneOp) + NumBytes /= VecTy.getVectorNumElements(); // If the increment is a constant, it must match the memory ref size. SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0); @@ -3624,6 +3678,83 @@ static SDValue CombineBaseUpdate(SDNode *N, return SDValue(); } +/// For a VDUPLANE node N, check if its source operand is a vldN-lane (N > 1) +/// intrinsic, and if all the other uses of that intrinsic are also VDUPLANEs. +/// If so, combine them to a vldN-dup operation and return true. +static SDValue CombineVLDDUP(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { + SelectionDAG &DAG = DCI.DAG; + EVT VT = N->getValueType(0); + + // Check if the VDUPLANE operand is a vldN-dup intrinsic. + SDNode *VLD = N->getOperand(0).getNode(); + if (VLD->getOpcode() != ISD::INTRINSIC_W_CHAIN) + return SDValue(); + unsigned NumVecs = 0; + unsigned NewOpc = 0; + unsigned IntNo = cast(VLD->getOperand(1))->getZExtValue(); + if (IntNo == Intrinsic::arm_neon_vld2lane) { + NumVecs = 2; + NewOpc = AArch64ISD::NEON_LD2DUP; + } else if (IntNo == Intrinsic::arm_neon_vld3lane) { + NumVecs = 3; + NewOpc = AArch64ISD::NEON_LD3DUP; + } else if (IntNo == Intrinsic::arm_neon_vld4lane) { + NumVecs = 4; + NewOpc = AArch64ISD::NEON_LD4DUP; + } else { + return SDValue(); + } + + // First check that all the vldN-lane uses are VDUPLANEs and that the lane + // numbers match the load. + unsigned VLDLaneNo = + cast(VLD->getOperand(NumVecs + 3))->getZExtValue(); + for (SDNode::use_iterator UI = VLD->use_begin(), UE = VLD->use_end(); + UI != UE; ++UI) { + // Ignore uses of the chain result. + if (UI.getUse().getResNo() == NumVecs) + continue; + SDNode *User = *UI; + if (User->getOpcode() != AArch64ISD::NEON_VDUPLANE || + VLDLaneNo != cast(User->getOperand(1))->getZExtValue()) + return SDValue(); + } + + // Create the vldN-dup node. + EVT Tys[5]; + unsigned n; + for (n = 0; n < NumVecs; ++n) + Tys[n] = VT; + Tys[n] = MVT::Other; + SDVTList SDTys = DAG.getVTList(Tys, NumVecs + 1); + SDValue Ops[] = { VLD->getOperand(0), VLD->getOperand(2) }; + MemIntrinsicSDNode *VLDMemInt = cast(VLD); + SDValue VLDDup = DAG.getMemIntrinsicNode(NewOpc, SDLoc(VLD), SDTys, Ops, 2, + VLDMemInt->getMemoryVT(), + VLDMemInt->getMemOperand()); + + // Update the uses. + for (SDNode::use_iterator UI = VLD->use_begin(), UE = VLD->use_end(); + UI != UE; ++UI) { + unsigned ResNo = UI.getUse().getResNo(); + // Ignore uses of the chain result. + if (ResNo == NumVecs) + continue; + SDNode *User = *UI; + DCI.CombineTo(User, SDValue(VLDDup.getNode(), ResNo)); + } + + // Now the vldN-lane intrinsic is dead except for its chain result. + // Update uses of the chain. + std::vector VLDDupResults; + for (unsigned n = 0; n < NumVecs; ++n) + VLDDupResults.push_back(SDValue(VLDDup.getNode(), n)); + VLDDupResults.push_back(SDValue(VLDDup.getNode(), NumVecs)); + DCI.CombineTo(VLD, VLDDupResults); + + return SDValue(N, 0); +} + SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { @@ -3637,6 +3768,12 @@ AArch64TargetLowering::PerformDAGCombine(SDNode *N, return PerformShiftCombine(N, DCI, getSubtarget()); case ISD::INTRINSIC_WO_CHAIN: return PerformIntrinsicCombine(N, DCI.DAG); + case AArch64ISD::NEON_VDUPLANE: + return CombineVLDDUP(N, DCI); + case AArch64ISD::NEON_LD2DUP: + case AArch64ISD::NEON_LD3DUP: + case AArch64ISD::NEON_LD4DUP: + return CombineBaseUpdate(N, DCI); case ISD::INTRINSIC_VOID: case ISD::INTRINSIC_W_CHAIN: switch (cast(N->getOperand(1))->getZExtValue()) { @@ -3648,12 +3785,18 @@ AArch64TargetLowering::PerformDAGCombine(SDNode *N, case Intrinsic::arm_neon_vst2: case Intrinsic::arm_neon_vst3: case Intrinsic::arm_neon_vst4: + case Intrinsic::arm_neon_vld2lane: + case Intrinsic::arm_neon_vld3lane: + case Intrinsic::arm_neon_vld4lane: case Intrinsic::aarch64_neon_vld1x2: case Intrinsic::aarch64_neon_vld1x3: case Intrinsic::aarch64_neon_vld1x4: case Intrinsic::aarch64_neon_vst1x2: case Intrinsic::aarch64_neon_vst1x3: case Intrinsic::aarch64_neon_vst1x4: + case Intrinsic::arm_neon_vst2lane: + case Intrinsic::arm_neon_vst3lane: + case Intrinsic::arm_neon_vst4lane: return CombineBaseUpdate(N, DCI); default: break; @@ -4203,7 +4346,10 @@ bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, case Intrinsic::arm_neon_vld4: case Intrinsic::aarch64_neon_vld1x2: case Intrinsic::aarch64_neon_vld1x3: - case Intrinsic::aarch64_neon_vld1x4: { + case Intrinsic::aarch64_neon_vld1x4: + case Intrinsic::arm_neon_vld2lane: + case Intrinsic::arm_neon_vld3lane: + case Intrinsic::arm_neon_vld4lane: { Info.opc = ISD::INTRINSIC_W_CHAIN; // Conservatively set memVT to the entire set of vectors loaded. uint64_t NumElts = getDataLayout()->getTypeAllocSize(I.getType()) / 8; @@ -4223,7 +4369,10 @@ bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, case Intrinsic::arm_neon_vst4: case Intrinsic::aarch64_neon_vst1x2: case Intrinsic::aarch64_neon_vst1x3: - case Intrinsic::aarch64_neon_vst1x4: { + case Intrinsic::aarch64_neon_vst1x4: + case Intrinsic::arm_neon_vst2lane: + case Intrinsic::arm_neon_vst3lane: + case Intrinsic::arm_neon_vst4lane: { Info.opc = ISD::INTRINSIC_VOID; // Conservatively set memVT to the entire set of vectors stored. unsigned NumElts = 0; diff --git a/lib/Target/AArch64/AArch64ISelLowering.h b/lib/Target/AArch64/AArch64ISelLowering.h index 0f30a7a9d2e..a51d10f01cf 100644 --- a/lib/Target/AArch64/AArch64ISelLowering.h +++ b/lib/Target/AArch64/AArch64ISelLowering.h @@ -152,8 +152,13 @@ namespace AArch64ISD { // Vector extract NEON_VEXTRACT, + // NEON duplicate lane loads + NEON_LD2DUP = ISD::FIRST_TARGET_MEMORY_OPCODE, + NEON_LD3DUP, + NEON_LD4DUP, + // NEON loads with post-increment base updates: - NEON_LD1_UPD = ISD::FIRST_TARGET_MEMORY_OPCODE, + NEON_LD1_UPD, NEON_LD2_UPD, NEON_LD3_UPD, NEON_LD4_UPD, @@ -168,7 +173,22 @@ namespace AArch64ISD { NEON_ST4_UPD, NEON_ST1x2_UPD, NEON_ST1x3_UPD, - NEON_ST1x4_UPD + NEON_ST1x4_UPD, + + // NEON duplicate lane loads with post-increment base updates: + NEON_LD2DUP_UPD, + NEON_LD3DUP_UPD, + NEON_LD4DUP_UPD, + + // NEON lane loads with post-increment base updates: + NEON_LD2LN_UPD, + NEON_LD3LN_UPD, + NEON_LD4LN_UPD, + + // NEON lane store with post-increment base updates: + NEON_ST2LN_UPD, + NEON_ST3LN_UPD, + NEON_ST4LN_UPD }; } diff --git a/lib/Target/AArch64/AArch64InstrFormats.td b/lib/Target/AArch64/AArch64InstrFormats.td index 2a0cca86183..34f917caabe 100644 --- a/lib/Target/AArch64/AArch64InstrFormats.td +++ b/lib/Target/AArch64/AArch64InstrFormats.td @@ -1297,6 +1297,85 @@ class NeonI_LdStMult_Post opcode, bits<2> size, // Inherit Rt in 4-0 } +// Format AdvSIMD vector load Single N-element structure to all lanes +class NeonI_LdOne_Dup opcode, bits<2> size, dag outs, + dag ins, string asmstr, list patterns, + InstrItinClass itin> + : A64InstRtn +{ + let Inst{31} = 0b0; + let Inst{30} = q; + let Inst{29-23} = 0b0011010; + let Inst{22} = 0b1; + let Inst{21} = r; + let Inst{20-16} = 0b00000; + let Inst{15-13} = opcode; + let Inst{12} = 0b0; + let Inst{11-10} = size; + + // Inherit Rn in 9-5 + // Inherit Rt in 4-0 +} + +// Format AdvSIMD vector load/store Single N-element structure to/from one lane +class NeonI_LdStOne_Lane op2_1, bit op0, dag outs, + dag ins, string asmstr, + list patterns, InstrItinClass itin> + : A64InstRtn +{ + bits<4> lane; + let Inst{31} = 0b0; + let Inst{29-23} = 0b0011010; + let Inst{22} = l; + let Inst{21} = r; + let Inst{20-16} = 0b00000; + let Inst{15-14} = op2_1; + let Inst{13} = op0; + + // Inherit Rn in 9-5 + // Inherit Rt in 4-0 +} + +// Format AdvSIMD post-index vector load Single N-element structure to all lanes +class NeonI_LdOne_Dup_Post opcode, bits<2> size, dag outs, + dag ins, string asmstr, list patterns, + InstrItinClass itin> + : A64InstRtnm +{ + let Inst{31} = 0b0; + let Inst{30} = q; + let Inst{29-23} = 0b0011011; + let Inst{22} = 0b1; + let Inst{21} = r; + // Inherit Rm in 20-16 + let Inst{15-13} = opcode; + let Inst{12} = 0b0; + let Inst{11-10} = size; + + // Inherit Rn in 9-5 + // Inherit Rt in 4-0 +} + +// Format AdvSIMD post-index vector load/store Single N-element structure +// to/from one lane +class NeonI_LdStOne_Lane_Post op2_1, bit op0, dag outs, + dag ins, string asmstr, + list patterns, InstrItinClass itin> + : A64InstRtnm +{ + bits<4> lane; + let Inst{31} = 0b0; + let Inst{29-23} = 0b0011011; + let Inst{22} = l; + let Inst{21} = r; + // Inherit Rm in 20-16 + let Inst{15-14} = op2_1; + let Inst{13} = op0; + + // Inherit Rn in 9-5 + // Inherit Rt in 4-0 +} + // Format AdvSIMD 3 scalar registers with different type class NeonI_Scalar3Diff size, bits<4> opcode, diff --git a/lib/Target/AArch64/AArch64InstrNEON.td b/lib/Target/AArch64/AArch64InstrNEON.td index b6fa6fa8939..bcd59bd2e72 100644 --- a/lib/Target/AArch64/AArch64InstrNEON.td +++ b/lib/Target/AArch64/AArch64InstrNEON.td @@ -3456,6 +3456,51 @@ def ST1x4_1D : NeonI_STVList<0, 0b0010, 0b11, VQuad1D_operand, "st1">; // The followings are post-index vector load/store multiple N-element // structure(class SIMD lselem-post) +def exact1_asmoperand : AsmOperandClass { + let Name = "Exact1"; + let PredicateMethod = "isExactImm<1>"; + let RenderMethod = "addImmOperands"; +} +def uimm_exact1 : Operand, ImmLeaf { + let ParserMatchClass = exact1_asmoperand; +} + +def exact2_asmoperand : AsmOperandClass { + let Name = "Exact2"; + let PredicateMethod = "isExactImm<2>"; + let RenderMethod = "addImmOperands"; +} +def uimm_exact2 : Operand, ImmLeaf { + let ParserMatchClass = exact2_asmoperand; +} + +def exact3_asmoperand : AsmOperandClass { + let Name = "Exact3"; + let PredicateMethod = "isExactImm<3>"; + let RenderMethod = "addImmOperands"; +} +def uimm_exact3 : Operand, ImmLeaf { + let ParserMatchClass = exact3_asmoperand; +} + +def exact4_asmoperand : AsmOperandClass { + let Name = "Exact4"; + let PredicateMethod = "isExactImm<4>"; + let RenderMethod = "addImmOperands"; +} +def uimm_exact4 : Operand, ImmLeaf { + let ParserMatchClass = exact4_asmoperand; +} + +def exact6_asmoperand : AsmOperandClass { + let Name = "Exact6"; + let PredicateMethod = "isExactImm<6>"; + let RenderMethod = "addImmOperands"; +} +def uimm_exact6 : Operand, ImmLeaf { + let ParserMatchClass = exact6_asmoperand; +} + def exact8_asmoperand : AsmOperandClass { let Name = "Exact8"; let PredicateMethod = "isExactImm<8>"; @@ -3465,6 +3510,15 @@ def uimm_exact8 : Operand, ImmLeaf { let ParserMatchClass = exact8_asmoperand; } +def exact12_asmoperand : AsmOperandClass { + let Name = "Exact12"; + let PredicateMethod = "isExactImm<12>"; + let RenderMethod = "addImmOperands"; +} +def uimm_exact12 : Operand, ImmLeaf { + let ParserMatchClass = exact12_asmoperand; +} + def exact16_asmoperand : AsmOperandClass { let Name = "Exact16"; let PredicateMethod = "isExactImm<16>"; @@ -3678,6 +3732,574 @@ defm ST1x4WB_1D : NeonI_STWB_VList<0, 0b0010, 0b11, VQuad1D_operand, // End of post-index vector load/store multiple N-element structure // (class SIMD lselem-post) +// The followings are vector load/store single N-element structure +// (class SIMD lsone). +def neon_uimm0_bare : Operand, + ImmLeaf { + let ParserMatchClass = neon_uimm0_asmoperand; + let PrintMethod = "printUImmBareOperand"; +} + +def neon_uimm1_bare : Operand, + ImmLeaf { + let ParserMatchClass = neon_uimm1_asmoperand; + let PrintMethod = "printUImmBareOperand"; +} + +def neon_uimm2_bare : Operand, + ImmLeaf { + let ParserMatchClass = neon_uimm2_asmoperand; + let PrintMethod = "printUImmBareOperand"; +} + +def neon_uimm3_bare : Operand, + ImmLeaf { + let ParserMatchClass = uimm3_asmoperand; + let PrintMethod = "printUImmBareOperand"; +} + +def neon_uimm4_bare : Operand, + ImmLeaf { + let ParserMatchClass = uimm4_asmoperand; + let PrintMethod = "printUImmBareOperand"; +} + +class NeonI_LDN_Dup opcode, bits<2> size, + RegisterOperand VecList, string asmop> + : NeonI_LdOne_Dup { + let mayLoad = 1; + let neverHasSideEffects = 1; +} + +multiclass LDN_Dup_BHSD opcode, string List, string asmop> { + def _8B : NeonI_LDN_Dup<0, r, opcode, 0b00, + !cast(List # "8B_operand"), asmop>; + + def _4H : NeonI_LDN_Dup<0, r, opcode, 0b01, + !cast(List # "4H_operand"), asmop>; + + def _2S : NeonI_LDN_Dup<0, r, opcode, 0b10, + !cast(List # "2S_operand"), asmop>; + + def _1D : NeonI_LDN_Dup<0, r, opcode, 0b11, + !cast(List # "1D_operand"), asmop>; + + def _16B : NeonI_LDN_Dup<1, r, opcode, 0b00, + !cast(List # "16B_operand"), asmop>; + + def _8H : NeonI_LDN_Dup<1, r, opcode, 0b01, + !cast(List # "8H_operand"), asmop>; + + def _4S : NeonI_LDN_Dup<1, r, opcode, 0b10, + !cast(List # "4S_operand"), asmop>; + + def _2D : NeonI_LDN_Dup<1, r, opcode, 0b11, + !cast(List # "2D_operand"), asmop>; +} + +// Load single 1-element structure to all lanes of 1 register +defm LD1R : LDN_Dup_BHSD<0b0, 0b110, "VOne", "ld1r">; + +// Load single N-element structure to all lanes of N consecutive +// registers (N = 2,3,4) +defm LD2R : LDN_Dup_BHSD<0b1, 0b110, "VPair", "ld2r">; +defm LD3R : LDN_Dup_BHSD<0b0, 0b111, "VTriple", "ld3r">; +defm LD4R : LDN_Dup_BHSD<0b1, 0b111, "VQuad", "ld4r">; + + +class LD1R_pattern + : Pat<(VTy (Neon_vdup (DTy (LoadOp GPR64xsp:$Rn)))), + (VTy (INST GPR64xsp:$Rn))>; + +// Match all LD1R instructions +def : LD1R_pattern; + +def : LD1R_pattern; + +def : LD1R_pattern; + +def : LD1R_pattern; + +def : LD1R_pattern; +def : LD1R_pattern; + +def : LD1R_pattern; +def : LD1R_pattern; + +def : LD1R_pattern; +def : LD1R_pattern; + +def : LD1R_pattern; +def : LD1R_pattern; + + +multiclass VectorList_Bare_BHSD { + defm B : VectorList_operands; + defm H : VectorList_operands; + defm S : VectorList_operands; + defm D : VectorList_operands; +} + +// Special vector list operand of 128-bit vectors with bare layout. +// i.e. only show ".b", ".h", ".s", ".d" +defm VOne : VectorList_Bare_BHSD<"VOne", 1, FPR128>; +defm VPair : VectorList_Bare_BHSD<"VPair", 2, QPair>; +defm VTriple : VectorList_Bare_BHSD<"VTriple", 3, QTriple>; +defm VQuad : VectorList_Bare_BHSD<"VQuad", 4, QQuad>; + +class NeonI_LDN_Lane op2_1, bit op0, RegisterOperand VList, + Operand ImmOp, string asmop> + : NeonI_LdStOne_Lane<1, r, op2_1, op0, + (outs VList:$Rt), + (ins GPR64xsp:$Rn, VList:$src, ImmOp:$lane), + asmop # "\t$Rt[$lane], [$Rn]", + [], + NoItinerary> { + let mayLoad = 1; + let neverHasSideEffects = 1; + let hasExtraDefRegAllocReq = 1; + let Constraints = "$src = $Rt"; +} + +multiclass LDN_Lane_BHSD { + def _B : NeonI_LDN_Lane(List # "B_operand"), + neon_uimm4_bare, asmop> { + let Inst{12-10} = lane{2-0}; + let Inst{30} = lane{3}; + } + + def _H : NeonI_LDN_Lane(List # "H_operand"), + neon_uimm3_bare, asmop> { + let Inst{12-10} = {lane{1}, lane{0}, 0b0}; + let Inst{30} = lane{2}; + } + + def _S : NeonI_LDN_Lane(List # "S_operand"), + neon_uimm2_bare, asmop> { + let Inst{12-10} = {lane{0}, 0b0, 0b0}; + let Inst{30} = lane{1}; + } + + def _D : NeonI_LDN_Lane(List # "D_operand"), + neon_uimm1_bare, asmop> { + let Inst{12-10} = 0b001; + let Inst{30} = lane{0}; + } +} + +// Load single 1-element structure to one lane of 1 register. +defm LD1LN : LDN_Lane_BHSD<0b0, 0b0, "VOne", "ld1">; + +// Load single N-element structure to one lane of N consecutive registers +// (N = 2,3,4) +defm LD2LN : LDN_Lane_BHSD<0b1, 0b0, "VPair", "ld2">; +defm LD3LN : LDN_Lane_BHSD<0b0, 0b1, "VTriple", "ld3">; +defm LD4LN : LDN_Lane_BHSD<0b1, 0b1, "VQuad", "ld4">; + +multiclass LD1LN_patterns { + def : Pat<(VTy (vector_insert (VTy VPR64:$src), + (DTy (LoadOp GPR64xsp:$Rn)), (ImmOp:$lane))), + (VTy (EXTRACT_SUBREG + (INST GPR64xsp:$Rn, + (SUBREG_TO_REG (i64 0), VPR64:$src, sub_64), + ImmOp:$lane), + sub_64))>; + + def : Pat<(VTy2 (vector_insert (VTy2 VPR128:$src), + (DTy (LoadOp GPR64xsp:$Rn)), (ImmOp2:$lane))), + (VTy2 (INST GPR64xsp:$Rn, VPR128:$src, ImmOp2:$lane))>; +} + +// Match all LD1LN instructions +defm : LD1LN_patterns; + +defm : LD1LN_patterns; + +defm : LD1LN_patterns; +defm : LD1LN_patterns; + +defm : LD1LN_patterns; +defm : LD1LN_patterns; + +class NeonI_STN_Lane op2_1, bit op0, RegisterOperand VList, + Operand ImmOp, string asmop> + : NeonI_LdStOne_Lane<0, r, op2_1, op0, + (outs), (ins GPR64xsp:$Rn, VList:$Rt, ImmOp:$lane), + asmop # "\t$Rt[$lane], [$Rn]", + [], + NoItinerary> { + let mayStore = 1; + let neverHasSideEffects = 1; + let hasExtraDefRegAllocReq = 1; +} + +multiclass STN_Lane_BHSD { + def _B : NeonI_STN_Lane(List # "B_operand"), + neon_uimm4_bare, asmop> { + let Inst{12-10} = lane{2-0}; + let Inst{30} = lane{3}; + } + + def _H : NeonI_STN_Lane(List # "H_operand"), + neon_uimm3_bare, asmop> { + let Inst{12-10} = {lane{1}, lane{0}, 0b0}; + let Inst{30} = lane{2}; + } + + def _S : NeonI_STN_Lane(List # "S_operand"), + neon_uimm2_bare, asmop> { + let Inst{12-10} = {lane{0}, 0b0, 0b0}; + let Inst{30} = lane{1}; + } + + def _D : NeonI_STN_Lane(List # "D_operand"), + neon_uimm1_bare, asmop>{ + let Inst{12-10} = 0b001; + let Inst{30} = lane{0}; + } +} + +// Store single 1-element structure from one lane of 1 register. +defm ST1LN : STN_Lane_BHSD<0b0, 0b0, "VOne", "st1">; + +// Store single N-element structure from one lane of N consecutive registers +// (N = 2,3,4) +defm ST2LN : STN_Lane_BHSD<0b1, 0b0, "VPair", "st2">; +defm ST3LN : STN_Lane_BHSD<0b0, 0b1, "VTriple", "st3">; +defm ST4LN : STN_Lane_BHSD<0b1, 0b1, "VQuad", "st4">; + +multiclass ST1LN_patterns { + def : Pat<(StoreOp (DTy (vector_extract (VTy VPR64:$Rt), ImmOp:$lane)), + GPR64xsp:$Rn), + (INST GPR64xsp:$Rn, + (SUBREG_TO_REG (i64 0), VPR64:$Rt, sub_64), + ImmOp:$lane)>; + + def : Pat<(StoreOp (DTy (vector_extract (VTy2 VPR128:$Rt), ImmOp2:$lane)), + GPR64xsp:$Rn), + (INST GPR64xsp:$Rn, VPR128:$Rt, ImmOp2:$lane)>; +} + +// Match all ST1LN instructions +defm : ST1LN_patterns; + +defm : ST1LN_patterns; + +defm : ST1LN_patterns; +defm : ST1LN_patterns; + +defm : ST1LN_patterns; +defm : ST1LN_patterns; + +// End of vector load/store single N-element structure (class SIMD lsone). + + +// The following are post-index load/store single N-element instructions +// (class SIMD lsone-post) + +multiclass NeonI_LDN_WB_Dup opcode, bits<2> size, + RegisterOperand VecList, Operand ImmTy, + string asmop> { + let mayLoad = 1, neverHasSideEffects = 1, Constraints = "$wb = $Rn", + DecoderMethod = "DecodeVLDSTLanePostInstruction" in { + def _fixed : NeonI_LdOne_Dup_Post { + let Rm = 0b11111; + } + + def _register : NeonI_LdOne_Dup_Post; + } +} + +multiclass LDWB_Dup_BHSD opcode, string List, string asmop, + Operand uimm_b, Operand uimm_h, + Operand uimm_s, Operand uimm_d> { + defm _8B : NeonI_LDN_WB_Dup<0, r, opcode, 0b00, + !cast(List # "8B_operand"), + uimm_b, asmop>; + + defm _4H : NeonI_LDN_WB_Dup<0, r, opcode, 0b01, + !cast(List # "4H_operand"), + uimm_h, asmop>; + + defm _2S : NeonI_LDN_WB_Dup<0, r, opcode, 0b10, + !cast(List # "2S_operand"), + uimm_s, asmop>; + + defm _1D : NeonI_LDN_WB_Dup<0, r, opcode, 0b11, + !cast(List # "1D_operand"), + uimm_d, asmop>; + + defm _16B : NeonI_LDN_WB_Dup<1, r, opcode, 0b00, + !cast(List # "16B_operand"), + uimm_b, asmop>; + + defm _8H : NeonI_LDN_WB_Dup<1, r, opcode, 0b01, + !cast(List # "8H_operand"), + uimm_h, asmop>; + + defm _4S : NeonI_LDN_WB_Dup<1, r, opcode, 0b10, + !cast(List # "4S_operand"), + uimm_s, asmop>; + + defm _2D : NeonI_LDN_WB_Dup<1, r, opcode, 0b11, + !cast(List # "2D_operand"), + uimm_d, asmop>; +} + +// Post-index load single 1-element structure to all lanes of 1 register +defm LD1R_WB : LDWB_Dup_BHSD<0b0, 0b110, "VOne", "ld1r", uimm_exact1, + uimm_exact2, uimm_exact4, uimm_exact8>; + +// Post-index load single N-element structure to all lanes of N consecutive +// registers (N = 2,3,4) +defm LD2R_WB : LDWB_Dup_BHSD<0b1, 0b110, "VPair", "ld2r", uimm_exact2, + uimm_exact4, uimm_exact8, uimm_exact16>; +defm LD3R_WB : LDWB_Dup_BHSD<0b0, 0b111, "VTriple", "ld3r", uimm_exact3, + uimm_exact6, uimm_exact12, uimm_exact24>; +defm LD4R_WB : LDWB_Dup_BHSD<0b1, 0b111, "VQuad", "ld4r", uimm_exact4, + uimm_exact8, uimm_exact16, uimm_exact32>; + +let mayLoad = 1, neverHasSideEffects = 1, hasExtraDefRegAllocReq = 1, + Constraints = "$Rn = $wb, $Rt = $src", + DecoderMethod = "DecodeVLDSTLanePostInstruction" in { + class LDN_WBFx_Lane op2_1, bit op0, RegisterOperand VList, + Operand ImmTy, Operand ImmOp, string asmop> + : NeonI_LdStOne_Lane_Post<1, r, op2_1, op0, + (outs VList:$Rt, GPR64xsp:$wb), + (ins GPR64xsp:$Rn, ImmTy:$amt, + VList:$src, ImmOp:$lane), + asmop # "\t$Rt[$lane], [$Rn], $amt", + [], + NoItinerary> { + let Rm = 0b11111; + } + + class LDN_WBReg_Lane op2_1, bit op0, RegisterOperand VList, + Operand ImmTy, Operand ImmOp, string asmop> + : NeonI_LdStOne_Lane_Post<1, r, op2_1, op0, + (outs VList:$Rt, GPR64xsp:$wb), + (ins GPR64xsp:$Rn, GPR64noxzr:$Rm, + VList:$src, ImmOp:$lane), + asmop # "\t$Rt[$lane], [$Rn], $Rm", + [], + NoItinerary>; +} + +multiclass LD_Lane_WB_BHSD { + def _B_fixed : LDN_WBFx_Lane(List # "B_operand"), + uimm_b, neon_uimm4_bare, asmop> { + let Inst{12-10} = lane{2-0}; + let Inst{30} = lane{3}; + } + + def _B_register : LDN_WBReg_Lane(List # "B_operand"), + uimm_b, neon_uimm4_bare, asmop> { + let Inst{12-10} = lane{2-0}; + let Inst{30} = lane{3}; + } + + def _H_fixed : LDN_WBFx_Lane(List # "H_operand"), + uimm_h, neon_uimm3_bare, asmop> { + let Inst{12-10} = {lane{1}, lane{0}, 0b0}; + let Inst{30} = lane{2}; + } + + def _H_register : LDN_WBReg_Lane(List # "H_operand"), + uimm_h, neon_uimm3_bare, asmop> { + let Inst{12-10} = {lane{1}, lane{0}, 0b0}; + let Inst{30} = lane{2}; + } + + def _S_fixed : LDN_WBFx_Lane(List # "S_operand"), + uimm_s, neon_uimm2_bare, asmop> { + let Inst{12-10} = {lane{0}, 0b0, 0b0}; + let Inst{30} = lane{1}; + } + + def _S_register : LDN_WBReg_Lane(List # "S_operand"), + uimm_s, neon_uimm2_bare, asmop> { + let Inst{12-10} = {lane{0}, 0b0, 0b0}; + let Inst{30} = lane{1}; + } + + def _D_fixed : LDN_WBFx_Lane(List # "D_operand"), + uimm_d, neon_uimm1_bare, asmop> { + let Inst{12-10} = 0b001; + let Inst{30} = lane{0}; + } + + def _D_register : LDN_WBReg_Lane(List # "D_operand"), + uimm_d, neon_uimm1_bare, asmop> { + let Inst{12-10} = 0b001; + let Inst{30} = lane{0}; + } +} + +// Post-index load single 1-element structure to one lane of 1 register. +defm LD1LN_WB : LD_Lane_WB_BHSD<0b0, 0b0, "VOne", "ld1", uimm_exact1, + uimm_exact2, uimm_exact4, uimm_exact8>; + +// Post-index load single N-element structure to one lane of N consecutive +// registers +// (N = 2,3,4) +defm LD2LN_WB : LD_Lane_WB_BHSD<0b1, 0b0, "VPair", "ld2", uimm_exact2, + uimm_exact4, uimm_exact8, uimm_exact16>; +defm LD3LN_WB : LD_Lane_WB_BHSD<0b0, 0b1, "VTriple", "ld3", uimm_exact3, + uimm_exact6, uimm_exact12, uimm_exact24>; +defm LD4LN_WB : LD_Lane_WB_BHSD<0b1, 0b1, "VQuad", "ld4", uimm_exact4, + uimm_exact8, uimm_exact16, uimm_exact32>; + +let mayStore = 1, neverHasSideEffects = 1, + hasExtraDefRegAllocReq = 1, Constraints = "$Rn = $wb", + DecoderMethod = "DecodeVLDSTLanePostInstruction" in { + class STN_WBFx_Lane op2_1, bit op0, RegisterOperand VList, + Operand ImmTy, Operand ImmOp, string asmop> + : NeonI_LdStOne_Lane_Post<0, r, op2_1, op0, + (outs GPR64xsp:$wb), + (ins GPR64xsp:$Rn, ImmTy:$amt, + VList:$Rt, ImmOp:$lane), + asmop # "\t$Rt[$lane], [$Rn], $amt", + [], + NoItinerary> { + let Rm = 0b11111; + } + + class STN_WBReg_Lane op2_1, bit op0, RegisterOperand VList, + Operand ImmTy, Operand ImmOp, string asmop> + : NeonI_LdStOne_Lane_Post<0, r, op2_1, op0, + (outs GPR64xsp:$wb), + (ins GPR64xsp:$Rn, GPR64noxzr:$Rm, VList:$Rt, + ImmOp:$lane), + asmop # "\t$Rt[$lane], [$Rn], $Rm", + [], + NoItinerary>; +} + +multiclass ST_Lane_WB_BHSD { + def _B_fixed : STN_WBFx_Lane(List # "B_operand"), + uimm_b, neon_uimm4_bare, asmop> { + let Inst{12-10} = lane{2-0}; + let Inst{30} = lane{3}; + } + + def _B_register : STN_WBReg_Lane(List # "B_operand"), + uimm_b, neon_uimm4_bare, asmop> { + let Inst{12-10} = lane{2-0}; + let Inst{30} = lane{3}; + } + + def _H_fixed : STN_WBFx_Lane(List # "H_operand"), + uimm_h, neon_uimm3_bare, asmop> { + let Inst{12-10} = {lane{1}, lane{0}, 0b0}; + let Inst{30} = lane{2}; + } + + def _H_register : STN_WBReg_Lane(List # "H_operand"), + uimm_h, neon_uimm3_bare, asmop> { + let Inst{12-10} = {lane{1}, lane{0}, 0b0}; + let Inst{30} = lane{2}; + } + + def _S_fixed : STN_WBFx_Lane(List # "S_operand"), + uimm_s, neon_uimm2_bare, asmop> { + let Inst{12-10} = {lane{0}, 0b0, 0b0}; + let Inst{30} = lane{1}; + } + + def _S_register : STN_WBReg_Lane(List # "S_operand"), + uimm_s, neon_uimm2_bare, asmop> { + let Inst{12-10} = {lane{0}, 0b0, 0b0}; + let Inst{30} = lane{1}; + } + + def _D_fixed : STN_WBFx_Lane(List # "D_operand"), + uimm_d, neon_uimm1_bare, asmop> { + let Inst{12-10} = 0b001; + let Inst{30} = lane{0}; + } + + def _D_register : STN_WBReg_Lane(List # "D_operand"), + uimm_d, neon_uimm1_bare, asmop> { + let Inst{12-10} = 0b001; + let Inst{30} = lane{0}; + } +} + +// Post-index store single 1-element structure from one lane of 1 register. +defm ST1LN_WB : ST_Lane_WB_BHSD<0b0, 0b0, "VOne", "st1", uimm_exact1, + uimm_exact2, uimm_exact4, uimm_exact8>; + +// Post-index store single N-element structure from one lane of N consecutive +// registers (N = 2,3,4) +defm ST2LN_WB : ST_Lane_WB_BHSD<0b1, 0b0, "VPair", "st2", uimm_exact2, + uimm_exact4, uimm_exact8, uimm_exact16>; +defm ST3LN_WB : ST_Lane_WB_BHSD<0b0, 0b1, "VTriple", "st3", uimm_exact3, + uimm_exact6, uimm_exact12, uimm_exact24>; +defm ST4LN_WB : ST_Lane_WB_BHSD<0b1, 0b1, "VQuad", "st4", uimm_exact4, + uimm_exact8, uimm_exact16, uimm_exact32>; + +// End of post-index load/store single N-element instructions +// (class SIMD lsone-post) // Neon Scalar instructions implementation // Scalar Three Same @@ -4737,36 +5359,6 @@ defm : Neon_ScalarPair_SD_size_patterns; -def neon_uimm0_bare : Operand, - ImmLeaf { - let ParserMatchClass = neon_uimm0_asmoperand; - let PrintMethod = "printUImmBareOperand"; -} - -def neon_uimm1_bare : Operand, - ImmLeaf { - let ParserMatchClass = neon_uimm1_asmoperand; - let PrintMethod = "printUImmBareOperand"; -} - -def neon_uimm2_bare : Operand, - ImmLeaf { - let ParserMatchClass = neon_uimm2_asmoperand; - let PrintMethod = "printUImmBareOperand"; -} - -def neon_uimm3_bare : Operand, - ImmLeaf { - let ParserMatchClass = uimm3_asmoperand; - let PrintMethod = "printUImmBareOperand"; -} - -def neon_uimm4_bare : Operand, - ImmLeaf { - let ParserMatchClass = uimm4_asmoperand; - let PrintMethod = "printUImmBareOperand"; -} - // Scalar by element Arithmetic @@ -5316,6 +5908,8 @@ def : Pat<(v2i64 (bitconvert (f128 FPR128:$src))), (v2i64 FPR128:$src)>; def : Pat<(v4f32 (bitconvert (f128 FPR128:$src))), (v4f32 FPR128:$src)>; def : Pat<(v2f64 (bitconvert (f128 FPR128:$src))), (v2f64 FPR128:$src)>; +// Scalar Three Same + def neon_uimm3 : Operand, ImmLeaf { let ParserMatchClass = uimm3_asmoperand; diff --git a/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp index c351dbeb973..1e0033c164e 100644 --- a/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp +++ b/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp @@ -1985,6 +1985,7 @@ bool AArch64AsmParser::TryParseVector(uint32_t &RegNum, SMLoc &RegEndLoc, // Now there are two kinds of vector list when number of vector > 1: // (1) {Vn.layout, Vn+1.layout, ... , Vm.layout} // (2) {Vn.layout - Vm.layout} +// If the layout is like .b/.h/.s/.d, also parse the lane. AArch64AsmParser::OperandMatchResultTy AArch64AsmParser::ParseVectorList( SmallVectorImpl &Operands) { if (Parser.getTok().isNot(AsmToken::LCurly)) { @@ -2065,7 +2066,7 @@ AArch64AsmParser::OperandMatchResultTy AArch64AsmParser::ParseVectorList( A64Layout::VectorLayout Layout = A64StringToVectorLayout(LayoutStr); if (Count > 1) { // If count > 1, create vector list using super register. - bool IsVec64 = (Layout < A64Layout::_16B) ? true : false; + bool IsVec64 = (Layout < A64Layout::_16B); static unsigned SupRegIDs[3][2] = { { AArch64::QPairRegClassID, AArch64::DPairRegClassID }, { AArch64::QTripleRegClassID, AArch64::DTripleRegClassID }, @@ -2080,7 +2081,22 @@ AArch64AsmParser::OperandMatchResultTy AArch64AsmParser::ParseVectorList( Operands.push_back( AArch64Operand::CreateVectorList(Reg, Count, Layout, SLoc, ELoc)); - return MatchOperand_Success; + if (Parser.getTok().is(AsmToken::LBrac)) { + uint32_t NumLanes = 0; + switch(Layout) { + case A64Layout::_B : NumLanes = 16; break; + case A64Layout::_H : NumLanes = 8; break; + case A64Layout::_S : NumLanes = 4; break; + case A64Layout::_D : NumLanes = 2; break; + default: + SMLoc Loc = getLexer().getLoc(); + Error(Loc, "expected comma before next operand"); + return MatchOperand_ParseFail; + } + return ParseNEONLane(Operands, NumLanes); + } else { + return MatchOperand_Success; + } } // FIXME: We would really like to be able to tablegen'erate this. diff --git a/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp b/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp index c4f30628c31..f003d8c04b2 100644 --- a/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp +++ b/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp @@ -234,6 +234,10 @@ static DecodeStatus DecodeVLDSTPostInstruction(MCInst &Inst, unsigned Val, uint64_t Address, const void *Decoder); +static DecodeStatus DecodeVLDSTLanePostInstruction(MCInst &Inst, unsigned Insn, + uint64_t Address, + const void *Decoder); + static bool Check(DecodeStatus &Out, DecodeStatus In); #include "AArch64GenDisassemblerTables.inc" @@ -414,7 +418,7 @@ static DecodeStatus DecodeGPR64noxzrRegisterClass(llvm::MCInst &Inst, unsigned RegNo, uint64_t Address, const void *Decoder) { - if (RegNo >= 30) + if (RegNo > 30) return MCDisassembler::Fail; uint16_t Register = getReg(Decoder, AArch64::GPR64noxzrRegClassID, RegNo); @@ -1102,3 +1106,426 @@ static DecodeStatus DecodeVLDSTPostInstruction(MCInst &Inst, unsigned Insn, return MCDisassembler::Success; } + +// Decode post-index vector load/store lane instructions. +// This is necessary as we need to decode Rm: if Rm == 0b11111, the last +// operand is an immediate equal the the length of the changed bytes, +// or Rm is decoded to a GPR64noxzr register. +static DecodeStatus DecodeVLDSTLanePostInstruction(MCInst &Inst, unsigned Insn, + uint64_t Address, + const void *Decoder) { + bool Is64bitVec = false; + bool IsLoadDup = false; + bool IsLoad = false; + unsigned TransferBytes = 0; // The total number of bytes transferred. + unsigned NumVecs = 0; + unsigned Opc = Inst.getOpcode(); + switch (Opc) { + case AArch64::LD1R_WB_8B_fixed: case AArch64::LD1R_WB_8B_register: + case AArch64::LD1R_WB_4H_fixed: case AArch64::LD1R_WB_4H_register: + case AArch64::LD1R_WB_2S_fixed: case AArch64::LD1R_WB_2S_register: + case AArch64::LD1R_WB_1D_fixed: case AArch64::LD1R_WB_1D_register: { + switch (Opc) { + case AArch64::LD1R_WB_8B_fixed: case AArch64::LD1R_WB_8B_register: + TransferBytes = 1; break; + case AArch64::LD1R_WB_4H_fixed: case AArch64::LD1R_WB_4H_register: + TransferBytes = 2; break; + case AArch64::LD1R_WB_2S_fixed: case AArch64::LD1R_WB_2S_register: + TransferBytes = 4; break; + case AArch64::LD1R_WB_1D_fixed: case AArch64::LD1R_WB_1D_register: + TransferBytes = 8; break; + } + Is64bitVec = true; + IsLoadDup = true; + NumVecs = 1; + break; + } + + case AArch64::LD1R_WB_16B_fixed: case AArch64::LD1R_WB_16B_register: + case AArch64::LD1R_WB_8H_fixed: case AArch64::LD1R_WB_8H_register: + case AArch64::LD1R_WB_4S_fixed: case AArch64::LD1R_WB_4S_register: + case AArch64::LD1R_WB_2D_fixed: case AArch64::LD1R_WB_2D_register: { + switch (Opc) { + case AArch64::LD1R_WB_16B_fixed: case AArch64::LD1R_WB_16B_register: + TransferBytes = 1; break; + case AArch64::LD1R_WB_8H_fixed: case AArch64::LD1R_WB_8H_register: + TransferBytes = 2; break; + case AArch64::LD1R_WB_4S_fixed: case AArch64::LD1R_WB_4S_register: + TransferBytes = 4; break; + case AArch64::LD1R_WB_2D_fixed: case AArch64::LD1R_WB_2D_register: + TransferBytes = 8; break; + } + IsLoadDup = true; + NumVecs = 1; + break; + } + + case AArch64::LD2R_WB_8B_fixed: case AArch64::LD2R_WB_8B_register: + case AArch64::LD2R_WB_4H_fixed: case AArch64::LD2R_WB_4H_register: + case AArch64::LD2R_WB_2S_fixed: case AArch64::LD2R_WB_2S_register: + case AArch64::LD2R_WB_1D_fixed: case AArch64::LD2R_WB_1D_register: { + switch (Opc) { + case AArch64::LD2R_WB_8B_fixed: case AArch64::LD2R_WB_8B_register: + TransferBytes = 2; break; + case AArch64::LD2R_WB_4H_fixed: case AArch64::LD2R_WB_4H_register: + TransferBytes = 4; break; + case AArch64::LD2R_WB_2S_fixed: case AArch64::LD2R_WB_2S_register: + TransferBytes = 8; break; + case AArch64::LD2R_WB_1D_fixed: case AArch64::LD2R_WB_1D_register: + TransferBytes = 16; break; + } + Is64bitVec = true; + IsLoadDup = true; + NumVecs = 2; + break; + } + + case AArch64::LD2R_WB_16B_fixed: case AArch64::LD2R_WB_16B_register: + case AArch64::LD2R_WB_8H_fixed: case AArch64::LD2R_WB_8H_register: + case AArch64::LD2R_WB_4S_fixed: case AArch64::LD2R_WB_4S_register: + case AArch64::LD2R_WB_2D_fixed: case AArch64::LD2R_WB_2D_register: { + switch (Opc) { + case AArch64::LD2R_WB_16B_fixed: case AArch64::LD2R_WB_16B_register: + TransferBytes = 2; break; + case AArch64::LD2R_WB_8H_fixed: case AArch64::LD2R_WB_8H_register: + TransferBytes = 4; break; + case AArch64::LD2R_WB_4S_fixed: case AArch64::LD2R_WB_4S_register: + TransferBytes = 8; break; + case AArch64::LD2R_WB_2D_fixed: case AArch64::LD2R_WB_2D_register: + TransferBytes = 16; break; + } + IsLoadDup = true; + NumVecs = 2; + break; + } + + case AArch64::LD3R_WB_8B_fixed: case AArch64::LD3R_WB_8B_register: + case AArch64::LD3R_WB_4H_fixed: case AArch64::LD3R_WB_4H_register: + case AArch64::LD3R_WB_2S_fixed: case AArch64::LD3R_WB_2S_register: + case AArch64::LD3R_WB_1D_fixed: case AArch64::LD3R_WB_1D_register: { + switch (Opc) { + case AArch64::LD3R_WB_8B_fixed: case AArch64::LD3R_WB_8B_register: + TransferBytes = 3; break; + case AArch64::LD3R_WB_4H_fixed: case AArch64::LD3R_WB_4H_register: + TransferBytes = 6; break; + case AArch64::LD3R_WB_2S_fixed: case AArch64::LD3R_WB_2S_register: + TransferBytes = 12; break; + case AArch64::LD3R_WB_1D_fixed: case AArch64::LD3R_WB_1D_register: + TransferBytes = 24; break; + } + Is64bitVec = true; + IsLoadDup = true; + NumVecs = 3; + break; + } + + case AArch64::LD3R_WB_16B_fixed: case AArch64::LD3R_WB_16B_register: + case AArch64::LD3R_WB_4S_fixed: case AArch64::LD3R_WB_8H_register: + case AArch64::LD3R_WB_8H_fixed: case AArch64::LD3R_WB_4S_register: + case AArch64::LD3R_WB_2D_fixed: case AArch64::LD3R_WB_2D_register: { + switch (Opc) { + case AArch64::LD3R_WB_16B_fixed: case AArch64::LD3R_WB_16B_register: + TransferBytes = 3; break; + case AArch64::LD3R_WB_8H_fixed: case AArch64::LD3R_WB_8H_register: + TransferBytes = 6; break; + case AArch64::LD3R_WB_4S_fixed: case AArch64::LD3R_WB_4S_register: + TransferBytes = 12; break; + case AArch64::LD3R_WB_2D_fixed: case AArch64::LD3R_WB_2D_register: + TransferBytes = 24; break; + } + IsLoadDup = true; + NumVecs = 3; + break; + } + + case AArch64::LD4R_WB_8B_fixed: case AArch64::LD4R_WB_8B_register: + case AArch64::LD4R_WB_4H_fixed: case AArch64::LD4R_WB_4H_register: + case AArch64::LD4R_WB_2S_fixed: case AArch64::LD4R_WB_2S_register: + case AArch64::LD4R_WB_1D_fixed: case AArch64::LD4R_WB_1D_register: { + switch (Opc) { + case AArch64::LD4R_WB_8B_fixed: case AArch64::LD4R_WB_8B_register: + TransferBytes = 4; break; + case AArch64::LD4R_WB_4H_fixed: case AArch64::LD4R_WB_4H_register: + TransferBytes = 8; break; + case AArch64::LD4R_WB_2S_fixed: case AArch64::LD4R_WB_2S_register: + TransferBytes = 16; break; + case AArch64::LD4R_WB_1D_fixed: case AArch64::LD4R_WB_1D_register: + TransferBytes = 32; break; + } + Is64bitVec = true; + IsLoadDup = true; + NumVecs = 4; + break; + } + + case AArch64::LD4R_WB_16B_fixed: case AArch64::LD4R_WB_16B_register: + case AArch64::LD4R_WB_4S_fixed: case AArch64::LD4R_WB_8H_register: + case AArch64::LD4R_WB_8H_fixed: case AArch64::LD4R_WB_4S_register: + case AArch64::LD4R_WB_2D_fixed: case AArch64::LD4R_WB_2D_register: { + switch (Opc) { + case AArch64::LD4R_WB_16B_fixed: case AArch64::LD4R_WB_16B_register: + TransferBytes = 4; break; + case AArch64::LD4R_WB_8H_fixed: case AArch64::LD4R_WB_8H_register: + TransferBytes = 8; break; + case AArch64::LD4R_WB_4S_fixed: case AArch64::LD4R_WB_4S_register: + TransferBytes = 16; break; + case AArch64::LD4R_WB_2D_fixed: case AArch64::LD4R_WB_2D_register: + TransferBytes = 32; break; + } + IsLoadDup = true; + NumVecs = 4; + break; + } + + case AArch64::LD1LN_WB_B_fixed: case AArch64::LD1LN_WB_B_register: + case AArch64::LD1LN_WB_H_fixed: case AArch64::LD1LN_WB_H_register: + case AArch64::LD1LN_WB_S_fixed: case AArch64::LD1LN_WB_S_register: + case AArch64::LD1LN_WB_D_fixed: case AArch64::LD1LN_WB_D_register: { + switch (Opc) { + case AArch64::LD1LN_WB_B_fixed: case AArch64::LD1LN_WB_B_register: + TransferBytes = 1; break; + case AArch64::LD1LN_WB_H_fixed: case AArch64::LD1LN_WB_H_register: + TransferBytes = 2; break; + case AArch64::LD1LN_WB_S_fixed: case AArch64::LD1LN_WB_S_register: + TransferBytes = 4; break; + case AArch64::LD1LN_WB_D_fixed: case AArch64::LD1LN_WB_D_register: + TransferBytes = 8; break; + } + IsLoad = true; + NumVecs = 1; + break; + } + + case AArch64::LD2LN_WB_B_fixed: case AArch64::LD2LN_WB_B_register: + case AArch64::LD2LN_WB_H_fixed: case AArch64::LD2LN_WB_H_register: + case AArch64::LD2LN_WB_S_fixed: case AArch64::LD2LN_WB_S_register: + case AArch64::LD2LN_WB_D_fixed: case AArch64::LD2LN_WB_D_register: { + switch (Opc) { + case AArch64::LD2LN_WB_B_fixed: case AArch64::LD2LN_WB_B_register: + TransferBytes = 2; break; + case AArch64::LD2LN_WB_H_fixed: case AArch64::LD2LN_WB_H_register: + TransferBytes = 4; break; + case AArch64::LD2LN_WB_S_fixed: case AArch64::LD2LN_WB_S_register: + TransferBytes = 8; break; + case AArch64::LD2LN_WB_D_fixed: case AArch64::LD2LN_WB_D_register: + TransferBytes = 16; break; + } + IsLoad = true; + NumVecs = 2; + break; + } + + case AArch64::LD3LN_WB_B_fixed: case AArch64::LD3LN_WB_B_register: + case AArch64::LD3LN_WB_H_fixed: case AArch64::LD3LN_WB_H_register: + case AArch64::LD3LN_WB_S_fixed: case AArch64::LD3LN_WB_S_register: + case AArch64::LD3LN_WB_D_fixed: case AArch64::LD3LN_WB_D_register: { + switch (Opc) { + case AArch64::LD3LN_WB_B_fixed: case AArch64::LD3LN_WB_B_register: + TransferBytes = 3; break; + case AArch64::LD3LN_WB_H_fixed: case AArch64::LD3LN_WB_H_register: + TransferBytes = 6; break; + case AArch64::LD3LN_WB_S_fixed: case AArch64::LD3LN_WB_S_register: + TransferBytes = 12; break; + case AArch64::LD3LN_WB_D_fixed: case AArch64::LD3LN_WB_D_register: + TransferBytes = 24; break; + } + IsLoad = true; + NumVecs = 3; + break; + } + + case AArch64::LD4LN_WB_B_fixed: case AArch64::LD4LN_WB_B_register: + case AArch64::LD4LN_WB_H_fixed: case AArch64::LD4LN_WB_H_register: + case AArch64::LD4LN_WB_S_fixed: case AArch64::LD4LN_WB_S_register: + case AArch64::LD4LN_WB_D_fixed: case AArch64::LD4LN_WB_D_register: { + switch (Opc) { + case AArch64::LD4LN_WB_B_fixed: case AArch64::LD4LN_WB_B_register: + TransferBytes = 3; break; + case AArch64::LD4LN_WB_H_fixed: case AArch64::LD4LN_WB_H_register: + TransferBytes = 6; break; + case AArch64::LD4LN_WB_S_fixed: case AArch64::LD4LN_WB_S_register: + TransferBytes = 12; break; + case AArch64::LD4LN_WB_D_fixed: case AArch64::LD4LN_WB_D_register: + TransferBytes = 24; break; + } + IsLoad = true; + NumVecs = 4; + break; + } + + case AArch64::ST1LN_WB_B_fixed: case AArch64::ST1LN_WB_B_register: + case AArch64::ST1LN_WB_H_fixed: case AArch64::ST1LN_WB_H_register: + case AArch64::ST1LN_WB_S_fixed: case AArch64::ST1LN_WB_S_register: + case AArch64::ST1LN_WB_D_fixed: case AArch64::ST1LN_WB_D_register: { + switch (Opc) { + case AArch64::ST1LN_WB_B_fixed: case AArch64::ST1LN_WB_B_register: + TransferBytes = 1; break; + case AArch64::ST1LN_WB_H_fixed: case AArch64::ST1LN_WB_H_register: + TransferBytes = 2; break; + case AArch64::ST1LN_WB_S_fixed: case AArch64::ST1LN_WB_S_register: + TransferBytes = 4; break; + case AArch64::ST1LN_WB_D_fixed: case AArch64::ST1LN_WB_D_register: + TransferBytes = 8; break; + } + NumVecs = 1; + break; + } + + case AArch64::ST2LN_WB_B_fixed: case AArch64::ST2LN_WB_B_register: + case AArch64::ST2LN_WB_H_fixed: case AArch64::ST2LN_WB_H_register: + case AArch64::ST2LN_WB_S_fixed: case AArch64::ST2LN_WB_S_register: + case AArch64::ST2LN_WB_D_fixed: case AArch64::ST2LN_WB_D_register: { + switch (Opc) { + case AArch64::ST2LN_WB_B_fixed: case AArch64::ST2LN_WB_B_register: + TransferBytes = 2; break; + case AArch64::ST2LN_WB_H_fixed: case AArch64::ST2LN_WB_H_register: + TransferBytes = 4; break; + case AArch64::ST2LN_WB_S_fixed: case AArch64::ST2LN_WB_S_register: + TransferBytes = 8; break; + case AArch64::ST2LN_WB_D_fixed: case AArch64::ST2LN_WB_D_register: + TransferBytes = 16; break; + } + NumVecs = 2; + break; + } + + case AArch64::ST3LN_WB_B_fixed: case AArch64::ST3LN_WB_B_register: + case AArch64::ST3LN_WB_H_fixed: case AArch64::ST3LN_WB_H_register: + case AArch64::ST3LN_WB_S_fixed: case AArch64::ST3LN_WB_S_register: + case AArch64::ST3LN_WB_D_fixed: case AArch64::ST3LN_WB_D_register: { + switch (Opc) { + case AArch64::ST3LN_WB_B_fixed: case AArch64::ST3LN_WB_B_register: + TransferBytes = 3; break; + case AArch64::ST3LN_WB_H_fixed: case AArch64::ST3LN_WB_H_register: + TransferBytes = 6; break; + case AArch64::ST3LN_WB_S_fixed: case AArch64::ST3LN_WB_S_register: + TransferBytes = 12; break; + case AArch64::ST3LN_WB_D_fixed: case AArch64::ST3LN_WB_D_register: + TransferBytes = 24; break; + } + NumVecs = 3; + break; + } + + case AArch64::ST4LN_WB_B_fixed: case AArch64::ST4LN_WB_B_register: + case AArch64::ST4LN_WB_H_fixed: case AArch64::ST4LN_WB_H_register: + case AArch64::ST4LN_WB_S_fixed: case AArch64::ST4LN_WB_S_register: + case AArch64::ST4LN_WB_D_fixed: case AArch64::ST4LN_WB_D_register: { + switch (Opc) { + case AArch64::ST4LN_WB_B_fixed: case AArch64::ST4LN_WB_B_register: + TransferBytes = 4; break; + case AArch64::ST4LN_WB_H_fixed: case AArch64::ST4LN_WB_H_register: + TransferBytes = 8; break; + case AArch64::ST4LN_WB_S_fixed: case AArch64::ST4LN_WB_S_register: + TransferBytes = 16; break; + case AArch64::ST4LN_WB_D_fixed: case AArch64::ST4LN_WB_D_register: + TransferBytes = 32; break; + } + NumVecs = 4; + break; + } + + default: + return MCDisassembler::Fail; + } // End of switch (Opc) + + unsigned Rt = fieldFromInstruction(Insn, 0, 5); + unsigned Rn = fieldFromInstruction(Insn, 5, 5); + unsigned Rm = fieldFromInstruction(Insn, 16, 5); + + // Decode post-index of load duplicate lane + if (IsLoadDup) { + switch (NumVecs) { + case 1: + Is64bitVec ? DecodeFPR64RegisterClass(Inst, Rt, Address, Decoder) + : DecodeFPR128RegisterClass(Inst, Rt, Address, Decoder); + break; + case 2: + Is64bitVec ? DecodeDPairRegisterClass(Inst, Rt, Address, Decoder) + : DecodeQPairRegisterClass(Inst, Rt, Address, Decoder); + break; + case 3: + Is64bitVec ? DecodeDTripleRegisterClass(Inst, Rt, Address, Decoder) + : DecodeQTripleRegisterClass(Inst, Rt, Address, Decoder); + break; + case 4: + Is64bitVec ? DecodeDQuadRegisterClass(Inst, Rt, Address, Decoder) + : DecodeQQuadRegisterClass(Inst, Rt, Address, Decoder); + } + + // Decode write back register, which is equal to Rn. + DecodeGPR64xspRegisterClass(Inst, Rn, Address, Decoder); + DecodeGPR64xspRegisterClass(Inst, Rn, Address, Decoder); + + if (Rm == 31) // If Rm is 0x11111, add the number of transferred bytes + Inst.addOperand(MCOperand::CreateImm(TransferBytes)); + else // Decode Rm + DecodeGPR64noxzrRegisterClass(Inst, Rm, Address, Decoder); + + return MCDisassembler::Success; + } + + // Decode post-index of load/store lane + // Loads have a vector list as output. + if (IsLoad) { + switch (NumVecs) { + case 1: + DecodeFPR128RegisterClass(Inst, Rt, Address, Decoder); + break; + case 2: + DecodeQPairRegisterClass(Inst, Rt, Address, Decoder); + break; + case 3: + DecodeQTripleRegisterClass(Inst, Rt, Address, Decoder); + break; + case 4: + DecodeQQuadRegisterClass(Inst, Rt, Address, Decoder); + } + } + + // Decode write back register, which is equal to Rn. + DecodeGPR64xspRegisterClass(Inst, Rn, Address, Decoder); + DecodeGPR64xspRegisterClass(Inst, Rn, Address, Decoder); + + if (Rm == 31) // If Rm is 0x11111, add the number of transferred bytes + Inst.addOperand(MCOperand::CreateImm(TransferBytes)); + else // Decode Rm + DecodeGPR64noxzrRegisterClass(Inst, Rm, Address, Decoder); + + // Decode the source vector list. + switch (NumVecs) { + case 1: + DecodeFPR128RegisterClass(Inst, Rt, Address, Decoder); + break; + case 2: + DecodeQPairRegisterClass(Inst, Rt, Address, Decoder); + break; + case 3: + DecodeQTripleRegisterClass(Inst, Rt, Address, Decoder); + break; + case 4: + DecodeQQuadRegisterClass(Inst, Rt, Address, Decoder); + } + + // Decode lane + unsigned Q = fieldFromInstruction(Insn, 30, 1); + unsigned S = fieldFromInstruction(Insn, 10, 3); + unsigned lane = 0; + switch (NumVecs) { + case 1: + lane = (Q << 3) & S; + break; + case 2: + lane = (Q << 2) & (S >> 1); + break; + case 3: + lane = (Q << 1) & (S >> 2); + break; + case 4: + lane = Q; + break; + } + Inst.addOperand(MCOperand::CreateImm(lane)); + + return MCDisassembler::Success; +} diff --git a/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp b/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp index c0816917562..24205b57b9e 100644 --- a/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp +++ b/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp @@ -521,7 +521,7 @@ void AArch64InstPrinter::printVectorList(const MCInst *MI, unsigned OpNum, std::string LayoutStr = A64VectorLayoutToString(Layout); O << "{"; if (Count > 1) { // Print sub registers separately - bool IsVec64 = (Layout < A64Layout::_16B) ? true : false; + bool IsVec64 = (Layout < A64Layout::_16B); unsigned SubRegIdx = IsVec64 ? AArch64::dsub_0 : AArch64::qsub_0; for (unsigned I = 0; I < Count; I++) { std::string Name = getRegisterName(MRI.getSubReg(Reg, SubRegIdx++)); diff --git a/lib/Target/AArch64/Utils/AArch64BaseInfo.h b/lib/Target/AArch64/Utils/AArch64BaseInfo.h index 7db52381813..d6ae147182d 100644 --- a/lib/Target/AArch64/Utils/AArch64BaseInfo.h +++ b/lib/Target/AArch64/Utils/AArch64BaseInfo.h @@ -317,7 +317,14 @@ namespace A64Layout { _16B, _8H, _4S, - _2D + _2D, + + // Bare layout for the 128-bit vector + // (only show ".b", ".h", ".s", ".d" without vector number) + _B, + _H, + _S, + _D }; } @@ -332,6 +339,10 @@ A64VectorLayoutToString(A64Layout::VectorLayout Layout) { case A64Layout::_8H: return ".8h"; case A64Layout::_4S: return ".4s"; case A64Layout::_2D: return ".2d"; + case A64Layout::_B: return ".b"; + case A64Layout::_H: return ".h"; + case A64Layout::_S: return ".s"; + case A64Layout::_D: return ".d"; default: llvm_unreachable("Unknown Vector Layout"); } } @@ -347,6 +358,10 @@ A64StringToVectorLayout(StringRef LayoutStr) { .Case(".8h", A64Layout::_8H) .Case(".4s", A64Layout::_4S) .Case(".2d", A64Layout::_2D) + .Case(".b", A64Layout::_B) + .Case(".h", A64Layout::_H) + .Case(".s", A64Layout::_S) + .Case(".d", A64Layout::_D) .Default(A64Layout::Invalid); } diff --git a/test/CodeGen/AArch64/neon-simd-ldst-one.ll b/test/CodeGen/AArch64/neon-simd-ldst-one.ll new file mode 100644 index 00000000000..3f28320f23d --- /dev/null +++ b/test/CodeGen/AArch64/neon-simd-ldst-one.ll @@ -0,0 +1,2113 @@ +; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s + +%struct.int8x16x2_t = type { [2 x <16 x i8>] } +%struct.int16x8x2_t = type { [2 x <8 x i16>] } +%struct.int32x4x2_t = type { [2 x <4 x i32>] } +%struct.int64x2x2_t = type { [2 x <2 x i64>] } +%struct.float32x4x2_t = type { [2 x <4 x float>] } +%struct.float64x2x2_t = type { [2 x <2 x double>] } +%struct.int8x8x2_t = type { [2 x <8 x i8>] } +%struct.int16x4x2_t = type { [2 x <4 x i16>] } +%struct.int32x2x2_t = type { [2 x <2 x i32>] } +%struct.int64x1x2_t = type { [2 x <1 x i64>] } +%struct.float32x2x2_t = type { [2 x <2 x float>] } +%struct.float64x1x2_t = type { [2 x <1 x double>] } +%struct.int8x16x3_t = type { [3 x <16 x i8>] } +%struct.int16x8x3_t = type { [3 x <8 x i16>] } +%struct.int32x4x3_t = type { [3 x <4 x i32>] } +%struct.int64x2x3_t = type { [3 x <2 x i64>] } +%struct.float32x4x3_t = type { [3 x <4 x float>] } +%struct.float64x2x3_t = type { [3 x <2 x double>] } +%struct.int8x8x3_t = type { [3 x <8 x i8>] } +%struct.int16x4x3_t = type { [3 x <4 x i16>] } +%struct.int32x2x3_t = type { [3 x <2 x i32>] } +%struct.int64x1x3_t = type { [3 x <1 x i64>] } +%struct.float32x2x3_t = type { [3 x <2 x float>] } +%struct.float64x1x3_t = type { [3 x <1 x double>] } +%struct.int8x16x4_t = type { [4 x <16 x i8>] } +%struct.int16x8x4_t = type { [4 x <8 x i16>] } +%struct.int32x4x4_t = type { [4 x <4 x i32>] } +%struct.int64x2x4_t = type { [4 x <2 x i64>] } +%struct.float32x4x4_t = type { [4 x <4 x float>] } +%struct.float64x2x4_t = type { [4 x <2 x double>] } +%struct.int8x8x4_t = type { [4 x <8 x i8>] } +%struct.int16x4x4_t = type { [4 x <4 x i16>] } +%struct.int32x2x4_t = type { [4 x <2 x i32>] } +%struct.int64x1x4_t = type { [4 x <1 x i64>] } +%struct.float32x2x4_t = type { [4 x <2 x float>] } +%struct.float64x1x4_t = type { [4 x <1 x double>] } + +define <16 x i8> @test_vld1q_dup_s8(i8* %a) { +; CHECK-LABEL: test_vld1q_dup_s8 +; CHECK: ld1r {{{v[0-9]+}}.16b}, [x0] +entry: + %0 = load i8* %a, align 1 + %1 = insertelement <16 x i8> undef, i8 %0, i32 0 + %lane = shufflevector <16 x i8> %1, <16 x i8> undef, <16 x i32> zeroinitializer + ret <16 x i8> %lane +} + +define <8 x i16> @test_vld1q_dup_s16(i16* %a) { +; CHECK-LABEL: test_vld1q_dup_s16 +; CHECK: ld1r {{{v[0-9]+}}.8h}, [x0] +entry: + %0 = load i16* %a, align 2 + %1 = insertelement <8 x i16> undef, i16 %0, i32 0 + %lane = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> zeroinitializer + ret <8 x i16> %lane +} + +define <4 x i32> @test_vld1q_dup_s32(i32* %a) { +; CHECK-LABEL: test_vld1q_dup_s32 +; CHECK: ld1r {{{v[0-9]+}}.4s}, [x0] +entry: + %0 = load i32* %a, align 4 + %1 = insertelement <4 x i32> undef, i32 %0, i32 0 + %lane = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> zeroinitializer + ret <4 x i32> %lane +} + +define <2 x i64> @test_vld1q_dup_s64(i64* %a) { +; CHECK-LABEL: test_vld1q_dup_s64 +; CHECK: ld1r {{{v[0-9]+}}.2d}, [x0] +entry: + %0 = load i64* %a, align 8 + %1 = insertelement <2 x i64> undef, i64 %0, i32 0 + %lane = shufflevector <2 x i64> %1, <2 x i64> undef, <2 x i32> zeroinitializer + ret <2 x i64> %lane +} + +define <4 x float> @test_vld1q_dup_f32(float* %a) { +; CHECK-LABEL: test_vld1q_dup_f32 +; CHECK: ld1r {{{v[0-9]+}}.4s}, [x0] +entry: + %0 = load float* %a, align 4 + %1 = insertelement <4 x float> undef, float %0, i32 0 + %lane = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> zeroinitializer + ret <4 x float> %lane +} + +define <2 x double> @test_vld1q_dup_f64(double* %a) { +; CHECK-LABEL: test_vld1q_dup_f64 +; CHECK: ld1r {{{v[0-9]+}}.2d}, [x0] +entry: + %0 = load double* %a, align 8 + %1 = insertelement <2 x double> undef, double %0, i32 0 + %lane = shufflevector <2 x double> %1, <2 x double> undef, <2 x i32> zeroinitializer + ret <2 x double> %lane +} + +define <8 x i8> @test_vld1_dup_s8(i8* %a) { +; CHECK-LABEL: test_vld1_dup_s8 +; CHECK: ld1r {{{v[0-9]+}}.8b}, [x0] +entry: + %0 = load i8* %a, align 1 + %1 = insertelement <8 x i8> undef, i8 %0, i32 0 + %lane = shufflevector <8 x i8> %1, <8 x i8> undef, <8 x i32> zeroinitializer + ret <8 x i8> %lane +} + +define <4 x i16> @test_vld1_dup_s16(i16* %a) { +; CHECK-LABEL: test_vld1_dup_s16 +; CHECK: ld1r {{{v[0-9]+}}.4h}, [x0] +entry: + %0 = load i16* %a, align 2 + %1 = insertelement <4 x i16> undef, i16 %0, i32 0 + %lane = shufflevector <4 x i16> %1, <4 x i16> undef, <4 x i32> zeroinitializer + ret <4 x i16> %lane +} + +define <2 x i32> @test_vld1_dup_s32(i32* %a) { +; CHECK-LABEL: test_vld1_dup_s32 +; CHECK: ld1r {{{v[0-9]+}}.2s}, [x0] +entry: + %0 = load i32* %a, align 4 + %1 = insertelement <2 x i32> undef, i32 %0, i32 0 + %lane = shufflevector <2 x i32> %1, <2 x i32> undef, <2 x i32> zeroinitializer + ret <2 x i32> %lane +} + +define <1 x i64> @test_vld1_dup_s64(i64* %a) { +; CHECK-LABEL: test_vld1_dup_s64 +; CHECK: ld1r {{{v[0-9]+}}.1d}, [x0] +entry: + %0 = load i64* %a, align 8 + %1 = insertelement <1 x i64> undef, i64 %0, i32 0 + ret <1 x i64> %1 +} + +define <2 x float> @test_vld1_dup_f32(float* %a) { +; CHECK-LABEL: test_vld1_dup_f32 +; CHECK: ld1r {{{v[0-9]+}}.2s}, [x0] +entry: + %0 = load float* %a, align 4 + %1 = insertelement <2 x float> undef, float %0, i32 0 + %lane = shufflevector <2 x float> %1, <2 x float> undef, <2 x i32> zeroinitializer + ret <2 x float> %lane +} + +define <1 x double> @test_vld1_dup_f64(double* %a) { +; CHECK-LABEL: test_vld1_dup_f64 +; CHECK: ld1r {{{v[0-9]+}}.1d}, [x0] +entry: + %0 = load double* %a, align 8 + %1 = insertelement <1 x double> undef, double %0, i32 0 + ret <1 x double> %1 +} + +define %struct.int8x16x2_t @test_vld2q_dup_s8(i8* %a) { +; CHECK-LABEL: test_vld2q_dup_s8 +; CHECK: ld2r {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, [x0] +entry: + %vld_dup = tail call { <16 x i8>, <16 x i8> } @llvm.arm.neon.vld2lane.v16i8(i8* %a, <16 x i8> undef, <16 x i8> undef, i32 0, i32 1) + %0 = extractvalue { <16 x i8>, <16 x i8> } %vld_dup, 0 + %lane = shufflevector <16 x i8> %0, <16 x i8> undef, <16 x i32> zeroinitializer + %1 = extractvalue { <16 x i8>, <16 x i8> } %vld_dup, 1 + %lane1 = shufflevector <16 x i8> %1, <16 x i8> undef, <16 x i32> zeroinitializer + %.fca.0.0.insert = insertvalue %struct.int8x16x2_t undef, <16 x i8> %lane, 0, 0 + %.fca.0.1.insert = insertvalue %struct.int8x16x2_t %.fca.0.0.insert, <16 x i8> %lane1, 0, 1 + ret %struct.int8x16x2_t %.fca.0.1.insert +} + +define %struct.int16x8x2_t @test_vld2q_dup_s16(i16* %a) { +; CHECK-LABEL: test_vld2q_dup_s16 +; CHECK: ld2r {{{v[0-9]+}}.8h, {{v[0-9]+}}.8h}, [x0] +entry: + %0 = bitcast i16* %a to i8* + %vld_dup = tail call { <8 x i16>, <8 x i16> } @llvm.arm.neon.vld2lane.v8i16(i8* %0, <8 x i16> undef, <8 x i16> undef, i32 0, i32 2) + %1 = extractvalue { <8 x i16>, <8 x i16> } %vld_dup, 0 + %lane = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> zeroinitializer + %2 = extractvalue { <8 x i16>, <8 x i16> } %vld_dup, 1 + %lane1 = shufflevector <8 x i16> %2, <8 x i16> undef, <8 x i32> zeroinitializer + %.fca.0.0.insert = insertvalue %struct.int16x8x2_t undef, <8 x i16> %lane, 0, 0 + %.fca.0.1.insert = insertvalue %struct.int16x8x2_t %.fca.0.0.insert, <8 x i16> %lane1, 0, 1 + ret %struct.int16x8x2_t %.fca.0.1.insert +} + +define %struct.int32x4x2_t @test_vld2q_dup_s32(i32* %a) { +; CHECK-LABEL: test_vld2q_dup_s32 +; CHECK: ld2r {{{v[0-9]+}}.4s, {{v[0-9]+}}.4s}, [x0] +entry: + %0 = bitcast i32* %a to i8* + %vld_dup = tail call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2lane.v4i32(i8* %0, <4 x i32> undef, <4 x i32> undef, i32 0, i32 4) + %1 = extractvalue { <4 x i32>, <4 x i32> } %vld_dup, 0 + %lane = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> zeroinitializer + %2 = extractvalue { <4 x i32>, <4 x i32> } %vld_dup, 1 + %lane1 = shufflevector <4 x i32> %2, <4 x i32> undef, <4 x i32> zeroinitializer + %.fca.0.0.insert = insertvalue %struct.int32x4x2_t undef, <4 x i32> %lane, 0, 0 + %.fca.0.1.insert = insertvalue %struct.int32x4x2_t %.fca.0.0.insert, <4 x i32> %lane1, 0, 1 + ret %struct.int32x4x2_t %.fca.0.1.insert +} + +define %struct.int64x2x2_t @test_vld2q_dup_s64(i64* %a) { +; CHECK-LABEL: test_vld2q_dup_s64 +; CHECK: ld2r {{{v[0-9]+}}.2d, {{v[0-9]+}}.2d}, [x0] +entry: + %0 = bitcast i64* %a to i8* + %vld_dup = tail call { <2 x i64>, <2 x i64> } @llvm.arm.neon.vld2lane.v2i64(i8* %0, <2 x i64> undef, <2 x i64> undef, i32 0, i32 8) + %1 = extractvalue { <2 x i64>, <2 x i64> } %vld_dup, 0 + %lane = shufflevector <2 x i64> %1, <2 x i64> undef, <2 x i32> zeroinitializer + %2 = extractvalue { <2 x i64>, <2 x i64> } %vld_dup, 1 + %lane1 = shufflevector <2 x i64> %2, <2 x i64> undef, <2 x i32> zeroinitializer + %.fca.0.0.insert = insertvalue %struct.int64x2x2_t undef, <2 x i64> %lane, 0, 0 + %.fca.0.1.insert = insertvalue %struct.int64x2x2_t %.fca.0.0.insert, <2 x i64> %lane1, 0, 1 + ret %struct.int64x2x2_t %.fca.0.1.insert +} + +define %struct.float32x4x2_t @test_vld2q_dup_f32(float* %a) { +; CHECK-LABEL: test_vld2q_dup_f32 +; CHECK: ld2r {{{v[0-9]+}}.4s, {{v[0-9]+}}.4s}, [x0] +entry: + %0 = bitcast float* %a to i8* + %vld_dup = tail call { <4 x float>, <4 x float> } @llvm.arm.neon.vld2lane.v4f32(i8* %0, <4 x float> undef, <4 x float> undef, i32 0, i32 4) + %1 = extractvalue { <4 x float>, <4 x float> } %vld_dup, 0 + %lane = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> zeroinitializer + %2 = extractvalue { <4 x float>, <4 x float> } %vld_dup, 1 + %lane1 = shufflevector <4 x float> %2, <4 x float> undef, <4 x i32> zeroinitializer + %.fca.0.0.insert = insertvalue %struct.float32x4x2_t undef, <4 x float> %lane, 0, 0 + %.fca.0.1.insert = insertvalue %struct.float32x4x2_t %.fca.0.0.insert, <4 x float> %lane1, 0, 1 + ret %struct.float32x4x2_t %.fca.0.1.insert +} + +define %struct.float64x2x2_t @test_vld2q_dup_f64(double* %a) { +; CHECK-LABEL: test_vld2q_dup_f64 +; CHECK: ld2r {{{v[0-9]+}}.2d, {{v[0-9]+}}.2d}, [x0] +entry: + %0 = bitcast double* %a to i8* + %vld_dup = tail call { <2 x double>, <2 x double> } @llvm.arm.neon.vld2lane.v2f64(i8* %0, <2 x double> undef, <2 x double> undef, i32 0, i32 8) + %1 = extractvalue { <2 x double>, <2 x double> } %vld_dup, 0 + %lane = shufflevector <2 x double> %1, <2 x double> undef, <2 x i32> zeroinitializer + %2 = extractvalue { <2 x double>, <2 x double> } %vld_dup, 1 + %lane1 = shufflevector <2 x double> %2, <2 x double> undef, <2 x i32> zeroinitializer + %.fca.0.0.insert = insertvalue %struct.float64x2x2_t undef, <2 x double> %lane, 0, 0 + %.fca.0.1.insert = insertvalue %struct.float64x2x2_t %.fca.0.0.insert, <2 x double> %lane1, 0, 1 + ret %struct.float64x2x2_t %.fca.0.1.insert +} + +define %struct.int8x8x2_t @test_vld2_dup_s8(i8* %a) { +; CHECK-LABEL: test_vld2_dup_s8 +; CHECK: ld2r {{{v[0-9]+}}.8b, {{v[0-9]+}}.8b}, [x0] +entry: + %vld_dup = tail call { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2lane.v8i8(i8* %a, <8 x i8> undef, <8 x i8> undef, i32 0, i32 1) + %0 = extractvalue { <8 x i8>, <8 x i8> } %vld_dup, 0 + %lane = shufflevector <8 x i8> %0, <8 x i8> undef, <8 x i32> zeroinitializer + %1 = extractvalue { <8 x i8>, <8 x i8> } %vld_dup, 1 + %lane1 = shufflevector <8 x i8> %1, <8 x i8> undef, <8 x i32> zeroinitializer + %.fca.0.0.insert = insertvalue %struct.int8x8x2_t undef, <8 x i8> %lane, 0, 0 + %.fca.0.1.insert = insertvalue %struct.int8x8x2_t %.fca.0.0.insert, <8 x i8> %lane1, 0, 1 + ret %struct.int8x8x2_t %.fca.0.1.insert +} + +define %struct.int16x4x2_t @test_vld2_dup_s16(i16* %a) { +; CHECK-LABEL: test_vld2_dup_s16 +; CHECK: ld2r {{{v[0-9]+}}.4h, {{v[0-9]+}}.4h}, [x0] +entry: + %0 = bitcast i16* %a to i8* + %vld_dup = tail call { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld2lane.v4i16(i8* %0, <4 x i16> undef, <4 x i16> undef, i32 0, i32 2) + %1 = extractvalue { <4 x i16>, <4 x i16> } %vld_dup, 0 + %lane = shufflevector <4 x i16> %1, <4 x i16> undef, <4 x i32> zeroinitializer + %2 = extractvalue { <4 x i16>, <4 x i16> } %vld_dup, 1 + %lane1 = shufflevector <4 x i16> %2, <4 x i16> undef, <4 x i32> zeroinitializer + %.fca.0.0.insert = insertvalue %struct.int16x4x2_t undef, <4 x i16> %lane, 0, 0 + %.fca.0.1.insert = insertvalue %struct.int16x4x2_t %.fca.0.0.insert, <4 x i16> %lane1, 0, 1 + ret %struct.int16x4x2_t %.fca.0.1.insert +} + +define %struct.int32x2x2_t @test_vld2_dup_s32(i32* %a) { +; CHECK-LABEL: test_vld2_dup_s32 +; CHECK: ld2r {{{v[0-9]+}}.2s, {{v[0-9]+}}.2s}, [x0] +entry: + %0 = bitcast i32* %a to i8* + %vld_dup = tail call { <2 x i32>, <2 x i32> } @llvm.arm.neon.vld2lane.v2i32(i8* %0, <2 x i32> undef, <2 x i32> undef, i32 0, i32 4) + %1 = extractvalue { <2 x i32>, <2 x i32> } %vld_dup, 0 + %lane = shufflevector <2 x i32> %1, <2 x i32> undef, <2 x i32> zeroinitializer + %2 = extractvalue { <2 x i32>, <2 x i32> } %vld_dup, 1 + %lane1 = shufflevector <2 x i32> %2, <2 x i32> undef, <2 x i32> zeroinitializer + %.fca.0.0.insert = insertvalue %struct.int32x2x2_t undef, <2 x i32> %lane, 0, 0 + %.fca.0.1.insert = insertvalue %struct.int32x2x2_t %.fca.0.0.insert, <2 x i32> %lane1, 0, 1 + ret %struct.int32x2x2_t %.fca.0.1.insert +} + +define %struct.int64x1x2_t @test_vld2_dup_s64(i64* %a) { +; CHECK-LABEL: test_vld2_dup_s64 +; CHECK: ld1 {{{v[0-9]+}}.1d, {{v[0-9]+}}.1d}, [x0] +entry: + %0 = bitcast i64* %a to i8* + %vld_dup = tail call { <1 x i64>, <1 x i64> } @llvm.arm.neon.vld2.v1i64(i8* %0, i32 8) + %vld_dup.fca.0.extract = extractvalue { <1 x i64>, <1 x i64> } %vld_dup, 0 + %vld_dup.fca.1.extract = extractvalue { <1 x i64>, <1 x i64> } %vld_dup, 1 + %.fca.0.0.insert = insertvalue %struct.int64x1x2_t undef, <1 x i64> %vld_dup.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.int64x1x2_t %.fca.0.0.insert, <1 x i64> %vld_dup.fca.1.extract, 0, 1 + ret %struct.int64x1x2_t %.fca.0.1.insert +} + +define %struct.float32x2x2_t @test_vld2_dup_f32(float* %a) { +; CHECK-LABEL: test_vld2_dup_f32 +; CHECK: ld2r {{{v[0-9]+}}.2s, {{v[0-9]+}}.2s}, [x0] +entry: + %0 = bitcast float* %a to i8* + %vld_dup = tail call { <2 x float>, <2 x float> } @llvm.arm.neon.vld2lane.v2f32(i8* %0, <2 x float> undef, <2 x float> undef, i32 0, i32 4) + %1 = extractvalue { <2 x float>, <2 x float> } %vld_dup, 0 + %lane = shufflevector <2 x float> %1, <2 x float> undef, <2 x i32> zeroinitializer + %2 = extractvalue { <2 x float>, <2 x float> } %vld_dup, 1 + %lane1 = shufflevector <2 x float> %2, <2 x float> undef, <2 x i32> zeroinitializer + %.fca.0.0.insert = insertvalue %struct.float32x2x2_t undef, <2 x float> %lane, 0, 0 + %.fca.0.1.insert = insertvalue %struct.float32x2x2_t %.fca.0.0.insert, <2 x float> %lane1, 0, 1 + ret %struct.float32x2x2_t %.fca.0.1.insert +} + +define %struct.float64x1x2_t @test_vld2_dup_f64(double* %a) { +; CHECK-LABEL: test_vld2_dup_f64 +; CHECK: ld1 {{{v[0-9]+}}.1d, {{v[0-9]+}}.1d}, [x0] +entry: + %0 = bitcast double* %a to i8* + %vld_dup = tail call { <1 x double>, <1 x double> } @llvm.arm.neon.vld2.v1f64(i8* %0, i32 8) + %vld_dup.fca.0.extract = extractvalue { <1 x double>, <1 x double> } %vld_dup, 0 + %vld_dup.fca.1.extract = extractvalue { <1 x double>, <1 x double> } %vld_dup, 1 + %.fca.0.0.insert = insertvalue %struct.float64x1x2_t undef, <1 x double> %vld_dup.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.float64x1x2_t %.fca.0.0.insert, <1 x double> %vld_dup.fca.1.extract, 0, 1 + ret %struct.float64x1x2_t %.fca.0.1.insert +} + +define %struct.int8x16x3_t @test_vld3q_dup_s8(i8* %a) { +; CHECK-LABEL: test_vld3q_dup_s8 +; CHECK: ld3r {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, [x0] +entry: + %vld_dup = tail call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3lane.v16i8(i8* %a, <16 x i8> undef, <16 x i8> undef, <16 x i8> undef, i32 0, i32 1) + %0 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vld_dup, 0 + %lane = shufflevector <16 x i8> %0, <16 x i8> undef, <16 x i32> zeroinitializer + %1 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vld_dup, 1 + %lane1 = shufflevector <16 x i8> %1, <16 x i8> undef, <16 x i32> zeroinitializer + %2 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vld_dup, 2 + %lane2 = shufflevector <16 x i8> %2, <16 x i8> undef, <16 x i32> zeroinitializer + %.fca.0.0.insert = insertvalue %struct.int8x16x3_t undef, <16 x i8> %lane, 0, 0 + %.fca.0.1.insert = insertvalue %struct.int8x16x3_t %.fca.0.0.insert, <16 x i8> %lane1, 0, 1 + %.fca.0.2.insert = insertvalue %struct.int8x16x3_t %.fca.0.1.insert, <16 x i8> %lane2, 0, 2 + ret %struct.int8x16x3_t %.fca.0.2.insert +} + +define %struct.int16x8x3_t @test_vld3q_dup_s16(i16* %a) { +; CHECK-LABEL: test_vld3q_dup_s16 +; CHECK: ld3r {{{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h}, [x0] +entry: + %0 = bitcast i16* %a to i8* + %vld_dup = tail call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3lane.v8i16(i8* %0, <8 x i16> undef, <8 x i16> undef, <8 x i16> undef, i32 0, i32 2) + %1 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %vld_dup, 0 + %lane = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> zeroinitializer + %2 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %vld_dup, 1 + %lane1 = shufflevector <8 x i16> %2, <8 x i16> undef, <8 x i32> zeroinitializer + %3 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %vld_dup, 2 + %lane2 = shufflevector <8 x i16> %3, <8 x i16> undef, <8 x i32> zeroinitializer + %.fca.0.0.insert = insertvalue %struct.int16x8x3_t undef, <8 x i16> %lane, 0, 0 + %.fca.0.1.insert = insertvalue %struct.int16x8x3_t %.fca.0.0.insert, <8 x i16> %lane1, 0, 1 + %.fca.0.2.insert = insertvalue %struct.int16x8x3_t %.fca.0.1.insert, <8 x i16> %lane2, 0, 2 + ret %struct.int16x8x3_t %.fca.0.2.insert +} + +define %struct.int32x4x3_t @test_vld3q_dup_s32(i32* %a) { +; CHECK-LABEL: test_vld3q_dup_s32 +; CHECK: ld3r {{{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s}, [x0] +entry: + %0 = bitcast i32* %a to i8* + %vld_dup = tail call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3lane.v4i32(i8* %0, <4 x i32> undef, <4 x i32> undef, <4 x i32> undef, i32 0, i32 4) + %1 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld_dup, 0 + %lane = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> zeroinitializer + %2 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld_dup, 1 + %lane1 = shufflevector <4 x i32> %2, <4 x i32> undef, <4 x i32> zeroinitializer + %3 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld_dup, 2 + %lane2 = shufflevector <4 x i32> %3, <4 x i32> undef, <4 x i32> zeroinitializer + %.fca.0.0.insert = insertvalue %struct.int32x4x3_t undef, <4 x i32> %lane, 0, 0 + %.fca.0.1.insert = insertvalue %struct.int32x4x3_t %.fca.0.0.insert, <4 x i32> %lane1, 0, 1 + %.fca.0.2.insert = insertvalue %struct.int32x4x3_t %.fca.0.1.insert, <4 x i32> %lane2, 0, 2 + ret %struct.int32x4x3_t %.fca.0.2.insert +} + +define %struct.int64x2x3_t @test_vld3q_dup_s64(i64* %a) { +; CHECK-LABEL: test_vld3q_dup_s64 +; CHECK: ld3r {{{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d}, [x0] +entry: + %0 = bitcast i64* %a to i8* + %vld_dup = tail call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm.neon.vld3lane.v2i64(i8* %0, <2 x i64> undef, <2 x i64> undef, <2 x i64> undef, i32 0, i32 8) + %1 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } %vld_dup, 0 + %lane = shufflevector <2 x i64> %1, <2 x i64> undef, <2 x i32> zeroinitializer + %2 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } %vld_dup, 1 + %lane1 = shufflevector <2 x i64> %2, <2 x i64> undef, <2 x i32> zeroinitializer + %3 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } %vld_dup, 2 + %lane2 = shufflevector <2 x i64> %3, <2 x i64> undef, <2 x i32> zeroinitializer + %.fca.0.0.insert = insertvalue %struct.int64x2x3_t undef, <2 x i64> %lane, 0, 0 + %.fca.0.1.insert = insertvalue %struct.int64x2x3_t %.fca.0.0.insert, <2 x i64> %lane1, 0, 1 + %.fca.0.2.insert = insertvalue %struct.int64x2x3_t %.fca.0.1.insert, <2 x i64> %lane2, 0, 2 + ret %struct.int64x2x3_t %.fca.0.2.insert +} + +define %struct.float32x4x3_t @test_vld3q_dup_f32(float* %a) { +; CHECK-LABEL: test_vld3q_dup_f32 +; CHECK: ld3r {{{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s}, [x0] +entry: + %0 = bitcast float* %a to i8* + %vld_dup = tail call { <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld3lane.v4f32(i8* %0, <4 x float> undef, <4 x float> undef, <4 x float> undef, i32 0, i32 4) + %1 = extractvalue { <4 x float>, <4 x float>, <4 x float> } %vld_dup, 0 + %lane = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> zeroinitializer + %2 = extractvalue { <4 x float>, <4 x float>, <4 x float> } %vld_dup, 1 + %lane1 = shufflevector <4 x float> %2, <4 x float> undef, <4 x i32> zeroinitializer + %3 = extractvalue { <4 x float>, <4 x float>, <4 x float> } %vld_dup, 2 + %lane2 = shufflevector <4 x float> %3, <4 x float> undef, <4 x i32> zeroinitializer + %.fca.0.0.insert = insertvalue %struct.float32x4x3_t undef, <4 x float> %lane, 0, 0 + %.fca.0.1.insert = insertvalue %struct.float32x4x3_t %.fca.0.0.insert, <4 x float> %lane1, 0, 1 + %.fca.0.2.insert = insertvalue %struct.float32x4x3_t %.fca.0.1.insert, <4 x float> %lane2, 0, 2 + ret %struct.float32x4x3_t %.fca.0.2.insert +} + +define %struct.float64x2x3_t @test_vld3q_dup_f64(double* %a) { +; CHECK-LABEL: test_vld3q_dup_f64 +; CHECK: ld3r {{{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d}, [x0] +entry: + %0 = bitcast double* %a to i8* + %vld_dup = tail call { <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld3lane.v2f64(i8* %0, <2 x double> undef, <2 x double> undef, <2 x double> undef, i32 0, i32 8) + %1 = extractvalue { <2 x double>, <2 x double>, <2 x double> } %vld_dup, 0 + %lane = shufflevector <2 x double> %1, <2 x double> undef, <2 x i32> zeroinitializer + %2 = extractvalue { <2 x double>, <2 x double>, <2 x double> } %vld_dup, 1 + %lane1 = shufflevector <2 x double> %2, <2 x double> undef, <2 x i32> zeroinitializer + %3 = extractvalue { <2 x double>, <2 x double>, <2 x double> } %vld_dup, 2 + %lane2 = shufflevector <2 x double> %3, <2 x double> undef, <2 x i32> zeroinitializer + %.fca.0.0.insert = insertvalue %struct.float64x2x3_t undef, <2 x double> %lane, 0, 0 + %.fca.0.1.insert = insertvalue %struct.float64x2x3_t %.fca.0.0.insert, <2 x double> %lane1, 0, 1 + %.fca.0.2.insert = insertvalue %struct.float64x2x3_t %.fca.0.1.insert, <2 x double> %lane2, 0, 2 + ret %struct.float64x2x3_t %.fca.0.2.insert +} + +define %struct.int8x8x3_t @test_vld3_dup_s8(i8* %a) { +; CHECK-LABEL: test_vld3_dup_s8 +; CHECK: ld3r {{{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b}, [x0] +entry: + %vld_dup = tail call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3lane.v8i8(i8* %a, <8 x i8> undef, <8 x i8> undef, <8 x i8> undef, i32 0, i32 1) + %0 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vld_dup, 0 + %lane = shufflevector <8 x i8> %0, <8 x i8> undef, <8 x i32> zeroinitializer + %1 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vld_dup, 1 + %lane1 = shufflevector <8 x i8> %1, <8 x i8> undef, <8 x i32> zeroinitializer + %2 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vld_dup, 2 + %lane2 = shufflevector <8 x i8> %2, <8 x i8> undef, <8 x i32> zeroinitializer + %.fca.0.0.insert = insertvalue %struct.int8x8x3_t undef, <8 x i8> %lane, 0, 0 + %.fca.0.1.insert = insertvalue %struct.int8x8x3_t %.fca.0.0.insert, <8 x i8> %lane1, 0, 1 + %.fca.0.2.insert = insertvalue %struct.int8x8x3_t %.fca.0.1.insert, <8 x i8> %lane2, 0, 2 + ret %struct.int8x8x3_t %.fca.0.2.insert +} + +define %struct.int16x4x3_t @test_vld3_dup_s16(i16* %a) { +; CHECK-LABEL: test_vld3_dup_s16 +; CHECK: ld3r {{{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h}, [x0] +entry: + %0 = bitcast i16* %a to i8* + %vld_dup = tail call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3lane.v4i16(i8* %0, <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, i32 0, i32 2) + %1 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %vld_dup, 0 + %lane = shufflevector <4 x i16> %1, <4 x i16> undef, <4 x i32> zeroinitializer + %2 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %vld_dup, 1 + %lane1 = shufflevector <4 x i16> %2, <4 x i16> undef, <4 x i32> zeroinitializer + %3 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %vld_dup, 2 + %lane2 = shufflevector <4 x i16> %3, <4 x i16> undef, <4 x i32> zeroinitializer + %.fca.0.0.insert = insertvalue %struct.int16x4x3_t undef, <4 x i16> %lane, 0, 0 + %.fca.0.1.insert = insertvalue %struct.int16x4x3_t %.fca.0.0.insert, <4 x i16> %lane1, 0, 1 + %.fca.0.2.insert = insertvalue %struct.int16x4x3_t %.fca.0.1.insert, <4 x i16> %lane2, 0, 2 + ret %struct.int16x4x3_t %.fca.0.2.insert +} + +define %struct.int32x2x3_t @test_vld3_dup_s32(i32* %a) { +; CHECK-LABEL: test_vld3_dup_s32 +; CHECK: ld3r {{{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s}, [x0] +entry: + %0 = bitcast i32* %a to i8* + %vld_dup = tail call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld3lane.v2i32(i8* %0, <2 x i32> undef, <2 x i32> undef, <2 x i32> undef, i32 0, i32 4) + %1 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } %vld_dup, 0 + %lane = shufflevector <2 x i32> %1, <2 x i32> undef, <2 x i32> zeroinitializer + %2 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } %vld_dup, 1 + %lane1 = shufflevector <2 x i32> %2, <2 x i32> undef, <2 x i32> zeroinitializer + %3 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } %vld_dup, 2 + %lane2 = shufflevector <2 x i32> %3, <2 x i32> undef, <2 x i32> zeroinitializer + %.fca.0.0.insert = insertvalue %struct.int32x2x3_t undef, <2 x i32> %lane, 0, 0 + %.fca.0.1.insert = insertvalue %struct.int32x2x3_t %.fca.0.0.insert, <2 x i32> %lane1, 0, 1 + %.fca.0.2.insert = insertvalue %struct.int32x2x3_t %.fca.0.1.insert, <2 x i32> %lane2, 0, 2 + ret %struct.int32x2x3_t %.fca.0.2.insert +} + +define %struct.int64x1x3_t @test_vld3_dup_s64(i64* %a) { +; CHECK-LABEL: test_vld3_dup_s64 +; CHECK: ld1 {{{v[0-9]+}}.1d, {{v[0-9]+}}.1d, {{v[0-9]+}}.1d}, [x0] +entry: + %0 = bitcast i64* %a to i8* + %vld_dup = tail call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld3.v1i64(i8* %0, i32 8) + %vld_dup.fca.0.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } %vld_dup, 0 + %vld_dup.fca.1.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } %vld_dup, 1 + %vld_dup.fca.2.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } %vld_dup, 2 + %.fca.0.0.insert = insertvalue %struct.int64x1x3_t undef, <1 x i64> %vld_dup.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.int64x1x3_t %.fca.0.0.insert, <1 x i64> %vld_dup.fca.1.extract, 0, 1 + %.fca.0.2.insert = insertvalue %struct.int64x1x3_t %.fca.0.1.insert, <1 x i64> %vld_dup.fca.2.extract, 0, 2 + ret %struct.int64x1x3_t %.fca.0.2.insert +} + +define %struct.float32x2x3_t @test_vld3_dup_f32(float* %a) { +; CHECK-LABEL: test_vld3_dup_f32 +; CHECK: ld3r {{{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s}, [x0] +entry: + %0 = bitcast float* %a to i8* + %vld_dup = tail call { <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld3lane.v2f32(i8* %0, <2 x float> undef, <2 x float> undef, <2 x float> undef, i32 0, i32 4) + %1 = extractvalue { <2 x float>, <2 x float>, <2 x float> } %vld_dup, 0 + %lane = shufflevector <2 x float> %1, <2 x float> undef, <2 x i32> zeroinitializer + %2 = extractvalue { <2 x float>, <2 x float>, <2 x float> } %vld_dup, 1 + %lane1 = shufflevector <2 x float> %2, <2 x float> undef, <2 x i32> zeroinitializer + %3 = extractvalue { <2 x float>, <2 x float>, <2 x float> } %vld_dup, 2 + %lane2 = shufflevector <2 x float> %3, <2 x float> undef, <2 x i32> zeroinitializer + %.fca.0.0.insert = insertvalue %struct.float32x2x3_t undef, <2 x float> %lane, 0, 0 + %.fca.0.1.insert = insertvalue %struct.float32x2x3_t %.fca.0.0.insert, <2 x float> %lane1, 0, 1 + %.fca.0.2.insert = insertvalue %struct.float32x2x3_t %.fca.0.1.insert, <2 x float> %lane2, 0, 2 + ret %struct.float32x2x3_t %.fca.0.2.insert +} + +define %struct.float64x1x3_t @test_vld3_dup_f64(double* %a) { +; CHECK-LABEL: test_vld3_dup_f64 +; CHECK: ld1 {{{v[0-9]+}}.1d, {{v[0-9]+}}.1d, {{v[0-9]+}}.1d}, [x0] +entry: + %0 = bitcast double* %a to i8* + %vld_dup = tail call { <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld3.v1f64(i8* %0, i32 8) + %vld_dup.fca.0.extract = extractvalue { <1 x double>, <1 x double>, <1 x double> } %vld_dup, 0 + %vld_dup.fca.1.extract = extractvalue { <1 x double>, <1 x double>, <1 x double> } %vld_dup, 1 + %vld_dup.fca.2.extract = extractvalue { <1 x double>, <1 x double>, <1 x double> } %vld_dup, 2 + %.fca.0.0.insert = insertvalue %struct.float64x1x3_t undef, <1 x double> %vld_dup.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.float64x1x3_t %.fca.0.0.insert, <1 x double> %vld_dup.fca.1.extract, 0, 1 + %.fca.0.2.insert = insertvalue %struct.float64x1x3_t %.fca.0.1.insert, <1 x double> %vld_dup.fca.2.extract, 0, 2 + ret %struct.float64x1x3_t %.fca.0.2.insert +} + +define %struct.int8x16x4_t @test_vld4q_dup_s8(i8* %a) { +; CHECK-LABEL: test_vld4q_dup_s8 +; CHECK: ld4r {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, [x0] +entry: + %vld_dup = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld4lane.v16i8(i8* %a, <16 x i8> undef, <16 x i8> undef, <16 x i8> undef, <16 x i8> undef, i32 0, i32 1) + %0 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld_dup, 0 + %lane = shufflevector <16 x i8> %0, <16 x i8> undef, <16 x i32> zeroinitializer + %1 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld_dup, 1 + %lane1 = shufflevector <16 x i8> %1, <16 x i8> undef, <16 x i32> zeroinitializer + %2 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld_dup, 2 + %lane2 = shufflevector <16 x i8> %2, <16 x i8> undef, <16 x i32> zeroinitializer + %3 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld_dup, 3 + %lane3 = shufflevector <16 x i8> %3, <16 x i8> undef, <16 x i32> zeroinitializer + %.fca.0.0.insert = insertvalue %struct.int8x16x4_t undef, <16 x i8> %lane, 0, 0 + %.fca.0.1.insert = insertvalue %struct.int8x16x4_t %.fca.0.0.insert, <16 x i8> %lane1, 0, 1 + %.fca.0.2.insert = insertvalue %struct.int8x16x4_t %.fca.0.1.insert, <16 x i8> %lane2, 0, 2 + %.fca.0.3.insert = insertvalue %struct.int8x16x4_t %.fca.0.2.insert, <16 x i8> %lane3, 0, 3 + ret %struct.int8x16x4_t %.fca.0.3.insert +} + +define %struct.int16x8x4_t @test_vld4q_dup_s16(i16* %a) { +; CHECK-LABEL: test_vld4q_dup_s16 +; CHECK: ld4r {{{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h}, [x0] +entry: + %0 = bitcast i16* %a to i8* + %vld_dup = tail call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld4lane.v8i16(i8* %0, <8 x i16> undef, <8 x i16> undef, <8 x i16> undef, <8 x i16> undef, i32 0, i32 2) + %1 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld_dup, 0 + %lane = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> zeroinitializer + %2 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld_dup, 1 + %lane1 = shufflevector <8 x i16> %2, <8 x i16> undef, <8 x i32> zeroinitializer + %3 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld_dup, 2 + %lane2 = shufflevector <8 x i16> %3, <8 x i16> undef, <8 x i32> zeroinitializer + %4 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld_dup, 3 + %lane3 = shufflevector <8 x i16> %4, <8 x i16> undef, <8 x i32> zeroinitializer + %.fca.0.0.insert = insertvalue %struct.int16x8x4_t undef, <8 x i16> %lane, 0, 0 + %.fca.0.1.insert = insertvalue %struct.int16x8x4_t %.fca.0.0.insert, <8 x i16> %lane1, 0, 1 + %.fca.0.2.insert = insertvalue %struct.int16x8x4_t %.fca.0.1.insert, <8 x i16> %lane2, 0, 2 + %.fca.0.3.insert = insertvalue %struct.int16x8x4_t %.fca.0.2.insert, <8 x i16> %lane3, 0, 3 + ret %struct.int16x8x4_t %.fca.0.3.insert +} + +define %struct.int32x4x4_t @test_vld4q_dup_s32(i32* %a) { +; CHECK-LABEL: test_vld4q_dup_s32 +; CHECK: ld4r {{{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s}, [x0] +entry: + %0 = bitcast i32* %a to i8* + %vld_dup = tail call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4lane.v4i32(i8* %0, <4 x i32> undef, <4 x i32> undef, <4 x i32> undef, <4 x i32> undef, i32 0, i32 4) + %1 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld_dup, 0 + %lane = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> zeroinitializer + %2 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld_dup, 1 + %lane1 = shufflevector <4 x i32> %2, <4 x i32> undef, <4 x i32> zeroinitializer + %3 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld_dup, 2 + %lane2 = shufflevector <4 x i32> %3, <4 x i32> undef, <4 x i32> zeroinitializer + %4 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld_dup, 3 + %lane3 = shufflevector <4 x i32> %4, <4 x i32> undef, <4 x i32> zeroinitializer + %.fca.0.0.insert = insertvalue %struct.int32x4x4_t undef, <4 x i32> %lane, 0, 0 + %.fca.0.1.insert = insertvalue %struct.int32x4x4_t %.fca.0.0.insert, <4 x i32> %lane1, 0, 1 + %.fca.0.2.insert = insertvalue %struct.int32x4x4_t %.fca.0.1.insert, <4 x i32> %lane2, 0, 2 + %.fca.0.3.insert = insertvalue %struct.int32x4x4_t %.fca.0.2.insert, <4 x i32> %lane3, 0, 3 + ret %struct.int32x4x4_t %.fca.0.3.insert +} + +define %struct.int64x2x4_t @test_vld4q_dup_s64(i64* %a) { +; CHECK-LABEL: test_vld4q_dup_s64 +; CHECK: ld4r {{{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d}, [x0] +entry: + %0 = bitcast i64* %a to i8* + %vld_dup = tail call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm.neon.vld4lane.v2i64(i8* %0, <2 x i64> undef, <2 x i64> undef, <2 x i64> undef, <2 x i64> undef, i32 0, i32 8) + %1 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld_dup, 0 + %lane = shufflevector <2 x i64> %1, <2 x i64> undef, <2 x i32> zeroinitializer + %2 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld_dup, 1 + %lane1 = shufflevector <2 x i64> %2, <2 x i64> undef, <2 x i32> zeroinitializer + %3 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld_dup, 2 + %lane2 = shufflevector <2 x i64> %3, <2 x i64> undef, <2 x i32> zeroinitializer + %4 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld_dup, 3 + %lane3 = shufflevector <2 x i64> %4, <2 x i64> undef, <2 x i32> zeroinitializer + %.fca.0.0.insert = insertvalue %struct.int64x2x4_t undef, <2 x i64> %lane, 0, 0 + %.fca.0.1.insert = insertvalue %struct.int64x2x4_t %.fca.0.0.insert, <2 x i64> %lane1, 0, 1 + %.fca.0.2.insert = insertvalue %struct.int64x2x4_t %.fca.0.1.insert, <2 x i64> %lane2, 0, 2 + %.fca.0.3.insert = insertvalue %struct.int64x2x4_t %.fca.0.2.insert, <2 x i64> %lane3, 0, 3 + ret %struct.int64x2x4_t %.fca.0.3.insert +} + +define %struct.float32x4x4_t @test_vld4q_dup_f32(float* %a) { +; CHECK-LABEL: test_vld4q_dup_f32 +; CHECK: ld4r {{{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s}, [x0] +entry: + %0 = bitcast float* %a to i8* + %vld_dup = tail call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld4lane.v4f32(i8* %0, <4 x float> undef, <4 x float> undef, <4 x float> undef, <4 x float> undef, i32 0, i32 4) + %1 = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld_dup, 0 + %lane = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> zeroinitializer + %2 = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld_dup, 1 + %lane1 = shufflevector <4 x float> %2, <4 x float> undef, <4 x i32> zeroinitializer + %3 = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld_dup, 2 + %lane2 = shufflevector <4 x float> %3, <4 x float> undef, <4 x i32> zeroinitializer + %4 = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld_dup, 3 + %lane3 = shufflevector <4 x float> %4, <4 x float> undef, <4 x i32> zeroinitializer + %.fca.0.0.insert = insertvalue %struct.float32x4x4_t undef, <4 x float> %lane, 0, 0 + %.fca.0.1.insert = insertvalue %struct.float32x4x4_t %.fca.0.0.insert, <4 x float> %lane1, 0, 1 + %.fca.0.2.insert = insertvalue %struct.float32x4x4_t %.fca.0.1.insert, <4 x float> %lane2, 0, 2 + %.fca.0.3.insert = insertvalue %struct.float32x4x4_t %.fca.0.2.insert, <4 x float> %lane3, 0, 3 + ret %struct.float32x4x4_t %.fca.0.3.insert +} + +define %struct.float64x2x4_t @test_vld4q_dup_f64(double* %a) { +; CHECK-LABEL: test_vld4q_dup_f64 +; CHECK: ld4r {{{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d}, [x0] +entry: + %0 = bitcast double* %a to i8* + %vld_dup = tail call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld4lane.v2f64(i8* %0, <2 x double> undef, <2 x double> undef, <2 x double> undef, <2 x double> undef, i32 0, i32 8) + %1 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld_dup, 0 + %lane = shufflevector <2 x double> %1, <2 x double> undef, <2 x i32> zeroinitializer + %2 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld_dup, 1 + %lane1 = shufflevector <2 x double> %2, <2 x double> undef, <2 x i32> zeroinitializer + %3 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld_dup, 2 + %lane2 = shufflevector <2 x double> %3, <2 x double> undef, <2 x i32> zeroinitializer + %4 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld_dup, 3 + %lane3 = shufflevector <2 x double> %4, <2 x double> undef, <2 x i32> zeroinitializer + %.fca.0.0.insert = insertvalue %struct.float64x2x4_t undef, <2 x double> %lane, 0, 0 + %.fca.0.1.insert = insertvalue %struct.float64x2x4_t %.fca.0.0.insert, <2 x double> %lane1, 0, 1 + %.fca.0.2.insert = insertvalue %struct.float64x2x4_t %.fca.0.1.insert, <2 x double> %lane2, 0, 2 + %.fca.0.3.insert = insertvalue %struct.float64x2x4_t %.fca.0.2.insert, <2 x double> %lane3, 0, 3 + ret %struct.float64x2x4_t %.fca.0.3.insert +} + +define %struct.int8x8x4_t @test_vld4_dup_s8(i8* %a) { +; CHECK-LABEL: test_vld4_dup_s8 +; CHECK: ld4r {{{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b}, [x0] +entry: + %vld_dup = tail call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4lane.v8i8(i8* %a, <8 x i8> undef, <8 x i8> undef, <8 x i8> undef, <8 x i8> undef, i32 0, i32 1) + %0 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld_dup, 0 + %lane = shufflevector <8 x i8> %0, <8 x i8> undef, <8 x i32> zeroinitializer + %1 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld_dup, 1 + %lane1 = shufflevector <8 x i8> %1, <8 x i8> undef, <8 x i32> zeroinitializer + %2 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld_dup, 2 + %lane2 = shufflevector <8 x i8> %2, <8 x i8> undef, <8 x i32> zeroinitializer + %3 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld_dup, 3 + %lane3 = shufflevector <8 x i8> %3, <8 x i8> undef, <8 x i32> zeroinitializer + %.fca.0.0.insert = insertvalue %struct.int8x8x4_t undef, <8 x i8> %lane, 0, 0 + %.fca.0.1.insert = insertvalue %struct.int8x8x4_t %.fca.0.0.insert, <8 x i8> %lane1, 0, 1 + %.fca.0.2.insert = insertvalue %struct.int8x8x4_t %.fca.0.1.insert, <8 x i8> %lane2, 0, 2 + %.fca.0.3.insert = insertvalue %struct.int8x8x4_t %.fca.0.2.insert, <8 x i8> %lane3, 0, 3 + ret %struct.int8x8x4_t %.fca.0.3.insert +} + +define %struct.int16x4x4_t @test_vld4_dup_s16(i16* %a) { +; CHECK-LABEL: test_vld4_dup_s16 +; CHECK: ld4r {{{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h}, [x0] +entry: + %0 = bitcast i16* %a to i8* + %vld_dup = tail call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4lane.v4i16(i8* %0, <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, i32 0, i32 2) + %1 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld_dup, 0 + %lane = shufflevector <4 x i16> %1, <4 x i16> undef, <4 x i32> zeroinitializer + %2 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld_dup, 1 + %lane1 = shufflevector <4 x i16> %2, <4 x i16> undef, <4 x i32> zeroinitializer + %3 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld_dup, 2 + %lane2 = shufflevector <4 x i16> %3, <4 x i16> undef, <4 x i32> zeroinitializer + %4 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld_dup, 3 + %lane3 = shufflevector <4 x i16> %4, <4 x i16> undef, <4 x i32> zeroinitializer + %.fca.0.0.insert = insertvalue %struct.int16x4x4_t undef, <4 x i16> %lane, 0, 0 + %.fca.0.1.insert = insertvalue %struct.int16x4x4_t %.fca.0.0.insert, <4 x i16> %lane1, 0, 1 + %.fca.0.2.insert = insertvalue %struct.int16x4x4_t %.fca.0.1.insert, <4 x i16> %lane2, 0, 2 + %.fca.0.3.insert = insertvalue %struct.int16x4x4_t %.fca.0.2.insert, <4 x i16> %lane3, 0, 3 + ret %struct.int16x4x4_t %.fca.0.3.insert +} + +define %struct.int32x2x4_t @test_vld4_dup_s32(i32* %a) { +; CHECK-LABEL: test_vld4_dup_s32 +; CHECK: ld4r {{{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s}, [x0] +entry: + %0 = bitcast i32* %a to i8* + %vld_dup = tail call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld4lane.v2i32(i8* %0, <2 x i32> undef, <2 x i32> undef, <2 x i32> undef, <2 x i32> undef, i32 0, i32 4) + %1 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld_dup, 0 + %lane = shufflevector <2 x i32> %1, <2 x i32> undef, <2 x i32> zeroinitializer + %2 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld_dup, 1 + %lane1 = shufflevector <2 x i32> %2, <2 x i32> undef, <2 x i32> zeroinitializer + %3 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld_dup, 2 + %lane2 = shufflevector <2 x i32> %3, <2 x i32> undef, <2 x i32> zeroinitializer + %4 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld_dup, 3 + %lane3 = shufflevector <2 x i32> %4, <2 x i32> undef, <2 x i32> zeroinitializer + %.fca.0.0.insert = insertvalue %struct.int32x2x4_t undef, <2 x i32> %lane, 0, 0 + %.fca.0.1.insert = insertvalue %struct.int32x2x4_t %.fca.0.0.insert, <2 x i32> %lane1, 0, 1 + %.fca.0.2.insert = insertvalue %struct.int32x2x4_t %.fca.0.1.insert, <2 x i32> %lane2, 0, 2 + %.fca.0.3.insert = insertvalue %struct.int32x2x4_t %.fca.0.2.insert, <2 x i32> %lane3, 0, 3 + ret %struct.int32x2x4_t %.fca.0.3.insert +} + +define %struct.int64x1x4_t @test_vld4_dup_s64(i64* %a) { +; CHECK-LABEL: test_vld4_dup_s64 +; CHECK: ld1 {{{v[0-9]+}}.1d, {{v[0-9]+}}.1d, {{v[0-9]+}}.1d, {{v[0-9]+}}.1d}, [x0] +entry: + %0 = bitcast i64* %a to i8* + %vld_dup = tail call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld4.v1i64(i8* %0, i32 8) + %vld_dup.fca.0.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld_dup, 0 + %vld_dup.fca.1.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld_dup, 1 + %vld_dup.fca.2.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld_dup, 2 + %vld_dup.fca.3.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld_dup, 3 + %.fca.0.0.insert = insertvalue %struct.int64x1x4_t undef, <1 x i64> %vld_dup.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.int64x1x4_t %.fca.0.0.insert, <1 x i64> %vld_dup.fca.1.extract, 0, 1 + %.fca.0.2.insert = insertvalue %struct.int64x1x4_t %.fca.0.1.insert, <1 x i64> %vld_dup.fca.2.extract, 0, 2 + %.fca.0.3.insert = insertvalue %struct.int64x1x4_t %.fca.0.2.insert, <1 x i64> %vld_dup.fca.3.extract, 0, 3 + ret %struct.int64x1x4_t %.fca.0.3.insert +} + +define %struct.float32x2x4_t @test_vld4_dup_f32(float* %a) { +; CHECK-LABEL: test_vld4_dup_f32 +; CHECK: ld4r {{{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s}, [x0] +entry: + %0 = bitcast float* %a to i8* + %vld_dup = tail call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld4lane.v2f32(i8* %0, <2 x float> undef, <2 x float> undef, <2 x float> undef, <2 x float> undef, i32 0, i32 4) + %1 = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld_dup, 0 + %lane = shufflevector <2 x float> %1, <2 x float> undef, <2 x i32> zeroinitializer + %2 = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld_dup, 1 + %lane1 = shufflevector <2 x float> %2, <2 x float> undef, <2 x i32> zeroinitializer + %3 = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld_dup, 2 + %lane2 = shufflevector <2 x float> %3, <2 x float> undef, <2 x i32> zeroinitializer + %4 = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld_dup, 3 + %lane3 = shufflevector <2 x float> %4, <2 x float> undef, <2 x i32> zeroinitializer + %.fca.0.0.insert = insertvalue %struct.float32x2x4_t undef, <2 x float> %lane, 0, 0 + %.fca.0.1.insert = insertvalue %struct.float32x2x4_t %.fca.0.0.insert, <2 x float> %lane1, 0, 1 + %.fca.0.2.insert = insertvalue %struct.float32x2x4_t %.fca.0.1.insert, <2 x float> %lane2, 0, 2 + %.fca.0.3.insert = insertvalue %struct.float32x2x4_t %.fca.0.2.insert, <2 x float> %lane3, 0, 3 + ret %struct.float32x2x4_t %.fca.0.3.insert +} + +define %struct.float64x1x4_t @test_vld4_dup_f64(double* %a) { +; CHECK-LABEL: test_vld4_dup_f64 +; CHECK: ld1 {{{v[0-9]+}}.1d, {{v[0-9]+}}.1d, {{v[0-9]+}}.1d, {{v[0-9]+}}.1d}, [x0] +entry: + %0 = bitcast double* %a to i8* + %vld_dup = tail call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld4.v1f64(i8* %0, i32 8) + %vld_dup.fca.0.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld_dup, 0 + %vld_dup.fca.1.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld_dup, 1 + %vld_dup.fca.2.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld_dup, 2 + %vld_dup.fca.3.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld_dup, 3 + %.fca.0.0.insert = insertvalue %struct.float64x1x4_t undef, <1 x double> %vld_dup.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.float64x1x4_t %.fca.0.0.insert, <1 x double> %vld_dup.fca.1.extract, 0, 1 + %.fca.0.2.insert = insertvalue %struct.float64x1x4_t %.fca.0.1.insert, <1 x double> %vld_dup.fca.2.extract, 0, 2 + %.fca.0.3.insert = insertvalue %struct.float64x1x4_t %.fca.0.2.insert, <1 x double> %vld_dup.fca.3.extract, 0, 3 + ret %struct.float64x1x4_t %.fca.0.3.insert +} + +define <16 x i8> @test_vld1q_lane_s8(i8* %a, <16 x i8> %b) { +; CHECK-LABEL: test_vld1q_lane_s8 +; CHECK: ld1 {{{v[0-9]+}}.b}[{{[0-9]+}}], [x0] +entry: + %0 = load i8* %a, align 1 + %vld1_lane = insertelement <16 x i8> %b, i8 %0, i32 15 + ret <16 x i8> %vld1_lane +} + +define <8 x i16> @test_vld1q_lane_s16(i16* %a, <8 x i16> %b) { +; CHECK-LABEL: test_vld1q_lane_s16 +; CHECK: ld1 {{{v[0-9]+}}.h}[{{[0-9]+}}], [x0] +entry: + %0 = load i16* %a, align 2 + %vld1_lane = insertelement <8 x i16> %b, i16 %0, i32 7 + ret <8 x i16> %vld1_lane +} + +define <4 x i32> @test_vld1q_lane_s32(i32* %a, <4 x i32> %b) { +; CHECK-LABEL: test_vld1q_lane_s32 +; CHECK: ld1 {{{v[0-9]+}}.s}[{{[0-9]+}}], [x0] +entry: + %0 = load i32* %a, align 4 + %vld1_lane = insertelement <4 x i32> %b, i32 %0, i32 3 + ret <4 x i32> %vld1_lane +} + +define <2 x i64> @test_vld1q_lane_s64(i64* %a, <2 x i64> %b) { +; CHECK-LABEL: test_vld1q_lane_s64 +; CHECK: ld1 {{{v[0-9]+}}.d}[{{[0-9]+}}], [x0] +entry: + %0 = load i64* %a, align 8 + %vld1_lane = insertelement <2 x i64> %b, i64 %0, i32 1 + ret <2 x i64> %vld1_lane +} + +define <4 x float> @test_vld1q_lane_f32(float* %a, <4 x float> %b) { +; CHECK-LABEL: test_vld1q_lane_f32 +; CHECK: ld1 {{{v[0-9]+}}.s}[{{[0-9]+}}], [x0] +entry: + %0 = load float* %a, align 4 + %vld1_lane = insertelement <4 x float> %b, float %0, i32 3 + ret <4 x float> %vld1_lane +} + +define <2 x double> @test_vld1q_lane_f64(double* %a, <2 x double> %b) { +; CHECK-LABEL: test_vld1q_lane_f64 +; CHECK: ld1 {{{v[0-9]+}}.d}[{{[0-9]+}}], [x0] +entry: + %0 = load double* %a, align 8 + %vld1_lane = insertelement <2 x double> %b, double %0, i32 1 + ret <2 x double> %vld1_lane +} + +define <8 x i8> @test_vld1_lane_s8(i8* %a, <8 x i8> %b) { +; CHECK-LABEL: test_vld1_lane_s8 +; CHECK: ld1 {{{v[0-9]+}}.b}[{{[0-9]+}}], [x0] +entry: + %0 = load i8* %a, align 1 + %vld1_lane = insertelement <8 x i8> %b, i8 %0, i32 7 + ret <8 x i8> %vld1_lane +} + +define <4 x i16> @test_vld1_lane_s16(i16* %a, <4 x i16> %b) { +; CHECK-LABEL: test_vld1_lane_s16 +; CHECK: ld1 {{{v[0-9]+}}.h}[{{[0-9]+}}], [x0] +entry: + %0 = load i16* %a, align 2 + %vld1_lane = insertelement <4 x i16> %b, i16 %0, i32 3 + ret <4 x i16> %vld1_lane +} + +define <2 x i32> @test_vld1_lane_s32(i32* %a, <2 x i32> %b) { +; CHECK-LABEL: test_vld1_lane_s32 +; CHECK: ld1 {{{v[0-9]+}}.s}[{{[0-9]+}}], [x0] +entry: + %0 = load i32* %a, align 4 + %vld1_lane = insertelement <2 x i32> %b, i32 %0, i32 1 + ret <2 x i32> %vld1_lane +} + +define <1 x i64> @test_vld1_lane_s64(i64* %a, <1 x i64> %b) { +; CHECK-LABEL: test_vld1_lane_s64 +; CHECK: ld1r {{{v[0-9]+}}.1d}, [x0] +entry: + %0 = load i64* %a, align 8 + %vld1_lane = insertelement <1 x i64> undef, i64 %0, i32 0 + ret <1 x i64> %vld1_lane +} + +define <2 x float> @test_vld1_lane_f32(float* %a, <2 x float> %b) { +; CHECK-LABEL: test_vld1_lane_f32 +; CHECK: ld1 {{{v[0-9]+}}.s}[{{[0-9]+}}], [x0] +entry: + %0 = load float* %a, align 4 + %vld1_lane = insertelement <2 x float> %b, float %0, i32 1 + ret <2 x float> %vld1_lane +} + +define <1 x double> @test_vld1_lane_f64(double* %a, <1 x double> %b) { +; CHECK-LABEL: test_vld1_lane_f64 +; CHECK: ld1r {{{v[0-9]+}}.1d}, [x0] +entry: + %0 = load double* %a, align 8 + %vld1_lane = insertelement <1 x double> undef, double %0, i32 0 + ret <1 x double> %vld1_lane +} + +define %struct.int16x8x2_t @test_vld2q_lane_s16(i16* %a, [2 x <8 x i16>] %b.coerce) { +; CHECK-LABEL: test_vld2q_lane_s16 +; CHECK: ld2 {{{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0] +entry: + %b.coerce.fca.0.extract = extractvalue [2 x <8 x i16>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [2 x <8 x i16>] %b.coerce, 1 + %0 = bitcast i16* %a to i8* + %vld2_lane = tail call { <8 x i16>, <8 x i16> } @llvm.arm.neon.vld2lane.v8i16(i8* %0, <8 x i16> %b.coerce.fca.0.extract, <8 x i16> %b.coerce.fca.1.extract, i32 7, i32 2) + %vld2_lane.fca.0.extract = extractvalue { <8 x i16>, <8 x i16> } %vld2_lane, 0 + %vld2_lane.fca.1.extract = extractvalue { <8 x i16>, <8 x i16> } %vld2_lane, 1 + %.fca.0.0.insert = insertvalue %struct.int16x8x2_t undef, <8 x i16> %vld2_lane.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.int16x8x2_t %.fca.0.0.insert, <8 x i16> %vld2_lane.fca.1.extract, 0, 1 + ret %struct.int16x8x2_t %.fca.0.1.insert +} + +define %struct.int32x4x2_t @test_vld2q_lane_s32(i32* %a, [2 x <4 x i32>] %b.coerce) { +; CHECK-LABEL: test_vld2q_lane_s32 +; CHECK: ld2 {{{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0] +entry: + %b.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %b.coerce, 1 + %0 = bitcast i32* %a to i8* + %vld2_lane = tail call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2lane.v4i32(i8* %0, <4 x i32> %b.coerce.fca.0.extract, <4 x i32> %b.coerce.fca.1.extract, i32 3, i32 4) + %vld2_lane.fca.0.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2_lane, 0 + %vld2_lane.fca.1.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2_lane, 1 + %.fca.0.0.insert = insertvalue %struct.int32x4x2_t undef, <4 x i32> %vld2_lane.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.int32x4x2_t %.fca.0.0.insert, <4 x i32> %vld2_lane.fca.1.extract, 0, 1 + ret %struct.int32x4x2_t %.fca.0.1.insert +} + +define %struct.int64x2x2_t @test_vld2q_lane_s64(i64* %a, [2 x <2 x i64>] %b.coerce) { +; CHECK-LABEL: test_vld2q_lane_s64 +; CHECK: ld2 {{{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0] +entry: + %b.coerce.fca.0.extract = extractvalue [2 x <2 x i64>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [2 x <2 x i64>] %b.coerce, 1 + %0 = bitcast i64* %a to i8* + %vld2_lane = tail call { <2 x i64>, <2 x i64> } @llvm.arm.neon.vld2lane.v2i64(i8* %0, <2 x i64> %b.coerce.fca.0.extract, <2 x i64> %b.coerce.fca.1.extract, i32 1, i32 8) + %vld2_lane.fca.0.extract = extractvalue { <2 x i64>, <2 x i64> } %vld2_lane, 0 + %vld2_lane.fca.1.extract = extractvalue { <2 x i64>, <2 x i64> } %vld2_lane, 1 + %.fca.0.0.insert = insertvalue %struct.int64x2x2_t undef, <2 x i64> %vld2_lane.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.int64x2x2_t %.fca.0.0.insert, <2 x i64> %vld2_lane.fca.1.extract, 0, 1 + ret %struct.int64x2x2_t %.fca.0.1.insert +} + +define %struct.float32x4x2_t @test_vld2q_lane_f32(float* %a, [2 x <4 x float>] %b.coerce) { +; CHECK-LABEL: test_vld2q_lane_f32 +; CHECK: ld2 {{{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0] +entry: + %b.coerce.fca.0.extract = extractvalue [2 x <4 x float>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [2 x <4 x float>] %b.coerce, 1 + %0 = bitcast float* %a to i8* + %vld2_lane = tail call { <4 x float>, <4 x float> } @llvm.arm.neon.vld2lane.v4f32(i8* %0, <4 x float> %b.coerce.fca.0.extract, <4 x float> %b.coerce.fca.1.extract, i32 3, i32 4) + %vld2_lane.fca.0.extract = extractvalue { <4 x float>, <4 x float> } %vld2_lane, 0 + %vld2_lane.fca.1.extract = extractvalue { <4 x float>, <4 x float> } %vld2_lane, 1 + %.fca.0.0.insert = insertvalue %struct.float32x4x2_t undef, <4 x float> %vld2_lane.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.float32x4x2_t %.fca.0.0.insert, <4 x float> %vld2_lane.fca.1.extract, 0, 1 + ret %struct.float32x4x2_t %.fca.0.1.insert +} + +define %struct.float64x2x2_t @test_vld2q_lane_f64(double* %a, [2 x <2 x double>] %b.coerce) { +; CHECK-LABEL: test_vld2q_lane_f64 +; CHECK: ld2 {{{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0] +entry: + %b.coerce.fca.0.extract = extractvalue [2 x <2 x double>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [2 x <2 x double>] %b.coerce, 1 + %0 = bitcast double* %a to i8* + %vld2_lane = tail call { <2 x double>, <2 x double> } @llvm.arm.neon.vld2lane.v2f64(i8* %0, <2 x double> %b.coerce.fca.0.extract, <2 x double> %b.coerce.fca.1.extract, i32 1, i32 8) + %vld2_lane.fca.0.extract = extractvalue { <2 x double>, <2 x double> } %vld2_lane, 0 + %vld2_lane.fca.1.extract = extractvalue { <2 x double>, <2 x double> } %vld2_lane, 1 + %.fca.0.0.insert = insertvalue %struct.float64x2x2_t undef, <2 x double> %vld2_lane.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.float64x2x2_t %.fca.0.0.insert, <2 x double> %vld2_lane.fca.1.extract, 0, 1 + ret %struct.float64x2x2_t %.fca.0.1.insert +} + +define %struct.int8x8x2_t @test_vld2_lane_s8(i8* %a, [2 x <8 x i8>] %b.coerce) { +; CHECK-LABEL: test_vld2_lane_s8 +; CHECK: ld2 {{{v[0-9]+}}.b, {{v[0-9]+}}.b}[{{[0-9]+}}], [x0] +entry: + %b.coerce.fca.0.extract = extractvalue [2 x <8 x i8>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [2 x <8 x i8>] %b.coerce, 1 + %vld2_lane = tail call { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2lane.v8i8(i8* %a, <8 x i8> %b.coerce.fca.0.extract, <8 x i8> %b.coerce.fca.1.extract, i32 7, i32 1) + %vld2_lane.fca.0.extract = extractvalue { <8 x i8>, <8 x i8> } %vld2_lane, 0 + %vld2_lane.fca.1.extract = extractvalue { <8 x i8>, <8 x i8> } %vld2_lane, 1 + %.fca.0.0.insert = insertvalue %struct.int8x8x2_t undef, <8 x i8> %vld2_lane.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.int8x8x2_t %.fca.0.0.insert, <8 x i8> %vld2_lane.fca.1.extract, 0, 1 + ret %struct.int8x8x2_t %.fca.0.1.insert +} + +define %struct.int16x4x2_t @test_vld2_lane_s16(i16* %a, [2 x <4 x i16>] %b.coerce) { +; CHECK-LABEL: test_vld2_lane_s16 +; CHECK: ld2 {{{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0] +entry: + %b.coerce.fca.0.extract = extractvalue [2 x <4 x i16>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [2 x <4 x i16>] %b.coerce, 1 + %0 = bitcast i16* %a to i8* + %vld2_lane = tail call { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld2lane.v4i16(i8* %0, <4 x i16> %b.coerce.fca.0.extract, <4 x i16> %b.coerce.fca.1.extract, i32 3, i32 2) + %vld2_lane.fca.0.extract = extractvalue { <4 x i16>, <4 x i16> } %vld2_lane, 0 + %vld2_lane.fca.1.extract = extractvalue { <4 x i16>, <4 x i16> } %vld2_lane, 1 + %.fca.0.0.insert = insertvalue %struct.int16x4x2_t undef, <4 x i16> %vld2_lane.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.int16x4x2_t %.fca.0.0.insert, <4 x i16> %vld2_lane.fca.1.extract, 0, 1 + ret %struct.int16x4x2_t %.fca.0.1.insert +} + +define %struct.int32x2x2_t @test_vld2_lane_s32(i32* %a, [2 x <2 x i32>] %b.coerce) { +; CHECK-LABEL: test_vld2_lane_s32 +; CHECK: ld2 {{{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0] +entry: + %b.coerce.fca.0.extract = extractvalue [2 x <2 x i32>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [2 x <2 x i32>] %b.coerce, 1 + %0 = bitcast i32* %a to i8* + %vld2_lane = tail call { <2 x i32>, <2 x i32> } @llvm.arm.neon.vld2lane.v2i32(i8* %0, <2 x i32> %b.coerce.fca.0.extract, <2 x i32> %b.coerce.fca.1.extract, i32 1, i32 4) + %vld2_lane.fca.0.extract = extractvalue { <2 x i32>, <2 x i32> } %vld2_lane, 0 + %vld2_lane.fca.1.extract = extractvalue { <2 x i32>, <2 x i32> } %vld2_lane, 1 + %.fca.0.0.insert = insertvalue %struct.int32x2x2_t undef, <2 x i32> %vld2_lane.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.int32x2x2_t %.fca.0.0.insert, <2 x i32> %vld2_lane.fca.1.extract, 0, 1 + ret %struct.int32x2x2_t %.fca.0.1.insert +} + +define %struct.int64x1x2_t @test_vld2_lane_s64(i64* %a, [2 x <1 x i64>] %b.coerce) { +; CHECK-LABEL: test_vld2_lane_s64 +; CHECK: ld2 {{{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0] +entry: + %b.coerce.fca.0.extract = extractvalue [2 x <1 x i64>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [2 x <1 x i64>] %b.coerce, 1 + %0 = bitcast i64* %a to i8* + %vld2_lane = tail call { <1 x i64>, <1 x i64> } @llvm.arm.neon.vld2lane.v1i64(i8* %0, <1 x i64> %b.coerce.fca.0.extract, <1 x i64> %b.coerce.fca.1.extract, i32 0, i32 8) + %vld2_lane.fca.0.extract = extractvalue { <1 x i64>, <1 x i64> } %vld2_lane, 0 + %vld2_lane.fca.1.extract = extractvalue { <1 x i64>, <1 x i64> } %vld2_lane, 1 + %.fca.0.0.insert = insertvalue %struct.int64x1x2_t undef, <1 x i64> %vld2_lane.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.int64x1x2_t %.fca.0.0.insert, <1 x i64> %vld2_lane.fca.1.extract, 0, 1 + ret %struct.int64x1x2_t %.fca.0.1.insert +} + +define %struct.float32x2x2_t @test_vld2_lane_f32(float* %a, [2 x <2 x float>] %b.coerce) { +; CHECK-LABEL: test_vld2_lane_f32 +; CHECK: ld2 {{{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0] +entry: + %b.coerce.fca.0.extract = extractvalue [2 x <2 x float>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [2 x <2 x float>] %b.coerce, 1 + %0 = bitcast float* %a to i8* + %vld2_lane = tail call { <2 x float>, <2 x float> } @llvm.arm.neon.vld2lane.v2f32(i8* %0, <2 x float> %b.coerce.fca.0.extract, <2 x float> %b.coerce.fca.1.extract, i32 1, i32 4) + %vld2_lane.fca.0.extract = extractvalue { <2 x float>, <2 x float> } %vld2_lane, 0 + %vld2_lane.fca.1.extract = extractvalue { <2 x float>, <2 x float> } %vld2_lane, 1 + %.fca.0.0.insert = insertvalue %struct.float32x2x2_t undef, <2 x float> %vld2_lane.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.float32x2x2_t %.fca.0.0.insert, <2 x float> %vld2_lane.fca.1.extract, 0, 1 + ret %struct.float32x2x2_t %.fca.0.1.insert +} + +define %struct.float64x1x2_t @test_vld2_lane_f64(double* %a, [2 x <1 x double>] %b.coerce) { +; CHECK-LABEL: test_vld2_lane_f64 +; CHECK: ld2 {{{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0] +entry: + %b.coerce.fca.0.extract = extractvalue [2 x <1 x double>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [2 x <1 x double>] %b.coerce, 1 + %0 = bitcast double* %a to i8* + %vld2_lane = tail call { <1 x double>, <1 x double> } @llvm.arm.neon.vld2lane.v1f64(i8* %0, <1 x double> %b.coerce.fca.0.extract, <1 x double> %b.coerce.fca.1.extract, i32 0, i32 8) + %vld2_lane.fca.0.extract = extractvalue { <1 x double>, <1 x double> } %vld2_lane, 0 + %vld2_lane.fca.1.extract = extractvalue { <1 x double>, <1 x double> } %vld2_lane, 1 + %.fca.0.0.insert = insertvalue %struct.float64x1x2_t undef, <1 x double> %vld2_lane.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.float64x1x2_t %.fca.0.0.insert, <1 x double> %vld2_lane.fca.1.extract, 0, 1 + ret %struct.float64x1x2_t %.fca.0.1.insert +} + +define %struct.int16x8x3_t @test_vld3q_lane_s16(i16* %a, [3 x <8 x i16>] %b.coerce) { +; CHECK-LABEL: test_vld3q_lane_s16 +; CHECK: ld3 {{{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0] +entry: + %b.coerce.fca.0.extract = extractvalue [3 x <8 x i16>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [3 x <8 x i16>] %b.coerce, 1 + %b.coerce.fca.2.extract = extractvalue [3 x <8 x i16>] %b.coerce, 2 + %0 = bitcast i16* %a to i8* + %vld3_lane = tail call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3lane.v8i16(i8* %0, <8 x i16> %b.coerce.fca.0.extract, <8 x i16> %b.coerce.fca.1.extract, <8 x i16> %b.coerce.fca.2.extract, i32 7, i32 2) + %vld3_lane.fca.0.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %vld3_lane, 0 + %vld3_lane.fca.1.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %vld3_lane, 1 + %vld3_lane.fca.2.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %vld3_lane, 2 + %.fca.0.0.insert = insertvalue %struct.int16x8x3_t undef, <8 x i16> %vld3_lane.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.int16x8x3_t %.fca.0.0.insert, <8 x i16> %vld3_lane.fca.1.extract, 0, 1 + %.fca.0.2.insert = insertvalue %struct.int16x8x3_t %.fca.0.1.insert, <8 x i16> %vld3_lane.fca.2.extract, 0, 2 + ret %struct.int16x8x3_t %.fca.0.2.insert +} + +define %struct.int32x4x3_t @test_vld3q_lane_s32(i32* %a, [3 x <4 x i32>] %b.coerce) { +; CHECK-LABEL: test_vld3q_lane_s32 +; CHECK: ld3 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0] +entry: + %b.coerce.fca.0.extract = extractvalue [3 x <4 x i32>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [3 x <4 x i32>] %b.coerce, 1 + %b.coerce.fca.2.extract = extractvalue [3 x <4 x i32>] %b.coerce, 2 + %0 = bitcast i32* %a to i8* + %vld3_lane = tail call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3lane.v4i32(i8* %0, <4 x i32> %b.coerce.fca.0.extract, <4 x i32> %b.coerce.fca.1.extract, <4 x i32> %b.coerce.fca.2.extract, i32 3, i32 4) + %vld3_lane.fca.0.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld3_lane, 0 + %vld3_lane.fca.1.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld3_lane, 1 + %vld3_lane.fca.2.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld3_lane, 2 + %.fca.0.0.insert = insertvalue %struct.int32x4x3_t undef, <4 x i32> %vld3_lane.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.int32x4x3_t %.fca.0.0.insert, <4 x i32> %vld3_lane.fca.1.extract, 0, 1 + %.fca.0.2.insert = insertvalue %struct.int32x4x3_t %.fca.0.1.insert, <4 x i32> %vld3_lane.fca.2.extract, 0, 2 + ret %struct.int32x4x3_t %.fca.0.2.insert +} + +define %struct.int64x2x3_t @test_vld3q_lane_s64(i64* %a, [3 x <2 x i64>] %b.coerce) { +; CHECK-LABEL: test_vld3q_lane_s64 +; CHECK: ld3 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0] +entry: + %b.coerce.fca.0.extract = extractvalue [3 x <2 x i64>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [3 x <2 x i64>] %b.coerce, 1 + %b.coerce.fca.2.extract = extractvalue [3 x <2 x i64>] %b.coerce, 2 + %0 = bitcast i64* %a to i8* + %vld3_lane = tail call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm.neon.vld3lane.v2i64(i8* %0, <2 x i64> %b.coerce.fca.0.extract, <2 x i64> %b.coerce.fca.1.extract, <2 x i64> %b.coerce.fca.2.extract, i32 1, i32 8) + %vld3_lane.fca.0.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } %vld3_lane, 0 + %vld3_lane.fca.1.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } %vld3_lane, 1 + %vld3_lane.fca.2.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } %vld3_lane, 2 + %.fca.0.0.insert = insertvalue %struct.int64x2x3_t undef, <2 x i64> %vld3_lane.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.int64x2x3_t %.fca.0.0.insert, <2 x i64> %vld3_lane.fca.1.extract, 0, 1 + %.fca.0.2.insert = insertvalue %struct.int64x2x3_t %.fca.0.1.insert, <2 x i64> %vld3_lane.fca.2.extract, 0, 2 + ret %struct.int64x2x3_t %.fca.0.2.insert +} + +define %struct.float32x4x3_t @test_vld3q_lane_f32(float* %a, [3 x <4 x float>] %b.coerce) { +; CHECK-LABEL: test_vld3q_lane_f32 +; CHECK: ld3 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0] +entry: + %b.coerce.fca.0.extract = extractvalue [3 x <4 x float>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [3 x <4 x float>] %b.coerce, 1 + %b.coerce.fca.2.extract = extractvalue [3 x <4 x float>] %b.coerce, 2 + %0 = bitcast float* %a to i8* + %vld3_lane = tail call { <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld3lane.v4f32(i8* %0, <4 x float> %b.coerce.fca.0.extract, <4 x float> %b.coerce.fca.1.extract, <4 x float> %b.coerce.fca.2.extract, i32 3, i32 4) + %vld3_lane.fca.0.extract = extractvalue { <4 x float>, <4 x float>, <4 x float> } %vld3_lane, 0 + %vld3_lane.fca.1.extract = extractvalue { <4 x float>, <4 x float>, <4 x float> } %vld3_lane, 1 + %vld3_lane.fca.2.extract = extractvalue { <4 x float>, <4 x float>, <4 x float> } %vld3_lane, 2 + %.fca.0.0.insert = insertvalue %struct.float32x4x3_t undef, <4 x float> %vld3_lane.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.float32x4x3_t %.fca.0.0.insert, <4 x float> %vld3_lane.fca.1.extract, 0, 1 + %.fca.0.2.insert = insertvalue %struct.float32x4x3_t %.fca.0.1.insert, <4 x float> %vld3_lane.fca.2.extract, 0, 2 + ret %struct.float32x4x3_t %.fca.0.2.insert +} + +define %struct.float64x2x3_t @test_vld3q_lane_f64(double* %a, [3 x <2 x double>] %b.coerce) { +; CHECK-LABEL: test_vld3q_lane_f64 +; CHECK: ld3 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0] +entry: + %b.coerce.fca.0.extract = extractvalue [3 x <2 x double>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [3 x <2 x double>] %b.coerce, 1 + %b.coerce.fca.2.extract = extractvalue [3 x <2 x double>] %b.coerce, 2 + %0 = bitcast double* %a to i8* + %vld3_lane = tail call { <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld3lane.v2f64(i8* %0, <2 x double> %b.coerce.fca.0.extract, <2 x double> %b.coerce.fca.1.extract, <2 x double> %b.coerce.fca.2.extract, i32 1, i32 8) + %vld3_lane.fca.0.extract = extractvalue { <2 x double>, <2 x double>, <2 x double> } %vld3_lane, 0 + %vld3_lane.fca.1.extract = extractvalue { <2 x double>, <2 x double>, <2 x double> } %vld3_lane, 1 + %vld3_lane.fca.2.extract = extractvalue { <2 x double>, <2 x double>, <2 x double> } %vld3_lane, 2 + %.fca.0.0.insert = insertvalue %struct.float64x2x3_t undef, <2 x double> %vld3_lane.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.float64x2x3_t %.fca.0.0.insert, <2 x double> %vld3_lane.fca.1.extract, 0, 1 + %.fca.0.2.insert = insertvalue %struct.float64x2x3_t %.fca.0.1.insert, <2 x double> %vld3_lane.fca.2.extract, 0, 2 + ret %struct.float64x2x3_t %.fca.0.2.insert +} + +define %struct.int8x8x3_t @test_vld3_lane_s8(i8* %a, [3 x <8 x i8>] %b.coerce) { +; CHECK-LABEL: test_vld3_lane_s8 +; CHECK: ld3 {{{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b}[{{[0-9]+}}], [x0] +entry: + %b.coerce.fca.0.extract = extractvalue [3 x <8 x i8>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [3 x <8 x i8>] %b.coerce, 1 + %b.coerce.fca.2.extract = extractvalue [3 x <8 x i8>] %b.coerce, 2 + %vld3_lane = tail call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3lane.v8i8(i8* %a, <8 x i8> %b.coerce.fca.0.extract, <8 x i8> %b.coerce.fca.1.extract, <8 x i8> %b.coerce.fca.2.extract, i32 7, i32 1) + %vld3_lane.fca.0.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vld3_lane, 0 + %vld3_lane.fca.1.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vld3_lane, 1 + %vld3_lane.fca.2.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vld3_lane, 2 + %.fca.0.0.insert = insertvalue %struct.int8x8x3_t undef, <8 x i8> %vld3_lane.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.int8x8x3_t %.fca.0.0.insert, <8 x i8> %vld3_lane.fca.1.extract, 0, 1 + %.fca.0.2.insert = insertvalue %struct.int8x8x3_t %.fca.0.1.insert, <8 x i8> %vld3_lane.fca.2.extract, 0, 2 + ret %struct.int8x8x3_t %.fca.0.2.insert +} + +define %struct.int16x4x3_t @test_vld3_lane_s16(i16* %a, [3 x <4 x i16>] %b.coerce) { +; CHECK-LABEL: test_vld3_lane_s16 +; CHECK: ld3 {{{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0] +entry: + %b.coerce.fca.0.extract = extractvalue [3 x <4 x i16>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [3 x <4 x i16>] %b.coerce, 1 + %b.coerce.fca.2.extract = extractvalue [3 x <4 x i16>] %b.coerce, 2 + %0 = bitcast i16* %a to i8* + %vld3_lane = tail call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3lane.v4i16(i8* %0, <4 x i16> %b.coerce.fca.0.extract, <4 x i16> %b.coerce.fca.1.extract, <4 x i16> %b.coerce.fca.2.extract, i32 3, i32 2) + %vld3_lane.fca.0.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %vld3_lane, 0 + %vld3_lane.fca.1.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %vld3_lane, 1 + %vld3_lane.fca.2.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %vld3_lane, 2 + %.fca.0.0.insert = insertvalue %struct.int16x4x3_t undef, <4 x i16> %vld3_lane.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.int16x4x3_t %.fca.0.0.insert, <4 x i16> %vld3_lane.fca.1.extract, 0, 1 + %.fca.0.2.insert = insertvalue %struct.int16x4x3_t %.fca.0.1.insert, <4 x i16> %vld3_lane.fca.2.extract, 0, 2 + ret %struct.int16x4x3_t %.fca.0.2.insert +} + +define %struct.int32x2x3_t @test_vld3_lane_s32(i32* %a, [3 x <2 x i32>] %b.coerce) { +; CHECK-LABEL: test_vld3_lane_s32 +; CHECK: ld3 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0] +entry: + %b.coerce.fca.0.extract = extractvalue [3 x <2 x i32>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [3 x <2 x i32>] %b.coerce, 1 + %b.coerce.fca.2.extract = extractvalue [3 x <2 x i32>] %b.coerce, 2 + %0 = bitcast i32* %a to i8* + %vld3_lane = tail call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld3lane.v2i32(i8* %0, <2 x i32> %b.coerce.fca.0.extract, <2 x i32> %b.coerce.fca.1.extract, <2 x i32> %b.coerce.fca.2.extract, i32 1, i32 4) + %vld3_lane.fca.0.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } %vld3_lane, 0 + %vld3_lane.fca.1.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } %vld3_lane, 1 + %vld3_lane.fca.2.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } %vld3_lane, 2 + %.fca.0.0.insert = insertvalue %struct.int32x2x3_t undef, <2 x i32> %vld3_lane.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.int32x2x3_t %.fca.0.0.insert, <2 x i32> %vld3_lane.fca.1.extract, 0, 1 + %.fca.0.2.insert = insertvalue %struct.int32x2x3_t %.fca.0.1.insert, <2 x i32> %vld3_lane.fca.2.extract, 0, 2 + ret %struct.int32x2x3_t %.fca.0.2.insert +} + +define %struct.int64x1x3_t @test_vld3_lane_s64(i64* %a, [3 x <1 x i64>] %b.coerce) { +; CHECK-LABEL: test_vld3_lane_s64 +; CHECK: ld3 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0] +entry: + %b.coerce.fca.0.extract = extractvalue [3 x <1 x i64>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [3 x <1 x i64>] %b.coerce, 1 + %b.coerce.fca.2.extract = extractvalue [3 x <1 x i64>] %b.coerce, 2 + %0 = bitcast i64* %a to i8* + %vld3_lane = tail call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld3lane.v1i64(i8* %0, <1 x i64> %b.coerce.fca.0.extract, <1 x i64> %b.coerce.fca.1.extract, <1 x i64> %b.coerce.fca.2.extract, i32 0, i32 8) + %vld3_lane.fca.0.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } %vld3_lane, 0 + %vld3_lane.fca.1.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } %vld3_lane, 1 + %vld3_lane.fca.2.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } %vld3_lane, 2 + %.fca.0.0.insert = insertvalue %struct.int64x1x3_t undef, <1 x i64> %vld3_lane.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.int64x1x3_t %.fca.0.0.insert, <1 x i64> %vld3_lane.fca.1.extract, 0, 1 + %.fca.0.2.insert = insertvalue %struct.int64x1x3_t %.fca.0.1.insert, <1 x i64> %vld3_lane.fca.2.extract, 0, 2 + ret %struct.int64x1x3_t %.fca.0.2.insert +} + +define %struct.float32x2x3_t @test_vld3_lane_f32(float* %a, [3 x <2 x float>] %b.coerce) { +; CHECK-LABEL: test_vld3_lane_f32 +; CHECK: ld3 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0] +entry: + %b.coerce.fca.0.extract = extractvalue [3 x <2 x float>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [3 x <2 x float>] %b.coerce, 1 + %b.coerce.fca.2.extract = extractvalue [3 x <2 x float>] %b.coerce, 2 + %0 = bitcast float* %a to i8* + %vld3_lane = tail call { <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld3lane.v2f32(i8* %0, <2 x float> %b.coerce.fca.0.extract, <2 x float> %b.coerce.fca.1.extract, <2 x float> %b.coerce.fca.2.extract, i32 1, i32 4) + %vld3_lane.fca.0.extract = extractvalue { <2 x float>, <2 x float>, <2 x float> } %vld3_lane, 0 + %vld3_lane.fca.1.extract = extractvalue { <2 x float>, <2 x float>, <2 x float> } %vld3_lane, 1 + %vld3_lane.fca.2.extract = extractvalue { <2 x float>, <2 x float>, <2 x float> } %vld3_lane, 2 + %.fca.0.0.insert = insertvalue %struct.float32x2x3_t undef, <2 x float> %vld3_lane.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.float32x2x3_t %.fca.0.0.insert, <2 x float> %vld3_lane.fca.1.extract, 0, 1 + %.fca.0.2.insert = insertvalue %struct.float32x2x3_t %.fca.0.1.insert, <2 x float> %vld3_lane.fca.2.extract, 0, 2 + ret %struct.float32x2x3_t %.fca.0.2.insert +} + +define %struct.float64x1x3_t @test_vld3_lane_f64(double* %a, [3 x <1 x double>] %b.coerce) { +; CHECK-LABEL: test_vld3_lane_f64 +; CHECK: ld3 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0] +entry: + %b.coerce.fca.0.extract = extractvalue [3 x <1 x double>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [3 x <1 x double>] %b.coerce, 1 + %b.coerce.fca.2.extract = extractvalue [3 x <1 x double>] %b.coerce, 2 + %0 = bitcast double* %a to i8* + %vld3_lane = tail call { <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld3lane.v1f64(i8* %0, <1 x double> %b.coerce.fca.0.extract, <1 x double> %b.coerce.fca.1.extract, <1 x double> %b.coerce.fca.2.extract, i32 0, i32 8) + %vld3_lane.fca.0.extract = extractvalue { <1 x double>, <1 x double>, <1 x double> } %vld3_lane, 0 + %vld3_lane.fca.1.extract = extractvalue { <1 x double>, <1 x double>, <1 x double> } %vld3_lane, 1 + %vld3_lane.fca.2.extract = extractvalue { <1 x double>, <1 x double>, <1 x double> } %vld3_lane, 2 + %.fca.0.0.insert = insertvalue %struct.float64x1x3_t undef, <1 x double> %vld3_lane.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.float64x1x3_t %.fca.0.0.insert, <1 x double> %vld3_lane.fca.1.extract, 0, 1 + %.fca.0.2.insert = insertvalue %struct.float64x1x3_t %.fca.0.1.insert, <1 x double> %vld3_lane.fca.2.extract, 0, 2 + ret %struct.float64x1x3_t %.fca.0.2.insert +} + +define %struct.int8x16x4_t @test_vld4q_lane_s8(i8* %a, [4 x <16 x i8>] %b.coerce) { +; CHECK-LABEL: test_vld4q_lane_s8 +; CHECK: ld4 {{{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b}[{{[0-9]+}}], [x0] +entry: + %b.coerce.fca.0.extract = extractvalue [4 x <16 x i8>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [4 x <16 x i8>] %b.coerce, 1 + %b.coerce.fca.2.extract = extractvalue [4 x <16 x i8>] %b.coerce, 2 + %b.coerce.fca.3.extract = extractvalue [4 x <16 x i8>] %b.coerce, 3 + %vld3_lane = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld4lane.v16i8(i8* %a, <16 x i8> %b.coerce.fca.0.extract, <16 x i8> %b.coerce.fca.1.extract, <16 x i8> %b.coerce.fca.2.extract, <16 x i8> %b.coerce.fca.3.extract, i32 15, i32 1) + %vld3_lane.fca.0.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld3_lane, 0 + %vld3_lane.fca.1.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld3_lane, 1 + %vld3_lane.fca.2.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld3_lane, 2 + %vld3_lane.fca.3.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld3_lane, 3 + %.fca.0.0.insert = insertvalue %struct.int8x16x4_t undef, <16 x i8> %vld3_lane.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.int8x16x4_t %.fca.0.0.insert, <16 x i8> %vld3_lane.fca.1.extract, 0, 1 + %.fca.0.2.insert = insertvalue %struct.int8x16x4_t %.fca.0.1.insert, <16 x i8> %vld3_lane.fca.2.extract, 0, 2 + %.fca.0.3.insert = insertvalue %struct.int8x16x4_t %.fca.0.2.insert, <16 x i8> %vld3_lane.fca.3.extract, 0, 3 + ret %struct.int8x16x4_t %.fca.0.3.insert +} + +define %struct.int16x8x4_t @test_vld4q_lane_s16(i16* %a, [4 x <8 x i16>] %b.coerce) { +; CHECK-LABEL: test_vld4q_lane_s16 +; CHECK: ld4 {{{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0] +entry: + %b.coerce.fca.0.extract = extractvalue [4 x <8 x i16>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [4 x <8 x i16>] %b.coerce, 1 + %b.coerce.fca.2.extract = extractvalue [4 x <8 x i16>] %b.coerce, 2 + %b.coerce.fca.3.extract = extractvalue [4 x <8 x i16>] %b.coerce, 3 + %0 = bitcast i16* %a to i8* + %vld3_lane = tail call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld4lane.v8i16(i8* %0, <8 x i16> %b.coerce.fca.0.extract, <8 x i16> %b.coerce.fca.1.extract, <8 x i16> %b.coerce.fca.2.extract, <8 x i16> %b.coerce.fca.3.extract, i32 7, i32 2) + %vld3_lane.fca.0.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld3_lane, 0 + %vld3_lane.fca.1.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld3_lane, 1 + %vld3_lane.fca.2.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld3_lane, 2 + %vld3_lane.fca.3.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld3_lane, 3 + %.fca.0.0.insert = insertvalue %struct.int16x8x4_t undef, <8 x i16> %vld3_lane.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.int16x8x4_t %.fca.0.0.insert, <8 x i16> %vld3_lane.fca.1.extract, 0, 1 + %.fca.0.2.insert = insertvalue %struct.int16x8x4_t %.fca.0.1.insert, <8 x i16> %vld3_lane.fca.2.extract, 0, 2 + %.fca.0.3.insert = insertvalue %struct.int16x8x4_t %.fca.0.2.insert, <8 x i16> %vld3_lane.fca.3.extract, 0, 3 + ret %struct.int16x8x4_t %.fca.0.3.insert +} + +define %struct.int32x4x4_t @test_vld4q_lane_s32(i32* %a, [4 x <4 x i32>] %b.coerce) { +; CHECK-LABEL: test_vld4q_lane_s32 +; CHECK: ld4 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0] +entry: + %b.coerce.fca.0.extract = extractvalue [4 x <4 x i32>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [4 x <4 x i32>] %b.coerce, 1 + %b.coerce.fca.2.extract = extractvalue [4 x <4 x i32>] %b.coerce, 2 + %b.coerce.fca.3.extract = extractvalue [4 x <4 x i32>] %b.coerce, 3 + %0 = bitcast i32* %a to i8* + %vld3_lane = tail call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4lane.v4i32(i8* %0, <4 x i32> %b.coerce.fca.0.extract, <4 x i32> %b.coerce.fca.1.extract, <4 x i32> %b.coerce.fca.2.extract, <4 x i32> %b.coerce.fca.3.extract, i32 3, i32 4) + %vld3_lane.fca.0.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld3_lane, 0 + %vld3_lane.fca.1.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld3_lane, 1 + %vld3_lane.fca.2.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld3_lane, 2 + %vld3_lane.fca.3.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld3_lane, 3 + %.fca.0.0.insert = insertvalue %struct.int32x4x4_t undef, <4 x i32> %vld3_lane.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.int32x4x4_t %.fca.0.0.insert, <4 x i32> %vld3_lane.fca.1.extract, 0, 1 + %.fca.0.2.insert = insertvalue %struct.int32x4x4_t %.fca.0.1.insert, <4 x i32> %vld3_lane.fca.2.extract, 0, 2 + %.fca.0.3.insert = insertvalue %struct.int32x4x4_t %.fca.0.2.insert, <4 x i32> %vld3_lane.fca.3.extract, 0, 3 + ret %struct.int32x4x4_t %.fca.0.3.insert +} + +define %struct.int64x2x4_t @test_vld4q_lane_s64(i64* %a, [4 x <2 x i64>] %b.coerce) { +; CHECK-LABEL: test_vld4q_lane_s64 +; CHECK: ld4 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0] +entry: + %b.coerce.fca.0.extract = extractvalue [4 x <2 x i64>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [4 x <2 x i64>] %b.coerce, 1 + %b.coerce.fca.2.extract = extractvalue [4 x <2 x i64>] %b.coerce, 2 + %b.coerce.fca.3.extract = extractvalue [4 x <2 x i64>] %b.coerce, 3 + %0 = bitcast i64* %a to i8* + %vld3_lane = tail call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm.neon.vld4lane.v2i64(i8* %0, <2 x i64> %b.coerce.fca.0.extract, <2 x i64> %b.coerce.fca.1.extract, <2 x i64> %b.coerce.fca.2.extract, <2 x i64> %b.coerce.fca.3.extract, i32 1, i32 8) + %vld3_lane.fca.0.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld3_lane, 0 + %vld3_lane.fca.1.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld3_lane, 1 + %vld3_lane.fca.2.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld3_lane, 2 + %vld3_lane.fca.3.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld3_lane, 3 + %.fca.0.0.insert = insertvalue %struct.int64x2x4_t undef, <2 x i64> %vld3_lane.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.int64x2x4_t %.fca.0.0.insert, <2 x i64> %vld3_lane.fca.1.extract, 0, 1 + %.fca.0.2.insert = insertvalue %struct.int64x2x4_t %.fca.0.1.insert, <2 x i64> %vld3_lane.fca.2.extract, 0, 2 + %.fca.0.3.insert = insertvalue %struct.int64x2x4_t %.fca.0.2.insert, <2 x i64> %vld3_lane.fca.3.extract, 0, 3 + ret %struct.int64x2x4_t %.fca.0.3.insert +} + +define %struct.float32x4x4_t @test_vld4q_lane_f32(float* %a, [4 x <4 x float>] %b.coerce) { +; CHECK-LABEL: test_vld4q_lane_f32 +; CHECK: ld4 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0] +entry: + %b.coerce.fca.0.extract = extractvalue [4 x <4 x float>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [4 x <4 x float>] %b.coerce, 1 + %b.coerce.fca.2.extract = extractvalue [4 x <4 x float>] %b.coerce, 2 + %b.coerce.fca.3.extract = extractvalue [4 x <4 x float>] %b.coerce, 3 + %0 = bitcast float* %a to i8* + %vld3_lane = tail call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld4lane.v4f32(i8* %0, <4 x float> %b.coerce.fca.0.extract, <4 x float> %b.coerce.fca.1.extract, <4 x float> %b.coerce.fca.2.extract, <4 x float> %b.coerce.fca.3.extract, i32 3, i32 4) + %vld3_lane.fca.0.extract = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld3_lane, 0 + %vld3_lane.fca.1.extract = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld3_lane, 1 + %vld3_lane.fca.2.extract = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld3_lane, 2 + %vld3_lane.fca.3.extract = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld3_lane, 3 + %.fca.0.0.insert = insertvalue %struct.float32x4x4_t undef, <4 x float> %vld3_lane.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.float32x4x4_t %.fca.0.0.insert, <4 x float> %vld3_lane.fca.1.extract, 0, 1 + %.fca.0.2.insert = insertvalue %struct.float32x4x4_t %.fca.0.1.insert, <4 x float> %vld3_lane.fca.2.extract, 0, 2 + %.fca.0.3.insert = insertvalue %struct.float32x4x4_t %.fca.0.2.insert, <4 x float> %vld3_lane.fca.3.extract, 0, 3 + ret %struct.float32x4x4_t %.fca.0.3.insert +} + +define %struct.float64x2x4_t @test_vld4q_lane_f64(double* %a, [4 x <2 x double>] %b.coerce) { +; CHECK-LABEL: test_vld4q_lane_f64 +; CHECK: ld4 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0] +entry: + %b.coerce.fca.0.extract = extractvalue [4 x <2 x double>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [4 x <2 x double>] %b.coerce, 1 + %b.coerce.fca.2.extract = extractvalue [4 x <2 x double>] %b.coerce, 2 + %b.coerce.fca.3.extract = extractvalue [4 x <2 x double>] %b.coerce, 3 + %0 = bitcast double* %a to i8* + %vld3_lane = tail call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld4lane.v2f64(i8* %0, <2 x double> %b.coerce.fca.0.extract, <2 x double> %b.coerce.fca.1.extract, <2 x double> %b.coerce.fca.2.extract, <2 x double> %b.coerce.fca.3.extract, i32 1, i32 8) + %vld3_lane.fca.0.extract = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld3_lane, 0 + %vld3_lane.fca.1.extract = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld3_lane, 1 + %vld3_lane.fca.2.extract = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld3_lane, 2 + %vld3_lane.fca.3.extract = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld3_lane, 3 + %.fca.0.0.insert = insertvalue %struct.float64x2x4_t undef, <2 x double> %vld3_lane.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.float64x2x4_t %.fca.0.0.insert, <2 x double> %vld3_lane.fca.1.extract, 0, 1 + %.fca.0.2.insert = insertvalue %struct.float64x2x4_t %.fca.0.1.insert, <2 x double> %vld3_lane.fca.2.extract, 0, 2 + %.fca.0.3.insert = insertvalue %struct.float64x2x4_t %.fca.0.2.insert, <2 x double> %vld3_lane.fca.3.extract, 0, 3 + ret %struct.float64x2x4_t %.fca.0.3.insert +} + +define %struct.int8x8x4_t @test_vld4_lane_s8(i8* %a, [4 x <8 x i8>] %b.coerce) { +; CHECK-LABEL: test_vld4_lane_s8 +; CHECK: ld4 {{{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b}[{{[0-9]+}}], [x0] +entry: + %b.coerce.fca.0.extract = extractvalue [4 x <8 x i8>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [4 x <8 x i8>] %b.coerce, 1 + %b.coerce.fca.2.extract = extractvalue [4 x <8 x i8>] %b.coerce, 2 + %b.coerce.fca.3.extract = extractvalue [4 x <8 x i8>] %b.coerce, 3 + %vld3_lane = tail call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4lane.v8i8(i8* %a, <8 x i8> %b.coerce.fca.0.extract, <8 x i8> %b.coerce.fca.1.extract, <8 x i8> %b.coerce.fca.2.extract, <8 x i8> %b.coerce.fca.3.extract, i32 7, i32 1) + %vld3_lane.fca.0.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld3_lane, 0 + %vld3_lane.fca.1.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld3_lane, 1 + %vld3_lane.fca.2.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld3_lane, 2 + %vld3_lane.fca.3.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld3_lane, 3 + %.fca.0.0.insert = insertvalue %struct.int8x8x4_t undef, <8 x i8> %vld3_lane.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.int8x8x4_t %.fca.0.0.insert, <8 x i8> %vld3_lane.fca.1.extract, 0, 1 + %.fca.0.2.insert = insertvalue %struct.int8x8x4_t %.fca.0.1.insert, <8 x i8> %vld3_lane.fca.2.extract, 0, 2 + %.fca.0.3.insert = insertvalue %struct.int8x8x4_t %.fca.0.2.insert, <8 x i8> %vld3_lane.fca.3.extract, 0, 3 + ret %struct.int8x8x4_t %.fca.0.3.insert +} + +define %struct.int16x4x4_t @test_vld4_lane_s16(i16* %a, [4 x <4 x i16>] %b.coerce) { +; CHECK-LABEL: test_vld4_lane_s16 +; CHECK: ld4 {{{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0] +entry: + %b.coerce.fca.0.extract = extractvalue [4 x <4 x i16>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [4 x <4 x i16>] %b.coerce, 1 + %b.coerce.fca.2.extract = extractvalue [4 x <4 x i16>] %b.coerce, 2 + %b.coerce.fca.3.extract = extractvalue [4 x <4 x i16>] %b.coerce, 3 + %0 = bitcast i16* %a to i8* + %vld3_lane = tail call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4lane.v4i16(i8* %0, <4 x i16> %b.coerce.fca.0.extract, <4 x i16> %b.coerce.fca.1.extract, <4 x i16> %b.coerce.fca.2.extract, <4 x i16> %b.coerce.fca.3.extract, i32 3, i32 2) + %vld3_lane.fca.0.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld3_lane, 0 + %vld3_lane.fca.1.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld3_lane, 1 + %vld3_lane.fca.2.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld3_lane, 2 + %vld3_lane.fca.3.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld3_lane, 3 + %.fca.0.0.insert = insertvalue %struct.int16x4x4_t undef, <4 x i16> %vld3_lane.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.int16x4x4_t %.fca.0.0.insert, <4 x i16> %vld3_lane.fca.1.extract, 0, 1 + %.fca.0.2.insert = insertvalue %struct.int16x4x4_t %.fca.0.1.insert, <4 x i16> %vld3_lane.fca.2.extract, 0, 2 + %.fca.0.3.insert = insertvalue %struct.int16x4x4_t %.fca.0.2.insert, <4 x i16> %vld3_lane.fca.3.extract, 0, 3 + ret %struct.int16x4x4_t %.fca.0.3.insert +} + +define %struct.int32x2x4_t @test_vld4_lane_s32(i32* %a, [4 x <2 x i32>] %b.coerce) { +; CHECK-LABEL: test_vld4_lane_s32 +; CHECK: ld4 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0] +entry: + %b.coerce.fca.0.extract = extractvalue [4 x <2 x i32>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [4 x <2 x i32>] %b.coerce, 1 + %b.coerce.fca.2.extract = extractvalue [4 x <2 x i32>] %b.coerce, 2 + %b.coerce.fca.3.extract = extractvalue [4 x <2 x i32>] %b.coerce, 3 + %0 = bitcast i32* %a to i8* + %vld3_lane = tail call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld4lane.v2i32(i8* %0, <2 x i32> %b.coerce.fca.0.extract, <2 x i32> %b.coerce.fca.1.extract, <2 x i32> %b.coerce.fca.2.extract, <2 x i32> %b.coerce.fca.3.extract, i32 1, i32 4) + %vld3_lane.fca.0.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld3_lane, 0 + %vld3_lane.fca.1.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld3_lane, 1 + %vld3_lane.fca.2.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld3_lane, 2 + %vld3_lane.fca.3.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld3_lane, 3 + %.fca.0.0.insert = insertvalue %struct.int32x2x4_t undef, <2 x i32> %vld3_lane.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.int32x2x4_t %.fca.0.0.insert, <2 x i32> %vld3_lane.fca.1.extract, 0, 1 + %.fca.0.2.insert = insertvalue %struct.int32x2x4_t %.fca.0.1.insert, <2 x i32> %vld3_lane.fca.2.extract, 0, 2 + %.fca.0.3.insert = insertvalue %struct.int32x2x4_t %.fca.0.2.insert, <2 x i32> %vld3_lane.fca.3.extract, 0, 3 + ret %struct.int32x2x4_t %.fca.0.3.insert +} + +define %struct.int64x1x4_t @test_vld4_lane_s64(i64* %a, [4 x <1 x i64>] %b.coerce) { +; CHECK-LABEL: test_vld4_lane_s64 +; CHECK: ld4 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0] +entry: + %b.coerce.fca.0.extract = extractvalue [4 x <1 x i64>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [4 x <1 x i64>] %b.coerce, 1 + %b.coerce.fca.2.extract = extractvalue [4 x <1 x i64>] %b.coerce, 2 + %b.coerce.fca.3.extract = extractvalue [4 x <1 x i64>] %b.coerce, 3 + %0 = bitcast i64* %a to i8* + %vld3_lane = tail call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld4lane.v1i64(i8* %0, <1 x i64> %b.coerce.fca.0.extract, <1 x i64> %b.coerce.fca.1.extract, <1 x i64> %b.coerce.fca.2.extract, <1 x i64> %b.coerce.fca.3.extract, i32 0, i32 8) + %vld3_lane.fca.0.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld3_lane, 0 + %vld3_lane.fca.1.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld3_lane, 1 + %vld3_lane.fca.2.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld3_lane, 2 + %vld3_lane.fca.3.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld3_lane, 3 + %.fca.0.0.insert = insertvalue %struct.int64x1x4_t undef, <1 x i64> %vld3_lane.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.int64x1x4_t %.fca.0.0.insert, <1 x i64> %vld3_lane.fca.1.extract, 0, 1 + %.fca.0.2.insert = insertvalue %struct.int64x1x4_t %.fca.0.1.insert, <1 x i64> %vld3_lane.fca.2.extract, 0, 2 + %.fca.0.3.insert = insertvalue %struct.int64x1x4_t %.fca.0.2.insert, <1 x i64> %vld3_lane.fca.3.extract, 0, 3 + ret %struct.int64x1x4_t %.fca.0.3.insert +} + +define %struct.float32x2x4_t @test_vld4_lane_f32(float* %a, [4 x <2 x float>] %b.coerce) { +; CHECK-LABEL: test_vld4_lane_f32 +; CHECK: ld4 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0] +entry: + %b.coerce.fca.0.extract = extractvalue [4 x <2 x float>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [4 x <2 x float>] %b.coerce, 1 + %b.coerce.fca.2.extract = extractvalue [4 x <2 x float>] %b.coerce, 2 + %b.coerce.fca.3.extract = extractvalue [4 x <2 x float>] %b.coerce, 3 + %0 = bitcast float* %a to i8* + %vld3_lane = tail call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld4lane.v2f32(i8* %0, <2 x float> %b.coerce.fca.0.extract, <2 x float> %b.coerce.fca.1.extract, <2 x float> %b.coerce.fca.2.extract, <2 x float> %b.coerce.fca.3.extract, i32 1, i32 4) + %vld3_lane.fca.0.extract = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld3_lane, 0 + %vld3_lane.fca.1.extract = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld3_lane, 1 + %vld3_lane.fca.2.extract = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld3_lane, 2 + %vld3_lane.fca.3.extract = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld3_lane, 3 + %.fca.0.0.insert = insertvalue %struct.float32x2x4_t undef, <2 x float> %vld3_lane.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.float32x2x4_t %.fca.0.0.insert, <2 x float> %vld3_lane.fca.1.extract, 0, 1 + %.fca.0.2.insert = insertvalue %struct.float32x2x4_t %.fca.0.1.insert, <2 x float> %vld3_lane.fca.2.extract, 0, 2 + %.fca.0.3.insert = insertvalue %struct.float32x2x4_t %.fca.0.2.insert, <2 x float> %vld3_lane.fca.3.extract, 0, 3 + ret %struct.float32x2x4_t %.fca.0.3.insert +} + +define %struct.float64x1x4_t @test_vld4_lane_f64(double* %a, [4 x <1 x double>] %b.coerce) { +; CHECK-LABEL: test_vld4_lane_f64 +; CHECK: ld4 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0] +entry: + %b.coerce.fca.0.extract = extractvalue [4 x <1 x double>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [4 x <1 x double>] %b.coerce, 1 + %b.coerce.fca.2.extract = extractvalue [4 x <1 x double>] %b.coerce, 2 + %b.coerce.fca.3.extract = extractvalue [4 x <1 x double>] %b.coerce, 3 + %0 = bitcast double* %a to i8* + %vld3_lane = tail call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld4lane.v1f64(i8* %0, <1 x double> %b.coerce.fca.0.extract, <1 x double> %b.coerce.fca.1.extract, <1 x double> %b.coerce.fca.2.extract, <1 x double> %b.coerce.fca.3.extract, i32 0, i32 8) + %vld3_lane.fca.0.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld3_lane, 0 + %vld3_lane.fca.1.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld3_lane, 1 + %vld3_lane.fca.2.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld3_lane, 2 + %vld3_lane.fca.3.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld3_lane, 3 + %.fca.0.0.insert = insertvalue %struct.float64x1x4_t undef, <1 x double> %vld3_lane.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.float64x1x4_t %.fca.0.0.insert, <1 x double> %vld3_lane.fca.1.extract, 0, 1 + %.fca.0.2.insert = insertvalue %struct.float64x1x4_t %.fca.0.1.insert, <1 x double> %vld3_lane.fca.2.extract, 0, 2 + %.fca.0.3.insert = insertvalue %struct.float64x1x4_t %.fca.0.2.insert, <1 x double> %vld3_lane.fca.3.extract, 0, 3 + ret %struct.float64x1x4_t %.fca.0.3.insert +} + +define void @test_vst1q_lane_s8(i8* %a, <16 x i8> %b) { +; CHECK-LABEL: test_vst1q_lane_s8 +; CHECK: st1 {{{v[0-9]+}}.b}[{{[0-9]+}}], [x0] +entry: + %0 = extractelement <16 x i8> %b, i32 15 + store i8 %0, i8* %a, align 1 + ret void +} + +define void @test_vst1q_lane_s16(i16* %a, <8 x i16> %b) { +; CHECK-LABEL: test_vst1q_lane_s16 +; CHECK: st1 {{{v[0-9]+}}.h}[{{[0-9]+}}], [x0] +entry: + %0 = extractelement <8 x i16> %b, i32 7 + store i16 %0, i16* %a, align 2 + ret void +} + +define void @test_vst1q_lane_s32(i32* %a, <4 x i32> %b) { +; CHECK-LABEL: test_vst1q_lane_s32 +; CHECK: st1 {{{v[0-9]+}}.s}[{{[0-9]+}}], [x0] +entry: + %0 = extractelement <4 x i32> %b, i32 3 + store i32 %0, i32* %a, align 4 + ret void +} + +define void @test_vst1q_lane_s64(i64* %a, <2 x i64> %b) { +; CHECK-LABEL: test_vst1q_lane_s64 +; CHECK: st1 {{{v[0-9]+}}.d}[{{[0-9]+}}], [x0] +entry: + %0 = extractelement <2 x i64> %b, i32 1 + store i64 %0, i64* %a, align 8 + ret void +} + +define void @test_vst1q_lane_f32(float* %a, <4 x float> %b) { +; CHECK-LABEL: test_vst1q_lane_f32 +; CHECK: st1 {{{v[0-9]+}}.s}[{{[0-9]+}}], [x0] +entry: + %0 = extractelement <4 x float> %b, i32 3 + store float %0, float* %a, align 4 + ret void +} + +define void @test_vst1q_lane_f64(double* %a, <2 x double> %b) { +; CHECK-LABEL: test_vst1q_lane_f64 +; CHECK: st1 {{{v[0-9]+}}.d}[{{[0-9]+}}], [x0] +entry: + %0 = extractelement <2 x double> %b, i32 1 + store double %0, double* %a, align 8 + ret void +} + +define void @test_vst1_lane_s8(i8* %a, <8 x i8> %b) { +; CHECK-LABEL: test_vst1_lane_s8 +; CHECK: st1 {{{v[0-9]+}}.b}[{{[0-9]+}}], [x0] +entry: + %0 = extractelement <8 x i8> %b, i32 7 + store i8 %0, i8* %a, align 1 + ret void +} + +define void @test_vst1_lane_s16(i16* %a, <4 x i16> %b) { +; CHECK-LABEL: test_vst1_lane_s16 +; CHECK: st1 {{{v[0-9]+}}.h}[{{[0-9]+}}], [x0] +entry: + %0 = extractelement <4 x i16> %b, i32 3 + store i16 %0, i16* %a, align 2 + ret void +} + +define void @test_vst1_lane_s32(i32* %a, <2 x i32> %b) { +; CHECK-LABEL: test_vst1_lane_s32 +; CHECK: st1 {{{v[0-9]+}}.s}[{{[0-9]+}}], [x0] +entry: + %0 = extractelement <2 x i32> %b, i32 1 + store i32 %0, i32* %a, align 4 + ret void +} + +define void @test_vst1_lane_s64(i64* %a, <1 x i64> %b) { +; CHECK-LABEL: test_vst1_lane_s64 +; CHECK: st1 {{{v[0-9]+}}.d}[{{[0-9]+}}], [x0] +entry: + %0 = extractelement <1 x i64> %b, i32 0 + store i64 %0, i64* %a, align 8 + ret void +} + +define void @test_vst1_lane_f32(float* %a, <2 x float> %b) { +; CHECK-LABEL: test_vst1_lane_f32 +; CHECK: st1 {{{v[0-9]+}}.s}[{{[0-9]+}}], [x0] +entry: + %0 = extractelement <2 x float> %b, i32 1 + store float %0, float* %a, align 4 + ret void +} + +define void @test_vst1_lane_f64(double* %a, <1 x double> %b) { +; CHECK-LABEL: test_vst1_lane_f64 +; CHECK: st1 {{{v[0-9]+}}.d}[{{[0-9]+}}], [x0] +entry: + %0 = extractelement <1 x double> %b, i32 0 + store double %0, double* %a, align 8 + ret void +} + +define void @test_vst2q_lane_s8(i8* %a, [2 x <16 x i8>] %b.coerce) { +; CHECK-LABEL: test_vst2q_lane_s8 +; CHECK: st2 {{{v[0-9]+}}.b, {{v[0-9]+}}.b}[{{[0-9]+}}], [x0] +entry: + %b.coerce.fca.0.extract = extractvalue [2 x <16 x i8>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [2 x <16 x i8>] %b.coerce, 1 + tail call void @llvm.arm.neon.vst2lane.v16i8(i8* %a, <16 x i8> %b.coerce.fca.0.extract, <16 x i8> %b.coerce.fca.1.extract, i32 15, i32 1) + ret void +} + +define void @test_vst2q_lane_s16(i16* %a, [2 x <8 x i16>] %b.coerce) { +; CHECK-LABEL: test_vst2q_lane_s16 +; CHECK: st2 {{{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0] +entry: + %b.coerce.fca.0.extract = extractvalue [2 x <8 x i16>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [2 x <8 x i16>] %b.coerce, 1 + %0 = bitcast i16* %a to i8* + tail call void @llvm.arm.neon.vst2lane.v8i16(i8* %0, <8 x i16> %b.coerce.fca.0.extract, <8 x i16> %b.coerce.fca.1.extract, i32 7, i32 2) + ret void +} + +define void @test_vst2q_lane_s32(i32* %a, [2 x <4 x i32>] %b.coerce) { +; CHECK-LABEL: test_vst2q_lane_s32 +; CHECK: st2 {{{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0] +entry: + %b.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %b.coerce, 1 + %0 = bitcast i32* %a to i8* + tail call void @llvm.arm.neon.vst2lane.v4i32(i8* %0, <4 x i32> %b.coerce.fca.0.extract, <4 x i32> %b.coerce.fca.1.extract, i32 3, i32 4) + ret void +} + +define void @test_vst2q_lane_s64(i64* %a, [2 x <2 x i64>] %b.coerce) { +; CHECK-LABEL: test_vst2q_lane_s64 +; CHECK: st2 {{{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0] +entry: + %b.coerce.fca.0.extract = extractvalue [2 x <2 x i64>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [2 x <2 x i64>] %b.coerce, 1 + %0 = bitcast i64* %a to i8* + tail call void @llvm.arm.neon.vst2lane.v2i64(i8* %0, <2 x i64> %b.coerce.fca.0.extract, <2 x i64> %b.coerce.fca.1.extract, i32 1, i32 8) + ret void +} + +define void @test_vst2q_lane_f32(float* %a, [2 x <4 x float>] %b.coerce) { +; CHECK-LABEL: test_vst2q_lane_f32 +; CHECK: st2 {{{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0] +entry: + %b.coerce.fca.0.extract = extractvalue [2 x <4 x float>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [2 x <4 x float>] %b.coerce, 1 + %0 = bitcast float* %a to i8* + tail call void @llvm.arm.neon.vst2lane.v4f32(i8* %0, <4 x float> %b.coerce.fca.0.extract, <4 x float> %b.coerce.fca.1.extract, i32 3, i32 4) + ret void +} + +define void @test_vst2q_lane_f64(double* %a, [2 x <2 x double>] %b.coerce) { +; CHECK-LABEL: test_vst2q_lane_f64 +; CHECK: st2 {{{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0] +entry: + %b.coerce.fca.0.extract = extractvalue [2 x <2 x double>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [2 x <2 x double>] %b.coerce, 1 + %0 = bitcast double* %a to i8* + tail call void @llvm.arm.neon.vst2lane.v2f64(i8* %0, <2 x double> %b.coerce.fca.0.extract, <2 x double> %b.coerce.fca.1.extract, i32 1, i32 8) + ret void +} + +define void @test_vst2_lane_s8(i8* %a, [2 x <8 x i8>] %b.coerce) { +; CHECK-LABEL: test_vst2_lane_s8 +; CHECK: st2 {{{v[0-9]+}}.b, {{v[0-9]+}}.b}[{{[0-9]+}}], [x0] +entry: + %b.coerce.fca.0.extract = extractvalue [2 x <8 x i8>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [2 x <8 x i8>] %b.coerce, 1 + tail call void @llvm.arm.neon.vst2lane.v8i8(i8* %a, <8 x i8> %b.coerce.fca.0.extract, <8 x i8> %b.coerce.fca.1.extract, i32 7, i32 1) + ret void +} + +define void @test_vst2_lane_s16(i16* %a, [2 x <4 x i16>] %b.coerce) { +; CHECK-LABEL: test_vst2_lane_s16 +; CHECK: st2 {{{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0] +entry: + %b.coerce.fca.0.extract = extractvalue [2 x <4 x i16>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [2 x <4 x i16>] %b.coerce, 1 + %0 = bitcast i16* %a to i8* + tail call void @llvm.arm.neon.vst2lane.v4i16(i8* %0, <4 x i16> %b.coerce.fca.0.extract, <4 x i16> %b.coerce.fca.1.extract, i32 3, i32 2) + ret void +} + +define void @test_vst2_lane_s32(i32* %a, [2 x <2 x i32>] %b.coerce) { +; CHECK-LABEL: test_vst2_lane_s32 +; CHECK: st2 {{{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0] +entry: + %b.coerce.fca.0.extract = extractvalue [2 x <2 x i32>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [2 x <2 x i32>] %b.coerce, 1 + %0 = bitcast i32* %a to i8* + tail call void @llvm.arm.neon.vst2lane.v2i32(i8* %0, <2 x i32> %b.coerce.fca.0.extract, <2 x i32> %b.coerce.fca.1.extract, i32 1, i32 4) + ret void +} + +define void @test_vst2_lane_s64(i64* %a, [2 x <1 x i64>] %b.coerce) { +; CHECK-LABEL: test_vst2_lane_s64 +; CHECK: st2 {{{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0] +entry: + %b.coerce.fca.0.extract = extractvalue [2 x <1 x i64>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [2 x <1 x i64>] %b.coerce, 1 + %0 = bitcast i64* %a to i8* + tail call void @llvm.arm.neon.vst2lane.v1i64(i8* %0, <1 x i64> %b.coerce.fca.0.extract, <1 x i64> %b.coerce.fca.1.extract, i32 0, i32 8) + ret void +} + +define void @test_vst2_lane_f32(float* %a, [2 x <2 x float>] %b.coerce) { +; CHECK-LABEL: test_vst2_lane_f32 +; CHECK: st2 {{{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0] +entry: + %b.coerce.fca.0.extract = extractvalue [2 x <2 x float>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [2 x <2 x float>] %b.coerce, 1 + %0 = bitcast float* %a to i8* + tail call void @llvm.arm.neon.vst2lane.v2f32(i8* %0, <2 x float> %b.coerce.fca.0.extract, <2 x float> %b.coerce.fca.1.extract, i32 1, i32 4) + ret void +} + +define void @test_vst2_lane_f64(double* %a, [2 x <1 x double>] %b.coerce) { +; CHECK-LABEL: test_vst2_lane_f64 +; CHECK: st2 {{{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0] +entry: + %b.coerce.fca.0.extract = extractvalue [2 x <1 x double>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [2 x <1 x double>] %b.coerce, 1 + %0 = bitcast double* %a to i8* + tail call void @llvm.arm.neon.vst2lane.v1f64(i8* %0, <1 x double> %b.coerce.fca.0.extract, <1 x double> %b.coerce.fca.1.extract, i32 0, i32 8) + ret void +} + +define void @test_vst3q_lane_s8(i8* %a, [3 x <16 x i8>] %b.coerce) { +; CHECK-LABEL: test_vst3q_lane_s8 +; CHECK: st3 {{{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b}[{{[0-9]+}}], [x0] +entry: + %b.coerce.fca.0.extract = extractvalue [3 x <16 x i8>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [3 x <16 x i8>] %b.coerce, 1 + %b.coerce.fca.2.extract = extractvalue [3 x <16 x i8>] %b.coerce, 2 + tail call void @llvm.arm.neon.vst3lane.v16i8(i8* %a, <16 x i8> %b.coerce.fca.0.extract, <16 x i8> %b.coerce.fca.1.extract, <16 x i8> %b.coerce.fca.2.extract, i32 15, i32 1) + ret void +} + +define void @test_vst3q_lane_s16(i16* %a, [3 x <8 x i16>] %b.coerce) { +; CHECK-LABEL: test_vst3q_lane_s16 +; CHECK: st3 {{{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0] +entry: + %b.coerce.fca.0.extract = extractvalue [3 x <8 x i16>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [3 x <8 x i16>] %b.coerce, 1 + %b.coerce.fca.2.extract = extractvalue [3 x <8 x i16>] %b.coerce, 2 + %0 = bitcast i16* %a to i8* + tail call void @llvm.arm.neon.vst3lane.v8i16(i8* %0, <8 x i16> %b.coerce.fca.0.extract, <8 x i16> %b.coerce.fca.1.extract, <8 x i16> %b.coerce.fca.2.extract, i32 7, i32 2) + ret void +} + +define void @test_vst3q_lane_s32(i32* %a, [3 x <4 x i32>] %b.coerce) { +; CHECK-LABEL: test_vst3q_lane_s32 +; CHECK: st3 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0] +entry: + %b.coerce.fca.0.extract = extractvalue [3 x <4 x i32>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [3 x <4 x i32>] %b.coerce, 1 + %b.coerce.fca.2.extract = extractvalue [3 x <4 x i32>] %b.coerce, 2 + %0 = bitcast i32* %a to i8* + tail call void @llvm.arm.neon.vst3lane.v4i32(i8* %0, <4 x i32> %b.coerce.fca.0.extract, <4 x i32> %b.coerce.fca.1.extract, <4 x i32> %b.coerce.fca.2.extract, i32 3, i32 4) + ret void +} + +define void @test_vst3q_lane_s64(i64* %a, [3 x <2 x i64>] %b.coerce) { +; CHECK-LABEL: test_vst3q_lane_s64 +; CHECK: st3 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0] +entry: + %b.coerce.fca.0.extract = extractvalue [3 x <2 x i64>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [3 x <2 x i64>] %b.coerce, 1 + %b.coerce.fca.2.extract = extractvalue [3 x <2 x i64>] %b.coerce, 2 + %0 = bitcast i64* %a to i8* + tail call void @llvm.arm.neon.vst3lane.v2i64(i8* %0, <2 x i64> %b.coerce.fca.0.extract, <2 x i64> %b.coerce.fca.1.extract, <2 x i64> %b.coerce.fca.2.extract, i32 1, i32 8) + ret void +} + +define void @test_vst3q_lane_f32(float* %a, [3 x <4 x float>] %b.coerce) { +; CHECK-LABEL: test_vst3q_lane_f32 +; CHECK: st3 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0] +entry: + %b.coerce.fca.0.extract = extractvalue [3 x <4 x float>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [3 x <4 x float>] %b.coerce, 1 + %b.coerce.fca.2.extract = extractvalue [3 x <4 x float>] %b.coerce, 2 + %0 = bitcast float* %a to i8* + tail call void @llvm.arm.neon.vst3lane.v4f32(i8* %0, <4 x float> %b.coerce.fca.0.extract, <4 x float> %b.coerce.fca.1.extract, <4 x float> %b.coerce.fca.2.extract, i32 3, i32 4) + ret void +} + +define void @test_vst3q_lane_f64(double* %a, [3 x <2 x double>] %b.coerce) { +; CHECK-LABEL: test_vst3q_lane_f64 +; CHECK: st3 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0] +entry: + %b.coerce.fca.0.extract = extractvalue [3 x <2 x double>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [3 x <2 x double>] %b.coerce, 1 + %b.coerce.fca.2.extract = extractvalue [3 x <2 x double>] %b.coerce, 2 + %0 = bitcast double* %a to i8* + tail call void @llvm.arm.neon.vst3lane.v2f64(i8* %0, <2 x double> %b.coerce.fca.0.extract, <2 x double> %b.coerce.fca.1.extract, <2 x double> %b.coerce.fca.2.extract, i32 1, i32 8) + ret void +} + +define void @test_vst3_lane_s8(i8* %a, [3 x <8 x i8>] %b.coerce) { +; CHECK-LABEL: test_vst3_lane_s8 +; CHECK: st3 {{{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b}[{{[0-9]+}}], [x0] +entry: + %b.coerce.fca.0.extract = extractvalue [3 x <8 x i8>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [3 x <8 x i8>] %b.coerce, 1 + %b.coerce.fca.2.extract = extractvalue [3 x <8 x i8>] %b.coerce, 2 + tail call void @llvm.arm.neon.vst3lane.v8i8(i8* %a, <8 x i8> %b.coerce.fca.0.extract, <8 x i8> %b.coerce.fca.1.extract, <8 x i8> %b.coerce.fca.2.extract, i32 7, i32 1) + ret void +} + +define void @test_vst3_lane_s16(i16* %a, [3 x <4 x i16>] %b.coerce) { +; CHECK-LABEL: test_vst3_lane_s16 +; CHECK: st3 {{{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0] +entry: + %b.coerce.fca.0.extract = extractvalue [3 x <4 x i16>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [3 x <4 x i16>] %b.coerce, 1 + %b.coerce.fca.2.extract = extractvalue [3 x <4 x i16>] %b.coerce, 2 + %0 = bitcast i16* %a to i8* + tail call void @llvm.arm.neon.vst3lane.v4i16(i8* %0, <4 x i16> %b.coerce.fca.0.extract, <4 x i16> %b.coerce.fca.1.extract, <4 x i16> %b.coerce.fca.2.extract, i32 3, i32 2) + ret void +} + +define void @test_vst3_lane_s32(i32* %a, [3 x <2 x i32>] %b.coerce) { +; CHECK-LABEL: test_vst3_lane_s32 +; CHECK: st3 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0] +entry: + %b.coerce.fca.0.extract = extractvalue [3 x <2 x i32>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [3 x <2 x i32>] %b.coerce, 1 + %b.coerce.fca.2.extract = extractvalue [3 x <2 x i32>] %b.coerce, 2 + %0 = bitcast i32* %a to i8* + tail call void @llvm.arm.neon.vst3lane.v2i32(i8* %0, <2 x i32> %b.coerce.fca.0.extract, <2 x i32> %b.coerce.fca.1.extract, <2 x i32> %b.coerce.fca.2.extract, i32 1, i32 4) + ret void +} + +define void @test_vst3_lane_s64(i64* %a, [3 x <1 x i64>] %b.coerce) { +; CHECK-LABEL: test_vst3_lane_s64 +; CHECK: st3 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0] +entry: + %b.coerce.fca.0.extract = extractvalue [3 x <1 x i64>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [3 x <1 x i64>] %b.coerce, 1 + %b.coerce.fca.2.extract = extractvalue [3 x <1 x i64>] %b.coerce, 2 + %0 = bitcast i64* %a to i8* + tail call void @llvm.arm.neon.vst3lane.v1i64(i8* %0, <1 x i64> %b.coerce.fca.0.extract, <1 x i64> %b.coerce.fca.1.extract, <1 x i64> %b.coerce.fca.2.extract, i32 0, i32 8) + ret void +} + +define void @test_vst3_lane_f32(float* %a, [3 x <2 x float>] %b.coerce) { +; CHECK-LABEL: test_vst3_lane_f32 +; CHECK: st3 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0] +entry: + %b.coerce.fca.0.extract = extractvalue [3 x <2 x float>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [3 x <2 x float>] %b.coerce, 1 + %b.coerce.fca.2.extract = extractvalue [3 x <2 x float>] %b.coerce, 2 + %0 = bitcast float* %a to i8* + tail call void @llvm.arm.neon.vst3lane.v2f32(i8* %0, <2 x float> %b.coerce.fca.0.extract, <2 x float> %b.coerce.fca.1.extract, <2 x float> %b.coerce.fca.2.extract, i32 1, i32 4) + ret void +} + +define void @test_vst3_lane_f64(double* %a, [3 x <1 x double>] %b.coerce) { +; CHECK-LABEL: test_vst3_lane_f64 +; CHECK: st3 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0] +entry: + %b.coerce.fca.0.extract = extractvalue [3 x <1 x double>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [3 x <1 x double>] %b.coerce, 1 + %b.coerce.fca.2.extract = extractvalue [3 x <1 x double>] %b.coerce, 2 + %0 = bitcast double* %a to i8* + tail call void @llvm.arm.neon.vst3lane.v1f64(i8* %0, <1 x double> %b.coerce.fca.0.extract, <1 x double> %b.coerce.fca.1.extract, <1 x double> %b.coerce.fca.2.extract, i32 0, i32 8) + ret void +} + +define void @test_vst4q_lane_s8(i16* %a, [4 x <16 x i8>] %b.coerce) { +; CHECK-LABEL: test_vst4q_lane_s8 +; CHECK: st4 {{{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b}[{{[0-9]+}}], [x0] +entry: + %b.coerce.fca.0.extract = extractvalue [4 x <16 x i8>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [4 x <16 x i8>] %b.coerce, 1 + %b.coerce.fca.2.extract = extractvalue [4 x <16 x i8>] %b.coerce, 2 + %b.coerce.fca.3.extract = extractvalue [4 x <16 x i8>] %b.coerce, 3 + %0 = bitcast i16* %a to i8* + tail call void @llvm.arm.neon.vst4lane.v16i8(i8* %0, <16 x i8> %b.coerce.fca.0.extract, <16 x i8> %b.coerce.fca.1.extract, <16 x i8> %b.coerce.fca.2.extract, <16 x i8> %b.coerce.fca.3.extract, i32 15, i32 2) + ret void +} + +define void @test_vst4q_lane_s16(i16* %a, [4 x <8 x i16>] %b.coerce) { +; CHECK-LABEL: test_vst4q_lane_s16 +; CHECK: st4 {{{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0] +entry: + %b.coerce.fca.0.extract = extractvalue [4 x <8 x i16>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [4 x <8 x i16>] %b.coerce, 1 + %b.coerce.fca.2.extract = extractvalue [4 x <8 x i16>] %b.coerce, 2 + %b.coerce.fca.3.extract = extractvalue [4 x <8 x i16>] %b.coerce, 3 + %0 = bitcast i16* %a to i8* + tail call void @llvm.arm.neon.vst4lane.v8i16(i8* %0, <8 x i16> %b.coerce.fca.0.extract, <8 x i16> %b.coerce.fca.1.extract, <8 x i16> %b.coerce.fca.2.extract, <8 x i16> %b.coerce.fca.3.extract, i32 7, i32 2) + ret void +} + +define void @test_vst4q_lane_s32(i32* %a, [4 x <4 x i32>] %b.coerce) { +; CHECK-LABEL: test_vst4q_lane_s32 +; CHECK: st4 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0] +entry: + %b.coerce.fca.0.extract = extractvalue [4 x <4 x i32>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [4 x <4 x i32>] %b.coerce, 1 + %b.coerce.fca.2.extract = extractvalue [4 x <4 x i32>] %b.coerce, 2 + %b.coerce.fca.3.extract = extractvalue [4 x <4 x i32>] %b.coerce, 3 + %0 = bitcast i32* %a to i8* + tail call void @llvm.arm.neon.vst4lane.v4i32(i8* %0, <4 x i32> %b.coerce.fca.0.extract, <4 x i32> %b.coerce.fca.1.extract, <4 x i32> %b.coerce.fca.2.extract, <4 x i32> %b.coerce.fca.3.extract, i32 3, i32 4) + ret void +} + +define void @test_vst4q_lane_s64(i64* %a, [4 x <2 x i64>] %b.coerce) { +; CHECK-LABEL: test_vst4q_lane_s64 +; CHECK: st4 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0] +entry: + %b.coerce.fca.0.extract = extractvalue [4 x <2 x i64>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [4 x <2 x i64>] %b.coerce, 1 + %b.coerce.fca.2.extract = extractvalue [4 x <2 x i64>] %b.coerce, 2 + %b.coerce.fca.3.extract = extractvalue [4 x <2 x i64>] %b.coerce, 3 + %0 = bitcast i64* %a to i8* + tail call void @llvm.arm.neon.vst4lane.v2i64(i8* %0, <2 x i64> %b.coerce.fca.0.extract, <2 x i64> %b.coerce.fca.1.extract, <2 x i64> %b.coerce.fca.2.extract, <2 x i64> %b.coerce.fca.3.extract, i32 1, i32 8) + ret void +} + +define void @test_vst4q_lane_f32(float* %a, [4 x <4 x float>] %b.coerce) { +; CHECK-LABEL: test_vst4q_lane_f32 +; CHECK: st4 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0] +entry: + %b.coerce.fca.0.extract = extractvalue [4 x <4 x float>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [4 x <4 x float>] %b.coerce, 1 + %b.coerce.fca.2.extract = extractvalue [4 x <4 x float>] %b.coerce, 2 + %b.coerce.fca.3.extract = extractvalue [4 x <4 x float>] %b.coerce, 3 + %0 = bitcast float* %a to i8* + tail call void @llvm.arm.neon.vst4lane.v4f32(i8* %0, <4 x float> %b.coerce.fca.0.extract, <4 x float> %b.coerce.fca.1.extract, <4 x float> %b.coerce.fca.2.extract, <4 x float> %b.coerce.fca.3.extract, i32 3, i32 4) + ret void +} + +define void @test_vst4q_lane_f64(double* %a, [4 x <2 x double>] %b.coerce) { +; CHECK-LABEL: test_vst4q_lane_f64 +; CHECK: st4 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0] +entry: + %b.coerce.fca.0.extract = extractvalue [4 x <2 x double>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [4 x <2 x double>] %b.coerce, 1 + %b.coerce.fca.2.extract = extractvalue [4 x <2 x double>] %b.coerce, 2 + %b.coerce.fca.3.extract = extractvalue [4 x <2 x double>] %b.coerce, 3 + %0 = bitcast double* %a to i8* + tail call void @llvm.arm.neon.vst4lane.v2f64(i8* %0, <2 x double> %b.coerce.fca.0.extract, <2 x double> %b.coerce.fca.1.extract, <2 x double> %b.coerce.fca.2.extract, <2 x double> %b.coerce.fca.3.extract, i32 1, i32 8) + ret void +} + +define void @test_vst4_lane_s8(i8* %a, [4 x <8 x i8>] %b.coerce) { +; CHECK-LABEL: test_vst4_lane_s8 +; CHECK: st4 {{{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b}[{{[0-9]+}}], [x0] +entry: + %b.coerce.fca.0.extract = extractvalue [4 x <8 x i8>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [4 x <8 x i8>] %b.coerce, 1 + %b.coerce.fca.2.extract = extractvalue [4 x <8 x i8>] %b.coerce, 2 + %b.coerce.fca.3.extract = extractvalue [4 x <8 x i8>] %b.coerce, 3 + tail call void @llvm.arm.neon.vst4lane.v8i8(i8* %a, <8 x i8> %b.coerce.fca.0.extract, <8 x i8> %b.coerce.fca.1.extract, <8 x i8> %b.coerce.fca.2.extract, <8 x i8> %b.coerce.fca.3.extract, i32 7, i32 1) + ret void +} + +define void @test_vst4_lane_s16(i16* %a, [4 x <4 x i16>] %b.coerce) { +; CHECK-LABEL: test_vst4_lane_s16 +; CHECK: st4 {{{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0] +entry: + %b.coerce.fca.0.extract = extractvalue [4 x <4 x i16>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [4 x <4 x i16>] %b.coerce, 1 + %b.coerce.fca.2.extract = extractvalue [4 x <4 x i16>] %b.coerce, 2 + %b.coerce.fca.3.extract = extractvalue [4 x <4 x i16>] %b.coerce, 3 + %0 = bitcast i16* %a to i8* + tail call void @llvm.arm.neon.vst4lane.v4i16(i8* %0, <4 x i16> %b.coerce.fca.0.extract, <4 x i16> %b.coerce.fca.1.extract, <4 x i16> %b.coerce.fca.2.extract, <4 x i16> %b.coerce.fca.3.extract, i32 3, i32 2) + ret void +} + +define void @test_vst4_lane_s32(i32* %a, [4 x <2 x i32>] %b.coerce) { +; CHECK-LABEL: test_vst4_lane_s32 +; CHECK: st4 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0] +entry: + %b.coerce.fca.0.extract = extractvalue [4 x <2 x i32>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [4 x <2 x i32>] %b.coerce, 1 + %b.coerce.fca.2.extract = extractvalue [4 x <2 x i32>] %b.coerce, 2 + %b.coerce.fca.3.extract = extractvalue [4 x <2 x i32>] %b.coerce, 3 + %0 = bitcast i32* %a to i8* + tail call void @llvm.arm.neon.vst4lane.v2i32(i8* %0, <2 x i32> %b.coerce.fca.0.extract, <2 x i32> %b.coerce.fca.1.extract, <2 x i32> %b.coerce.fca.2.extract, <2 x i32> %b.coerce.fca.3.extract, i32 1, i32 4) + ret void +} + +define void @test_vst4_lane_s64(i64* %a, [4 x <1 x i64>] %b.coerce) { +; CHECK-LABEL: test_vst4_lane_s64 +; CHECK: st4 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0] +entry: + %b.coerce.fca.0.extract = extractvalue [4 x <1 x i64>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [4 x <1 x i64>] %b.coerce, 1 + %b.coerce.fca.2.extract = extractvalue [4 x <1 x i64>] %b.coerce, 2 + %b.coerce.fca.3.extract = extractvalue [4 x <1 x i64>] %b.coerce, 3 + %0 = bitcast i64* %a to i8* + tail call void @llvm.arm.neon.vst4lane.v1i64(i8* %0, <1 x i64> %b.coerce.fca.0.extract, <1 x i64> %b.coerce.fca.1.extract, <1 x i64> %b.coerce.fca.2.extract, <1 x i64> %b.coerce.fca.3.extract, i32 0, i32 8) + ret void +} + +define void @test_vst4_lane_f32(float* %a, [4 x <2 x float>] %b.coerce) { +; CHECK-LABEL: test_vst4_lane_f32 +; CHECK: st4 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0] +entry: + %b.coerce.fca.0.extract = extractvalue [4 x <2 x float>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [4 x <2 x float>] %b.coerce, 1 + %b.coerce.fca.2.extract = extractvalue [4 x <2 x float>] %b.coerce, 2 + %b.coerce.fca.3.extract = extractvalue [4 x <2 x float>] %b.coerce, 3 + %0 = bitcast float* %a to i8* + tail call void @llvm.arm.neon.vst4lane.v2f32(i8* %0, <2 x float> %b.coerce.fca.0.extract, <2 x float> %b.coerce.fca.1.extract, <2 x float> %b.coerce.fca.2.extract, <2 x float> %b.coerce.fca.3.extract, i32 1, i32 4) + ret void +} + +define void @test_vst4_lane_f64(double* %a, [4 x <1 x double>] %b.coerce) { +; CHECK-LABEL: test_vst4_lane_f64 +; CHECK: st4 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0] +entry: + %b.coerce.fca.0.extract = extractvalue [4 x <1 x double>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [4 x <1 x double>] %b.coerce, 1 + %b.coerce.fca.2.extract = extractvalue [4 x <1 x double>] %b.coerce, 2 + %b.coerce.fca.3.extract = extractvalue [4 x <1 x double>] %b.coerce, 3 + %0 = bitcast double* %a to i8* + tail call void @llvm.arm.neon.vst4lane.v1f64(i8* %0, <1 x double> %b.coerce.fca.0.extract, <1 x double> %b.coerce.fca.1.extract, <1 x double> %b.coerce.fca.2.extract, <1 x double> %b.coerce.fca.3.extract, i32 0, i32 8) + ret void +} + +declare { <16 x i8>, <16 x i8> } @llvm.arm.neon.vld2lane.v16i8(i8*, <16 x i8>, <16 x i8>, i32, i32) +declare { <8 x i16>, <8 x i16> } @llvm.arm.neon.vld2lane.v8i16(i8*, <8 x i16>, <8 x i16>, i32, i32) +declare { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2lane.v4i32(i8*, <4 x i32>, <4 x i32>, i32, i32) +declare { <2 x i64>, <2 x i64> } @llvm.arm.neon.vld2lane.v2i64(i8*, <2 x i64>, <2 x i64>, i32, i32) +declare { <4 x float>, <4 x float> } @llvm.arm.neon.vld2lane.v4f32(i8*, <4 x float>, <4 x float>, i32, i32) +declare { <2 x double>, <2 x double> } @llvm.arm.neon.vld2lane.v2f64(i8*, <2 x double>, <2 x double>, i32, i32) +declare { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2lane.v8i8(i8*, <8 x i8>, <8 x i8>, i32, i32) +declare { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld2lane.v4i16(i8*, <4 x i16>, <4 x i16>, i32, i32) +declare { <2 x i32>, <2 x i32> } @llvm.arm.neon.vld2lane.v2i32(i8*, <2 x i32>, <2 x i32>, i32, i32) +declare { <1 x i64>, <1 x i64> } @llvm.arm.neon.vld2.v1i64(i8*, i32) +declare { <2 x float>, <2 x float> } @llvm.arm.neon.vld2lane.v2f32(i8*, <2 x float>, <2 x float>, i32, i32) +declare { <1 x double>, <1 x double> } @llvm.arm.neon.vld2.v1f64(i8*, i32) +declare { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3lane.v16i8(i8*, <16 x i8>, <16 x i8>, <16 x i8>, i32, i32) +declare { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3lane.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, i32, i32) +declare { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3lane.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, i32, i32) +declare { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm.neon.vld3lane.v2i64(i8*, <2 x i64>, <2 x i64>, <2 x i64>, i32, i32) +declare { <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld3lane.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, i32, i32) +declare { <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld3lane.v2f64(i8*, <2 x double>, <2 x double>, <2 x double>, i32, i32) +declare { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3lane.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32) +declare { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3lane.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32) +declare { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld3lane.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32) +declare { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld3.v1i64(i8*, i32) +declare { <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld3lane.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, i32, i32) +declare { <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld3.v1f64(i8*, i32) +declare { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld4lane.v16i8(i8*, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i32, i32) +declare { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld4lane.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i32, i32) +declare { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4lane.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i32, i32) +declare { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm.neon.vld4lane.v2i64(i8*, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, i32, i32) +declare { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld4lane.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, <4 x float>, i32, i32) +declare { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld4lane.v2f64(i8*, <2 x double>, <2 x double>, <2 x double>, <2 x double>, i32, i32) +declare { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4lane.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32) +declare { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4lane.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32) +declare { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld4lane.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32) +declare { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld4.v1i64(i8*, i32) +declare { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld4lane.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, <2 x float>, i32, i32) +declare { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld4.v1f64(i8*, i32) +declare { <1 x i64>, <1 x i64> } @llvm.arm.neon.vld2lane.v1i64(i8*, <1 x i64>, <1 x i64>, i32, i32) +declare { <1 x double>, <1 x double> } @llvm.arm.neon.vld2lane.v1f64(i8*, <1 x double>, <1 x double>, i32, i32) +declare { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld3lane.v1i64(i8*, <1 x i64>, <1 x i64>, <1 x i64>, i32, i32) +declare { <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld3lane.v1f64(i8*, <1 x double>, <1 x double>, <1 x double>, i32, i32) +declare { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld4lane.v1i64(i8*, <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, i32, i32) +declare { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld4lane.v1f64(i8*, <1 x double>, <1 x double>, <1 x double>, <1 x double>, i32, i32) +declare void @llvm.arm.neon.vst2lane.v16i8(i8*, <16 x i8>, <16 x i8>, i32, i32) +declare void @llvm.arm.neon.vst2lane.v8i16(i8*, <8 x i16>, <8 x i16>, i32, i32) +declare void @llvm.arm.neon.vst2lane.v4i32(i8*, <4 x i32>, <4 x i32>, i32, i32) +declare void @llvm.arm.neon.vst2lane.v2i64(i8*, <2 x i64>, <2 x i64>, i32, i32) +declare void @llvm.arm.neon.vst2lane.v4f32(i8*, <4 x float>, <4 x float>, i32, i32) +declare void @llvm.arm.neon.vst2lane.v2f64(i8*, <2 x double>, <2 x double>, i32, i32) +declare void @llvm.arm.neon.vst2lane.v8i8(i8*, <8 x i8>, <8 x i8>, i32, i32) +declare void @llvm.arm.neon.vst2lane.v4i16(i8*, <4 x i16>, <4 x i16>, i32, i32) +declare void @llvm.arm.neon.vst2lane.v2i32(i8*, <2 x i32>, <2 x i32>, i32, i32) +declare void @llvm.arm.neon.vst2lane.v1i64(i8*, <1 x i64>, <1 x i64>, i32, i32) +declare void @llvm.arm.neon.vst2lane.v2f32(i8*, <2 x float>, <2 x float>, i32, i32) +declare void @llvm.arm.neon.vst2lane.v1f64(i8*, <1 x double>, <1 x double>, i32, i32) +declare void @llvm.arm.neon.vst3lane.v16i8(i8*, <16 x i8>, <16 x i8>, <16 x i8>, i32, i32) +declare void @llvm.arm.neon.vst3lane.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, i32, i32) +declare void @llvm.arm.neon.vst3lane.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, i32, i32) +declare void @llvm.arm.neon.vst3lane.v2i64(i8*, <2 x i64>, <2 x i64>, <2 x i64>, i32, i32) +declare void @llvm.arm.neon.vst3lane.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, i32, i32) +declare void @llvm.arm.neon.vst3lane.v2f64(i8*, <2 x double>, <2 x double>, <2 x double>, i32, i32) +declare void @llvm.arm.neon.vst3lane.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32) +declare void @llvm.arm.neon.vst3lane.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32) +declare void @llvm.arm.neon.vst3lane.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32) +declare void @llvm.arm.neon.vst3lane.v1i64(i8*, <1 x i64>, <1 x i64>, <1 x i64>, i32, i32) +declare void @llvm.arm.neon.vst3lane.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, i32, i32) +declare void @llvm.arm.neon.vst3lane.v1f64(i8*, <1 x double>, <1 x double>, <1 x double>, i32, i32) +declare void @llvm.arm.neon.vst4lane.v16i8(i8*, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i32, i32) +declare void @llvm.arm.neon.vst4lane.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i32, i32) +declare void @llvm.arm.neon.vst4lane.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i32, i32) +declare void @llvm.arm.neon.vst4lane.v2i64(i8*, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, i32, i32) +declare void @llvm.arm.neon.vst4lane.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, <4 x float>, i32, i32) +declare void @llvm.arm.neon.vst4lane.v2f64(i8*, <2 x double>, <2 x double>, <2 x double>, <2 x double>, i32, i32) +declare void @llvm.arm.neon.vst4lane.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32) +declare void @llvm.arm.neon.vst4lane.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32) +declare void @llvm.arm.neon.vst4lane.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32) +declare void @llvm.arm.neon.vst4lane.v1i64(i8*, <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, i32, i32) +declare void @llvm.arm.neon.vst4lane.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, <2 x float>, i32, i32) +declare void @llvm.arm.neon.vst4lane.v1f64(i8*, <1 x double>, <1 x double>, <1 x double>, <1 x double>, i32, i32) \ No newline at end of file diff --git a/test/CodeGen/AArch64/neon-simd-post-ldst-one.ll b/test/CodeGen/AArch64/neon-simd-post-ldst-one.ll new file mode 100644 index 00000000000..80a934700c6 --- /dev/null +++ b/test/CodeGen/AArch64/neon-simd-post-ldst-one.ll @@ -0,0 +1,319 @@ +; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s + +define { [2 x <16 x i8>] } @test_vld2q_dup_fx_update(i8* %a, i8** %ptr) { +; CHECK-LABEL: test_vld2q_dup_fx_update +; CHECK: ld2r {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [x{{[0-9]+|sp}}], #2 + %1 = tail call { <16 x i8>, <16 x i8> } @llvm.arm.neon.vld2lane.v16i8(i8* %a, <16 x i8> undef, <16 x i8> undef, i32 0, i32 1) + %2 = extractvalue { <16 x i8>, <16 x i8> } %1, 0 + %3 = shufflevector <16 x i8> %2, <16 x i8> undef, <16 x i32> zeroinitializer + %4 = extractvalue { <16 x i8>, <16 x i8> } %1, 1 + %5 = shufflevector <16 x i8> %4, <16 x i8> undef, <16 x i32> zeroinitializer + %6 = insertvalue { [2 x <16 x i8>] } undef, <16 x i8> %3, 0, 0 + %7 = insertvalue { [2 x <16 x i8>] } %6, <16 x i8> %5, 0, 1 + %tmp1 = getelementptr i8* %a, i32 2 + store i8* %tmp1, i8** %ptr + ret { [2 x <16 x i8>] } %7 +} + +define { [2 x <4 x i32>] } @test_vld2q_dup_reg_update(i32* %a, i32** %ptr, i32 %inc) { +; CHECK-LABEL: test_vld2q_dup_reg_update +; CHECK: ld2r {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [x{{[0-9]+|sp}}], x{{[0-9]+}} + %1 = bitcast i32* %a to i8* + %2 = tail call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2lane.v4i32(i8* %1, <4 x i32> undef, <4 x i32> undef, i32 0, i32 4) + %3 = extractvalue { <4 x i32>, <4 x i32> } %2, 0 + %4 = shufflevector <4 x i32> %3, <4 x i32> undef, <4 x i32> zeroinitializer + %5 = extractvalue { <4 x i32>, <4 x i32> } %2, 1 + %6 = shufflevector <4 x i32> %5, <4 x i32> undef, <4 x i32> zeroinitializer + %7 = insertvalue { [2 x <4 x i32>] } undef, <4 x i32> %4, 0, 0 + %8 = insertvalue { [2 x <4 x i32>] } %7, <4 x i32> %6, 0, 1 + %tmp1 = getelementptr i32* %a, i32 %inc + store i32* %tmp1, i32** %ptr + ret { [2 x <4 x i32>] } %8 +} + +define { [3 x <4 x i16>] } @test_vld3_dup_fx_update(i16* %a, i16** %ptr) { +; CHECK-LABEL: test_vld3_dup_fx_update +; CHECK: ld3r {v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h}, [x{{[0-9]+|sp}}], #6 + %1 = bitcast i16* %a to i8* + %2 = tail call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3lane.v4i16(i8* %1, <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, i32 0, i32 2) + %3 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %2, 0 + %4 = shufflevector <4 x i16> %3, <4 x i16> undef, <4 x i32> zeroinitializer + %5 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %2, 1 + %6 = shufflevector <4 x i16> %5, <4 x i16> undef, <4 x i32> zeroinitializer + %7 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %2, 2 + %8 = shufflevector <4 x i16> %7, <4 x i16> undef, <4 x i32> zeroinitializer + %9 = insertvalue { [3 x <4 x i16>] } undef, <4 x i16> %4, 0, 0 + %10 = insertvalue { [3 x <4 x i16>] } %9, <4 x i16> %6, 0, 1 + %11 = insertvalue { [3 x <4 x i16>] } %10, <4 x i16> %8, 0, 2 + %tmp1 = getelementptr i16* %a, i32 3 + store i16* %tmp1, i16** %ptr + ret { [3 x <4 x i16>] } %11 +} + +define { [3 x <8 x i8>] } @test_vld3_dup_reg_update(i8* %a, i8** %ptr, i32 %inc) { +; CHECK-LABEL: test_vld3_dup_reg_update +; CHECK: ld3r {v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b}, [x{{[0-9]+|sp}}], x{{[0-9]+}} + %1 = tail call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3lane.v8i8(i8* %a, <8 x i8> undef, <8 x i8> undef, <8 x i8> undef, i32 0, i32 1) + %2 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %1, 0 + %3 = shufflevector <8 x i8> %2, <8 x i8> undef, <8 x i32> zeroinitializer + %4 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %1, 1 + %5 = shufflevector <8 x i8> %4, <8 x i8> undef, <8 x i32> zeroinitializer + %6 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %1, 2 + %7 = shufflevector <8 x i8> %6, <8 x i8> undef, <8 x i32> zeroinitializer + %8 = insertvalue { [3 x <8 x i8>] } undef, <8 x i8> %3, 0, 0 + %9 = insertvalue { [3 x <8 x i8>] } %8, <8 x i8> %5, 0, 1 + %10 = insertvalue { [3 x <8 x i8>] } %9, <8 x i8> %7, 0, 2 + %tmp1 = getelementptr i8* %a, i32 %inc + store i8* %tmp1, i8** %ptr + ret { [3 x <8 x i8>] }%10 +} + +define { [4 x <2 x i32>] } @test_vld4_dup_fx_update(i32* %a, i32** %ptr) #0 { +; CHECK-LABEL: test_vld4_dup_fx_update +; CHECK: ld4r {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [x{{[0-9]+|sp}}], #16 + %1 = bitcast i32* %a to i8* + %2 = tail call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld4lane.v2i32(i8* %1, <2 x i32> undef, <2 x i32> undef, <2 x i32> undef, <2 x i32> undef, i32 0, i32 4) + %3 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %2, 0 + %4 = shufflevector <2 x i32> %3, <2 x i32> undef, <2 x i32> zeroinitializer + %5 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %2, 1 + %6 = shufflevector <2 x i32> %5, <2 x i32> undef, <2 x i32> zeroinitializer + %7 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %2, 2 + %8 = shufflevector <2 x i32> %7, <2 x i32> undef, <2 x i32> zeroinitializer + %9 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %2, 3 + %10 = shufflevector <2 x i32> %9, <2 x i32> undef, <2 x i32> zeroinitializer + %11 = insertvalue { [4 x <2 x i32>] } undef, <2 x i32> %4, 0, 0 + %12 = insertvalue { [4 x <2 x i32>] } %11, <2 x i32> %6, 0, 1 + %13 = insertvalue { [4 x <2 x i32>] } %12, <2 x i32> %8, 0, 2 + %14 = insertvalue { [4 x <2 x i32>] } %13, <2 x i32> %10, 0, 3 + %tmp1 = getelementptr i32* %a, i32 4 + store i32* %tmp1, i32** %ptr + ret { [4 x <2 x i32>] } %14 +} + +define { [4 x <2 x double>] } @test_vld4_dup_reg_update(double* %a, double** %ptr, i32 %inc) { +; CHECK-LABEL: test_vld4_dup_reg_update +; CHECK: ld4r {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [x{{[0-9]+|sp}}], x{{[0-9]+}} + %1 = bitcast double* %a to i8* + %2 = tail call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld4lane.v2f64(i8* %1, <2 x double> undef, <2 x double> undef, <2 x double> undef, <2 x double> undef, i32 0, i32 8) + %3 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %2, 0 + %4 = shufflevector <2 x double> %3, <2 x double> undef, <2 x i32> zeroinitializer + %5 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %2, 1 + %6 = shufflevector <2 x double> %5, <2 x double> undef, <2 x i32> zeroinitializer + %7 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %2, 2 + %8 = shufflevector <2 x double> %7, <2 x double> undef, <2 x i32> zeroinitializer + %9 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %2, 3 + %10 = shufflevector <2 x double> %9, <2 x double> undef, <2 x i32> zeroinitializer + %11 = insertvalue { [4 x <2 x double>] } undef, <2 x double> %4, 0, 0 + %12 = insertvalue { [4 x <2 x double>] } %11, <2 x double> %6, 0, 1 + %13 = insertvalue { [4 x <2 x double>] } %12, <2 x double> %8, 0, 2 + %14 = insertvalue { [4 x <2 x double>] } %13, <2 x double> %10, 0, 3 + %tmp1 = getelementptr double* %a, i32 %inc + store double* %tmp1, double** %ptr + ret { [4 x <2 x double>] } %14 +} + +define { [2 x <8 x i8>] } @test_vld2_lane_fx_update(i8* %a, [2 x <8 x i8>] %b, i8** %ptr) { +; CHECK-LABEL: test_vld2_lane_fx_update +; CHECK: ld2 {v{{[0-9]+}}.b, v{{[0-9]+}}.b}[7], [x{{[0-9]+|sp}}], #2 + %1 = extractvalue [2 x <8 x i8>] %b, 0 + %2 = extractvalue [2 x <8 x i8>] %b, 1 + %3 = tail call { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2lane.v8i8(i8* %a, <8 x i8> %1, <8 x i8> %2, i32 7, i32 1) + %4 = extractvalue { <8 x i8>, <8 x i8> } %3, 0 + %5 = extractvalue { <8 x i8>, <8 x i8> } %3, 1 + %6 = insertvalue { [2 x <8 x i8>] } undef, <8 x i8> %4, 0, 0 + %7 = insertvalue { [2 x <8 x i8>] } %6, <8 x i8> %5, 0, 1 + %tmp1 = getelementptr i8* %a, i32 2 + store i8* %tmp1, i8** %ptr + ret { [2 x <8 x i8>] } %7 +} + +define { [2 x <8 x i8>] } @test_vld2_lane_reg_update(i8* %a, [2 x <8 x i8>] %b, i8** %ptr, i32 %inc) { +; CHECK-LABEL: test_vld2_lane_reg_update +; CHECK: ld2 {v{{[0-9]+}}.b, v{{[0-9]+}}.b}[6], [x{{[0-9]+|sp}}], x{{[0-9]+}} + %1 = extractvalue [2 x <8 x i8>] %b, 0 + %2 = extractvalue [2 x <8 x i8>] %b, 1 + %3 = tail call { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2lane.v8i8(i8* %a, <8 x i8> %1, <8 x i8> %2, i32 6, i32 1) + %4 = extractvalue { <8 x i8>, <8 x i8> } %3, 0 + %5 = extractvalue { <8 x i8>, <8 x i8> } %3, 1 + %6 = insertvalue { [2 x <8 x i8>] } undef, <8 x i8> %4, 0, 0 + %7 = insertvalue { [2 x <8 x i8>] } %6, <8 x i8> %5, 0, 1 + %tmp1 = getelementptr i8* %a, i32 %inc + store i8* %tmp1, i8** %ptr + ret { [2 x <8 x i8>] } %7 +} + +define { [3 x <2 x float>] } @test_vld3_lane_fx_update(float* %a, [3 x <2 x float>] %b, float** %ptr) { +; CHECK-LABEL: test_vld3_lane_fx_update +; CHECK: ld3 {v{{[0-9]+}}.s, v{{[0-9]+}}.s, v{{[0-9]+}}.s}[1], [x{{[0-9]+|sp}}], #12 + %1 = extractvalue [3 x <2 x float>] %b, 0 + %2 = extractvalue [3 x <2 x float>] %b, 1 + %3 = extractvalue [3 x <2 x float>] %b, 2 + %4 = bitcast float* %a to i8* + %5 = tail call { <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld3lane.v2f32(i8* %4, <2 x float> %1, <2 x float> %2, <2 x float> %3, i32 1, i32 4) + %6 = extractvalue { <2 x float>, <2 x float>, <2 x float> } %5, 0 + %7 = extractvalue { <2 x float>, <2 x float>, <2 x float> } %5, 1 + %8 = extractvalue { <2 x float>, <2 x float>, <2 x float> } %5, 2 + %9 = insertvalue { [3 x <2 x float>] } undef, <2 x float> %6, 0, 0 + %10 = insertvalue { [3 x <2 x float>] } %9, <2 x float> %7, 0, 1 + %11 = insertvalue { [3 x <2 x float>] } %10, <2 x float> %8, 0, 2 + %tmp1 = getelementptr float* %a, i32 3 + store float* %tmp1, float** %ptr + ret { [3 x <2 x float>] } %11 +} + +define { [3 x <4 x i16>] } @test_vld3_lane_reg_update(i16* %a, [3 x <4 x i16>] %b, i16** %ptr, i32 %inc) { +; CHECK-LABEL: test_vld3_lane_reg_update +; CHECK: ld3 {v{{[0-9]+}}.h, v{{[0-9]+}}.h, v{{[0-9]+}}.h}[3], [x{{[0-9]+|sp}}], x{{[0-9]+}} + %1 = extractvalue [3 x <4 x i16>] %b, 0 + %2 = extractvalue [3 x <4 x i16>] %b, 1 + %3 = extractvalue [3 x <4 x i16>] %b, 2 + %4 = bitcast i16* %a to i8* + %5 = tail call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3lane.v4i16(i8* %4, <4 x i16> %1, <4 x i16> %2, <4 x i16> %3, i32 3, i32 2) + %6 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %5, 0 + %7 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %5, 1 + %8 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %5, 2 + %9 = insertvalue { [3 x <4 x i16>] } undef, <4 x i16> %6, 0, 0 + %10 = insertvalue { [3 x <4 x i16>] } %9, <4 x i16> %7, 0, 1 + %11 = insertvalue { [3 x <4 x i16>] } %10, <4 x i16> %8, 0, 2 + %tmp1 = getelementptr i16* %a, i32 %inc + store i16* %tmp1, i16** %ptr + ret { [3 x <4 x i16>] } %11 +} + +define { [4 x <2 x i32>] } @test_vld4_lane_fx_update(i32* readonly %a, [4 x <2 x i32>] %b, i32** %ptr) { +; CHECK-LABEL: test_vld4_lane_fx_update +; CHECK: ld4 {v{{[0-9]+}}.s, v{{[0-9]+}}.s, v{{[0-9]+}}.s, v{{[0-9]+}}.s}[1], [x{{[0-9]+|sp}}], #16 + %1 = extractvalue [4 x <2 x i32>] %b, 0 + %2 = extractvalue [4 x <2 x i32>] %b, 1 + %3 = extractvalue [4 x <2 x i32>] %b, 2 + %4 = extractvalue [4 x <2 x i32>] %b, 3 + %5 = bitcast i32* %a to i8* + %6 = tail call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld4lane.v2i32(i8* %5, <2 x i32> %1, <2 x i32> %2, <2 x i32> %3, <2 x i32> %4, i32 1, i32 4) + %7 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %6, 0 + %8 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %6, 1 + %9 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %6, 2 + %10 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %6, 3 + %11 = insertvalue { [4 x <2 x i32>] } undef, <2 x i32> %7, 0, 0 + %12 = insertvalue { [4 x <2 x i32>] } %11, <2 x i32> %8, 0, 1 + %13 = insertvalue { [4 x <2 x i32>] } %12, <2 x i32> %9, 0, 2 + %14 = insertvalue { [4 x <2 x i32>] } %13, <2 x i32> %10, 0, 3 + %tmp1 = getelementptr i32* %a, i32 4 + store i32* %tmp1, i32** %ptr + ret { [4 x <2 x i32>] } %14 +} + +define { [4 x <2 x double>] } @test_vld4_lane_reg_update(double* readonly %a, [4 x <2 x double>] %b, double** %ptr, i32 %inc) { +; CHECK-LABEL: test_vld4_lane_reg_update +; CHECK: ld4 {v{{[0-9]+}}.d, v{{[0-9]+}}.d, v{{[0-9]+}}.d, v{{[0-9]+}}.d}[1], [x{{[0-9]+|sp}}], x{{[0-9]+}} + %1 = extractvalue [4 x <2 x double>] %b, 0 + %2 = extractvalue [4 x <2 x double>] %b, 1 + %3 = extractvalue [4 x <2 x double>] %b, 2 + %4 = extractvalue [4 x <2 x double>] %b, 3 + %5 = bitcast double* %a to i8* + %6 = tail call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld4lane.v2f64(i8* %5, <2 x double> %1, <2 x double> %2, <2 x double> %3, <2 x double> %4, i32 1, i32 8) + %7 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %6, 0 + %8 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %6, 1 + %9 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %6, 2 + %10 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %6, 3 + %11 = insertvalue { [4 x <2 x double>] } undef, <2 x double> %7, 0, 0 + %12 = insertvalue { [4 x <2 x double>] } %11, <2 x double> %8, 0, 1 + %13 = insertvalue { [4 x <2 x double>] } %12, <2 x double> %9, 0, 2 + %14 = insertvalue { [4 x <2 x double>] } %13, <2 x double> %10, 0, 3 + %tmp1 = getelementptr double* %a, i32 %inc + store double* %tmp1, double** %ptr + ret { [4 x <2 x double>] } %14 +} + +define void @test_vst2_lane_fx_update(i8* %a, [2 x <8 x i8>] %b, i8** %ptr) { +; CHECK-LABEL: test_vst2_lane_fx_update +; CHECK: st2 {v{{[0-9]+}}.b, v{{[0-9]+}}.b}[7], [x{{[0-9]+|sp}}], #2 + %1 = extractvalue [2 x <8 x i8>] %b, 0 + %2 = extractvalue [2 x <8 x i8>] %b, 1 + call void @llvm.arm.neon.vst2lane.v8i8(i8* %a, <8 x i8> %1, <8 x i8> %2, i32 7, i32 1) + %tmp1 = getelementptr i8* %a, i32 2 + store i8* %tmp1, i8** %ptr + ret void +} + +define void @test_vst2_lane_reg_update(i32* %a, [2 x <2 x i32>] %b.coerce, i32** %ptr, i32 %inc) { +; CHECK-LABEL: test_vst2_lane_reg_update +; CHECK: st2 {v{{[0-9]+}}.s, v{{[0-9]+}}.s}[1], [x{{[0-9]+|sp}}], x{{[0-9]+}} + %1 = extractvalue [2 x <2 x i32>] %b.coerce, 0 + %2 = extractvalue [2 x <2 x i32>] %b.coerce, 1 + %3 = bitcast i32* %a to i8* + tail call void @llvm.arm.neon.vst2lane.v2i32(i8* %3, <2 x i32> %1, <2 x i32> %2, i32 1, i32 4) + %tmp1 = getelementptr i32* %a, i32 %inc + store i32* %tmp1, i32** %ptr + ret void +} + +define void @test_vst3_lane_fx_update(float* %a, [3 x <4 x float>] %b, float** %ptr) { +; CHECK-LABEL: test_vst3_lane_fx_update +; CHECK: st3 {v{{[0-9]+}}.s, v{{[0-9]+}}.s, v{{[0-9]+}}.s}[3], [x{{[0-9]+|sp}}], #12 + %1 = extractvalue [3 x <4 x float>] %b, 0 + %2 = extractvalue [3 x <4 x float>] %b, 1 + %3 = extractvalue [3 x <4 x float>] %b, 2 + %4 = bitcast float* %a to i8* + call void @llvm.arm.neon.vst3lane.v4f32(i8* %4, <4 x float> %1, <4 x float> %2, <4 x float> %3, i32 3, i32 4) + %tmp1 = getelementptr float* %a, i32 3 + store float* %tmp1, float** %ptr + ret void +} + +; Function Attrs: nounwind +define void @test_vst3_lane_reg_update(i16* %a, [3 x <4 x i16>] %b, i16** %ptr, i32 %inc) { +; CHECK-LABEL: test_vst3_lane_reg_update +; CHECK: st3 {v{{[0-9]+}}.h, v{{[0-9]+}}.h, v{{[0-9]+}}.h}[3], [x{{[0-9]+|sp}}], x{{[0-9]+}} + %1 = extractvalue [3 x <4 x i16>] %b, 0 + %2 = extractvalue [3 x <4 x i16>] %b, 1 + %3 = extractvalue [3 x <4 x i16>] %b, 2 + %4 = bitcast i16* %a to i8* + tail call void @llvm.arm.neon.vst3lane.v4i16(i8* %4, <4 x i16> %1, <4 x i16> %2, <4 x i16> %3, i32 3, i32 2) + %tmp1 = getelementptr i16* %a, i32 %inc + store i16* %tmp1, i16** %ptr + ret void +} + +define void @test_vst4_lane_fx_update(double* %a, [4 x <2 x double>] %b.coerce, double** %ptr) { +; CHECK-LABEL: test_vst4_lane_fx_update +; CHECK: st4 {v{{[0-9]+}}.d, v{{[0-9]+}}.d, v{{[0-9]+}}.d, v{{[0-9]+}}.d}[1], [x{{[0-9]+|sp}}], #32 + %1 = extractvalue [4 x <2 x double>] %b.coerce, 0 + %2 = extractvalue [4 x <2 x double>] %b.coerce, 1 + %3 = extractvalue [4 x <2 x double>] %b.coerce, 2 + %4 = extractvalue [4 x <2 x double>] %b.coerce, 3 + %5 = bitcast double* %a to i8* + tail call void @llvm.arm.neon.vst4lane.v2f64(i8* %5, <2 x double> %1, <2 x double> %2, <2 x double> %3, <2 x double> %4, i32 1, i32 8) + %tmp1 = getelementptr double* %a, i32 4 + store double* %tmp1, double** %ptr + ret void +} + + +define void @test_vst4_lane_reg_update(float* %a, [4 x <2 x float>] %b.coerce, float** %ptr, i32 %inc) { +; CHECK-LABEL: test_vst4_lane_reg_update +; CHECK: st4 {v{{[0-9]+}}.s, v{{[0-9]+}}.s, v{{[0-9]+}}.s, v{{[0-9]+}}.s}[1], [x{{[0-9]+|sp}}], x{{[0-9]+}} + %1 = extractvalue [4 x <2 x float>] %b.coerce, 0 + %2 = extractvalue [4 x <2 x float>] %b.coerce, 1 + %3 = extractvalue [4 x <2 x float>] %b.coerce, 2 + %4 = extractvalue [4 x <2 x float>] %b.coerce, 3 + %5 = bitcast float* %a to i8* + tail call void @llvm.arm.neon.vst4lane.v2f32(i8* %5, <2 x float> %1, <2 x float> %2, <2 x float> %3, <2 x float> %4, i32 1, i32 4) + %tmp1 = getelementptr float* %a, i32 %inc + store float* %tmp1, float** %ptr + ret void +} + +declare { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2lane.v8i8(i8*, <8 x i8>, <8 x i8>, i32, i32) +declare { <16 x i8>, <16 x i8> } @llvm.arm.neon.vld2lane.v16i8(i8*, <16 x i8>, <16 x i8>, i32, i32) +declare { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2lane.v4i32(i8*, <4 x i32>, <4 x i32>, i32, i32) +declare { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3lane.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32) +declare { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3lane.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32) +declare { <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld3lane.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, i32, i32) +declare { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld4lane.v2f64(i8*, <2 x double>, <2 x double>, <2 x double>, <2 x double>, i32, i32) +declare { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld4lane.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32) +declare void @llvm.arm.neon.vst2lane.v8i8(i8*, <8 x i8>, <8 x i8>, i32, i32) +declare void @llvm.arm.neon.vst2lane.v2i32(i8*, <2 x i32>, <2 x i32>, i32, i32) +declare void @llvm.arm.neon.vst3lane.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, i32, i32) +declare void @llvm.arm.neon.vst3lane.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32) +declare void @llvm.arm.neon.vst4lane.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, <2 x float>, i32, i32) +declare void @llvm.arm.neon.vst4lane.v2f64(i8*, <2 x double>, <2 x double>, <2 x double>, <2 x double>, i32, i32) diff --git a/test/MC/AArch64/neon-diagnostics.s b/test/MC/AArch64/neon-diagnostics.s index eaa5c562f37..74719e09a53 100644 --- a/test/MC/AArch64/neon-diagnostics.s +++ b/test/MC/AArch64/neon-diagnostics.s @@ -4170,6 +4170,125 @@ // CHECK-ERROR: st4 {v31.2d, v0.2d, v1.2d, v2.1d}, [x3], x1 // CHECK-ERROR: ^ +//------------------------------------------------------------------------------ +// Load single N-element structure to all lanes of N consecutive +// registers (N = 1,2,3,4) +//------------------------------------------------------------------------------ + ld1r {x1}, [x0] + ld2r {v31.4s, v0.2s}, [sp] + ld3r {v0.8b, v1.8b, v2.8b, v3.8b}, [x0] + ld4r {v31.2s, v0.2s, v1.2d, v2.2s}, [sp] +// CHECK-ERROR: error: expected vector type register +// CHECK-ERROR: ld1r {x1}, [x0] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid space between two vectors +// CHECK-ERROR: ld2r {v31.4s, v0.2s}, [sp] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: ld3r {v0.8b, v1.8b, v2.8b, v3.8b}, [x0] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid space between two vectors +// CHECK-ERROR: ld4r {v31.2s, v0.2s, v1.2d, v2.2s}, [sp] +// CHECK-ERROR: ^ + +//------------------------------------------------------------------------------ +// Load/Store single N-element structure to/from one lane of N consecutive +// registers (N = 1, 2,3,4) +//------------------------------------------------------------------------------ + ld1 {v0.b}[16], [x0] + ld2 {v15.h, v16.h}[8], [x15] + ld3 {v31.s, v0.s, v1.s}[-1], [sp] + ld4 {v0.d, v1.d, v2.d, v3.d}[2], [x0] +// CHECK-ERROR:: error: lane number incompatible with layout +// CHECK-ERROR: ld1 {v0.b}[16], [x0] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: lane number incompatible with layout +// CHECK-ERROR: ld2 {v15.h, v16.h}[8], [x15] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: expected lane number +// CHECK-ERROR: ld3 {v31.s, v0.s, v1.s}[-1], [sp] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: lane number incompatible with layout +// CHECK-ERROR: ld4 {v0.d, v1.d, v2.d, v3.d}[2], [x0] +// CHECK-ERROR: ^ + + st1 {v0.d}[16], [x0] + st2 {v31.s, v0.s}[3], [8] + st3 {v15.h, v16.h, v17.h}[-1], [x15] + st4 {v0.d, v1.d, v2.d, v3.d}[2], [x0] +// CHECK-ERROR:: error: lane number incompatible with layout +// CHECK-ERROR: st1 {v0.d}[16], [x0] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: st2 {v31.s, v0.s}[3], [8] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: expected lane number +// CHECK-ERROR: st3 {v15.h, v16.h, v17.h}[-1], [x15] +// CHECK-ERROR: ^ +// CHECK-ERROR: lane number incompatible with layout +// CHECK-ERROR: st4 {v0.d, v1.d, v2.d, v3.d}[2], [x0] +// CHECK-ERROR: ^ + +//------------------------------------------------------------------------------ +// Post-index of load single N-element structure to all lanes of N consecutive +// registers (N = 1,2,3,4) +//------------------------------------------------------------------------------ + ld1r {v15.8h}, [x15], #5 + ld2r {v0.2d, v1.2d}, [x0], #7 + ld3r {v15.4h, v16.4h, v17.4h}, [x15], #1 + ld4r {v31.1d, v0.1d, v1.1d, v2.1d}, [sp], sp +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: ld1r {v15.8h}, [x15], #5 +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: ld2r {v0.2d, v1.2d}, [x0], #7 +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: ld3r {v15.4h, v16.4h, v17.4h}, [x15], #1 +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: ld4r {v31.1d, v0.1d, v1.1d, v2.1d}, [sp], sp +// CHECK-ERROR: ^ + +//------------------------------------------------------------------------------ +// Post-index of Load/Store single N-element structure to/from one lane of N +// consecutive registers (N = 1, 2,3,4) +//------------------------------------------------------------------------------ + ld1 {v0.b}[0], [x0], #2 + ld2 {v15.h, v16.h}[0], [x15], #3 + ld3 {v31.s, v0.s, v1.d}[0], [sp], x9 + ld4 {v0.d, v1.d, v2.d, v3.d}[1], [x0], #24 +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: ld1 {v0.b}[0], [x0], #2 +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: ld2 {v15.h, v16.h}[0], [x15], #3 +// CHECK-ERROR: ^ +// CHECK-ERROR: error: expected the same vector layout +// CHECK-ERROR: ld3 {v31.s, v0.s, v1.d}[0], [sp], x9 +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: ld4 {v0.d, v1.d, v2.d, v3.d}[1], [x0], #24 +// CHECK-ERROR: ^ + + st1 {v0.d}[0], [x0], #7 + st2 {v31.s, v0.s}[0], [sp], #6 + st3 {v15.h, v16.h, v17.h}[0], [x15], #8 + st4 {v0.b, v1.b, v2.b, v3.b}[1], [x0], #1 +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: st1 {v0.d}[0], [x0], #7 +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: st2 {v31.s, v0.s}[0], [sp], #6 +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: st3 {v15.h, v16.h, v17.h}[0], [x15], #8 +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: st4 {v0.b, v1.b, v2.b, v3.b}[1], [x0], #1 +// CHECK-ERROR: ^ + + ins v2.b[16], w1 ins v7.h[8], w14 ins v20.s[5], w30 diff --git a/test/MC/AArch64/neon-simd-ldst-one-elem.s b/test/MC/AArch64/neon-simd-ldst-one-elem.s new file mode 100644 index 00000000000..140d7525fee --- /dev/null +++ b/test/MC/AArch64/neon-simd-ldst-one-elem.s @@ -0,0 +1,325 @@ +// RUN: llvm-mc -triple=aarch64 -mattr=+neon -show-encoding < %s | FileCheck %s + +// Check that the assembler can handle the documented syntax for AArch64 + +//------------------------------------------------------------------------------ +// Load single 1-element structure to all lanes of 1 register +//------------------------------------------------------------------------------ + ld1r {v0.16b}, [x0] + ld1r {v15.8h}, [x15] + ld1r {v31.4s}, [sp] + ld1r {v0.2d}, [x0] + ld1r {v0.8b}, [x0] + ld1r {v15.4h}, [x15] + ld1r {v31.2s}, [sp] + ld1r {v0.1d}, [x0] +// CHECK: ld1r {v0.16b}, [x0] // encoding: [0x00,0xc0,0x40,0x4d] +// CHECK: ld1r {v15.8h}, [x15] // encoding: [0xef,0xc5,0x40,0x4d] +// CHECK: ld1r {v31.4s}, [sp] // encoding: [0xff,0xcb,0x40,0x4d] +// CHECK: ld1r {v0.2d}, [x0] // encoding: [0x00,0xcc,0x40,0x4d] +// CHECK: ld1r {v0.8b}, [x0] // encoding: [0x00,0xc0,0x40,0x0d] +// CHECK: ld1r {v15.4h}, [x15] // encoding: [0xef,0xc5,0x40,0x0d] +// CHECK: ld1r {v31.2s}, [sp] // encoding: [0xff,0xcb,0x40,0x0d] +// CHECK: ld1r {v0.1d}, [x0] // encoding: [0x00,0xcc,0x40,0x0d] + +//------------------------------------------------------------------------------ +// Load single N-element structure to all lanes of N consecutive +// registers (N = 2,3,4) +//------------------------------------------------------------------------------ + ld2r {v0.16b, v1.16b}, [x0] + ld2r {v15.8h, v16.8h}, [x15] + ld2r {v31.4s, v0.4s}, [sp] + ld2r {v0.2d, v1.2d}, [x0] + ld2r {v0.8b, v1.8b}, [x0] + ld2r {v15.4h, v16.4h}, [x15] + ld2r {v31.2s, v0.2s}, [sp] + ld2r {v31.1d, v0.1d}, [sp] +// CHECK: ld2r {v0.16b, v1.16b}, [x0] // encoding: [0x00,0xc0,0x60,0x4d] +// CHECK: ld2r {v15.8h, v16.8h}, [x15] // encoding: [0xef,0xc5,0x60,0x4d] +// CHECK: ld2r {v31.4s, v0.4s}, [sp] // encoding: [0xff,0xcb,0x60,0x4d] +// CHECK: ld2r {v0.2d, v1.2d}, [x0] // encoding: [0x00,0xcc,0x60,0x4d] +// CHECK: ld2r {v0.8b, v1.8b}, [x0] // encoding: [0x00,0xc0,0x60,0x0d] +// CHECK: ld2r {v15.4h, v16.4h}, [x15] // encoding: [0xef,0xc5,0x60,0x0d] +// CHECK: ld2r {v31.2s, v0.2s}, [sp] // encoding: [0xff,0xcb,0x60,0x0d] +// CHECK: ld2r {v31.1d, v0.1d}, [sp] // encoding: [0xff,0xcf,0x60,0x0d] + + ld3r {v0.16b, v1.16b, v2.16b}, [x0] + ld3r {v15.8h, v16.8h, v17.8h}, [x15] + ld3r {v31.4s, v0.4s, v1.4s}, [sp] + ld3r {v0.2d, v1.2d, v2.2d}, [x0] + ld3r {v0.8b, v1.8b, v2.8b}, [x0] + ld3r {v15.4h, v16.4h, v17.4h}, [x15] + ld3r {v31.2s, v0.2s, v1.2s}, [sp] + ld3r {v31.1d, v0.1d, v1.1d}, [sp] +// CHECK: ld3r {v0.16b, v1.16b, v2.16b}, [x0] // encoding: [0x00,0xe0,0x40,0x4d] +// CHECK: ld3r {v15.8h, v16.8h, v17.8h}, [x15] // encoding: [0xef,0xe5,0x40,0x4d] +// CHECK: ld3r {v31.4s, v0.4s, v1.4s}, [sp] // encoding: [0xff,0xeb,0x40,0x4d] +// CHECK: ld3r {v0.2d, v1.2d, v2.2d}, [x0] // encoding: [0x00,0xec,0x40,0x4d] +// CHECK: ld3r {v0.8b, v1.8b, v2.8b}, [x0] // encoding: [0x00,0xe0,0x40,0x0d] +// CHECK: ld3r {v15.4h, v16.4h, v17.4h}, [x15] // encoding: [0xef,0xe5,0x40,0x0d] +// CHECK: ld3r {v31.2s, v0.2s, v1.2s}, [sp] // encoding: [0xff,0xeb,0x40,0x0d] +// CHECK: ld3r {v31.1d, v0.1d, v1.1d}, [sp] // encoding: [0xff,0xef,0x40,0x0d] + + ld4r {v0.16b, v1.16b, v2.16b, v3.16b}, [x0] + ld4r {v15.8h, v16.8h, v17.8h, v18.8h}, [x15] + ld4r {v31.4s, v0.4s, v1.4s, v2.4s}, [sp] + ld4r {v0.2d, v1.2d, v2.2d, v3.2d}, [x0] + ld4r {v0.8b, v1.8b, v2.8b, v3.8b}, [x0] + ld4r {v15.4h, v16.4h, v17.4h, v18.4h}, [x15] + ld4r {v31.2s, v0.2s, v1.2s, v2.2s}, [sp] + ld4r {v31.1d, v0.1d, v1.1d, v2.1d}, [sp] +// CHECK: ld4r {v0.16b, v1.16b, v2.16b, v3.16b}, [x0] // encoding: [0x00,0xe0,0x60,0x4d] +// CHECK: ld4r {v15.8h, v16.8h, v17.8h, v18.8h}, [x15] // encoding: [0xef,0xe5,0x60,0x4d] +// CHECK: ld4r {v31.4s, v0.4s, v1.4s, v2.4s}, [sp] // encoding: [0xff,0xeb,0x60,0x4d] +// CHECK: ld4r {v0.2d, v1.2d, v2.2d, v3.2d}, [x0] // encoding: [0x00,0xec,0x60,0x4d] +// CHECK: ld4r {v0.8b, v1.8b, v2.8b, v3.8b}, [x0] // encoding: [0x00,0xe0,0x60,0x0d] +// CHECK: ld4r {v15.4h, v16.4h, v17.4h, v18.4h}, [x15] // encoding: [0xef,0xe5,0x60,0x0d] +// CHECK: ld4r {v31.2s, v0.2s, v1.2s, v2.2s}, [sp] // encoding: [0xff,0xeb,0x60,0x0d] +// CHECK: ld4r {v31.1d, v0.1d, v1.1d, v2.1d}, [sp] // encoding: [0xff,0xef,0x60,0x0d] + +//------------------------------------------------------------------------------ +// Load single 1-element structure to one lane of 1 register. +//------------------------------------------------------------------------------ + ld1 {v0.b}[9], [x0] + ld1 {v15.h}[7], [x15] + ld1 {v31.s}[3], [sp] + ld1 {v0.d}[1], [x0] +// CHECK: ld1 {v0.b}[9], [x0] // encoding: [0x00,0x04,0x40,0x4d] +// CHECK: ld1 {v15.h}[7], [x15] // encoding: [0xef,0x59,0x40,0x4d] +// CHECK: ld1 {v31.s}[3], [sp] // encoding: [0xff,0x93,0x40,0x4d] +// CHECK: ld1 {v0.d}[1], [x0] // encoding: [0x00,0x84,0x40,0x4d] + +//------------------------------------------------------------------------------ +// Load single N-element structure to one lane of N consecutive registers +// (N = 2,3,4) +//------------------------------------------------------------------------------ + ld2 {v0.b, v1.b}[9], [x0] + ld2 {v15.h, v16.h}[7], [x15] + ld2 {v31.s, v0.s}[3], [sp] + ld2 {v0.d, v1.d}[1], [x0] +// CHECK: ld2 {v0.b, v1.b}[9], [x0] // encoding: [0x00,0x04,0x60,0x4d] +// CHECK: ld2 {v15.h, v16.h}[7], [x15] // encoding: [0xef,0x59,0x60,0x4d] +// CHECK: ld2 {v31.s, v0.s}[3], [sp] // encoding: [0xff,0x93,0x60,0x4d] +// CHECK: ld2 {v0.d, v1.d}[1], [x0] // encoding: [0x00,0x84,0x60,0x4d] + + ld3 {v0.b, v1.b, v2.b}[9], [x0] + ld3 {v15.h, v16.h, v17.h}[7], [x15] + ld3 {v31.s, v0.s, v1.s}[3], [sp] + ld3 {v0.d, v1.d, v2.d}[1], [x0] +// CHECK: ld3 {v0.b, v1.b, v2.b}[9], [x0] // encoding: [0x00,0x24,0x40,0x4d] +// CHECK: ld3 {v15.h, v16.h, v17.h}[7], [x15] // encoding: [0xef,0x79,0x40,0x4d] +// CHECK: ld3 {v31.s, v0.s, v1.s}[3], [sp] // encoding: [0xff,0xb3,0x40,0x4d] +// CHECK: ld3 {v0.d, v1.d, v2.d}[1], [x0] // encoding: [0x00,0xa4,0x40,0x4d] + + ld4 {v0.b, v1.b, v2.b, v3.b}[9], [x0] + ld4 {v15.h, v16.h, v17.h, v18.h}[7], [x15] + ld4 {v31.s, v0.s, v1.s, v2.s}[3], [sp] + ld4 {v0.d, v1.d, v2.d, v3.d}[1], [x0] +// CHECK: ld4 {v0.b, v1.b, v2.b, v3.b}[9], [x0] // encoding: [0x00,0x24,0x60,0x4d] +// CHECK: ld4 {v15.h, v16.h, v17.h, v18.h}[7], [x15] // encoding: [0xef,0x79,0x60,0x4d] +// CHECK: ld4 {v31.s, v0.s, v1.s, v2.s}[3], [sp] // encoding: [0xff,0xb3,0x60,0x4d] +// CHECK: ld4 {v0.d, v1.d, v2.d, v3.d}[1], [x0] // encoding: [0x00,0xa4,0x60,0x4d] + +//------------------------------------------------------------------------------ +// Store single 1-element structure from one lane of 1 register. +//------------------------------------------------------------------------------ + st1 {v0.b}[9], [x0] + st1 {v15.h}[7], [x15] + st1 {v31.s}[3], [sp] + st1 {v0.d}[1], [x0] +// CHECK: st1 {v0.b}[9], [x0] // encoding: [0x00,0x04,0x00,0x4d] +// CHECK: st1 {v15.h}[7], [x15] // encoding: [0xef,0x59,0x00,0x4d] +// CHECK: st1 {v31.s}[3], [sp] // encoding: [0xff,0x93,0x00,0x4d] +// CHECK: st1 {v0.d}[1], [x0] // encoding: [0x00,0x84,0x00,0x4d] + +//------------------------------------------------------------------------------ +// Store single N-element structure from one lane of N consecutive registers +// (N = 2,3,4) +//------------------------------------------------------------------------------ + st2 {v0.b, v1.b}[9], [x0] + st2 {v15.h, v16.h}[7], [x15] + st2 {v31.s, v0.s}[3], [sp] + st2 {v0.d, v1.d}[1], [x0] +// CHECK: st2 {v0.b, v1.b}[9], [x0] // encoding: [0x00,0x04,0x20,0x4d] +// CHECK: st2 {v15.h, v16.h}[7], [x15] // encoding: [0xef,0x59,0x20,0x4d] +// CHECK: st2 {v31.s, v0.s}[3], [sp] // encoding: [0xff,0x93,0x20,0x4d] +// CHECK: st2 {v0.d, v1.d}[1], [x0] // encoding: [0x00,0x84,0x20,0x4d] + + st3 {v0.b, v1.b, v2.b}[9], [x0] + st3 {v15.h, v16.h, v17.h}[7], [x15] + st3 {v31.s, v0.s, v1.s}[3], [sp] + st3 {v0.d, v1.d, v2.d}[1], [x0] +// CHECK: st3 {v0.b, v1.b, v2.b}[9], [x0] // encoding: [0x00,0x24,0x00,0x4d] +// CHECK: st3 {v15.h, v16.h, v17.h}[7], [x15] // encoding: [0xef,0x79,0x00,0x4d] +// CHECK: st3 {v31.s, v0.s, v1.s}[3], [sp] // encoding: [0xff,0xb3,0x00,0x4d] +// CHECK: st3 {v0.d, v1.d, v2.d}[1], [x0] // encoding: [0x00,0xa4,0x00,0x4d] + + st4 {v0.b, v1.b, v2.b, v3.b}[9], [x0] + st4 {v15.h, v16.h, v17.h, v18.h}[7], [x15] + st4 {v31.s, v0.s, v1.s, v2.s}[3], [sp] + st4 {v0.d, v1.d, v2.d, v3.d}[1], [x0] +// CHECK: st4 {v0.b, v1.b, v2.b, v3.b}[9], [x0] // encoding: [0x00,0x24,0x20,0x4d] +// CHECK: st4 {v15.h, v16.h, v17.h, v18.h}[7], [x15] // encoding: [0xef,0x79,0x20,0x4d] +// CHECK: st4 {v31.s, v0.s, v1.s, v2.s}[3], [sp] // encoding: [0xff,0xb3,0x20,0x4d] +// CHECK: st4 {v0.d, v1.d, v2.d, v3.d}[1], [x0] // encoding: [0x00,0xa4,0x20,0x4d] + +//------------------------------------------------------------------------------ +// Post-index oad single 1-element structure to all lanes of 1 register +//------------------------------------------------------------------------------ + ld1r {v0.16b}, [x0], #1 + ld1r {v15.8h}, [x15], #2 + ld1r {v31.4s}, [sp], #4 + ld1r {v0.2d}, [x0], #8 + ld1r {v0.8b}, [x0], x0 + ld1r {v15.4h}, [x15], x1 + ld1r {v31.2s}, [sp], x2 + ld1r {v0.1d}, [x0], x3 +// CHECK: ld1r {v0.16b}, [x0], #1 // encoding: [0x00,0xc0,0xdf,0x4d] +// CHECK: ld1r {v15.8h}, [x15], #2 // encoding: [0xef,0xc5,0xdf,0x4d] +// CHECK: ld1r {v31.4s}, [sp], #4 // encoding: [0xff,0xcb,0xdf,0x4d] +// CHECK: ld1r {v0.2d}, [x0], #8 // encoding: [0x00,0xcc,0xdf,0x4d] +// CHECK: ld1r {v0.8b}, [x0], x0 // encoding: [0x00,0xc0,0xc0,0x0d] +// CHECK: ld1r {v15.4h}, [x15], x1 // encoding: [0xef,0xc5,0xc1,0x0d] +// CHECK: ld1r {v31.2s}, [sp], x2 // encoding: [0xff,0xcb,0xc2,0x0d] +// CHECK: ld1r {v0.1d}, [x0], x3 // encoding: [0x00,0xcc,0xc3,0x0d] + +//------------------------------------------------------------------------------ +// Post-index load single N-element structure to all lanes of N consecutive +// registers (N = 2,3,4) +//------------------------------------------------------------------------------ + ld2r {v0.16b, v1.16b}, [x0], #2 + ld2r {v15.8h, v16.8h}, [x15], #4 + ld2r {v31.4s, v0.4s}, [sp], #8 + ld2r {v0.2d, v1.2d}, [x0], #16 + ld2r {v0.8b, v1.8b}, [x0], x6 + ld2r {v15.4h, v16.4h}, [x15], x7 + ld2r {v31.2s, v0.2s}, [sp], x9 + ld2r {v31.1d, v0.1d}, [x0], x5 +// CHECK: ld2r {v0.16b, v1.16b}, [x0], #2 // encoding: [0x00,0xc0,0xff,0x4d] +// CHECK: ld2r {v15.8h, v16.8h}, [x15], #4 // encoding: [0xef,0xc5,0xff,0x4d] +// CHECK: ld2r {v31.4s, v0.4s}, [sp], #8 // encoding: [0xff,0xcb,0xff,0x4d] +// CHECK: ld2r {v0.2d, v1.2d}, [x0], #16 // encoding: [0x00,0xcc,0xff,0x4d] +// CHECK: ld2r {v0.8b, v1.8b}, [x0], x6 // encoding: [0x00,0xc0,0xe6,0x0d] +// CHECK: ld2r {v15.4h, v16.4h}, [x15], x7 // encoding: [0xef,0xc5,0xe7,0x0d] +// CHECK: ld2r {v31.2s, v0.2s}, [sp], x9 // encoding: [0xff,0xcb,0xe9,0x0d] +// CHECK: ld2r {v31.1d, v0.1d}, [x0], x5 // encoding: [0x1f,0xcc,0xe5,0x0d] + + ld3r {v0.16b, v1.16b, v2.16b}, [x0], x9 + ld3r {v15.8h, v16.8h, v17.8h}, [x15], x6 + ld3r {v31.4s, v0.4s, v1.4s}, [sp], x7 + ld3r {v0.2d, v1.2d, v2.2d}, [x0], x5 + ld3r {v0.8b, v1.8b, v2.8b}, [x0], #3 + ld3r {v15.4h, v16.4h, v17.4h}, [x15], #6 + ld3r {v31.2s, v0.2s, v1.2s}, [sp], #12 + ld3r {v31.1d, v0.1d, v1.1d}, [sp], #24 +// CHECK: ld3r {v0.16b, v1.16b, v2.16b}, [x0], x9 // encoding: [0x00,0xe0,0xc9,0x4d] +// CHECK: ld3r {v15.8h, v16.8h, v17.8h}, [x15], x6 // encoding: [0xef,0xe5,0xc6,0x4d] +// CHECK: ld3r {v31.4s, v0.4s, v1.4s}, [sp], x7 // encoding: [0xff,0xeb,0xc7,0x4d] +// CHECK: ld3r {v0.2d, v1.2d, v2.2d}, [x0], x5 // encoding: [0x00,0xec,0xc5,0x4d] +// CHECK: ld3r {v0.8b, v1.8b, v2.8b}, [x0], #3 // encoding: [0x00,0xe0,0xdf,0x0d] +// CHECK: ld3r {v15.4h, v16.4h, v17.4h}, [x15], #6 // encoding: [0xef,0xe5,0xdf,0x0d] +// CHECK: ld3r {v31.2s, v0.2s, v1.2s}, [sp], #12 // encoding: [0xff,0xeb,0xdf,0x0d] +// CHECK: ld3r {v31.1d, v0.1d, v1.1d}, [sp], #24 // encoding: [0xff,0xef,0xdf,0x0d] + + ld4r {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #4 + ld4r {v15.8h, v16.8h, v17.8h, v18.8h}, [x15], #8 + ld4r {v31.4s, v0.4s, v1.4s, v2.4s}, [sp], #16 + ld4r {v0.2d, v1.2d, v2.2d, v3.2d}, [x0], #32 + ld4r {v0.8b, v1.8b, v2.8b, v3.8b}, [x0], x5 + ld4r {v15.4h, v16.4h, v17.4h, v18.4h}, [x15], x9 + ld4r {v31.2s, v0.2s, v1.2s, v2.2s}, [sp], x30 + ld4r {v31.1d, v0.1d, v1.1d, v2.1d}, [sp], x7 +// CHECK: ld4r {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #4 // encoding: [0x00,0xe0,0xff,0x4d] +// CHECK: ld4r {v15.8h, v16.8h, v17.8h, v18.8h}, [x15], #8 // encoding: [0xef,0xe5,0xff,0x4d] +// CHECK: ld4r {v31.4s, v0.4s, v1.4s, v2.4s}, [sp], #16 // encoding: [0xff,0xeb,0xff,0x4d] +// CHECK: ld4r {v0.2d, v1.2d, v2.2d, v3.2d}, [x0], #32 // encoding: [0x00,0xec,0xff,0x4d] +// CHECK: ld4r {v0.8b, v1.8b, v2.8b, v3.8b}, [x0], x5 // encoding: [0x00,0xe0,0xe5,0x0d] +// CHECK: ld4r {v15.4h, v16.4h, v17.4h, v18.4h}, [x15], x9 // encoding: [0xef,0xe5,0xe9,0x0d] +// CHECK: ld4r {v31.2s, v0.2s, v1.2s, v2.2s}, [sp], x30 // encoding: [0xff,0xeb,0xfe,0x0d] +// CHECK: ld4r {v31.1d, v0.1d, v1.1d, v2.1d}, [sp], x7 // encoding: [0xff,0xef,0xe7,0x0d] + +//------------------------------------------------------------------------------ +// Post-index load single 1-element structure to one lane of 1 register. +//------------------------------------------------------------------------------ + ld1 {v0.b}[9], [x0], #1 + ld1 {v15.h}[7], [x15], x9 + ld1 {v31.s}[3], [sp], x6 + ld1 {v0.d}[1], [x0], #8 +// CHECK: ld1 {v0.b}[9], [x0], #1 // encoding: [0x00,0x04,0xdf,0x4d] +// CHECK: ld1 {v15.h}[7], [x15], x9 // encoding: [0xef,0x59,0xc9,0x4d] +// CHECK: ld1 {v31.s}[3], [sp], x6 // encoding: [0xff,0x93,0xc6,0x4d] +// CHECK: ld1 {v0.d}[1], [x0], #8 // encoding: [0x00,0x84,0xdf,0x4d] + +//------------------------------------------------------------------------------ +// Post-index load single N-element structure to one lane of N consecutive +// registers (N = 2,3,4) +//------------------------------------------------------------------------------ + ld2 {v0.b, v1.b}[9], [x0], x3 + ld2 {v15.h, v16.h}[7], [x15], #4 + ld2 {v31.s, v0.s}[3], [sp], #8 + ld2 {v0.d, v1.d}[1], [x0], x0 +// CHECK: ld2 {v0.b, v1.b}[9], [x0], x3 // encoding: [0x00,0x04,0xe3,0x4d] +// CHECK: ld2 {v15.h, v16.h}[7], [x15], #4 // encoding: [0xef,0x59,0xff,0x4d] +// CHECK: ld2 {v31.s, v0.s}[3], [sp], #8 // encoding: [0xff,0x93,0xff,0x4d] +// CHECK: ld2 {v0.d, v1.d}[1], [x0], x0 // encoding: [0x00,0x84,0xe0,0x4d] + + ld3 {v0.b, v1.b, v2.b}[9], [x0], #3 + ld3 {v15.h, v16.h, v17.h}[7], [x15], #6 + ld3 {v31.s, v0.s, v1.s}[3], [sp], x3 + ld3 {v0.d, v1.d, v2.d}[1], [x0], x6 +// CHECK: ld3 {v0.b, v1.b, v2.b}[9], [x0], #3 // encoding: [0x00,0x24,0xdf,0x4d] +// CHECK: ld3 {v15.h, v16.h, v17.h}[7], [x15], #6 // encoding: [0xef,0x79,0xdf,0x4d] +// CHECK: ld3 {v31.s, v0.s, v1.s}[3], [sp], x3 // encoding: [0xff,0xb3,0xc3,0x4d] +// CHECK: ld3 {v0.d, v1.d, v2.d}[1], [x0], x6 // encoding: [0x00,0xa4,0xc6,0x4d] + + ld4 {v0.b, v1.b, v2.b, v3.b}[9], [x0], x5 + ld4 {v15.h, v16.h, v17.h, v18.h}[7], [x15], x7 + ld4 {v31.s, v0.s, v1.s, v2.s}[3], [sp], #16 + ld4 {v0.d, v1.d, v2.d, v3.d}[1], [x0], #32 +// CHECK: ld4 {v0.b, v1.b, v2.b, v3.b}[9], [x0], x5 // encoding: [0x00,0x24,0xe5,0x4d] +// CHECK: ld4 {v15.h, v16.h, v17.h, v18.h}[7], [x15], x7 // encoding: [0xef,0x79,0xe7,0x4d] +// CHECK: ld4 {v31.s, v0.s, v1.s, v2.s}[3], [sp], #16 // encoding: [0xff,0xb3,0xff,0x4d] +// CHECK: ld4 {v0.d, v1.d, v2.d, v3.d}[1], [x0], #32 // encoding: [0x00,0xa4,0xff,0x4d] + +//------------------------------------------------------------------------------ +// Post-index store single 1-element structure from one lane of 1 register. +//------------------------------------------------------------------------------ + st1 {v0.b}[9], [x0], #1 + st1 {v15.h}[7], [x15], x9 + st1 {v31.s}[3], [sp], x6 + st1 {v0.d}[1], [x0], #8 +// CHECK: st1 {v0.b}[9], [x0], #1 // encoding: [0x00,0x04,0x9f,0x4d] +// CHECK: st1 {v15.h}[7], [x15], x9 // encoding: [0xef,0x59,0x89,0x4d] +// CHECK: st1 {v31.s}[3], [sp], x6 // encoding: [0xff,0x93,0x86,0x4d] +// CHECK: st1 {v0.d}[1], [x0], #8 // encoding: [0x00,0x84,0x9f,0x4d] + +//------------------------------------------------------------------------------ +// Post-index store single N-element structure from one lane of N consecutive +// registers (N = 2,3,4) +//------------------------------------------------------------------------------ + st2 {v0.b, v1.b}[9], [x0], x3 + st2 {v15.h, v16.h}[7], [x15], #4 + st2 {v31.s, v0.s}[3], [sp], #8 + st2 {v0.d, v1.d}[1], [x0], x0 +// CHECK: st2 {v0.b, v1.b}[9], [x0], x3 // encoding: [0x00,0x04,0xa3,0x4d] +// CHECK: st2 {v15.h, v16.h}[7], [x15], #4 // encoding: [0xef,0x59,0xbf,0x4d] +// CHECK: st2 {v31.s, v0.s}[3], [sp], #8 // encoding: [0xff,0x93,0xbf,0x4d] +// CHECK: st2 {v0.d, v1.d}[1], [x0], x0 // encoding: [0x00,0x84,0xa0,0x4d] + + st3 {v0.b, v1.b, v2.b}[9], [x0], #3 + st3 {v15.h, v16.h, v17.h}[7], [x15], #6 + st3 {v31.s, v0.s, v1.s}[3], [sp], x3 + st3 {v0.d, v1.d, v2.d}[1], [x0], x6 +// CHECK: st3 {v0.b, v1.b, v2.b}[9], [x0], #3 // encoding: [0x00,0x24,0x9f,0x4d] +// CHECK: st3 {v15.h, v16.h, v17.h}[7], [x15], #6 // encoding: [0xef,0x79,0x9f,0x4d] +// CHECK: st3 {v31.s, v0.s, v1.s}[3], [sp], x3 // encoding: [0xff,0xb3,0x83,0x4d] +// CHECK: st3 {v0.d, v1.d, v2.d}[1], [x0], x6 // encoding: [0x00,0xa4,0x86,0x4d] + + st4 {v0.b, v1.b, v2.b, v3.b}[9], [x0], x5 + st4 {v15.h, v16.h, v17.h, v18.h}[7], [x15], x7 + st4 {v31.s, v0.s, v1.s, v2.s}[3], [sp], #16 + st4 {v0.d, v1.d, v2.d, v3.d}[1], [x0], #32 +// CHECK: st4 {v0.b, v1.b, v2.b, v3.b}[9], [x0], x5 // encoding: [0x00,0x24,0xa5,0x4d] +// CHECK: st4 {v15.h, v16.h, v17.h, v18.h}[7], [x15], x7 // encoding: [0xef,0x79,0xa7,0x4d] +// CHECK: st4 {v31.s, v0.s, v1.s, v2.s}[3], [sp], #16 // encoding: [0xff,0xb3,0xbf,0x4d] +// CHECK: st4 {v0.d, v1.d, v2.d, v3.d}[1], [x0], #32 // encoding: [0x00,0xa4,0xbf,0x4d] diff --git a/test/MC/Disassembler/AArch64/neon-instructions.txt b/test/MC/Disassembler/AArch64/neon-instructions.txt index c1659019a87..9f9e7772857 100644 --- a/test/MC/Disassembler/AArch64/neon-instructions.txt +++ b/test/MC/Disassembler/AArch64/neon-instructions.txt @@ -2059,6 +2059,90 @@ 0xef,0x45,0x82,0x4c 0xff,0x0b,0x9f,0x4c +#---------------------------------------------------------------------- +# Vector load single N-element structure to all lane of N +# consecutive registers (N = 1,2,3,4) +#---------------------------------------------------------------------- +# CHECK: ld1r {v0.16b}, [x0] +# CHECK: ld1r {v15.8h}, [x15] +# CHECK: ld2r {v31.4s, v0.4s}, [sp] +# CHECK: ld2r {v0.2d, v1.2d}, [x0] +# CHECK: ld3r {v0.8b, v1.8b, v2.8b}, [x0] +# CHECK: ld3r {v15.4h, v16.4h, v17.4h}, [x15] +# CHECK: ld4r {v31.2s, v0.2s, v1.2s, v2.2s}, [sp] +# CHECK: ld4r {v31.1d, v0.1d, v1.1d, v2.1d}, [sp] +0x00,0xc0,0x40,0x4d +0xef,0xc5,0x40,0x4d +0xff,0xcb,0x60,0x4d +0x00,0xcc,0x60,0x4d +0x00,0xe0,0x40,0x0d +0xef,0xe5,0x40,0x0d +0xff,0xeb,0x60,0x0d +0xff,0xef,0x60,0x0d + +#---------------------------------------------------------------------- +# Vector load/store single N-element structure to/from one lane of N +# consecutive registers (N = 1,2,3,4) +#---------------------------------------------------------------------- +# CHECK: ld1 {v0.b}[9], [x0] +# CHECK: ld2 {v15.h, v16.h}[7], [x15] +# CHECK: ld3 {v31.s, v0.s, v1.s}[3], [sp] +# CHECK: ld4 {v0.d, v1.d, v2.d, v3.d}[1], [x0] +# CHECK: st1 {v0.d}[1], [x0] +# CHECK: st2 {v31.s, v0.s}[3], [sp] +# CHECK: st3 {v15.h, v16.h, v17.h}[7], [x15] +# CHECK: st4 {v0.b, v1.b, v2.b, v3.b}[9], [x0] +0x00,0x04,0x40,0x4d +0xef,0x59,0x60,0x4d +0xff,0xb3,0x40,0x4d +0x00,0xa4,0x60,0x4d +0x00,0x84,0x00,0x4d +0xff,0x93,0x20,0x4d +0xef,0x79,0x00,0x4d +0x00,0x24,0x20,0x4d + +#---------------------------------------------------------------------- +# Post-index of vector load single N-element structure to all lane of N +# consecutive registers (N = 1,2,3,4) +#---------------------------------------------------------------------- +# CHECK: ld1r {v0.16b}, [x0], #1 +# CHECK: ld1r {v15.8h}, [x15], #2 +# CHECK: ld2r {v31.4s, v0.4s}, [sp], #8 +# CHECK: ld2r {v0.2d, v1.2d}, [x0], #16 +# CHECK: ld3r {v0.8b, v1.8b, v2.8b}, [x0], #3 +# CHECK: ld3r {v15.4h, v16.4h, v17.4h}, [x15], #6 +# CHECK: ld4r {v31.2s, v0.2s, v1.2s, v2.2s}, [sp], x30 +# CHECK: ld4r {v31.1d, v0.1d, v1.1d, v2.1d}, [sp], x7 +0x00,0xc0,0xdf,0x4d +0xef,0xc5,0xdf,0x4d +0xff,0xcb,0xff,0x4d +0x00,0xcc,0xff,0x4d +0x00,0xe0,0xdf,0x0d +0xef,0xe5,0xdf,0x0d +0xff,0xeb,0xfe,0x0d +0xff,0xef,0xe7,0x0d + +#---------------------------------------------------------------------- +# Post-index of vector load/store single N-element structure to/from +# one lane of N consecutive registers (N = 1,2,3,4) +#---------------------------------------------------------------------- +# CHECK: ld1 {v0.b}[0], [x0], #1 +# CHECK: ld2 {v15.h, v16.h}[0], [x15], #4 +# CHECK: ld3 {v31.s, v0.s, v1.s}[0], [sp], x3 +# CHECK: ld4 {v0.d, v1.d, v2.d, v3.d}[1], [x0], #24 +# CHECK: st1 {v0.d}[0], [x0], #8 +# CHECK: st2 {v31.s, v0.s}[0], [sp], #8 +# CHECK: st3 {v15.h, v16.h, v17.h}[0], [x15], #6 +# CHECK: st4 {v0.b, v1.b, v2.b, v3.b}[1], [x0], x5 +0x00,0x04,0xdf,0x4d +0xef,0x59,0xff,0x4d +0xff,0xb3,0xc3,0x4d +0x00,0xa4,0xff,0x4d +0x00,0x84,0x9f,0x4d +0xff,0x93,0xbf,0x4d +0xef,0x79,0x9f,0x4d +0x00,0x24,0xa5,0x4d + #---------------------------------------------------------------------- # Bitwise extract #----------------------------------------------------------------------