From b36ec86c01e3c3238dca621648f017aef96dda60 Mon Sep 17 00:00:00 2001 From: Bob Wilson Date: Thu, 6 Aug 2009 18:47:44 +0000 Subject: [PATCH] Implement Neon VST[234] operations. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@78330 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/ARM/ARMISelDAGToDAG.cpp | 65 +++++++++++++++++++-- lib/Target/ARM/ARMISelLowering.cpp | 25 ++++++++ lib/Target/ARM/ARMISelLowering.h | 5 +- lib/Target/ARM/ARMInstrNEON.td | 91 +++++++++++++++++++++-------- lib/Target/ARM/NEONPreAllocPass.cpp | 21 +++++++ test/CodeGen/ARM/vst2.ll | 38 ++++++++++++ test/CodeGen/ARM/vst3.ll | 38 ++++++++++++ test/CodeGen/ARM/vst4.ll | 38 ++++++++++++ 8 files changed, 290 insertions(+), 31 deletions(-) create mode 100644 test/CodeGen/ARM/vst2.ll create mode 100644 test/CodeGen/ARM/vst3.ll create mode 100644 test/CodeGen/ARM/vst4.ll diff --git a/lib/Target/ARM/ARMISelDAGToDAG.cpp b/lib/Target/ARM/ARMISelDAGToDAG.cpp index b2c6e12a0f8..d996b24ddf0 100644 --- a/lib/Target/ARM/ARMISelDAGToDAG.cpp +++ b/lib/Target/ARM/ARMISelDAGToDAG.cpp @@ -1306,11 +1306,11 @@ SDNode *ARMDAGToDAGISel::Select(SDValue Op) { } case ARMISD::VLD2D: { - MVT VT = Op.getValueType(); SDValue MemAddr, MemUpdate, MemOpc; if (!SelectAddrMode6(Op, N->getOperand(1), MemAddr, MemUpdate, MemOpc)) return NULL; - unsigned Opc; + unsigned Opc = 0; + MVT VT = Op.getValueType(); switch (VT.getSimpleVT()) { default: llvm_unreachable("unhandled VLD2D type"); case MVT::v8i8: Opc = ARM::VLD2d8; break; @@ -1323,11 +1323,11 @@ SDNode *ARMDAGToDAGISel::Select(SDValue Op) { } case ARMISD::VLD3D: { - MVT VT = Op.getValueType(); SDValue MemAddr, MemUpdate, MemOpc; if (!SelectAddrMode6(Op, N->getOperand(1), MemAddr, MemUpdate, MemOpc)) return NULL; - unsigned Opc; + unsigned Opc = 0; + MVT VT = Op.getValueType(); switch (VT.getSimpleVT()) { default: llvm_unreachable("unhandled VLD3D type"); case MVT::v8i8: Opc = ARM::VLD3d8; break; @@ -1340,11 +1340,11 @@ SDNode *ARMDAGToDAGISel::Select(SDValue Op) { } case ARMISD::VLD4D: { - MVT VT = Op.getValueType(); SDValue MemAddr, MemUpdate, MemOpc; if (!SelectAddrMode6(Op, N->getOperand(1), MemAddr, MemUpdate, MemOpc)) return NULL; - unsigned Opc; + unsigned Opc = 0; + MVT VT = Op.getValueType(); switch (VT.getSimpleVT()) { default: llvm_unreachable("unhandled VLD4D type"); case MVT::v8i8: Opc = ARM::VLD4d8; break; @@ -1357,6 +1357,59 @@ SDNode *ARMDAGToDAGISel::Select(SDValue Op) { ResTys.push_back(MVT::Other); return CurDAG->getTargetNode(Opc, dl, ResTys, Ops, 3); } + + case ARMISD::VST2D: { + SDValue MemAddr, MemUpdate, MemOpc; + if (!SelectAddrMode6(Op, N->getOperand(1), MemAddr, MemUpdate, MemOpc)) + return NULL; + unsigned Opc = 0; + switch (N->getOperand(2).getValueType().getSimpleVT()) { + default: llvm_unreachable("unhandled VST2D type"); + case MVT::v8i8: Opc = ARM::VST2d8; break; + case MVT::v4i16: Opc = ARM::VST2d16; break; + case MVT::v2f32: + case MVT::v2i32: Opc = ARM::VST2d32; break; + } + const SDValue Ops[] = { MemAddr, MemUpdate, MemOpc, + N->getOperand(2), N->getOperand(3) }; + return CurDAG->getTargetNode(Opc, dl, MVT::Other, Ops, 5); + } + + case ARMISD::VST3D: { + SDValue MemAddr, MemUpdate, MemOpc; + if (!SelectAddrMode6(Op, N->getOperand(1), MemAddr, MemUpdate, MemOpc)) + return NULL; + unsigned Opc = 0; + switch (N->getOperand(2).getValueType().getSimpleVT()) { + default: llvm_unreachable("unhandled VST3D type"); + case MVT::v8i8: Opc = ARM::VST3d8; break; + case MVT::v4i16: Opc = ARM::VST3d16; break; + case MVT::v2f32: + case MVT::v2i32: Opc = ARM::VST3d32; break; + } + const SDValue Ops[] = { MemAddr, MemUpdate, MemOpc, + N->getOperand(2), N->getOperand(3), + N->getOperand(4) }; + return CurDAG->getTargetNode(Opc, dl, MVT::Other, Ops, 6); + } + + case ARMISD::VST4D: { + SDValue MemAddr, MemUpdate, MemOpc; + if (!SelectAddrMode6(Op, N->getOperand(1), MemAddr, MemUpdate, MemOpc)) + return NULL; + unsigned Opc = 0; + switch (N->getOperand(2).getValueType().getSimpleVT()) { + default: llvm_unreachable("unhandled VST4D type"); + case MVT::v8i8: Opc = ARM::VST4d8; break; + case MVT::v4i16: Opc = ARM::VST4d16; break; + case MVT::v2f32: + case MVT::v2i32: Opc = ARM::VST4d32; break; + } + const SDValue Ops[] = { MemAddr, MemUpdate, MemOpc, + N->getOperand(2), N->getOperand(3), + N->getOperand(4), N->getOperand(5) }; + return CurDAG->getTargetNode(Opc, dl, MVT::Other, Ops, 7); + } } return SelectCode(Op); diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp index 155ef591f53..0bfe213cf04 100644 --- a/lib/Target/ARM/ARMISelLowering.cpp +++ b/lib/Target/ARM/ARMISelLowering.cpp @@ -323,6 +323,7 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM) // We want to custom lower some of our intrinsics. setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom); + setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom); setOperationAction(ISD::SETCC, MVT::i32, Expand); setOperationAction(ISD::SETCC, MVT::f32, Expand); @@ -466,6 +467,9 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const { case ARMISD::VLD2D: return "ARMISD::VLD2D"; case ARMISD::VLD3D: return "ARMISD::VLD3D"; case ARMISD::VLD4D: return "ARMISD::VLD4D"; + case ARMISD::VST2D: return "ARMISD::VST2D"; + case ARMISD::VST3D: return "ARMISD::VST3D"; + case ARMISD::VST4D: return "ARMISD::VST4D"; } } @@ -1325,6 +1329,23 @@ static SDValue LowerNeonVLDIntrinsic(SDValue Op, SelectionDAG &DAG, return DAG.getNode(Opcode, dl, Node->getVTList(), Ops, 2); } +static SDValue LowerNeonVSTIntrinsic(SDValue Op, SelectionDAG &DAG, + unsigned Opcode, unsigned NumVecs) { + SDNode *Node = Op.getNode(); + MVT VT = Node->getOperand(3).getValueType(); + DebugLoc dl = Op.getDebugLoc(); + + if (!VT.is64BitVector()) + return SDValue(); // unimplemented + + SmallVector Ops; + Ops.push_back(Node->getOperand(0)); + Ops.push_back(Node->getOperand(2)); + for (unsigned N = 0; N < NumVecs; ++N) + Ops.push_back(Node->getOperand(N + 3)); + return DAG.getNode(Opcode, dl, MVT::Other, Ops.data(), Ops.size()); +} + SDValue ARMTargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) { unsigned IntNo = cast(Op.getOperand(1))->getZExtValue(); @@ -1340,10 +1361,13 @@ ARMTargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) { return LowerNeonVLDIntrinsic(Op, DAG, ARMISD::VLD4D); case Intrinsic::arm_neon_vst2i: case Intrinsic::arm_neon_vst2f: + return LowerNeonVSTIntrinsic(Op, DAG, ARMISD::VST2D, 2); case Intrinsic::arm_neon_vst3i: case Intrinsic::arm_neon_vst3f: + return LowerNeonVSTIntrinsic(Op, DAG, ARMISD::VST3D, 3); case Intrinsic::arm_neon_vst4i: case Intrinsic::arm_neon_vst4f: + return LowerNeonVSTIntrinsic(Op, DAG, ARMISD::VST4D, 4); default: return SDValue(); // Don't custom lower most intrinsics. } } @@ -2381,6 +2405,7 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) { case ISD::RETURNADDR: break; case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG); case ISD::GLOBAL_OFFSET_TABLE: return LowerGLOBAL_OFFSET_TABLE(Op, DAG); + case ISD::INTRINSIC_VOID: case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, DAG); case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG); case ISD::BIT_CONVERT: return ExpandBIT_CONVERT(Op.getNode(), DAG); diff --git a/lib/Target/ARM/ARMISelLowering.h b/lib/Target/ARM/ARMISelLowering.h index 9393bafdacf..4fe4d8bf943 100644 --- a/lib/Target/ARM/ARMISelLowering.h +++ b/lib/Target/ARM/ARMISelLowering.h @@ -119,7 +119,10 @@ namespace llvm { // Vector load/store with (de)interleaving VLD2D, VLD3D, - VLD4D + VLD4D, + VST2D, + VST3D, + VST4D }; } diff --git a/lib/Target/ARM/ARMInstrNEON.td b/lib/Target/ARM/ARMInstrNEON.td index 2393ed16b8e..2e8e0a294f5 100644 --- a/lib/Target/ARM/ARMInstrNEON.td +++ b/lib/Target/ARM/ARMInstrNEON.td @@ -81,6 +81,20 @@ def NEONvld3d : SDNode<"ARMISD::VLD3D", SDTARMVLD3, def NEONvld4d : SDNode<"ARMISD::VLD4D", SDTARMVLD4, [SDNPHasChain, SDNPMayLoad]>; +def SDTARMVST2 : SDTypeProfile<0, 3, [SDTCisPtrTy<0>, SDTCisSameAs<1, 2>]>; +def SDTARMVST3 : SDTypeProfile<0, 4, [SDTCisPtrTy<0>, SDTCisSameAs<1, 2>, + SDTCisSameAs<1, 3>]>; +def SDTARMVST4 : SDTypeProfile<0, 5, [SDTCisPtrTy<0>, SDTCisSameAs<1, 2>, + SDTCisSameAs<1, 3>, + SDTCisSameAs<1, 4>]>; + +def NEONvst2d : SDNode<"ARMISD::VST2D", SDTARMVST2, + [SDNPHasChain, SDNPMayStore]>; +def NEONvst3d : SDNode<"ARMISD::VST3D", SDTARMVST3, + [SDNPHasChain, SDNPMayStore]>; +def NEONvst4d : SDNode<"ARMISD::VST4D", SDTARMVST4, + [SDNPHasChain, SDNPMayStore]>; + //===----------------------------------------------------------------------===// // NEON operand definitions //===----------------------------------------------------------------------===// @@ -172,30 +186,6 @@ def VLD1q32 : VLD1Q<"vld1.32", v4i32, int_arm_neon_vld1i>; def VLD1qf : VLD1Q<"vld1.32", v4f32, int_arm_neon_vld1f>; def VLD1q64 : VLD1Q<"vld1.64", v2i64, int_arm_neon_vld1i>; -// VST1 : Vector Store (multiple single elements) -class VST1D - : NLdSt<(outs), (ins addrmode6:$addr, DPR:$src), - NoItinerary, - !strconcat(OpcodeStr, "\t\\{$src\\}, $addr"), - [(IntOp addrmode6:$addr, (Ty DPR:$src))]>; -class VST1Q - : NLdSt<(outs), (ins addrmode6:$addr, QPR:$src), - NoItinerary, - !strconcat(OpcodeStr, "\t${src:dregpair}, $addr"), - [(IntOp addrmode6:$addr, (Ty QPR:$src))]>; - -def VST1d8 : VST1D<"vst1.8", v8i8, int_arm_neon_vst1i>; -def VST1d16 : VST1D<"vst1.16", v4i16, int_arm_neon_vst1i>; -def VST1d32 : VST1D<"vst1.32", v2i32, int_arm_neon_vst1i>; -def VST1df : VST1D<"vst1.32", v2f32, int_arm_neon_vst1f>; -def VST1d64 : VST1D<"vst1.64", v1i64, int_arm_neon_vst1i>; - -def VST1q8 : VST1Q<"vst1.8", v16i8, int_arm_neon_vst1i>; -def VST1q16 : VST1Q<"vst1.16", v8i16, int_arm_neon_vst1i>; -def VST1q32 : VST1Q<"vst1.32", v4i32, int_arm_neon_vst1i>; -def VST1qf : VST1Q<"vst1.32", v4f32, int_arm_neon_vst1f>; -def VST1q64 : VST1Q<"vst1.64", v2i64, int_arm_neon_vst1i>; - // VLD2 : Vector Load (multiple 2-element structures) class VLD2D : NLdSt<(outs DPR:$dst1, DPR:$dst2), (ins addrmode6:$addr), @@ -227,6 +217,59 @@ def VLD4d8 : VLD4D<"vld4.8">; def VLD4d16 : VLD4D<"vld4.16">; def VLD4d32 : VLD4D<"vld4.32">; +// VST1 : Vector Store (multiple single elements) +class VST1D + : NLdSt<(outs), (ins addrmode6:$addr, DPR:$src), + NoItinerary, + !strconcat(OpcodeStr, "\t\\{$src\\}, $addr"), + [(IntOp addrmode6:$addr, (Ty DPR:$src))]>; +class VST1Q + : NLdSt<(outs), (ins addrmode6:$addr, QPR:$src), + NoItinerary, + !strconcat(OpcodeStr, "\t${src:dregpair}, $addr"), + [(IntOp addrmode6:$addr, (Ty QPR:$src))]>; + +def VST1d8 : VST1D<"vst1.8", v8i8, int_arm_neon_vst1i>; +def VST1d16 : VST1D<"vst1.16", v4i16, int_arm_neon_vst1i>; +def VST1d32 : VST1D<"vst1.32", v2i32, int_arm_neon_vst1i>; +def VST1df : VST1D<"vst1.32", v2f32, int_arm_neon_vst1f>; +def VST1d64 : VST1D<"vst1.64", v1i64, int_arm_neon_vst1i>; + +def VST1q8 : VST1Q<"vst1.8", v16i8, int_arm_neon_vst1i>; +def VST1q16 : VST1Q<"vst1.16", v8i16, int_arm_neon_vst1i>; +def VST1q32 : VST1Q<"vst1.32", v4i32, int_arm_neon_vst1i>; +def VST1qf : VST1Q<"vst1.32", v4f32, int_arm_neon_vst1f>; +def VST1q64 : VST1Q<"vst1.64", v2i64, int_arm_neon_vst1i>; + +// VST2 : Vector Store (multiple 2-element structures) +class VST2D + : NLdSt<(outs), (ins addrmode6:$addr, DPR:$src1, DPR:$src2), NoItinerary, + !strconcat(OpcodeStr, "\t\\{$src1,$src2\\}, $addr"), []>; + +def VST2d8 : VST2D<"vst2.8">; +def VST2d16 : VST2D<"vst2.16">; +def VST2d32 : VST2D<"vst2.32">; + +// VST3 : Vector Store (multiple 3-element structures) +class VST3D + : NLdSt<(outs), (ins addrmode6:$addr, DPR:$src1, DPR:$src2, DPR:$src3), + NoItinerary, + !strconcat(OpcodeStr, "\t\\{$src1,$src2,$src3\\}, $addr"), []>; + +def VST3d8 : VST3D<"vst3.8">; +def VST3d16 : VST3D<"vst3.16">; +def VST3d32 : VST3D<"vst3.32">; + +// VST4 : Vector Store (multiple 4-element structures) +class VST4D + : NLdSt<(outs), (ins addrmode6:$addr, + DPR:$src1, DPR:$src2, DPR:$src3, DPR:$src4), NoItinerary, + !strconcat(OpcodeStr, "\t\\{$src1,$src2,$src3,$src4\\}, $addr"), []>; + +def VST4d8 : VST4D<"vst4.8">; +def VST4d16 : VST4D<"vst4.16">; +def VST4d32 : VST4D<"vst4.32">; + //===----------------------------------------------------------------------===// // NEON pattern fragments diff --git a/lib/Target/ARM/NEONPreAllocPass.cpp b/lib/Target/ARM/NEONPreAllocPass.cpp index bf9a72bcd2c..9c790e25250 100644 --- a/lib/Target/ARM/NEONPreAllocPass.cpp +++ b/lib/Target/ARM/NEONPreAllocPass.cpp @@ -62,6 +62,27 @@ static bool isNEONMultiRegOp(int Opcode, unsigned &FirstOpnd, FirstOpnd = 0; NumRegs = 4; return true; + + case ARM::VST2d8: + case ARM::VST2d16: + case ARM::VST2d32: + FirstOpnd = 3; + NumRegs = 2; + return true; + + case ARM::VST3d8: + case ARM::VST3d16: + case ARM::VST3d32: + FirstOpnd = 3; + NumRegs = 3; + return true; + + case ARM::VST4d8: + case ARM::VST4d16: + case ARM::VST4d32: + FirstOpnd = 3; + NumRegs = 4; + return true; } return false; diff --git a/test/CodeGen/ARM/vst2.ll b/test/CodeGen/ARM/vst2.ll new file mode 100644 index 00000000000..f8f34f4aae3 --- /dev/null +++ b/test/CodeGen/ARM/vst2.ll @@ -0,0 +1,38 @@ +; RUN: llvm-as < %s | llc -march=arm -mattr=+neon | FileCheck %s + +define void @vst2i8(i8* %A, <8 x i8>* %B) nounwind { +;CHECK: vst2i8: +;CHECK: vst2.8 + %tmp1 = load <8 x i8>* %B + call void @llvm.arm.neon.vst2i.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1) + ret void +} + +define void @vst2i16(i16* %A, <4 x i16>* %B) nounwind { +;CHECK: vst2i16: +;CHECK: vst2.16 + %tmp1 = load <4 x i16>* %B + call void @llvm.arm.neon.vst2i.v4i16(i16* %A, <4 x i16> %tmp1, <4 x i16> %tmp1) + ret void +} + +define void @vst2i32(i32* %A, <2 x i32>* %B) nounwind { +;CHECK: vst2i32: +;CHECK: vst2.32 + %tmp1 = load <2 x i32>* %B + call void @llvm.arm.neon.vst2i.v2i32(i32* %A, <2 x i32> %tmp1, <2 x i32> %tmp1) + ret void +} + +define void @vst2f(float* %A, <2 x float>* %B) nounwind { +;CHECK: vst2f: +;CHECK: vst2.32 + %tmp1 = load <2 x float>* %B + call void @llvm.arm.neon.vst2f.v2f32(float* %A, <2 x float> %tmp1, <2 x float> %tmp1) + ret void +} + +declare void @llvm.arm.neon.vst2i.v8i8(i8*, <8 x i8>, <8 x i8>) nounwind +declare void @llvm.arm.neon.vst2i.v4i16(i8*, <4 x i16>, <4 x i16>) nounwind +declare void @llvm.arm.neon.vst2i.v2i32(i8*, <2 x i32>, <2 x i32>) nounwind +declare void @llvm.arm.neon.vst2f.v2f32(i8*, <2 x float>, <2 x float>) nounwind diff --git a/test/CodeGen/ARM/vst3.ll b/test/CodeGen/ARM/vst3.ll new file mode 100644 index 00000000000..c1a6ce86b4c --- /dev/null +++ b/test/CodeGen/ARM/vst3.ll @@ -0,0 +1,38 @@ +; RUN: llvm-as < %s | llc -march=arm -mattr=+neon | FileCheck %s + +define void @vst3i8(i8* %A, <8 x i8>* %B) nounwind { +;CHECK: vst3i8: +;CHECK: vst3.8 + %tmp1 = load <8 x i8>* %B + call void @llvm.arm.neon.vst3i.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1) + ret void +} + +define void @vst3i16(i16* %A, <4 x i16>* %B) nounwind { +;CHECK: vst3i16: +;CHECK: vst3.16 + %tmp1 = load <4 x i16>* %B + call void @llvm.arm.neon.vst3i.v4i16(i16* %A, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1) + ret void +} + +define void @vst3i32(i32* %A, <2 x i32>* %B) nounwind { +;CHECK: vst3i32: +;CHECK: vst3.32 + %tmp1 = load <2 x i32>* %B + call void @llvm.arm.neon.vst3i.v2i32(i32* %A, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1) + ret void +} + +define void @vst3f(float* %A, <2 x float>* %B) nounwind { +;CHECK: vst3f: +;CHECK: vst3.32 + %tmp1 = load <2 x float>* %B + call void @llvm.arm.neon.vst3f.v2f32(float* %A, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1) + ret void +} + +declare void @llvm.arm.neon.vst3i.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>) nounwind +declare void @llvm.arm.neon.vst3i.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>) nounwind +declare void @llvm.arm.neon.vst3i.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>) nounwind +declare void @llvm.arm.neon.vst3f.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>) nounwind diff --git a/test/CodeGen/ARM/vst4.ll b/test/CodeGen/ARM/vst4.ll new file mode 100644 index 00000000000..1d6f109a728 --- /dev/null +++ b/test/CodeGen/ARM/vst4.ll @@ -0,0 +1,38 @@ +; RUN: llvm-as < %s | llc -march=arm -mattr=+neon | FileCheck %s + +define void @vst4i8(i8* %A, <8 x i8>* %B) nounwind { +;CHECK: vst4i8: +;CHECK: vst4.8 + %tmp1 = load <8 x i8>* %B + call void @llvm.arm.neon.vst4i.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1) + ret void +} + +define void @vst4i16(i16* %A, <4 x i16>* %B) nounwind { +;CHECK: vst4i16: +;CHECK: vst4.16 + %tmp1 = load <4 x i16>* %B + call void @llvm.arm.neon.vst4i.v4i16(i16* %A, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1) + ret void +} + +define void @vst4i32(i32* %A, <2 x i32>* %B) nounwind { +;CHECK: vst4i32: +;CHECK: vst4.32 + %tmp1 = load <2 x i32>* %B + call void @llvm.arm.neon.vst4i.v2i32(i32* %A, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1) + ret void +} + +define void @vst4f(float* %A, <2 x float>* %B) nounwind { +;CHECK: vst4f: +;CHECK: vst4.32 + %tmp1 = load <2 x float>* %B + call void @llvm.arm.neon.vst4f.v2f32(float* %A, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1) + ret void +} + +declare void @llvm.arm.neon.vst4i.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>) nounwind +declare void @llvm.arm.neon.vst4i.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>) nounwind +declare void @llvm.arm.neon.vst4i.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>) nounwind +declare void @llvm.arm.neon.vst4f.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, <2 x float>) nounwind