diff --git a/lib/Target/ARM/ARMExpandPseudoInsts.cpp b/lib/Target/ARM/ARMExpandPseudoInsts.cpp index f65be1cb179..a78ed2616b0 100644 --- a/lib/Target/ARM/ARMExpandPseudoInsts.cpp +++ b/lib/Target/ARM/ARMExpandPseudoInsts.cpp @@ -172,6 +172,13 @@ static const NEONLdStTableEntry NEONLdStTable[] = { { ARM::VLD2q8Pseudo, ARM::VLD2q8, true, false, SingleSpc, 4, 8 }, { ARM::VLD2q8Pseudo_UPD, ARM::VLD2q8_UPD, true, true, SingleSpc, 4, 8 }, +{ ARM::VLD3DUPd16Pseudo, ARM::VLD3DUPd16, true, false, SingleSpc, 3, 4}, +{ ARM::VLD3DUPd16Pseudo_UPD, ARM::VLD3DUPd16_UPD, true, true, SingleSpc, 3, 4}, +{ ARM::VLD3DUPd32Pseudo, ARM::VLD3DUPd32, true, false, SingleSpc, 3, 2}, +{ ARM::VLD3DUPd32Pseudo_UPD, ARM::VLD3DUPd32_UPD, true, true, SingleSpc, 3, 2}, +{ ARM::VLD3DUPd8Pseudo, ARM::VLD3DUPd8, true, false, SingleSpc, 3, 8}, +{ ARM::VLD3DUPd8Pseudo_UPD, ARM::VLD3DUPd8_UPD, true, true, SingleSpc, 3, 8}, + { ARM::VLD3LNd16Pseudo, ARM::VLD3LNd16, true, false, SingleSpc, 3, 4 }, { ARM::VLD3LNd16Pseudo_UPD, ARM::VLD3LNd16_UPD, true, true, SingleSpc, 3, 4 }, { ARM::VLD3LNd32Pseudo, ARM::VLD3LNd32, true, false, SingleSpc, 3, 2 }, @@ -946,6 +953,12 @@ bool ARMExpandPseudo::ExpandMBB(MachineBasicBlock &MBB) { case ARM::VLD2DUPd8Pseudo_UPD: case ARM::VLD2DUPd16Pseudo_UPD: case ARM::VLD2DUPd32Pseudo_UPD: + case ARM::VLD3DUPd8Pseudo: + case ARM::VLD3DUPd16Pseudo: + case ARM::VLD3DUPd32Pseudo: + case ARM::VLD3DUPd8Pseudo_UPD: + case ARM::VLD3DUPd16Pseudo_UPD: + case ARM::VLD3DUPd32Pseudo_UPD: ExpandVLD(MBBI); break; diff --git a/lib/Target/ARM/ARMISelDAGToDAG.cpp b/lib/Target/ARM/ARMISelDAGToDAG.cpp index b9fbdc58a1e..a3b86cb9dbd 100644 --- a/lib/Target/ARM/ARMISelDAGToDAG.cpp +++ b/lib/Target/ARM/ARMISelDAGToDAG.cpp @@ -2361,6 +2361,12 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) { return SelectVLDDup(N, 2, Opcodes); } + case ARMISD::VLD3DUP: { + unsigned Opcodes[] = { ARM::VLD3DUPd8Pseudo, ARM::VLD3DUPd16Pseudo, + ARM::VLD3DUPd32Pseudo }; + return SelectVLDDup(N, 3, Opcodes); + } + case ISD::INTRINSIC_VOID: case ISD::INTRINSIC_W_CHAIN: { unsigned IntNo = cast(N->getOperand(1))->getZExtValue(); diff --git a/lib/Target/ARM/ARMInstrNEON.td b/lib/Target/ARM/ARMInstrNEON.td index fd9b8234123..ebb9d5d8a7f 100644 --- a/lib/Target/ARM/ARMInstrNEON.td +++ b/lib/Target/ARM/ARMInstrNEON.td @@ -896,6 +896,48 @@ def VLD2DUPd16Pseudo_UPD : VLDQWBPseudo; def VLD2DUPd32Pseudo_UPD : VLDQWBPseudo; // VLD3DUP : Vector Load (single 3-element structure to all lanes) +class VLD3DUP op7_4, string Dt> + : NLdSt<1, 0b10, 0b1110, op7_4, (outs DPR:$Vd, DPR:$dst2, DPR:$dst3), + (ins addrmode6:$Rn), IIC_VLD3dup, + "vld3", Dt, "\\{$Vd[], $dst2[], $dst3[]\\}, $Rn", "", []> { + let Rm = 0b1111; + let Inst{4} = Rn{4}; +} + +def VLD3DUPd8 : VLD3DUP<{0,0,0,?}, "8">; +def VLD3DUPd16 : VLD3DUP<{0,1,0,?}, "16">; +def VLD3DUPd32 : VLD3DUP<{1,0,0,?}, "32">; + +def VLD3DUPd8Pseudo : VLDQQPseudo; +def VLD3DUPd16Pseudo : VLDQQPseudo; +def VLD3DUPd32Pseudo : VLDQQPseudo; + +// ...with double-spaced registers (not used for codegen): +def VLD3DUPd8T : VLD3DUP<{0,0,1,?}, "8">; +def VLD3DUPd16T : VLD3DUP<{0,1,1,?}, "16">; +def VLD3DUPd32T : VLD3DUP<{1,0,1,?}, "32">; + +// ...with address register writeback: +class VLD3DUPWB op7_4, string Dt> + : NLdSt<1, 0b10, 0b1110, op7_4, (outs DPR:$Vd, DPR:$dst2, DPR:$dst3, GPR:$wb), + (ins addrmode6:$Rn, am6offset:$Rm), IIC_VLD3dupu, + "vld3", Dt, "\\{$Vd[], $dst2[], $dst3[]\\}, $Rn$Rm", + "$Rn.addr = $wb", []> { + let Inst{4} = Rn{4}; +} + +def VLD3DUPd8_UPD : VLD3DUPWB<{0,0,0,0}, "8">; +def VLD3DUPd16_UPD : VLD3DUPWB<{0,1,0,?}, "16">; +def VLD3DUPd32_UPD : VLD3DUPWB<{1,0,0,?}, "32">; + +def VLD3DUPd8T_UPD : VLD3DUPWB<{0,0,1,0}, "8">; +def VLD3DUPd16T_UPD : VLD3DUPWB<{0,1,1,?}, "16">; +def VLD3DUPd32T_UPD : VLD3DUPWB<{1,0,1,?}, "32">; + +def VLD3DUPd8Pseudo_UPD : VLDQQWBPseudo; +def VLD3DUPd16Pseudo_UPD : VLDQQWBPseudo; +def VLD3DUPd32Pseudo_UPD : VLDQQWBPseudo; + // VLD4DUP : Vector Load (single 4-element structure to all lanes) // FIXME: Not yet implemented. } // mayLoad = 1, neverHasSideEffects = 1, hasExtraDefRegAllocReq = 1 diff --git a/lib/Target/ARM/ARMSchedule.td b/lib/Target/ARM/ARMSchedule.td index 5202da46390..6300043114e 100644 --- a/lib/Target/ARM/ARMSchedule.td +++ b/lib/Target/ARM/ARMSchedule.td @@ -152,6 +152,8 @@ def IIC_VLD3 : InstrItinClass; def IIC_VLD3ln : InstrItinClass; def IIC_VLD3u : InstrItinClass; def IIC_VLD3lnu : InstrItinClass; +def IIC_VLD3dup : InstrItinClass; +def IIC_VLD3dupu : InstrItinClass; def IIC_VLD4 : InstrItinClass; def IIC_VLD4ln : InstrItinClass; def IIC_VLD4u : InstrItinClass; diff --git a/lib/Target/ARM/ARMScheduleA8.td b/lib/Target/ARM/ARMScheduleA8.td index be92562a2e5..1e9ec0791ad 100644 --- a/lib/Target/ARM/ARMScheduleA8.td +++ b/lib/Target/ARM/ARMScheduleA8.td @@ -559,6 +559,18 @@ def CortexA8Itineraries : ProcessorItineraries< InstrStage<5, [A8_LSPipe]>], [4, 4, 5, 2, 1, 1, 1, 1, 1, 2]>, // + // VLD3dup + InstrItinData, + InstrStage<3, [A8_NLSPipe], 0>, + InstrStage<3, [A8_LSPipe]>], + [2, 2, 3, 1]>, + // + // VLD3dupu + InstrItinData, + InstrStage<3, [A8_NLSPipe], 0>, + InstrStage<3, [A8_LSPipe]>], + [2, 2, 3, 2, 1, 1]>, + // // VLD4 InstrItinData, InstrStage<4, [A8_NLSPipe], 0>, diff --git a/lib/Target/ARM/ARMScheduleA9.td b/lib/Target/ARM/ARMScheduleA9.td index d775a9f26fe..a253b988198 100644 --- a/lib/Target/ARM/ARMScheduleA9.td +++ b/lib/Target/ARM/ARMScheduleA9.td @@ -941,6 +941,24 @@ def CortexA9Itineraries : ProcessorItineraries< InstrStage<5, [A9_LSUnit]>], [5, 5, 6, 2, 1, 1, 1, 1, 1, 2]>, // + // VLD3dup + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<9, [A9_DRegsVFP], 0, Reserved>, + InstrStage<3, [A9_NPipe], 0>, + InstrStage<3, [A9_LSUnit]>], + [3, 3, 4, 1]>, + // + // VLD3dupu + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<9, [A9_DRegsVFP], 0, Reserved>, + InstrStage<3, [A9_NPipe], 0>, + InstrStage<3, [A9_LSUnit]>], + [3, 3, 4, 2, 1, 1]>, + // // VLD4 InstrItinData, InstrStage<1, [A9_MUX0], 0>, diff --git a/test/CodeGen/ARM/vlddup.ll b/test/CodeGen/ARM/vlddup.ll index 00dd134d770..8d78dfbb90b 100644 --- a/test/CodeGen/ARM/vlddup.ll +++ b/test/CodeGen/ARM/vlddup.ll @@ -71,3 +71,23 @@ define <2 x i32> @vld2dupi32(i32* %A) nounwind { declare %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2lane.v8i8(i8*, <8 x i8>, <8 x i8>, i32, i32) nounwind readonly declare %struct.__neon_int2x32x2_t @llvm.arm.neon.vld2lane.v2i32(i32*, <2 x i32>, <2 x i32>, i32, i32) nounwind readonly + +%struct.__neon_int16x4x3_t = type { <4 x i16>, <4 x i16>, <4 x i16> } + +define <4 x i16> @vld3dupi16(i16* %A) nounwind { +;CHECK: vld3dupi16: +;Check the (default) alignment value. VLD3 does not support alignment. +;CHECK: vld3.16 {d16[], d17[], d18[]}, [r0] + %tmp0 = tail call %struct.__neon_int16x4x3_t @llvm.arm.neon.vld3lane.v4i16(i16* %A, <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, i32 0, i32 8) + %tmp1 = extractvalue %struct.__neon_int16x4x3_t %tmp0, 0 + %tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> zeroinitializer + %tmp3 = extractvalue %struct.__neon_int16x4x3_t %tmp0, 1 + %tmp4 = shufflevector <4 x i16> %tmp3, <4 x i16> undef, <4 x i32> zeroinitializer + %tmp5 = extractvalue %struct.__neon_int16x4x3_t %tmp0, 2 + %tmp6 = shufflevector <4 x i16> %tmp5, <4 x i16> undef, <4 x i32> zeroinitializer + %tmp7 = add <4 x i16> %tmp2, %tmp4 + %tmp8 = add <4 x i16> %tmp7, %tmp6 + ret <4 x i16> %tmp8 +} + +declare %struct.__neon_int16x4x3_t @llvm.arm.neon.vld3lane.v4i16(i16*, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32) nounwind readonly