diff --git a/lib/Target/ARM/ARMBaseInstrInfo.cpp b/lib/Target/ARM/ARMBaseInstrInfo.cpp index c0478260f51..030f09965cb 100644 --- a/lib/Target/ARM/ARMBaseInstrInfo.cpp +++ b/lib/Target/ARM/ARMBaseInstrInfo.cpp @@ -3686,6 +3686,7 @@ ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData, case ARM::VLD3d16Pseudo: case ARM::VLD3d32Pseudo: case ARM::VLD1d64TPseudo: + case ARM::VLD1d64TPseudoWB_fixed: case ARM::VLD3d8Pseudo_UPD: case ARM::VLD3d16Pseudo_UPD: case ARM::VLD3d32Pseudo_UPD: @@ -3702,6 +3703,7 @@ ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData, case ARM::VLD4d16Pseudo: case ARM::VLD4d32Pseudo: case ARM::VLD1d64QPseudo: + case ARM::VLD1d64QPseudoWB_fixed: case ARM::VLD4d8Pseudo_UPD: case ARM::VLD4d16Pseudo_UPD: case ARM::VLD4d32Pseudo_UPD: diff --git a/lib/Target/ARM/ARMExpandPseudoInsts.cpp b/lib/Target/ARM/ARMExpandPseudoInsts.cpp index b9594e61283..f2aba620d3a 100644 --- a/lib/Target/ARM/ARMExpandPseudoInsts.cpp +++ b/lib/Target/ARM/ARMExpandPseudoInsts.cpp @@ -138,7 +138,9 @@ static const NEONLdStTableEntry NEONLdStTable[] = { { ARM::VLD1LNq8Pseudo_UPD, ARM::VLD1LNd8_UPD, true, true, true, EvenDblSpc, 1, 8 ,true}, { ARM::VLD1d64QPseudo, ARM::VLD1d64Q, true, false, false, SingleSpc, 4, 1 ,false}, +{ ARM::VLD1d64QPseudoWB_fixed, ARM::VLD1d64Qwb_fixed, true, true, false, SingleSpc, 4, 1 ,false}, { ARM::VLD1d64TPseudo, ARM::VLD1d64T, true, false, false, SingleSpc, 3, 1 ,false}, +{ ARM::VLD1d64TPseudoWB_fixed, ARM::VLD1d64Twb_fixed, true, true, false, SingleSpc, 3, 1 ,false}, { ARM::VLD2LNd16Pseudo, ARM::VLD2LNd16, true, false, false, SingleSpc, 2, 4 ,true}, { ARM::VLD2LNd16Pseudo_UPD, ARM::VLD2LNd16_UPD, true, true, true, SingleSpc, 2, 4 ,true}, @@ -1118,6 +1120,7 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB, case ARM::VLD3d16Pseudo: case ARM::VLD3d32Pseudo: case ARM::VLD1d64TPseudo: + case ARM::VLD1d64TPseudoWB_fixed: case ARM::VLD3d8Pseudo_UPD: case ARM::VLD3d16Pseudo_UPD: case ARM::VLD3d32Pseudo_UPD: @@ -1134,6 +1137,7 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB, case ARM::VLD4d16Pseudo: case ARM::VLD4d32Pseudo: case ARM::VLD1d64QPseudo: + case ARM::VLD1d64QPseudoWB_fixed: case ARM::VLD4d8Pseudo_UPD: case ARM::VLD4d16Pseudo_UPD: case ARM::VLD4d32Pseudo_UPD: diff --git a/lib/Target/ARM/ARMISelDAGToDAG.cpp b/lib/Target/ARM/ARMISelDAGToDAG.cpp index 44e9dd15b13..8e236e16fe9 100644 --- a/lib/Target/ARM/ARMISelDAGToDAG.cpp +++ b/lib/Target/ARM/ARMISelDAGToDAG.cpp @@ -1668,9 +1668,61 @@ SDValue ARMDAGToDAGISel::GetVLDSTAlign(SDValue Align, unsigned NumVecs, return CurDAG->getTargetConstant(Alignment, MVT::i32); } +static bool isVLDfixed(unsigned Opc) +{ + switch (Opc) { + default: return false; + case ARM::VLD1d8wb_fixed : return true; + case ARM::VLD1d16wb_fixed : return true; + case ARM::VLD1d64Qwb_fixed : return true; + case ARM::VLD1d32wb_fixed : return true; + case ARM::VLD1d64wb_fixed : return true; + case ARM::VLD1d64TPseudoWB_fixed : return true; + case ARM::VLD1d64QPseudoWB_fixed : return true; + case ARM::VLD1q8wb_fixed : return true; + case ARM::VLD1q16wb_fixed : return true; + case ARM::VLD1q32wb_fixed : return true; + case ARM::VLD1q64wb_fixed : return true; + case ARM::VLD2d8wb_fixed : return true; + case ARM::VLD2d16wb_fixed : return true; + case ARM::VLD2d32wb_fixed : return true; + case ARM::VLD2q8PseudoWB_fixed : return true; + case ARM::VLD2q16PseudoWB_fixed : return true; + case ARM::VLD2q32PseudoWB_fixed : return true; + case ARM::VLD2DUPd8wb_fixed : return true; + case ARM::VLD2DUPd16wb_fixed : return true; + case ARM::VLD2DUPd32wb_fixed : return true; + } +} + +static bool isVSTfixed(unsigned Opc) +{ + switch (Opc) { + default: return false; + case ARM::VST1d8wb_fixed : return true; + case ARM::VST1d16wb_fixed : return true; + case ARM::VST1d32wb_fixed : return true; + case ARM::VST1d64wb_fixed : return true; + case ARM::VST1q8wb_fixed : return true; + case ARM::VST1q16wb_fixed : return true; + case ARM::VST1q32wb_fixed : return true; + case ARM::VST1q64wb_fixed : return true; + case ARM::VST1d64TPseudoWB_fixed : return true; + case ARM::VST1d64QPseudoWB_fixed : return true; + case ARM::VST2d8wb_fixed : return true; + case ARM::VST2d16wb_fixed : return true; + case ARM::VST2d32wb_fixed : return true; + case ARM::VST2q8PseudoWB_fixed : return true; + case ARM::VST2q16PseudoWB_fixed : return true; + case ARM::VST2q32PseudoWB_fixed : return true; + } +} + // Get the register stride update opcode of a VLD/VST instruction that // is otherwise equivalent to the given fixed stride updating instruction. static unsigned getVLDSTRegisterUpdateOpcode(unsigned Opc) { + assert((isVLDfixed(Opc) || isVSTfixed(Opc)) + && "Incorrect fixed stride updating instruction."); switch (Opc) { default: break; case ARM::VLD1d8wb_fixed: return ARM::VLD1d8wb_register; @@ -1681,6 +1733,10 @@ static unsigned getVLDSTRegisterUpdateOpcode(unsigned Opc) { case ARM::VLD1q16wb_fixed: return ARM::VLD1q16wb_register; case ARM::VLD1q32wb_fixed: return ARM::VLD1q32wb_register; case ARM::VLD1q64wb_fixed: return ARM::VLD1q64wb_register; + case ARM::VLD1d64Twb_fixed: return ARM::VLD1d64Twb_register; + case ARM::VLD1d64Qwb_fixed: return ARM::VLD1d64Qwb_register; + case ARM::VLD1d64TPseudoWB_fixed: return ARM::VLD1d64TPseudoWB_register; + case ARM::VLD1d64QPseudoWB_fixed: return ARM::VLD1d64QPseudoWB_register; case ARM::VST1d8wb_fixed: return ARM::VST1d8wb_register; case ARM::VST1d16wb_fixed: return ARM::VST1d16wb_register; @@ -1780,11 +1836,11 @@ SDNode *ARMDAGToDAGISel::SelectVLD(SDNode *N, bool isUpdating, unsigned NumVecs, SDValue Inc = N->getOperand(AddrOpIdx + 1); // FIXME: VLD1/VLD2 fixed increment doesn't need Reg0. Remove the reg0 // case entirely when the rest are updated to that form, too. - if ((NumVecs == 1 || NumVecs == 2) && !isa(Inc.getNode())) + if ((NumVecs <= 2) && !isa(Inc.getNode())) Opc = getVLDSTRegisterUpdateOpcode(Opc); - // We use a VLD1 for v1i64 even if the pseudo says vld2/3/4, so + // FIXME: We use a VLD1 for v1i64 even if the pseudo says vld2/3/4, so // check for that explicitly too. Horribly hacky, but temporary. - if ((NumVecs != 1 && NumVecs != 2 && Opc != ARM::VLD1q64wb_fixed) || + if ((NumVecs > 2 && !isVLDfixed(Opc)) || !isa(Inc.getNode())) Ops.push_back(isa(Inc.getNode()) ? Reg0 : Inc); } @@ -1932,11 +1988,12 @@ SDNode *ARMDAGToDAGISel::SelectVST(SDNode *N, bool isUpdating, unsigned NumVecs, // case entirely when the rest are updated to that form, too. if (NumVecs <= 2 && !isa(Inc.getNode())) Opc = getVLDSTRegisterUpdateOpcode(Opc); - // We use a VST1 for v1i64 even if the pseudo says vld2/3/4, so + // FIXME: We use a VST1 for v1i64 even if the pseudo says vld2/3/4, so // check for that explicitly too. Horribly hacky, but temporary. - if ((NumVecs > 2 && Opc != ARM::VST1q64wb_fixed) || - !isa(Inc.getNode())) - Ops.push_back(isa(Inc.getNode()) ? Reg0 : Inc); + if (!isa(Inc.getNode())) + Ops.push_back(Inc); + else if (NumVecs > 2 && !isVSTfixed(Opc)) + Ops.push_back(Reg0); } Ops.push_back(SrcReg); Ops.push_back(Pred); @@ -2829,7 +2886,7 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) { static const uint16_t DOpcodes[] = { ARM::VLD3d8Pseudo_UPD, ARM::VLD3d16Pseudo_UPD, ARM::VLD3d32Pseudo_UPD, - ARM::VLD1q64wb_fixed}; + ARM::VLD1d64TPseudoWB_fixed}; static const uint16_t QOpcodes0[] = { ARM::VLD3q8Pseudo_UPD, ARM::VLD3q16Pseudo_UPD, ARM::VLD3q32Pseudo_UPD }; @@ -2843,7 +2900,7 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) { static const uint16_t DOpcodes[] = { ARM::VLD4d8Pseudo_UPD, ARM::VLD4d16Pseudo_UPD, ARM::VLD4d32Pseudo_UPD, - ARM::VLD1q64wb_fixed}; + ARM::VLD1d64QPseudoWB_fixed}; static const uint16_t QOpcodes0[] = { ARM::VLD4q8Pseudo_UPD, ARM::VLD4q16Pseudo_UPD, ARM::VLD4q32Pseudo_UPD }; diff --git a/lib/Target/ARM/ARMInstrNEON.td b/lib/Target/ARM/ARMInstrNEON.td index 43bd4c21dc3..0b05c08ed94 100644 --- a/lib/Target/ARM/ARMInstrNEON.td +++ b/lib/Target/ARM/ARMInstrNEON.td @@ -730,6 +730,8 @@ defm VLD1d32Twb : VLD1D3WB<{1,0,0,?}, "32">; defm VLD1d64Twb : VLD1D3WB<{1,1,0,?}, "64">; def VLD1d64TPseudo : VLDQQPseudo; +def VLD1d64TPseudoWB_fixed : VLDQQWBfixedPseudo; +def VLD1d64TPseudoWB_register : VLDQQWBregisterPseudo; // ...with 4 registers class VLD1D4 op7_4, string Dt> @@ -769,6 +771,8 @@ defm VLD1d32Qwb : VLD1D4WB<{1,0,?,?}, "32">; defm VLD1d64Qwb : VLD1D4WB<{1,1,?,?}, "64">; def VLD1d64QPseudo : VLDQQPseudo; +def VLD1d64QPseudoWB_fixed : VLDQQWBfixedPseudo; +def VLD1d64QPseudoWB_register : VLDQQWBregisterPseudo; // VLD2 : Vector Load (multiple 2-element structures) class VLD2 op11_8, bits<4> op7_4, string Dt, RegisterOperand VdTy, @@ -1671,7 +1675,7 @@ defm VST1d32Twb : VST1D3WB<{1,0,0,?}, "32">; defm VST1d64Twb : VST1D3WB<{1,1,0,?}, "64">; def VST1d64TPseudo : VSTQQPseudo; -def VST1d64TPseudoWB_fixed : VSTQQWBPseudo; +def VST1d64TPseudoWB_fixed : VSTQQWBfixedPseudo; def VST1d64TPseudoWB_register : VSTQQWBPseudo; // ...with 4 registers @@ -1714,7 +1718,7 @@ defm VST1d32Qwb : VST1D4WB<{1,0,?,?}, "32">; defm VST1d64Qwb : VST1D4WB<{1,1,?,?}, "64">; def VST1d64QPseudo : VSTQQPseudo; -def VST1d64QPseudoWB_fixed : VSTQQWBPseudo; +def VST1d64QPseudoWB_fixed : VSTQQWBfixedPseudo; def VST1d64QPseudoWB_register : VSTQQWBPseudo; // VST2 : Vector Store (multiple 2-element structures) diff --git a/test/CodeGen/ARM/vld3.ll b/test/CodeGen/ARM/vld3.ll index 400541fb90a..d6eb4c2f6dd 100644 --- a/test/CodeGen/ARM/vld3.ll +++ b/test/CodeGen/ARM/vld3.ll @@ -83,6 +83,19 @@ define <1 x i64> @vld3i64(i64* %A) nounwind { ret <1 x i64> %tmp4 } +define <1 x i64> @vld3i64_update(i64** %ptr, i64* %A) nounwind { +;CHECK-LABEL: vld3i64_update: +;CHECK: vld1.64 {d16, d17, d18}, [r1:64]! + %tmp0 = bitcast i64* %A to i8* + %tmp1 = call %struct.__neon_int64x1x3_t @llvm.arm.neon.vld3.v1i64(i8* %tmp0, i32 16) + %tmp5 = getelementptr i64* %A, i32 3 + store i64* %tmp5, i64** %ptr + %tmp2 = extractvalue %struct.__neon_int64x1x3_t %tmp1, 0 + %tmp3 = extractvalue %struct.__neon_int64x1x3_t %tmp1, 2 + %tmp4 = add <1 x i64> %tmp2, %tmp3 + ret <1 x i64> %tmp4 +} + define <16 x i8> @vld3Qi8(i8* %A) nounwind { ;CHECK-LABEL: vld3Qi8: ;Check the alignment value. Max for this instruction is 64 bits: diff --git a/test/CodeGen/ARM/vld4.ll b/test/CodeGen/ARM/vld4.ll index f7376b503a3..ff162bb022e 100644 --- a/test/CodeGen/ARM/vld4.ll +++ b/test/CodeGen/ARM/vld4.ll @@ -83,6 +83,19 @@ define <1 x i64> @vld4i64(i64* %A) nounwind { ret <1 x i64> %tmp4 } +define <1 x i64> @vld4i64_update(i64** %ptr, i64* %A) nounwind { +;CHECK-LABEL: vld4i64_update: +;CHECK: vld1.64 {d16, d17, d18, d19}, [r1:256]! + %tmp0 = bitcast i64* %A to i8* + %tmp1 = call %struct.__neon_int64x1x4_t @llvm.arm.neon.vld4.v1i64(i8* %tmp0, i32 64) + %tmp5 = getelementptr i64* %A, i32 4 + store i64* %tmp5, i64** %ptr + %tmp2 = extractvalue %struct.__neon_int64x1x4_t %tmp1, 0 + %tmp3 = extractvalue %struct.__neon_int64x1x4_t %tmp1, 2 + %tmp4 = add <1 x i64> %tmp2, %tmp3 + ret <1 x i64> %tmp4 +} + define <16 x i8> @vld4Qi8(i8* %A) nounwind { ;CHECK-LABEL: vld4Qi8: ;Check the alignment value. Max for this instruction is 256 bits: diff --git a/test/CodeGen/ARM/vst3.ll b/test/CodeGen/ARM/vst3.ll index 91eb7fce2b7..65625de3457 100644 --- a/test/CodeGen/ARM/vst3.ll +++ b/test/CodeGen/ARM/vst3.ll @@ -61,6 +61,18 @@ define void @vst3i64(i64* %A, <1 x i64>* %B) nounwind { ret void } +define void @vst3i64_update(i64** %ptr, <1 x i64>* %B) nounwind { +;CHECK-LABEL: vst3i64_update +;CHECK: vst1.64 {d{{.*}}, d{{.*}}, d{{.*}}}, [r{{.*}}]! + %A = load i64** %ptr + %tmp0 = bitcast i64* %A to i8* + %tmp1 = load <1 x i64>* %B + call void @llvm.arm.neon.vst3.v1i64(i8* %tmp0, <1 x i64> %tmp1, <1 x i64> %tmp1, <1 x i64> %tmp1, i32 1) + %tmp2 = getelementptr i64* %A, i32 3 + store i64* %tmp2, i64** %ptr + ret void +} + define void @vst3Qi8(i8* %A, <16 x i8>* %B) nounwind { ;CHECK-LABEL: vst3Qi8: ;Check the alignment value. Max for this instruction is 64 bits: diff --git a/test/CodeGen/ARM/vst4.ll b/test/CodeGen/ARM/vst4.ll index ef5c83a57db..83a6c704865 100644 --- a/test/CodeGen/ARM/vst4.ll +++ b/test/CodeGen/ARM/vst4.ll @@ -60,6 +60,18 @@ define void @vst4i64(i64* %A, <1 x i64>* %B) nounwind { ret void } +define void @vst4i64_update(i64** %ptr, <1 x i64>* %B) nounwind { +;CHECK-LABEL: vst4i64_update: +;CHECK: vst1.64 {d16, d17, d18, d19}, [r1]! + %A = load i64** %ptr + %tmp0 = bitcast i64* %A to i8* + %tmp1 = load <1 x i64>* %B + call void @llvm.arm.neon.vst4.v1i64(i8* %tmp0, <1 x i64> %tmp1, <1 x i64> %tmp1, <1 x i64> %tmp1, <1 x i64> %tmp1, i32 1) + %tmp2 = getelementptr i64* %A, i32 4 + store i64* %tmp2, i64** %ptr + ret void +} + define void @vst4Qi8(i8* %A, <16 x i8>* %B) nounwind { ;CHECK-LABEL: vst4Qi8: ;Check the alignment value. Max for this instruction is 256 bits: