diff --git a/lib/Target/ARM/ARMBaseInstrInfo.cpp b/lib/Target/ARM/ARMBaseInstrInfo.cpp index e6ea03aa4aa..943952fcf6b 100644 --- a/lib/Target/ARM/ARMBaseInstrInfo.cpp +++ b/lib/Target/ARM/ARMBaseInstrInfo.cpp @@ -788,7 +788,7 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, RC == ARM::QPR_8RegisterClass) && "Unknown regclass!"); if (Align >= 16 && (getRegisterInfo().canRealignStack(MF))) { - AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VLD1q64), DestReg) + AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VLD1q), DestReg) .addFrameIndex(FI).addImm(128) .addMemOperand(MMO)); } else { diff --git a/lib/Target/ARM/ARMISelDAGToDAG.cpp b/lib/Target/ARM/ARMISelDAGToDAG.cpp index 0f129eb417f..332ca3cb2a7 100644 --- a/lib/Target/ARM/ARMISelDAGToDAG.cpp +++ b/lib/Target/ARM/ARMISelDAGToDAG.cpp @@ -124,10 +124,10 @@ private: /// SelectDYN_ALLOC - Select dynamic alloc for Thumb. SDNode *SelectDYN_ALLOC(SDNode *N); - /// SelectVLD - Select NEON load intrinsics. NumVecs should - /// be 2, 3 or 4. The opcode arrays specify the instructions used for + /// SelectVLD - Select NEON load intrinsics. NumVecs should be + /// 1, 2, 3 or 4. The opcode arrays specify the instructions used for /// loads of D registers and even subregs and odd subregs of Q registers. - /// For NumVecs == 2, QOpcodes1 is not used. + /// For NumVecs <= 2, QOpcodes1 is not used. SDNode *SelectVLD(SDNode *N, unsigned NumVecs, unsigned *DOpcodes, unsigned *QOpcodes0, unsigned *QOpcodes1); @@ -1022,7 +1022,7 @@ static EVT GetNEONSubregVT(EVT VT) { SDNode *ARMDAGToDAGISel::SelectVLD(SDNode *N, unsigned NumVecs, unsigned *DOpcodes, unsigned *QOpcodes0, unsigned *QOpcodes1) { - assert(NumVecs >=2 && NumVecs <= 4 && "VLD NumVecs out-of-range"); + assert(NumVecs >= 1 && NumVecs <= 4 && "VLD NumVecs out-of-range"); DebugLoc dl = N->getDebugLoc(); SDValue MemAddr, Align; @@ -1047,6 +1047,9 @@ SDNode *ARMDAGToDAGISel::SelectVLD(SDNode *N, unsigned NumVecs, case MVT::v8i16: OpcodeIndex = 1; break; case MVT::v4f32: case MVT::v4i32: OpcodeIndex = 2; break; + case MVT::v2i64: OpcodeIndex = 3; + assert(NumVecs == 1 && "v2i64 type only supported for VLD1/VST1"); + break; } SDValue Pred = CurDAG->getTargetConstant(14, MVT::i32); @@ -1060,15 +1063,15 @@ SDNode *ARMDAGToDAGISel::SelectVLD(SDNode *N, unsigned NumVecs, } EVT RegVT = GetNEONSubregVT(VT); - if (NumVecs == 2) { - // Quad registers are directly supported for VLD2, - // loading 2 pairs of D regs. + if (NumVecs <= 2) { + // Quad registers are directly supported for VLD1 and VLD2, + // loading pairs of D regs. unsigned Opc = QOpcodes0[OpcodeIndex]; const SDValue Ops[] = { MemAddr, Align, Pred, Reg0, Chain }; - std::vector ResTys(4, VT); + std::vector ResTys(2 * NumVecs, RegVT); ResTys.push_back(MVT::Other); SDNode *VLd = CurDAG->getMachineNode(Opc, dl, ResTys, Ops, 5); - Chain = SDValue(VLd, 4); + Chain = SDValue(VLd, 2 * NumVecs); // Combine the even and odd subregs to produce the result. for (unsigned Vec = 0; Vec < NumVecs; ++Vec) { @@ -1831,9 +1834,17 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) { default: break; + case Intrinsic::arm_neon_vld1: { + unsigned DOpcodes[] = { ARM::VLD1d8, ARM::VLD1d16, + ARM::VLD1d32, ARM::VLD1d64 }; + unsigned QOpcodes[] = { ARM::VLD1q8, ARM::VLD1q16, + ARM::VLD1q32, ARM::VLD1q64 }; + return SelectVLD(N, 1, DOpcodes, QOpcodes, 0); + } + case Intrinsic::arm_neon_vld2: { unsigned DOpcodes[] = { ARM::VLD2d8, ARM::VLD2d16, - ARM::VLD2d32, ARM::VLD2d64 }; + ARM::VLD2d32, ARM::VLD1q64 }; unsigned QOpcodes[] = { ARM::VLD2q8, ARM::VLD2q16, ARM::VLD2q32 }; return SelectVLD(N, 2, DOpcodes, QOpcodes, 0); } diff --git a/lib/Target/ARM/ARMInstrNEON.td b/lib/Target/ARM/ARMInstrNEON.td index 71c7814c9c7..9156ff9e068 100644 --- a/lib/Target/ARM/ARMInstrNEON.td +++ b/lib/Target/ARM/ARMInstrNEON.td @@ -116,6 +116,7 @@ def h64imm : Operand { //===----------------------------------------------------------------------===// // Use vldmia to load a Q register as a D register pair. +// This is equivalent to VLDMD except that it has a Q register operand. def VLDRQ : NI4<(outs QPR:$dst), (ins addrmode4:$addr), IIC_fpLoadm, "vldmia", "$addr, ${dst:dregpair}", [(set QPR:$dst, (v2f64 (load addrmode4:$addr)))]> { @@ -126,6 +127,19 @@ def VLDRQ : NI4<(outs QPR:$dst), (ins addrmode4:$addr), IIC_fpLoadm, let Inst{11-8} = 0b1011; } +let mayLoad = 1 in { +// Use vld1 to load a Q register as a D register pair. +// This alternative to VLDRQ allows an alignment to be specified. +// This is equivalent to VLD1q64 except that it has a Q register operand. +def VLD1q + : NLdSt<0,0b10,0b1010,0b1100, (outs QPR:$dst), (ins addrmode6:$addr), + IIC_VLD1, "vld1", "64", "${dst:dregpair}, $addr", "", []>; +def VLD1q_UPD + : NLdSt<0,0b10,0b1010,0b1100, (outs QPR:$dst, GPR:$wb), + (ins addrmode6:$addr, am6offset:$offset), IIC_VLD1, "vld1", "64", + "${dst:dregpair}, $addr$offset", "$addr.addr = $wb", []>; +} // mayLoad = 1 + // Use vstmia to store a Q register as a D register pair. def VSTRQ : NI4<(outs), (ins QPR:$src, addrmode4:$addr), IIC_fpStorem, "vstmia", "$addr, ${src:dregpair}", @@ -137,29 +151,27 @@ def VSTRQ : NI4<(outs), (ins QPR:$src, addrmode4:$addr), IIC_fpStorem, let Inst{11-8} = 0b1011; } +let mayLoad = 1, hasExtraDefRegAllocReq = 1 in { + // VLD1 : Vector Load (multiple single elements) -class VLD1D op7_4, string Dt, ValueType Ty> - : NLdSt<0,0b10,0b0111,op7_4, (outs DPR:$dst), (ins addrmode6:$addr), IIC_VLD1, - "vld1", Dt, "\\{$dst\\}, $addr", "", - [(set DPR:$dst, (Ty (int_arm_neon_vld1 addrmode6:$addr)))]>; -class VLD1Q op7_4, string Dt, ValueType Ty> - : NLdSt<0,0b10,0b1010,op7_4, (outs QPR:$dst), (ins addrmode6:$addr), IIC_VLD1, - "vld1", Dt, "${dst:dregpair}, $addr", "", - [(set QPR:$dst, (Ty (int_arm_neon_vld1 addrmode6:$addr)))]>; +class VLD1D op7_4, string Dt> + : NLdSt<0,0b10,0b0111,op7_4, (outs DPR:$dst), + (ins addrmode6:$addr), IIC_VLD1, + "vld1", Dt, "\\{$dst\\}, $addr", "", []>; +class VLD1Q op7_4, string Dt> + : NLdSt<0,0b10,0b1010,op7_4, (outs DPR:$dst1, DPR:$dst2), + (ins addrmode6:$addr), IIC_VLD1, + "vld1", Dt, "\\{$dst1, $dst2\\}, $addr", "", []>; -def VLD1d8 : VLD1D<0b0000, "8", v8i8>; -def VLD1d16 : VLD1D<0b0100, "16", v4i16>; -def VLD1d32 : VLD1D<0b1000, "32", v2i32>; -def VLD1df : VLD1D<0b1000, "32", v2f32>; -def VLD1d64 : VLD1D<0b1100, "64", v1i64>; +def VLD1d8 : VLD1D<0b0000, "8">; +def VLD1d16 : VLD1D<0b0100, "16">; +def VLD1d32 : VLD1D<0b1000, "32">; +def VLD1d64 : VLD1D<0b1100, "64">; -def VLD1q8 : VLD1Q<0b0000, "8", v16i8>; -def VLD1q16 : VLD1Q<0b0100, "16", v8i16>; -def VLD1q32 : VLD1Q<0b1000, "32", v4i32>; -def VLD1qf : VLD1Q<0b1000, "32", v4f32>; -def VLD1q64 : VLD1Q<0b1100, "64", v2i64>; - -let mayLoad = 1 in { +def VLD1q8 : VLD1Q<0b0000, "8">; +def VLD1q16 : VLD1Q<0b0100, "16">; +def VLD1q32 : VLD1Q<0b1000, "32">; +def VLD1q64 : VLD1Q<0b1100, "64">; // ...with address register writeback: class VLD1DWB op7_4, string Dt> @@ -182,9 +194,6 @@ def VLD1q8_UPD : VLD1QWB<0b0000, "8">; def VLD1q16_UPD : VLD1QWB<0b0100, "16">; def VLD1q32_UPD : VLD1QWB<0b1000, "32">; def VLD1q64_UPD : VLD1QWB<0b1100, "64">; -} // mayLoad = 1 - -let mayLoad = 1, hasExtraDefRegAllocReq = 1 in { // ...with 3 registers (some of these are only for the disassembler): class VLD1D3 op7_4, string Dt> @@ -242,9 +251,6 @@ class VLD2Q op7_4, string Dt> def VLD2d8 : VLD2D<0b1000, 0b0000, "8">; def VLD2d16 : VLD2D<0b1000, 0b0100, "16">; def VLD2d32 : VLD2D<0b1000, 0b1000, "32">; -def VLD2d64 : NLdSt<0,0b10,0b1010,0b1100, (outs DPR:$dst1, DPR:$dst2), - (ins addrmode6:$addr), IIC_VLD1, - "vld1", "64", "\\{$dst1, $dst2\\}, $addr", "", []>; def VLD2q8 : VLD2Q<0b0000, "8">; def VLD2q16 : VLD2Q<0b0100, "16">; @@ -266,11 +272,6 @@ class VLD2QWB op7_4, string Dt> def VLD2d8_UPD : VLD2DWB<0b1000, 0b0000, "8">; def VLD2d16_UPD : VLD2DWB<0b1000, 0b0100, "16">; def VLD2d32_UPD : VLD2DWB<0b1000, 0b1000, "32">; -def VLD2d64_UPD : NLdSt<0,0b10,0b1010,0b1100, - (outs DPR:$dst1, DPR:$dst2, GPR:$wb), - (ins addrmode6:$addr, am6offset:$offset), IIC_VLD1, - "vld1", "64", "\\{$dst1, $dst2\\}, $addr$offset", - "$addr.addr = $wb", []>; def VLD2q8_UPD : VLD2QWB<0b0000, "8">; def VLD2q16_UPD : VLD2QWB<0b0100, "16">; diff --git a/lib/Target/ARM/NEONPreAllocPass.cpp b/lib/Target/ARM/NEONPreAllocPass.cpp index 8f56d7c3df5..ce620fa9dce 100644 --- a/lib/Target/ARM/NEONPreAllocPass.cpp +++ b/lib/Target/ARM/NEONPreAllocPass.cpp @@ -46,10 +46,17 @@ static bool isNEONMultiRegOp(int Opcode, unsigned &FirstOpnd, unsigned &NumRegs, default: break; + case ARM::VLD1q8: + case ARM::VLD1q16: + case ARM::VLD1q32: + case ARM::VLD1q64: + FirstOpnd = 0; + NumRegs = 2; + return true; + case ARM::VLD2d8: case ARM::VLD2d16: case ARM::VLD2d32: - case ARM::VLD2d64: case ARM::VLD2LNd8: case ARM::VLD2LNd16: case ARM::VLD2LNd32: