diff --git a/lib/Target/ARM/ARMISelDAGToDAG.cpp b/lib/Target/ARM/ARMISelDAGToDAG.cpp index ebff3e1c4e0..bdc29402b45 100644 --- a/lib/Target/ARM/ARMISelDAGToDAG.cpp +++ b/lib/Target/ARM/ARMISelDAGToDAG.cpp @@ -444,7 +444,7 @@ bool ARMDAGToDAGISel::SelectAddrMode6(SDValue Op, SDValue N, SDValue &Addr, SDValue &Update, SDValue &Opc) { Addr = N; - // The optional writeback is handled in ARMLoadStoreOpt. + // Default to no writeback. Update = CurDAG->getRegister(0, MVT::i32); Opc = CurDAG->getTargetConstant(ARM_AM::getAM6Opc(false), MVT::i32); return true; @@ -1388,16 +1388,57 @@ SDNode *ARMDAGToDAGISel::Select(SDValue Op) { SDValue MemAddr, MemUpdate, MemOpc; if (!SelectAddrMode6(Op, N->getOperand(2), MemAddr, MemUpdate, MemOpc)) return NULL; + if (VT.is64BitVector()) { + switch (VT.getSimpleVT().SimpleTy) { + default: llvm_unreachable("unhandled vld3 type"); + case MVT::v8i8: Opc = ARM::VLD3d8; break; + case MVT::v4i16: Opc = ARM::VLD3d16; break; + case MVT::v2f32: + case MVT::v2i32: Opc = ARM::VLD3d32; break; + } + SDValue Chain = N->getOperand(0); + const SDValue Ops[] = { MemAddr, MemUpdate, MemOpc, Chain }; + return CurDAG->getMachineNode(Opc, dl, VT, VT, VT, MVT::Other, Ops, 4); + } + // Quad registers are loaded with two separate instructions, where one + // loads the even registers and the other loads the odd registers. + EVT RegVT = VT; + unsigned Opc2 = 0; switch (VT.getSimpleVT().SimpleTy) { default: llvm_unreachable("unhandled vld3 type"); - case MVT::v8i8: Opc = ARM::VLD3d8; break; - case MVT::v4i16: Opc = ARM::VLD3d16; break; - case MVT::v2f32: - case MVT::v2i32: Opc = ARM::VLD3d32; break; + case MVT::v16i8: + Opc = ARM::VLD3q8a; Opc2 = ARM::VLD3q8b; RegVT = MVT::v8i8; break; + case MVT::v8i16: + Opc = ARM::VLD3q16a; Opc2 = ARM::VLD3q16b; RegVT = MVT::v4i16; break; + case MVT::v4f32: + Opc = ARM::VLD3q32a; Opc2 = ARM::VLD3q32b; RegVT = MVT::v2f32; break; + case MVT::v4i32: + Opc = ARM::VLD3q32a; Opc2 = ARM::VLD3q32b; RegVT = MVT::v2i32; break; } SDValue Chain = N->getOperand(0); - const SDValue Ops[] = { MemAddr, MemUpdate, MemOpc, Chain }; - return CurDAG->getMachineNode(Opc, dl, VT, VT, VT, MVT::Other, Ops, 4); + // Enable writeback to the address register. + MemOpc = CurDAG->getTargetConstant(ARM_AM::getAM6Opc(true), MVT::i32); + + std::vector ResTys(3, RegVT); + ResTys.push_back(MemAddr.getValueType()); + ResTys.push_back(MVT::Other); + + const SDValue OpsA[] = { MemAddr, MemUpdate, MemOpc, Chain }; + SDNode *VLdA = CurDAG->getMachineNode(Opc, dl, ResTys, OpsA, 4); + Chain = SDValue(VLdA, 4); + + const SDValue OpsB[] = { SDValue(VLdA, 3), MemUpdate, MemOpc, Chain }; + SDNode *VLdB = CurDAG->getMachineNode(Opc2, dl, ResTys, OpsB, 4); + Chain = SDValue(VLdB, 4); + + SDNode *Q0 = PairDRegs(VT, SDValue(VLdA, 0), SDValue(VLdB, 0)); + SDNode *Q1 = PairDRegs(VT, SDValue(VLdA, 1), SDValue(VLdB, 1)); + SDNode *Q2 = PairDRegs(VT, SDValue(VLdA, 2), SDValue(VLdB, 2)); + ReplaceUses(SDValue(N, 0), SDValue(Q0, 0)); + ReplaceUses(SDValue(N, 1), SDValue(Q1, 0)); + ReplaceUses(SDValue(N, 2), SDValue(Q2, 0)); + ReplaceUses(SDValue(N, 3), Chain); + return NULL; } case Intrinsic::arm_neon_vld4: { diff --git a/lib/Target/ARM/ARMInstrNEON.td b/lib/Target/ARM/ARMInstrNEON.td index 3f07d302b03..c7ff523b437 100644 --- a/lib/Target/ARM/ARMInstrNEON.td +++ b/lib/Target/ARM/ARMInstrNEON.td @@ -201,11 +201,26 @@ class VLD3D : NLdSt<(outs DPR:$dst1, DPR:$dst2, DPR:$dst3), (ins addrmode6:$addr), IIC_VLD3, !strconcat(OpcodeStr, "\t\\{$dst1,$dst2,$dst3\\}, $addr"), "", []>; +class VLD3WB + : NLdSt<(outs DPR:$dst1, DPR:$dst2, DPR:$dst3, GPR:$wb), + (ins addrmode6:$addr), IIC_VLD3, + !strconcat(OpcodeStr, "\t\\{$dst1,$dst2,$dst3\\}, $addr"), + "$addr.addr = $wb", []>; def VLD3d8 : VLD3D<"vld3.8">; def VLD3d16 : VLD3D<"vld3.16">; def VLD3d32 : VLD3D<"vld3.32">; +// vld3 to double-spaced even registers. +def VLD3q8a : VLD3WB<"vld3.8">; +def VLD3q16a : VLD3WB<"vld3.16">; +def VLD3q32a : VLD3WB<"vld3.32">; + +// vld3 to double-spaced odd registers. +def VLD3q8b : VLD3WB<"vld3.8">; +def VLD3q16b : VLD3WB<"vld3.16">; +def VLD3q32b : VLD3WB<"vld3.32">; + // VLD4 : Vector Load (multiple 4-element structures) class VLD4D : NLdSt<(outs DPR:$dst1, DPR:$dst2, DPR:$dst3, DPR:$dst4), diff --git a/lib/Target/ARM/NEONPreAllocPass.cpp b/lib/Target/ARM/NEONPreAllocPass.cpp index da1c662edb6..fab62f6cfc2 100644 --- a/lib/Target/ARM/NEONPreAllocPass.cpp +++ b/lib/Target/ARM/NEONPreAllocPass.cpp @@ -36,8 +36,12 @@ namespace { char NEONPreAllocPass::ID = 0; } -static bool isNEONMultiRegOp(int Opcode, unsigned &FirstOpnd, - unsigned &NumRegs) { +static bool isNEONMultiRegOp(int Opcode, unsigned &FirstOpnd, unsigned &NumRegs, + unsigned &Offset, unsigned &Stride) { + // Default to unit stride with no offset. + Stride = 1; + Offset = 0; + switch (Opcode) { default: break; @@ -69,6 +73,24 @@ static bool isNEONMultiRegOp(int Opcode, unsigned &FirstOpnd, NumRegs = 3; return true; + case ARM::VLD3q8a: + case ARM::VLD3q16a: + case ARM::VLD3q32a: + FirstOpnd = 0; + NumRegs = 3; + Offset = 0; + Stride = 2; + return true; + + case ARM::VLD3q8b: + case ARM::VLD3q16b: + case ARM::VLD3q32b: + FirstOpnd = 0; + NumRegs = 3; + Offset = 1; + Stride = 2; + return true; + case ARM::VLD4d8: case ARM::VLD4d16: case ARM::VLD4d32: @@ -149,8 +171,8 @@ bool NEONPreAllocPass::PreAllocNEONRegisters(MachineBasicBlock &MBB) { MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end(); for (; MBBI != E; ++MBBI) { MachineInstr *MI = &*MBBI; - unsigned FirstOpnd, NumRegs; - if (!isNEONMultiRegOp(MI->getOpcode(), FirstOpnd, NumRegs)) + unsigned FirstOpnd, NumRegs, Offset, Stride; + if (!isNEONMultiRegOp(MI->getOpcode(), FirstOpnd, NumRegs, Offset, Stride)) continue; MachineBasicBlock::iterator NextI = next(MBBI); @@ -164,9 +186,10 @@ bool NEONPreAllocPass::PreAllocNEONRegisters(MachineBasicBlock &MBB) { // For now, just assign a fixed set of adjacent registers. // This leaves plenty of room for future improvements. static const unsigned NEONDRegs[] = { - ARM::D0, ARM::D1, ARM::D2, ARM::D3 + ARM::D0, ARM::D1, ARM::D2, ARM::D3, + ARM::D4, ARM::D5, ARM::D6, ARM::D7 }; - MO.setReg(NEONDRegs[R]); + MO.setReg(NEONDRegs[Offset + R * Stride]); if (MO.isUse()) { // Insert a copy from VirtReg. diff --git a/test/CodeGen/ARM/vld3.ll b/test/CodeGen/ARM/vld3.ll index b03d74ab5dd..4ed53092c3f 100644 --- a/test/CodeGen/ARM/vld3.ll +++ b/test/CodeGen/ARM/vld3.ll @@ -5,6 +5,11 @@ %struct.__neon_int32x2x3_t = type { <2 x i32>, <2 x i32>, <2 x i32> } %struct.__neon_float32x2x3_t = type { <2 x float>, <2 x float>, <2 x float> } +%struct.__neon_int8x16x3_t = type { <16 x i8>, <16 x i8>, <16 x i8> } +%struct.__neon_int16x8x3_t = type { <8 x i16>, <8 x i16>, <8 x i16> } +%struct.__neon_int32x4x3_t = type { <4 x i32>, <4 x i32>, <4 x i32> } +%struct.__neon_float32x4x3_t = type { <4 x float>, <4 x float>, <4 x float> } + define <8 x i8> @vld3i8(i8* %A) nounwind { ;CHECK: vld3i8: ;CHECK: vld3.8 @@ -45,7 +50,56 @@ define <2 x float> @vld3f(float* %A) nounwind { ret <2 x float> %tmp4 } +define <16 x i8> @vld3Qi8(i8* %A) nounwind { +;CHECK: vld3Qi8: +;CHECK: vld3.8 +;CHECK: vld3.8 + %tmp1 = call %struct.__neon_int8x16x3_t @llvm.arm.neon.vld3.v16i8(i8* %A) + %tmp2 = extractvalue %struct.__neon_int8x16x3_t %tmp1, 0 + %tmp3 = extractvalue %struct.__neon_int8x16x3_t %tmp1, 2 + %tmp4 = add <16 x i8> %tmp2, %tmp3 + ret <16 x i8> %tmp4 +} + +define <8 x i16> @vld3Qi16(i16* %A) nounwind { +;CHECK: vld3Qi16: +;CHECK: vld3.16 +;CHECK: vld3.16 + %tmp1 = call %struct.__neon_int16x8x3_t @llvm.arm.neon.vld3.v8i16(i16* %A) + %tmp2 = extractvalue %struct.__neon_int16x8x3_t %tmp1, 0 + %tmp3 = extractvalue %struct.__neon_int16x8x3_t %tmp1, 2 + %tmp4 = add <8 x i16> %tmp2, %tmp3 + ret <8 x i16> %tmp4 +} + +define <4 x i32> @vld3Qi32(i32* %A) nounwind { +;CHECK: vld3Qi32: +;CHECK: vld3.32 +;CHECK: vld3.32 + %tmp1 = call %struct.__neon_int32x4x3_t @llvm.arm.neon.vld3.v4i32(i32* %A) + %tmp2 = extractvalue %struct.__neon_int32x4x3_t %tmp1, 0 + %tmp3 = extractvalue %struct.__neon_int32x4x3_t %tmp1, 2 + %tmp4 = add <4 x i32> %tmp2, %tmp3 + ret <4 x i32> %tmp4 +} + +define <4 x float> @vld3Qf(float* %A) nounwind { +;CHECK: vld3Qf: +;CHECK: vld3.32 +;CHECK: vld3.32 + %tmp1 = call %struct.__neon_float32x4x3_t @llvm.arm.neon.vld3.v4f32(float* %A) + %tmp2 = extractvalue %struct.__neon_float32x4x3_t %tmp1, 0 + %tmp3 = extractvalue %struct.__neon_float32x4x3_t %tmp1, 2 + %tmp4 = add <4 x float> %tmp2, %tmp3 + ret <4 x float> %tmp4 +} + declare %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8(i8*) nounwind readonly declare %struct.__neon_int16x4x3_t @llvm.arm.neon.vld3.v4i16(i8*) nounwind readonly declare %struct.__neon_int32x2x3_t @llvm.arm.neon.vld3.v2i32(i8*) nounwind readonly declare %struct.__neon_float32x2x3_t @llvm.arm.neon.vld3.v2f32(i8*) nounwind readonly + +declare %struct.__neon_int8x16x3_t @llvm.arm.neon.vld3.v16i8(i8*) nounwind readonly +declare %struct.__neon_int16x8x3_t @llvm.arm.neon.vld3.v8i16(i8*) nounwind readonly +declare %struct.__neon_int32x4x3_t @llvm.arm.neon.vld3.v4i32(i8*) nounwind readonly +declare %struct.__neon_float32x4x3_t @llvm.arm.neon.vld3.v4f32(i8*) nounwind readonly