diff --git a/lib/Target/AArch64/AArch64InstrInfo.cpp b/lib/Target/AArch64/AArch64InstrInfo.cpp index 1e19eb0c741..f4d13932a51 100644 --- a/lib/Target/AArch64/AArch64InstrInfo.cpp +++ b/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -134,7 +134,8 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB, return; } } else { - llvm_unreachable("Unknown register class in copyPhysReg"); + CopyPhysRegTuple(MBB, I, DL, DestReg, SrcReg); + return; } // E.g. ORR xDst, xzr, xSrc, lsl #0 @@ -144,6 +145,55 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB, .addImm(0); } +void AArch64InstrInfo::CopyPhysRegTuple(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, + DebugLoc DL, unsigned DestReg, + unsigned SrcReg) const { + unsigned SubRegs; + bool IsQRegs; + if (AArch64::DPairRegClass.contains(DestReg, SrcReg)) { + SubRegs = 2; + IsQRegs = false; + } else if (AArch64::DTripleRegClass.contains(DestReg, SrcReg)) { + SubRegs = 3; + IsQRegs = false; + } else if (AArch64::DQuadRegClass.contains(DestReg, SrcReg)) { + SubRegs = 4; + IsQRegs = false; + } else if (AArch64::QPairRegClass.contains(DestReg, SrcReg)) { + SubRegs = 2; + IsQRegs = true; + } else if (AArch64::QTripleRegClass.contains(DestReg, SrcReg)) { + SubRegs = 3; + IsQRegs = true; + } else if (AArch64::QQuadRegClass.contains(DestReg, SrcReg)) { + SubRegs = 4; + IsQRegs = true; + } else + llvm_unreachable("Unknown register class"); + + unsigned BeginIdx = IsQRegs ? AArch64::qsub_0 : AArch64::dsub_0; + int Spacing = 1; + const TargetRegisterInfo *TRI = &getRegisterInfo(); + // Copy register tuples backward when the first Dest reg overlaps + // with SrcReg. + if (TRI->regsOverlap(SrcReg, TRI->getSubReg(DestReg, BeginIdx))) { + BeginIdx = BeginIdx + (SubRegs - 1); + Spacing = -1; + } + + unsigned Opc = IsQRegs ? AArch64::ORRvvv_16B : AArch64::ORRvvv_8B; + for (unsigned i = 0; i != SubRegs; ++i) { + unsigned Dst = TRI->getSubReg(DestReg, BeginIdx + i * Spacing); + unsigned Src = TRI->getSubReg(SrcReg, BeginIdx + i * Spacing); + assert(Dst && Src && "Bad sub-register"); + BuildMI(MBB, I, I->getDebugLoc(), get(Opc), Dst) + .addReg(Src) + .addReg(Src); + } + return; +} + /// Does the Opcode represent a conditional branch that we can remove and re-add /// at the end of a basic block? static bool isCondBranch(unsigned Opc) { diff --git a/lib/Target/AArch64/AArch64InstrInfo.h b/lib/Target/AArch64/AArch64InstrInfo.h index 620ecc93b17..946e5da5cf5 100644 --- a/lib/Target/AArch64/AArch64InstrInfo.h +++ b/lib/Target/AArch64/AArch64InstrInfo.h @@ -42,6 +42,9 @@ public: MachineBasicBlock::iterator I, DebugLoc DL, unsigned DestReg, unsigned SrcReg, bool KillSrc) const; + void CopyPhysRegTuple(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, DebugLoc DL, + unsigned DestReg, unsigned SrcReg) const; void storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, diff --git a/test/CodeGen/AArch64/neon-copyPhysReg-tuple.ll b/test/CodeGen/AArch64/neon-copyPhysReg-tuple.ll new file mode 100644 index 00000000000..4dffcd169e7 --- /dev/null +++ b/test/CodeGen/AArch64/neon-copyPhysReg-tuple.ll @@ -0,0 +1,47 @@ +; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s + +define <4 x i32> @copyTuple.QPair(i8* %a, i8* %b) { +; CHECK-LABEL: copyTuple.QPair: +; CHECK: orr v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b +; CHECK: orr v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b +; CHECK: ld2 {{{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x{{[0-9]+|sp}}] +entry: + %vld = tail call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2lane.v4i32(i8* %a, <4 x i32> , <4 x i32> , i32 0, i32 4) + %extract = extractvalue { <4 x i32>, <4 x i32> } %vld, 0 + %vld1 = tail call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2lane.v4i32(i8* %b, <4 x i32> %extract, <4 x i32> , i32 1, i32 4) + %vld1.fca.0.extract = extractvalue { <4 x i32>, <4 x i32> } %vld1, 0 + ret <4 x i32> %vld1.fca.0.extract +} + +define <4 x i32> @copyTuple.QTriple(i8* %a, i8* %b, <4 x i32> %c) { +; CHECK-LABEL: copyTuple.QTriple: +; CHECK: orr v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b +; CHECK: orr v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b +; CHECK: orr v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b +; CHECK: ld3 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x{{[0-9]+|sp}}] +entry: + %vld = tail call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3lane.v4i32(i8* %a, <4 x i32> , <4 x i32> %c, <4 x i32> %c, i32 0, i32 4) + %extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld, 0 + %vld1 = tail call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3lane.v4i32(i8* %b, <4 x i32> %extract, <4 x i32> , <4 x i32> %c, i32 1, i32 4) + %vld1.fca.0.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld1, 0 + ret <4 x i32> %vld1.fca.0.extract +} + +define <4 x i32> @copyTuple.QQuad(i8* %a, i8* %b, <4 x i32> %c) { +; CHECK-LABEL: copyTuple.QQuad: +; CHECK: orr v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b +; CHECK: orr v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b +; CHECK: orr v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b +; CHECK: orr v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b +; CHECK: ld4 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x{{[0-9]+|sp}}] +entry: + %vld = tail call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4lane.v4i32(i8* %a, <4 x i32> , <4 x i32> %c, <4 x i32> %c, <4 x i32> %c, i32 0, i32 4) + %extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld, 0 + %vld1 = tail call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4lane.v4i32(i8* %b, <4 x i32> %extract, <4 x i32> , <4 x i32> %c, <4 x i32> %c, i32 1, i32 4) + %vld1.fca.0.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld1, 0 + ret <4 x i32> %vld1.fca.0.extract +} + +declare { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2lane.v4i32(i8*, <4 x i32>, <4 x i32>, i32, i32) +declare { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3lane.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, i32, i32) +declare { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4lane.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i32, i32) \ No newline at end of file