diff --git a/lib/Target/AArch64/AArch64InstrInfo.cpp b/lib/Target/AArch64/AArch64InstrInfo.cpp index 180110a84dd..1e19eb0c741 100644 --- a/lib/Target/AArch64/AArch64InstrInfo.cpp +++ b/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -418,10 +418,8 @@ AArch64InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, default: llvm_unreachable("Unknown size for regclass"); } - } else { - assert((RC->hasType(MVT::f32) || RC->hasType(MVT::f64) || - RC->hasType(MVT::f128)) - && "Expected integer or floating type for store"); + } else if (RC->hasType(MVT::f32) || RC->hasType(MVT::f64) || + RC->hasType(MVT::f128)) { switch (RC->getSize()) { case 4: StoreOp = AArch64::LSFP32_STR; break; case 8: StoreOp = AArch64::LSFP64_STR; break; @@ -429,6 +427,22 @@ AArch64InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, default: llvm_unreachable("Unknown size for regclass"); } + } else { // The spill of D tuples is implemented by Q tuples + if (RC == &AArch64::QPairRegClass) + StoreOp = AArch64::ST1x2_16B; + else if (RC == &AArch64::QTripleRegClass) + StoreOp = AArch64::ST1x3_16B; + else if (RC == &AArch64::QQuadRegClass) + StoreOp = AArch64::ST1x4_16B; + else + llvm_unreachable("Unknown reg class"); + + MachineInstrBuilder NewMI = BuildMI(MBB, MBBI, DL, get(StoreOp)); + // Vector store has different operands from other store instructions. + NewMI.addFrameIndex(FrameIdx) + .addReg(SrcReg, getKillRegState(isKill)) + .addMemOperand(MMO); + return; } MachineInstrBuilder NewMI = BuildMI(MBB, MBBI, DL, get(StoreOp)); @@ -464,10 +478,8 @@ AArch64InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, default: llvm_unreachable("Unknown size for regclass"); } - } else { - assert((RC->hasType(MVT::f32) || RC->hasType(MVT::f64) - || RC->hasType(MVT::f128)) - && "Expected integer or floating type for store"); + } else if (RC->hasType(MVT::f32) || RC->hasType(MVT::f64) || + RC->hasType(MVT::f128)) { switch (RC->getSize()) { case 4: LoadOp = AArch64::LSFP32_LDR; break; case 8: LoadOp = AArch64::LSFP64_LDR; break; @@ -475,6 +487,21 @@ AArch64InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, default: llvm_unreachable("Unknown size for regclass"); } + } else { // The spill of D tuples is implemented by Q tuples + if (RC == &AArch64::QPairRegClass) + LoadOp = AArch64::LD1x2_16B; + else if (RC == &AArch64::QTripleRegClass) + LoadOp = AArch64::LD1x3_16B; + else if (RC == &AArch64::QQuadRegClass) + LoadOp = AArch64::LD1x4_16B; + else + llvm_unreachable("Unknown reg class"); + + MachineInstrBuilder NewMI = BuildMI(MBB, MBBI, DL, get(LoadOp), DestReg); + // Vector load has different operands from other load instructions. + NewMI.addFrameIndex(FrameIdx) + .addMemOperand(MMO); + return; } MachineInstrBuilder NewMI = BuildMI(MBB, MBBI, DL, get(LoadOp), DestReg); @@ -572,6 +599,21 @@ void AArch64InstrInfo::getAddressConstraints(const MachineInstr &MI, MinOffset = -0x40 * AccessScale; MaxOffset = 0x3f * AccessScale; return; + case AArch64::LD1x2_16B: case AArch64::ST1x2_16B: + AccessScale = 32; + MinOffset = 0; + MaxOffset = 0xfff * AccessScale; + return; + case AArch64::LD1x3_16B: case AArch64::ST1x3_16B: + AccessScale = 48; + MinOffset = 0; + MaxOffset = 0xfff * AccessScale; + return; + case AArch64::LD1x4_16B: case AArch64::ST1x4_16B: + AccessScale = 64; + MinOffset = 0; + MaxOffset = 0xfff * AccessScale; + return; } } diff --git a/lib/Target/AArch64/AArch64RegisterInfo.cpp b/lib/Target/AArch64/AArch64RegisterInfo.cpp index 75ec44f3fec..618f6fb9289 100644 --- a/lib/Target/AArch64/AArch64RegisterInfo.cpp +++ b/lib/Target/AArch64/AArch64RegisterInfo.cpp @@ -76,6 +76,12 @@ AArch64RegisterInfo::getReservedRegs(const MachineFunction &MF) const { return Reserved; } +static bool hasFrameOffset(int opcode) { + return opcode != AArch64::LD1x2_16B && opcode != AArch64::LD1x3_16B && + opcode != AArch64::LD1x4_16B && opcode != AArch64::ST1x2_16B && + opcode != AArch64::ST1x3_16B && opcode != AArch64::ST1x4_16B; +} + void AArch64RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MBBI, int SPAdj, @@ -110,8 +116,10 @@ AArch64RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MBBI, int64_t Offset; Offset = TFI->resolveFrameIndexReference(MF, FrameIndex, FrameReg, SPAdj, IsCalleeSaveOp); - - Offset += MI.getOperand(FIOperandNum + 1).getImm(); + // A vector load/store instruction doesn't have an offset operand. + bool HasOffsetOp = hasFrameOffset(MI.getOpcode()); + if (HasOffsetOp) + Offset += MI.getOperand(FIOperandNum + 1).getImm(); // DBG_VALUE instructions have no real restrictions so they can be handled // easily. @@ -124,7 +132,7 @@ AArch64RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MBBI, const AArch64InstrInfo &TII = *static_cast(MF.getTarget().getInstrInfo()); int MinOffset, MaxOffset, OffsetScale; - if (MI.getOpcode() == AArch64::ADDxxi_lsl0_s) { + if (MI.getOpcode() == AArch64::ADDxxi_lsl0_s || !HasOffsetOp) { MinOffset = 0; MaxOffset = 0xfff; OffsetScale = 1; @@ -133,10 +141,12 @@ AArch64RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MBBI, TII.getAddressConstraints(MI, OffsetScale, MinOffset, MaxOffset); } - // The frame lowering has told us a base and offset it thinks we should use to - // access this variable, but it's still up to us to make sure the values are - // legal for the instruction in question. - if (Offset % OffsetScale != 0 || Offset < MinOffset || Offset > MaxOffset) { + // There are two situations we don't use frame + offset directly in the + // instruction: + // (1) The offset can't really be scaled + // (2) Can't encode offset as it doesn't have an offset operand + if ((Offset % OffsetScale != 0 || Offset < MinOffset || Offset > MaxOffset) || + (!HasOffsetOp && Offset != 0)) { unsigned BaseReg = MF.getRegInfo().createVirtualRegister(&AArch64::GPR64RegClass); emitRegUpdate(MBB, MBBI, MBBI->getDebugLoc(), TII, @@ -150,7 +160,8 @@ AArch64RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MBBI, assert(Offset >= 0 && "Unexpected negative offset from SP"); MI.getOperand(FIOperandNum).ChangeToRegister(FrameReg, false, false, true); - MI.getOperand(FIOperandNum + 1).ChangeToImmediate(Offset / OffsetScale); + if (HasOffsetOp) + MI.getOperand(FIOperandNum + 1).ChangeToImmediate(Offset / OffsetScale); } unsigned diff --git a/test/CodeGen/AArch64/neon-vector-list-spill.ll b/test/CodeGen/AArch64/neon-vector-list-spill.ll new file mode 100644 index 00000000000..9ac2c05ebd0 --- /dev/null +++ b/test/CodeGen/AArch64/neon-vector-list-spill.ll @@ -0,0 +1,134 @@ +; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon -fp-contract=fast + +; FIXME: We should not generate ld/st for such register spill/fill, because the +; test case seems very simple and the register pressure is not high. If the +; spill/fill algorithm is optimized, this test case may not be triggered. And +; then we can delete it. +define i32 @spill.DPairReg(i8* %arg1, i32 %arg2) { +; CHECK-LABEL: spill.DPairReg: +; CHECK: ld2 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [{{x[0-9]+|sp}}] +; CHECK: st1 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [{{x[0-9]+|sp}}] +; CHECK: ld1 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [{{x[0-9]+|sp}}] +entry: + %vld = tail call { <2 x i32>, <2 x i32> } @llvm.arm.neon.vld2.v2i32(i8* %arg1, i32 4) + %cmp = icmp eq i32 %arg2, 0 + br i1 %cmp, label %if.then, label %if.end + +if.then: + tail call void @foo() + br label %if.end + +if.end: + %vld.extract = extractvalue { <2 x i32>, <2 x i32> } %vld, 0 + %res = extractelement <2 x i32> %vld.extract, i32 1 + ret i32 %res +} + +define i16 @spill.DTripleReg(i8* %arg1, i32 %arg2) { +; CHECK-LABEL: spill.DTripleReg: +; CHECK: ld3 {v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h}, [{{x[0-9]+|sp}}] +; CHECK: st1 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [{{x[0-9]+|sp}}] +; CHECK: ld1 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [{{x[0-9]+|sp}}] +entry: + %vld = tail call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3.v4i16(i8* %arg1, i32 4) + %cmp = icmp eq i32 %arg2, 0 + br i1 %cmp, label %if.then, label %if.end + +if.then: + tail call void @foo() + br label %if.end + +if.end: + %vld.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %vld, 0 + %res = extractelement <4 x i16> %vld.extract, i32 1 + ret i16 %res +} + +define i16 @spill.DQuadReg(i8* %arg1, i32 %arg2) { +; CHECK-LABEL: spill.DQuadReg: +; CHECK: ld4 {v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h}, [{{x[0-9]+|sp}}] +; CHECK: st1 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [{{x[0-9]+|sp}}] +; CHECK: ld1 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [{{x[0-9]+|sp}}] +entry: + %vld = tail call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4.v4i16(i8* %arg1, i32 4) + %cmp = icmp eq i32 %arg2, 0 + br i1 %cmp, label %if.then, label %if.end + +if.then: + tail call void @foo() + br label %if.end + +if.end: + %vld.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld, 0 + %res = extractelement <4 x i16> %vld.extract, i32 0 + ret i16 %res +} + +define i32 @spill.QPairReg(i8* %arg1, i32 %arg2) { +; CHECK-LABEL: spill.QPairReg: +; CHECK: ld3 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [{{x[0-9]+|sp}}] +; CHECK: st1 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [{{x[0-9]+|sp}}] +; CHECK: ld1 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [{{x[0-9]+|sp}}] +entry: + %vld = tail call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2.v4i32(i8* %arg1, i32 4) + %cmp = icmp eq i32 %arg2, 0 + br i1 %cmp, label %if.then, label %if.end + +if.then: + tail call void @foo() + br label %if.end + +if.end: + %vld.extract = extractvalue { <4 x i32>, <4 x i32> } %vld, 0 + %res = extractelement <4 x i32> %vld.extract, i32 1 + ret i32 %res +} + +define float @spill.QTripleReg(i8* %arg1, i32 %arg2) { +; CHECK-LABEL: spill.QTripleReg: +; CHECK: ld3 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [{{x[0-9]+|sp}}] +; CHECK: st1 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [{{x[0-9]+|sp}}] +; CHECK: ld1 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [{{x[0-9]+|sp}}] +entry: + %vld3 = tail call { <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld3.v4f32(i8* %arg1, i32 4) + %cmp = icmp eq i32 %arg2, 0 + br i1 %cmp, label %if.then, label %if.end + +if.then: + tail call void @foo() + br label %if.end + +if.end: + %vld3.extract = extractvalue { <4 x float>, <4 x float>, <4 x float> } %vld3, 0 + %res = extractelement <4 x float> %vld3.extract, i32 1 + ret float %res +} + +define i8 @spill.QQuadReg(i8* %arg1, i32 %arg2) { +; CHECK-LABEL: spill.QQuadReg: +; CHECK: ld4 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [{{x[0-9]+|sp}}] +; CHECK: st1 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [{{x[0-9]+|sp}}] +; CHECK: ld1 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [{{x[0-9]+|sp}}] +entry: + %vld = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld4.v16i8(i8* %arg1, i32 4) + %cmp = icmp eq i32 %arg2, 0 + br i1 %cmp, label %if.then, label %if.end + +if.then: + tail call void @foo() + br label %if.end + +if.end: + %vld.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld, 0 + %res = extractelement <16 x i8> %vld.extract, i32 1 + ret i8 %res +} + +declare { <2 x i32>, <2 x i32> } @llvm.arm.neon.vld2.v2i32(i8*, i32) +declare { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3.v4i16(i8*, i32) +declare { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4.v4i16(i8*, i32) +declare { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2.v4i32(i8*, i32) +declare { <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld3.v4f32(i8*, i32) +declare { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld4.v16i8(i8*, i32) + +declare void @foo()