From cdbb3f5d3311e0f46d22bc8daa211b2fab3541cb Mon Sep 17 00:00:00 2001 From: Evan Cheng Date: Thu, 27 Aug 2009 01:23:50 +0000 Subject: [PATCH] Fix PR4789. Teach eliminateFrameIndex how to handle VLDRQ and VSTRQ which cannot fold any immediate offset. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@80191 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/ARM/ARMBaseInstrInfo.cpp | 21 ++++++---- lib/Target/ARM/ARMBaseInstrInfo.h | 17 ++++---- lib/Target/ARM/ARMBaseRegisterInfo.cpp | 32 ++++++++++----- lib/Target/ARM/Thumb2InstrInfo.cpp | 36 +++++++++------- test/CodeGen/ARM/spill-q.ll | 57 ++++++++++++++++++++++++++ test/CodeGen/Thumb2/thumb2-spill-q.ll | 57 ++++++++++++++++++++++++++ 6 files changed, 179 insertions(+), 41 deletions(-) create mode 100644 test/CodeGen/ARM/spill-q.ll create mode 100644 test/CodeGen/Thumb2/thumb2-spill-q.ll diff --git a/lib/Target/ARM/ARMBaseInstrInfo.cpp b/lib/Target/ARM/ARMBaseInstrInfo.cpp index d5ceedb1e86..142c3f1a41c 100644 --- a/lib/Target/ARM/ARMBaseInstrInfo.cpp +++ b/lib/Target/ARM/ARMBaseInstrInfo.cpp @@ -893,9 +893,9 @@ void llvm::emitARMRegPlusImmediate(MachineBasicBlock &MBB, } } -int llvm::rewriteARMFrameIndex(MachineInstr &MI, unsigned FrameRegIdx, - unsigned FrameReg, int Offset, - const ARMBaseInstrInfo &TII) { +bool llvm::rewriteARMFrameIndex(MachineInstr &MI, unsigned FrameRegIdx, + unsigned FrameReg, int &Offset, + const ARMBaseInstrInfo &TII) { unsigned Opcode = MI.getOpcode(); const TargetInstrDesc &Desc = MI.getDesc(); unsigned AddrMode = (Desc.TSFlags & ARMII::AddrModeMask); @@ -912,7 +912,8 @@ int llvm::rewriteARMFrameIndex(MachineInstr &MI, unsigned FrameRegIdx, MI.setDesc(TII.get(ARM::MOVr)); MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false); MI.RemoveOperand(FrameRegIdx+1); - return 0; + Offset = 0; + return true; } else if (Offset < 0) { Offset = -Offset; isSub = true; @@ -924,7 +925,8 @@ int llvm::rewriteARMFrameIndex(MachineInstr &MI, unsigned FrameRegIdx, // Replace the FrameIndex with sp / fp MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false); MI.getOperand(FrameRegIdx+1).ChangeToImmediate(Offset); - return 0; + Offset = 0; + return true; } // Otherwise, pull as much of the immedidate into this ADDri/SUBri @@ -962,7 +964,8 @@ int llvm::rewriteARMFrameIndex(MachineInstr &MI, unsigned FrameRegIdx, break; } case ARMII::AddrMode4: - break; + // Can't fold any offset even if it's zero. + return false; case ARMII::AddrMode5: { ImmIdx = FrameRegIdx+1; InstrOffs = ARM_AM::getAM5Offset(MI.getOperand(ImmIdx).getImm()); @@ -996,7 +999,8 @@ int llvm::rewriteARMFrameIndex(MachineInstr &MI, unsigned FrameRegIdx, if (isSub) ImmedOffset |= 1 << NumBits; ImmOp.ChangeToImmediate(ImmedOffset); - return 0; + Offset = 0; + return true; } // Otherwise, it didn't fit. Pull in what we can to simplify the immed. @@ -1008,5 +1012,6 @@ int llvm::rewriteARMFrameIndex(MachineInstr &MI, unsigned FrameRegIdx, } } - return (isSub) ? -Offset : Offset; + Offset = (isSub) ? -Offset : Offset; + return Offset == 0; } diff --git a/lib/Target/ARM/ARMBaseInstrInfo.h b/lib/Target/ARM/ARMBaseInstrInfo.h index f4d1ef373dc..3632450eded 100644 --- a/lib/Target/ARM/ARMBaseInstrInfo.h +++ b/lib/Target/ARM/ARMBaseInstrInfo.h @@ -317,15 +317,16 @@ void emitT2RegPlusImmediate(MachineBasicBlock &MBB, /// rewriteARMFrameIndex / rewriteT2FrameIndex - -/// Rewrite MI to access 'Offset' bytes from the FP. Return the offset that -/// could not be handled directly in MI. -int rewriteARMFrameIndex(MachineInstr &MI, unsigned FrameRegIdx, - unsigned FrameReg, int Offset, - const ARMBaseInstrInfo &TII); +/// Rewrite MI to access 'Offset' bytes from the FP. Return false if the +/// offset could not be handled directly in MI, and return the left-over +/// portion by reference. +bool rewriteARMFrameIndex(MachineInstr &MI, unsigned FrameRegIdx, + unsigned FrameReg, int &Offset, + const ARMBaseInstrInfo &TII); -int rewriteT2FrameIndex(MachineInstr &MI, unsigned FrameRegIdx, - unsigned FrameReg, int Offset, - const ARMBaseInstrInfo &TII); +bool rewriteT2FrameIndex(MachineInstr &MI, unsigned FrameRegIdx, + unsigned FrameReg, int &Offset, + const ARMBaseInstrInfo &TII); } // End llvm namespace diff --git a/lib/Target/ARM/ARMBaseRegisterInfo.cpp b/lib/Target/ARM/ARMBaseRegisterInfo.cpp index 07164ff6f51..b0108f23064 100644 --- a/lib/Target/ARM/ARMBaseRegisterInfo.cpp +++ b/lib/Target/ARM/ARMBaseRegisterInfo.cpp @@ -1047,19 +1047,24 @@ ARMBaseRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, } // modify MI as necessary to handle as much of 'Offset' as possible + bool Done = false; if (!AFI->isThumbFunction()) - Offset = rewriteARMFrameIndex(MI, i, FrameReg, Offset, TII); + Done = rewriteARMFrameIndex(MI, i, FrameReg, Offset, TII); else { assert(AFI->isThumb2Function()); - Offset = rewriteT2FrameIndex(MI, i, FrameReg, Offset, TII); + Done = rewriteT2FrameIndex(MI, i, FrameReg, Offset, TII); } - if (Offset == 0) + if (Done) return; + const TargetInstrDesc &Desc = MI.getDesc(); + unsigned AddrMode = (Desc.TSFlags & ARMII::AddrModeMask); + // If we get here, the immediate doesn't fit into the instruction. We folded // as much as possible above, handle the rest, providing a register that is // SP+LargeImm. - assert(Offset && "This code isn't needed if offset already handled!"); + assert((Offset || AddrMode == ARMII::AddrMode4) && + "This code isn't needed if offset already handled!"); // Insert a set of r12 with the full address: r12 = sp + offset // If the offset we have is too large to fit into the instruction, we need @@ -1073,15 +1078,20 @@ ARMBaseRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, ARMCC::CondCodes Pred = (PIdx == -1) ? ARMCC::AL : (ARMCC::CondCodes)MI.getOperand(PIdx).getImm(); unsigned PredReg = (PIdx == -1) ? 0 : MI.getOperand(PIdx+1).getReg(); - if (!AFI->isThumbFunction()) - emitARMRegPlusImmediate(MBB, II, MI.getDebugLoc(), ScratchReg, FrameReg, - Offset, Pred, PredReg, TII); + if (Offset == 0) + // Must be addrmode4. + MI.getOperand(i).ChangeToRegister(FrameReg, false, false, false); else { - assert(AFI->isThumb2Function()); - emitT2RegPlusImmediate(MBB, II, MI.getDebugLoc(), ScratchReg, FrameReg, - Offset, Pred, PredReg, TII); + if (!AFI->isThumbFunction()) + emitARMRegPlusImmediate(MBB, II, MI.getDebugLoc(), ScratchReg, FrameReg, + Offset, Pred, PredReg, TII); + else { + assert(AFI->isThumb2Function()); + emitT2RegPlusImmediate(MBB, II, MI.getDebugLoc(), ScratchReg, FrameReg, + Offset, Pred, PredReg, TII); + } + MI.getOperand(i).ChangeToRegister(ScratchReg, false, false, true); } - MI.getOperand(i).ChangeToRegister(ScratchReg, false, false, true); } /// Move iterator pass the next bunch of callee save load / store ops for diff --git a/lib/Target/ARM/Thumb2InstrInfo.cpp b/lib/Target/ARM/Thumb2InstrInfo.cpp index d37f61e490c..8c09ebd3e00 100644 --- a/lib/Target/ARM/Thumb2InstrInfo.cpp +++ b/lib/Target/ARM/Thumb2InstrInfo.cpp @@ -319,9 +319,9 @@ immediateOffsetOpcode(unsigned opcode) return 0; } -int llvm::rewriteT2FrameIndex(MachineInstr &MI, unsigned FrameRegIdx, - unsigned FrameReg, int Offset, - const ARMBaseInstrInfo &TII) { +bool llvm::rewriteT2FrameIndex(MachineInstr &MI, unsigned FrameRegIdx, + unsigned FrameReg, int &Offset, + const ARMBaseInstrInfo &TII) { unsigned Opcode = MI.getOpcode(); const TargetInstrDesc &Desc = MI.getDesc(); unsigned AddrMode = (Desc.TSFlags & ARMII::AddrModeMask); @@ -340,7 +340,8 @@ int llvm::rewriteT2FrameIndex(MachineInstr &MI, unsigned FrameRegIdx, MI.setDesc(TII.get(ARM::tMOVgpr2gpr)); MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false); MI.RemoveOperand(FrameRegIdx+1); - return 0; + Offset = 0; + return true; } if (Offset < 0) { @@ -355,7 +356,8 @@ int llvm::rewriteT2FrameIndex(MachineInstr &MI, unsigned FrameRegIdx, if (ARM_AM::getT2SOImmVal(Offset) != -1) { MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false); MI.getOperand(FrameRegIdx+1).ChangeToImmediate(Offset); - return 0; + Offset = 0; + return true; } // Another common case: imm12. if (Offset < 4096) { @@ -365,7 +367,8 @@ int llvm::rewriteT2FrameIndex(MachineInstr &MI, unsigned FrameRegIdx, MI.setDesc(TII.get(NewOpc)); MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false); MI.getOperand(FrameRegIdx+1).ChangeToImmediate(Offset); - return 0; + Offset = 0; + return true; } // Otherwise, extract 8 adjacent bits from the immediate into this @@ -387,7 +390,7 @@ int llvm::rewriteT2FrameIndex(MachineInstr &MI, unsigned FrameRegIdx, unsigned OffsetReg = MI.getOperand(FrameRegIdx+1).getReg(); if (OffsetReg != 0) { MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false); - return Offset; + return Offset == 0; } MI.RemoveOperand(FrameRegIdx+1); @@ -413,11 +416,14 @@ int llvm::rewriteT2FrameIndex(MachineInstr &MI, unsigned FrameRegIdx, NumBits = 12; } } else { - // VFP address modes. - assert(AddrMode == ARMII::AddrMode5); - int InstrOffs=ARM_AM::getAM5Offset(MI.getOperand(FrameRegIdx+1).getImm()); - if (ARM_AM::getAM5Op(MI.getOperand(FrameRegIdx+1).getImm()) ==ARM_AM::sub) - InstrOffs *= -1; + // VFP and NEON address modes. + int InstrOffs = 0; + if (AddrMode == ARMII::AddrMode5) { + const MachineOperand &OffOp = MI.getOperand(FrameRegIdx+1); + InstrOffs = ARM_AM::getAM5Offset(OffOp.getImm()); + if (ARM_AM::getAM5Op(OffOp.getImm()) == ARM_AM::sub) + InstrOffs *= -1; + } NumBits = 8; Scale = 4; Offset += InstrOffs * 4; @@ -448,7 +454,8 @@ int llvm::rewriteT2FrameIndex(MachineInstr &MI, unsigned FrameRegIdx, ImmedOffset = -ImmedOffset; } ImmOp.ChangeToImmediate(ImmedOffset); - return 0; + Offset = 0; + return true; } // Otherwise, offset doesn't fit. Pull in what we can to simplify @@ -468,5 +475,6 @@ int llvm::rewriteT2FrameIndex(MachineInstr &MI, unsigned FrameRegIdx, Offset &= ~(Mask*Scale); } - return (isSub) ? -Offset : Offset; + Offset = (isSub) ? -Offset : Offset; + return Offset == 0; } diff --git a/test/CodeGen/ARM/spill-q.ll b/test/CodeGen/ARM/spill-q.ll new file mode 100644 index 00000000000..8775e050b2c --- /dev/null +++ b/test/CodeGen/ARM/spill-q.ll @@ -0,0 +1,57 @@ +; RUN: llvm-as < %s | llc -mtriple=armv7-elf -mattr=+neon | FileCheck %s +; PR4789 + +%bar = type { float, float, float } +%baz = type { i32, [16 x %bar], [16 x float], [16 x i32], i8 } +%foo = type { <4 x float> } +%quux = type { i32 (...)**, %baz*, i32 } +%quuz = type { %quux, i32, %bar, [128 x i8], [16 x %foo], %foo, %foo, %foo } + +declare <4 x float> @llvm.arm.neon.vld1.v4f32(i8*) nounwind readonly + +define arm_apcscc void @aaa(%quuz* %this, i8* %block) { +; CHECK: aaa: +; CHECK: vstmia sp +; CHECK: vldmia sp +entry: + %0 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef) nounwind ; <<4 x float>> [#uses=1] + store float 6.300000e+01, float* undef, align 4 + %1 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef) nounwind ; <<4 x float>> [#uses=1] + store float 0.000000e+00, float* undef, align 4 + %2 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef) nounwind ; <<4 x float>> [#uses=1] + %val173 = load <4 x float>* undef ; <<4 x float>> [#uses=1] + br label %bb4 + +bb4: ; preds = %bb193, %entry + %besterror.0.2264 = phi <4 x float> [ undef, %entry ], [ %besterror.0.0, %bb193 ] ; <<4 x float>> [#uses=2] + %part0.0.0261 = phi <4 x float> [ zeroinitializer, %entry ], [ %23, %bb193 ] ; <<4 x float>> [#uses=2] + %3 = fmul <4 x float> zeroinitializer, %0 ; <<4 x float>> [#uses=2] + %4 = fadd <4 x float> %3, %part0.0.0261 ; <<4 x float>> [#uses=1] + %5 = shufflevector <4 x float> %3, <4 x float> undef, <2 x i32> ; <<2 x float>> [#uses=1] + %6 = shufflevector <2 x float> %5, <2 x float> undef, <4 x i32> ; <<4 x float>> [#uses=1] + %7 = fmul <4 x float> %1, undef ; <<4 x float>> [#uses=1] + %8 = fadd <4 x float> %7, ; <<4 x float>> [#uses=1] + %9 = fptosi <4 x float> %8 to <4 x i32> ; <<4 x i32>> [#uses=1] + %10 = sitofp <4 x i32> %9 to <4 x float> ; <<4 x float>> [#uses=1] + %11 = fmul <4 x float> %10, %2 ; <<4 x float>> [#uses=1] + %12 = fmul <4 x float> undef, %6 ; <<4 x float>> [#uses=1] + %13 = fmul <4 x float> %11, %4 ; <<4 x float>> [#uses=1] + %14 = fsub <4 x float> %12, %13 ; <<4 x float>> [#uses=1] + %15 = fsub <4 x float> %14, undef ; <<4 x float>> [#uses=1] + %16 = fmul <4 x float> %15, ; <<4 x float>> [#uses=1] + %17 = fadd <4 x float> %16, undef ; <<4 x float>> [#uses=1] + %18 = fmul <4 x float> %17, %val173 ; <<4 x float>> [#uses=1] + %19 = shufflevector <4 x float> %18, <4 x float> undef, <2 x i32> ; <<2 x float>> [#uses=1] + %20 = shufflevector <2 x float> %19, <2 x float> undef, <4 x i32> zeroinitializer ; <<4 x float>> [#uses=1] + %21 = fadd <4 x float> zeroinitializer, %20 ; <<4 x float>> [#uses=2] + %22 = fcmp ogt <4 x float> %besterror.0.2264, %21 ; <<4 x i1>> [#uses=0] + br i1 undef, label %bb193, label %bb186 + +bb186: ; preds = %bb4 + br label %bb193 + +bb193: ; preds = %bb186, %bb4 + %besterror.0.0 = phi <4 x float> [ %21, %bb186 ], [ %besterror.0.2264, %bb4 ] ; <<4 x float>> [#uses=1] + %23 = fadd <4 x float> %part0.0.0261, zeroinitializer ; <<4 x float>> [#uses=1] + br label %bb4 +} diff --git a/test/CodeGen/Thumb2/thumb2-spill-q.ll b/test/CodeGen/Thumb2/thumb2-spill-q.ll new file mode 100644 index 00000000000..3c8e820ed4a --- /dev/null +++ b/test/CodeGen/Thumb2/thumb2-spill-q.ll @@ -0,0 +1,57 @@ +; RUN: llvm-as < %s | llc -mtriple=thumbv7-elf -mattr=+neon | FileCheck %s +; PR4789 + +%bar = type { float, float, float } +%baz = type { i32, [16 x %bar], [16 x float], [16 x i32], i8 } +%foo = type { <4 x float> } +%quux = type { i32 (...)**, %baz*, i32 } +%quuz = type { %quux, i32, %bar, [128 x i8], [16 x %foo], %foo, %foo, %foo } + +declare <4 x float> @llvm.arm.neon.vld1.v4f32(i8*) nounwind readonly + +define arm_apcscc void @aaa(%quuz* %this, i8* %block) { +; CHECK: aaa: +; CHECK: vstmia sp +; CHECK: vldmia sp +entry: + %0 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef) nounwind ; <<4 x float>> [#uses=1] + store float 6.300000e+01, float* undef, align 4 + %1 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef) nounwind ; <<4 x float>> [#uses=1] + store float 0.000000e+00, float* undef, align 4 + %2 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef) nounwind ; <<4 x float>> [#uses=1] + %val173 = load <4 x float>* undef ; <<4 x float>> [#uses=1] + br label %bb4 + +bb4: ; preds = %bb193, %entry + %besterror.0.2264 = phi <4 x float> [ undef, %entry ], [ %besterror.0.0, %bb193 ] ; <<4 x float>> [#uses=2] + %part0.0.0261 = phi <4 x float> [ zeroinitializer, %entry ], [ %23, %bb193 ] ; <<4 x float>> [#uses=2] + %3 = fmul <4 x float> zeroinitializer, %0 ; <<4 x float>> [#uses=2] + %4 = fadd <4 x float> %3, %part0.0.0261 ; <<4 x float>> [#uses=1] + %5 = shufflevector <4 x float> %3, <4 x float> undef, <2 x i32> ; <<2 x float>> [#uses=1] + %6 = shufflevector <2 x float> %5, <2 x float> undef, <4 x i32> ; <<4 x float>> [#uses=1] + %7 = fmul <4 x float> %1, undef ; <<4 x float>> [#uses=1] + %8 = fadd <4 x float> %7, ; <<4 x float>> [#uses=1] + %9 = fptosi <4 x float> %8 to <4 x i32> ; <<4 x i32>> [#uses=1] + %10 = sitofp <4 x i32> %9 to <4 x float> ; <<4 x float>> [#uses=1] + %11 = fmul <4 x float> %10, %2 ; <<4 x float>> [#uses=1] + %12 = fmul <4 x float> undef, %6 ; <<4 x float>> [#uses=1] + %13 = fmul <4 x float> %11, %4 ; <<4 x float>> [#uses=1] + %14 = fsub <4 x float> %12, %13 ; <<4 x float>> [#uses=1] + %15 = fsub <4 x float> %14, undef ; <<4 x float>> [#uses=1] + %16 = fmul <4 x float> %15, ; <<4 x float>> [#uses=1] + %17 = fadd <4 x float> %16, undef ; <<4 x float>> [#uses=1] + %18 = fmul <4 x float> %17, %val173 ; <<4 x float>> [#uses=1] + %19 = shufflevector <4 x float> %18, <4 x float> undef, <2 x i32> ; <<2 x float>> [#uses=1] + %20 = shufflevector <2 x float> %19, <2 x float> undef, <4 x i32> zeroinitializer ; <<4 x float>> [#uses=1] + %21 = fadd <4 x float> zeroinitializer, %20 ; <<4 x float>> [#uses=2] + %22 = fcmp ogt <4 x float> %besterror.0.2264, %21 ; <<4 x i1>> [#uses=0] + br i1 undef, label %bb193, label %bb186 + +bb186: ; preds = %bb4 + br label %bb193 + +bb193: ; preds = %bb186, %bb4 + %besterror.0.0 = phi <4 x float> [ %21, %bb186 ], [ %besterror.0.2264, %bb4 ] ; <<4 x float>> [#uses=1] + %23 = fadd <4 x float> %part0.0.0261, zeroinitializer ; <<4 x float>> [#uses=1] + br label %bb4 +}