Clean up the handling of the x87 fp stack to make it more robust.

Drop the FpMov instructions, use plain COPY instead.

Drop the FpSET/GET instruction for accessing fixed stack positions.
Instead use normal COPY to/from ST registers around inline assembly, and
provide a single new FpPOP_RETVAL instruction that can access the return
value(s) from a call. This is still necessary since you cannot tell from
the CALL instruction alone if it returns anything on the FP stack. Teach
fast isel to use this.

This provides a much more robust way of handling fixed stack registers -
we can tolerate arbitrary FP stack instructions inserted around calls
and inline assembly. Live range splitting could sometimes break x87 code
by inserting spill code in unfortunate places.

As a bonus we handle floating point inline assembly correctly now.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@134018 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
Jakob Stoklund Olesen 2011-06-28 18:32:28 +00:00
parent bd35f27ce9
commit 9bbe4d6c00
7 changed files with 535 additions and 231 deletions

View File

@ -1848,16 +1848,18 @@ bool X86FastISel::DoSelectCall(const Instruction *I, const char *MemIntName) {
// stack, but where we prefer to use the value in xmm registers, copy it
// out as F80 and use a truncate to move it from fp stack reg to xmm reg.
if ((RVLocs[i].getLocReg() == X86::ST0 ||
RVLocs[i].getLocReg() == X86::ST1) &&
isScalarFPTypeInSSEReg(RVLocs[0].getValVT())) {
CopyVT = MVT::f80;
RVLocs[i].getLocReg() == X86::ST1)) {
if (isScalarFPTypeInSSEReg(RVLocs[i].getValVT()))
CopyVT = MVT::f80;
CopyReg = createResultReg(X86::RFP80RegisterClass);
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(X86::FpPOP_RETVAL),
CopyReg);
} else {
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TargetOpcode::COPY),
CopyReg).addReg(RVLocs[i].getLocReg());
UsedRegs.push_back(RVLocs[i].getLocReg());
}
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TargetOpcode::COPY),
CopyReg).addReg(RVLocs[i].getLocReg());
UsedRegs.push_back(RVLocs[i].getLocReg());
if (CopyVT != RVLocs[i].getValVT()) {
// Round the F80 the right size, which also moves to the appropriate xmm
// register. This is accomplished by storing the F80 value in memory and

View File

@ -37,6 +37,7 @@
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/Passes.h"
#include "llvm/InlineAsm.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/raw_ostream.h"
@ -134,11 +135,36 @@ namespace {
unsigned Stack[8]; // FP<n> Registers in each stack slot...
unsigned StackTop; // The current top of the FP stack.
enum {
NumFPRegs = 16 // Including scratch pseudo-registers.
};
// For each live FP<n> register, point to its Stack[] entry.
// The first entries correspond to FP0-FP6, the rest are scratch registers
// used when we need slightly different live registers than what the
// register allocator thinks.
unsigned RegMap[16];
unsigned RegMap[NumFPRegs];
// Pending fixed registers - Inline assembly needs FP registers to appear
// in fixed stack slot positions. This is handled by copying FP registers
// to ST registers before the instruction, and copying back after the
// instruction.
//
// This is modeled with pending ST registers. NumPendingSTs is the number
// of ST registers (ST0-STn) we are tracking. PendingST[n] points to an FP
// register that holds the ST value. The ST registers are not moved into
// place until immediately before the instruction that needs them.
//
// It can happen that we need an ST register to be live when no FP register
// holds the value:
//
// %ST0 = COPY %FP4<kill>
//
// When that happens, we allocate a scratch FP register to hold the ST
// value. That means every register in PendingST must be live.
unsigned NumPendingSTs;
unsigned char PendingST[8];
// Set up our stack model to match the incoming registers to MBB.
void setupBlockStack();
@ -152,13 +178,15 @@ namespace {
dbgs() << " FP" << Stack[i];
assert(RegMap[Stack[i]] == i && "Stack[] doesn't match RegMap[]!");
}
for (unsigned i = 0; i != NumPendingSTs; ++i)
dbgs() << ", ST" << i << " in FP" << unsigned(PendingST[i]);
dbgs() << "\n";
}
/// getSlot - Return the stack slot number a particular register number is
/// in.
unsigned getSlot(unsigned RegNo) const {
assert(RegNo < array_lengthof(RegMap) && "Regno out of range!");
assert(RegNo < NumFPRegs && "Regno out of range!");
return RegMap[RegNo];
}
@ -170,12 +198,17 @@ namespace {
/// getScratchReg - Return an FP register that is not currently in use.
unsigned getScratchReg() {
for (int i = array_lengthof(RegMap) - 1; i >= 8; --i)
for (int i = NumFPRegs - 1; i >= 8; --i)
if (!isLive(i))
return i;
llvm_unreachable("Ran out of scratch FP registers");
}
/// isScratchReg - Returns trus if RegNo is a scratch FP register.
bool isScratchReg(unsigned RegNo) {
return RegNo > 8 && RegNo < NumFPRegs;
}
/// getStackEntry - Return the X86::FP<n> register in register ST(i).
unsigned getStackEntry(unsigned STi) const {
if (STi >= StackTop)
@ -191,7 +224,7 @@ namespace {
// pushReg - Push the specified FP<n> register onto the stack.
void pushReg(unsigned Reg) {
assert(Reg < array_lengthof(RegMap) && "Register number out of range!");
assert(Reg < NumFPRegs && "Register number out of range!");
if (StackTop >= 8)
report_fatal_error("Stack overflow!");
Stack[StackTop] = Reg;
@ -261,7 +294,14 @@ namespace {
void handleCondMovFP(MachineBasicBlock::iterator &I);
void handleSpecialFP(MachineBasicBlock::iterator &I);
bool translateCopy(MachineInstr*);
// Check if a COPY instruction is using FP registers.
bool isFPCopy(MachineInstr *MI) {
unsigned DstReg = MI->getOperand(0).getReg();
unsigned SrcReg = MI->getOperand(1).getReg();
return X86::RFP80RegClass.contains(DstReg) ||
X86::RFP80RegClass.contains(SrcReg);
}
};
char FPS::ID = 0;
}
@ -351,6 +391,7 @@ void FPS::bundleCFG(MachineFunction &MF) {
bool FPS::processBasicBlock(MachineFunction &MF, MachineBasicBlock &BB) {
bool Changed = false;
MBB = &BB;
NumPendingSTs = 0;
setupBlockStack();
@ -362,7 +403,7 @@ bool FPS::processBasicBlock(MachineFunction &MF, MachineBasicBlock &BB) {
if (MI->isInlineAsm())
FPInstClass = X86II::SpecialFP;
if (MI->isCopy() && translateCopy(MI))
if (MI->isCopy() && isFPCopy(MI))
FPInstClass = X86II::SpecialFP;
if (FPInstClass == X86II::NotFP)
@ -891,7 +932,8 @@ void FPS::shuffleStackTop(const unsigned char *FixStack,
continue;
// (Reg st0) (OldReg st0) = (Reg OldReg st0)
moveToTop(Reg, I);
moveToTop(OldReg, I);
if (FixCount > 0)
moveToTop(OldReg, I);
}
DEBUG(dumpStack());
}
@ -1249,142 +1291,309 @@ void FPS::handleSpecialFP(MachineBasicBlock::iterator &I) {
MachineInstr *MI = I;
switch (MI->getOpcode()) {
default: llvm_unreachable("Unknown SpecialFP instruction!");
case X86::FpGET_ST0_32:// Appears immediately after a call returning FP type!
case X86::FpGET_ST0_64:// Appears immediately after a call returning FP type!
case X86::FpGET_ST0_80:// Appears immediately after a call returning FP type!
assert(StackTop == 0 && "Stack should be empty after a call!");
pushReg(getFPReg(MI->getOperand(0)));
break;
case X86::FpGET_ST1_32:// Appears immediately after a call returning FP type!
case X86::FpGET_ST1_64:// Appears immediately after a call returning FP type!
case X86::FpGET_ST1_80:{// Appears immediately after a call returning FP type!
// FpGET_ST1 should occur right after a FpGET_ST0 for a call or inline asm.
// The pattern we expect is:
// CALL
// FP1 = FpGET_ST0
// FP4 = FpGET_ST1
//
// At this point, we've pushed FP1 on the top of stack, so it should be
// present if it isn't dead. If it was dead, we already emitted a pop to
// remove it from the stack and StackTop = 0.
// Push FP4 as top of stack next.
pushReg(getFPReg(MI->getOperand(0)));
// If StackTop was 0 before we pushed our operand, then ST(0) must have been
// dead. In this case, the ST(1) value is the only thing that is live, so
// it should be on the TOS (after the pop that was emitted) and is. Just
// continue in this case.
if (StackTop == 1)
break;
// Because pushReg just pushed ST(1) as TOS, we now have to swap the two top
// elements so that our accounting is correct.
unsigned RegOnTop = getStackEntry(0);
unsigned RegNo = getStackEntry(1);
// Swap the slots the regs are in.
std::swap(RegMap[RegNo], RegMap[RegOnTop]);
// Swap stack slot contents.
if (RegMap[RegOnTop] >= StackTop)
report_fatal_error("Access past stack top!");
std::swap(Stack[RegMap[RegOnTop]], Stack[StackTop-1]);
break;
}
case X86::FpSET_ST0_32:
case X86::FpSET_ST0_64:
case X86::FpSET_ST0_80: {
// FpSET_ST0_80 is generated by copyRegToReg for setting up inline asm
// arguments that use an st constraint. We expect a sequence of
// instructions: Fp_SET_ST0 Fp_SET_ST1? INLINEASM
unsigned Op0 = getFPReg(MI->getOperand(0));
if (!MI->killsRegister(X86::FP0 + Op0)) {
// Duplicate Op0 into a temporary on the stack top.
duplicateToTop(Op0, getScratchReg(), I);
} else {
// Op0 is killed, so just swap it into position.
moveToTop(Op0, I);
}
--StackTop; // "Forget" we have something on the top of stack!
break;
}
case X86::FpSET_ST1_32:
case X86::FpSET_ST1_64:
case X86::FpSET_ST1_80: {
// Set up st(1) for inline asm. We are assuming that st(0) has already been
// set up by FpSET_ST0, and our StackTop is off by one because of it.
unsigned Op0 = getFPReg(MI->getOperand(0));
// Restore the actual StackTop from before Fp_SET_ST0.
// Note we can't handle Fp_SET_ST1 without a preceding Fp_SET_ST0, and we
// are not enforcing the constraint.
++StackTop;
unsigned RegOnTop = getStackEntry(0); // This reg must remain in st(0).
if (!MI->killsRegister(X86::FP0 + Op0)) {
duplicateToTop(Op0, getScratchReg(), I);
moveToTop(RegOnTop, I);
} else if (getSTReg(Op0) != X86::ST1) {
// We have the wrong value at st(1). Shuffle! Untested!
moveToTop(getStackEntry(1), I);
moveToTop(Op0, I);
moveToTop(RegOnTop, I);
}
assert(StackTop >= 2 && "Too few live registers");
StackTop -= 2; // "Forget" both st(0) and st(1).
break;
}
case X86::MOV_Fp3232:
case X86::MOV_Fp3264:
case X86::MOV_Fp6432:
case X86::MOV_Fp6464:
case X86::MOV_Fp3280:
case X86::MOV_Fp6480:
case X86::MOV_Fp8032:
case X86::MOV_Fp8064:
case X86::MOV_Fp8080: {
case TargetOpcode::COPY: {
// We handle three kinds of copies: FP <- FP, FP <- ST, and ST <- FP.
const MachineOperand &MO1 = MI->getOperand(1);
unsigned SrcReg = getFPReg(MO1);
const MachineOperand &MO0 = MI->getOperand(0);
unsigned DestReg = getFPReg(MO0);
if (MI->killsRegister(X86::FP0+SrcReg)) {
unsigned DstST = MO0.getReg() - X86::ST0;
unsigned SrcST = MO1.getReg() - X86::ST0;
bool KillsSrc = MI->killsRegister(MO1.getReg());
// ST = COPY FP. Set up a pending ST register.
if (DstST < 8) {
unsigned SrcFP = getFPReg(MO1);
assert(isLive(SrcFP) && "Cannot copy dead register");
assert(!MO0.isDead() && "Cannot copy to dead ST register");
// Unallocated STs are marked as the nonexistent FP255.
while (NumPendingSTs <= DstST)
PendingST[NumPendingSTs++] = NumFPRegs;
// STi could still be live from a previous inline asm.
if (isScratchReg(PendingST[DstST])) {
DEBUG(dbgs() << "Clobbering old ST in FP" << unsigned(PendingST[DstST])
<< '\n');
freeStackSlotBefore(MI, PendingST[DstST]);
}
// When the source is killed, allocate a scratch FP register.
if (KillsSrc) {
unsigned Slot = getSlot(SrcFP);
unsigned SR = getScratchReg();
PendingST[DstST] = SR;
Stack[Slot] = SR;
RegMap[SR] = Slot;
} else
PendingST[DstST] = SrcFP;
break;
}
// FP = COPY ST. Extract fixed stack value.
// Any instruction defining ST registers must have assigned them to a
// scratch register.
if (SrcST < 8) {
unsigned DstFP = getFPReg(MO0);
assert(!isLive(DstFP) && "Cannot copy ST to live FP register");
assert(NumPendingSTs > SrcST && "Cannot copy from dead ST register");
unsigned SrcFP = PendingST[SrcST];
assert(isScratchReg(SrcFP) && "Expected ST in a scratch register");
assert(isLive(SrcFP) && "Scratch holding ST is dead");
// DstFP steals the stack slot from SrcFP.
unsigned Slot = getSlot(SrcFP);
Stack[Slot] = DstFP;
RegMap[DstFP] = Slot;
// Always treat the ST as killed.
PendingST[SrcST] = NumFPRegs;
while (NumPendingSTs && PendingST[NumPendingSTs - 1] == NumFPRegs)
--NumPendingSTs;
break;
}
// FP <- FP copy.
unsigned DstFP = getFPReg(MO0);
unsigned SrcFP = getFPReg(MO1);
assert(isLive(SrcFP) && "Cannot copy dead register");
if (KillsSrc) {
// If the input operand is killed, we can just change the owner of the
// incoming stack slot into the result.
unsigned Slot = getSlot(SrcReg);
assert(Slot < 7 && DestReg < 7 && "FpMOV operands invalid!");
Stack[Slot] = DestReg;
RegMap[DestReg] = Slot;
unsigned Slot = getSlot(SrcFP);
Stack[Slot] = DstFP;
RegMap[DstFP] = Slot;
} else {
// For FMOV we just duplicate the specified value to a new stack slot.
// For COPY we just duplicate the specified value to a new stack slot.
// This could be made better, but would require substantial changes.
duplicateToTop(SrcReg, DestReg, I);
}
duplicateToTop(SrcFP, DstFP, I);
}
break;
}
case X86::FpPOP_RETVAL: {
// The FpPOP_RETVAL instruction is used after calls that return a value on
// the floating point stack. We cannot model this with ST defs since CALL
// instructions have fixed clobber lists. This instruction is interpreted
// to mean that there is one more live register on the stack than we
// thought.
//
// This means that StackTop does not match the hardware stack between a
// call and the FpPOP_RETVAL instructions. We do tolerate FP instructions
// between CALL and FpPOP_RETVAL as long as they don't overflow the
// hardware stack.
unsigned DstFP = getFPReg(MI->getOperand(0));
// Move existing stack elements up to reflect reality.
assert(StackTop < 8 && "Stack overflowed before FpPOP_RETVAL");
if (StackTop) {
std::copy_backward(Stack, Stack + StackTop, Stack + StackTop + 1);
for (unsigned i = 0; i != NumFPRegs; ++i)
++RegMap[i];
}
++StackTop;
// DstFP is the new bottom of the stack.
Stack[0] = DstFP;
RegMap[DstFP] = 0;
// DstFP will be killed by processBasicBlock if this was a dead def.
break;
}
case TargetOpcode::INLINEASM: {
// The inline asm MachineInstr currently only *uses* FP registers for the
// 'f' constraint. These should be turned into the current ST(x) register
// in the machine instr. Also, any kills should be explicitly popped after
// the inline asm.
unsigned Kills = 0;
// in the machine instr.
//
// There are special rules for x87 inline assembly. The compiler must know
// exactly how many registers are popped and pushed implicitly by the asm.
// Otherwise it is not possible to restore the stack state after the inline
// asm.
//
// There are 3 kinds of input operands:
//
// 1. Popped inputs. These must appear at the stack top in ST0-STn. A
// popped input operand must be in a fixed stack slot, and it is either
// tied to an output operand, or in the clobber list. The MI has ST use
// and def operands for these inputs.
//
// 2. Fixed inputs. These inputs appear in fixed stack slots, but are
// preserved by the inline asm. The fixed stack slots must be STn-STm
// following the popped inputs. A fixed input operand cannot be tied to
// an output or appear in the clobber list. The MI has ST use operands
// and no defs for these inputs.
//
// 3. Preserved inputs. These inputs use the "f" constraint which is
// represented as an FP register. The inline asm won't change these
// stack slots.
//
// Outputs must be in ST registers, FP outputs are not allowed. Clobbered
// registers do not count as output operands. The inline asm changes the
// stack as if it popped all the popped inputs and then pushed all the
// output operands.
// Scan the assembly for ST registers used, defined and clobbered. We can
// only tell clobbers from defs by looking at the asm descriptor.
unsigned STUses = 0, STDefs = 0, STClobbers = 0, STDeadDefs = 0;
unsigned NumOps = 0;
for (unsigned i = InlineAsm::MIOp_FirstOperand, e = MI->getNumOperands();
i != e && MI->getOperand(i).isImm(); i += 1 + NumOps) {
unsigned Flags = MI->getOperand(i).getImm();
NumOps = InlineAsm::getNumOperandRegisters(Flags);
if (NumOps != 1)
continue;
const MachineOperand &MO = MI->getOperand(i + 1);
if (!MO.isReg())
continue;
unsigned STReg = MO.getReg() - X86::ST0;
if (STReg >= 8)
continue;
switch (InlineAsm::getKind(Flags)) {
case InlineAsm::Kind_RegUse:
STUses |= (1u << STReg);
break;
case InlineAsm::Kind_RegDef:
case InlineAsm::Kind_RegDefEarlyClobber:
STDefs |= (1u << STReg);
if (MO.isDead())
STDeadDefs |= (1u << STReg);
break;
case InlineAsm::Kind_Clobber:
STClobbers |= (1u << STReg);
break;
default:
break;
}
}
if (STUses && !isMask_32(STUses))
report_fatal_error("Inline asm fixed inputs"
" must be last on the x87 stack");
unsigned NumSTUses = CountTrailingOnes_32(STUses);
// Defs must be contiguous from the stack top. ST0-STn.
if (STDefs && !isMask_32(STDefs))
report_fatal_error("Inline asm fixed outputs"
" must be last on the x87 stack");
unsigned NumSTDefs = CountTrailingOnes_32(STDefs);
// So must the clobbered stack slots. ST0-STm, m >= n.
if (STClobbers && !isMask_32(STDefs | STClobbers))
report_fatal_error("Inline asm clobbers must be last on the x87 stack");
// Popped inputs are the ones that are also clobbered or defined.
unsigned STPopped = STUses & (STDefs | STClobbers);
if (STPopped && !isMask_32(STPopped))
report_fatal_error("Inline asm popped inputs"
" must be last on the x87 stack");
unsigned NumSTPopped = CountTrailingOnes_32(STPopped);
DEBUG(dbgs() << "Asm uses " << NumSTUses << " fixed regs, pops "
<< NumSTPopped << ", and defines " << NumSTDefs << " regs.\n");
// Scan the instruction for FP uses corresponding to "f" constraints.
// Collect FP registers to kill afer the instruction.
// Always kill all the scratch regs.
unsigned FPKills = ((1u << NumFPRegs) - 1) & ~0xff;
unsigned FPUsed = 0;
for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
MachineOperand &Op = MI->getOperand(i);
if (!Op.isReg() || Op.getReg() < X86::FP0 || Op.getReg() > X86::FP6)
continue;
assert(Op.isUse() && "Only handle inline asm uses right now");
if (!Op.isUse())
report_fatal_error("Illegal \"f\" output constraint in inline asm");
unsigned FPReg = getFPReg(Op);
Op.setReg(getSTReg(FPReg));
FPUsed |= 1U << FPReg;
// If we kill this operand, make sure to pop it from the stack after the
// asm. We just remember it for now, and pop them all off at the end in
// a batch.
if (Op.isKill())
Kills |= 1U << FPReg;
FPKills |= 1U << FPReg;
}
// The popped inputs will be killed by the instruction, so duplicate them
// if the FP register needs to be live after the instruction, or if it is
// used in the instruction itself. We effectively treat the popped inputs
// as early clobbers.
for (unsigned i = 0; i < NumSTPopped; ++i) {
if ((FPKills & ~FPUsed) & (1u << PendingST[i]))
continue;
unsigned SR = getScratchReg();
duplicateToTop(PendingST[i], SR, I);
DEBUG(dbgs() << "Duplicating ST" << i << " in FP"
<< unsigned(PendingST[i]) << " to avoid clobbering it.\n");
PendingST[i] = SR;
}
// Make sure we have a unique live register for every fixed use. Some of
// them could be undef uses, and we need to emit LD_F0 instructions.
for (unsigned i = 0; i < NumSTUses; ++i) {
if (i < NumPendingSTs && PendingST[i] < NumFPRegs) {
// Check for shared assignments.
for (unsigned j = 0; j < i; ++j) {
if (PendingST[j] != PendingST[i])
continue;
// STi and STj are inn the same register, create a copy.
unsigned SR = getScratchReg();
duplicateToTop(PendingST[i], SR, I);
DEBUG(dbgs() << "Duplicating ST" << i << " in FP"
<< unsigned(PendingST[i])
<< " to avoid collision with ST" << j << '\n');
PendingST[i] = SR;
}
continue;
}
unsigned SR = getScratchReg();
DEBUG(dbgs() << "Emitting LD_F0 for ST" << i << " in FP" << SR << '\n');
BuildMI(*MBB, I, MI->getDebugLoc(), TII->get(X86::LD_F0));
pushReg(SR);
PendingST[i] = SR;
if (NumPendingSTs == i)
++NumPendingSTs;
}
assert(NumPendingSTs >= NumSTUses && "Fixed registers should be assigned");
// Now we can rearrange the live registers to match what was requested.
shuffleStackTop(PendingST, NumPendingSTs, I);
DEBUG({dbgs() << "Before asm: "; dumpStack();});
// With the stack layout fixed, rewrite the FP registers.
for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
MachineOperand &Op = MI->getOperand(i);
if (!Op.isReg() || Op.getReg() < X86::FP0 || Op.getReg() > X86::FP6)
continue;
unsigned FPReg = getFPReg(Op);
Op.setReg(getSTReg(FPReg));
}
// Simulate the inline asm popping its inputs and pushing its outputs.
StackTop -= NumSTPopped;
// Hold the fixed output registers in scratch FP registers. They will be
// transferred to real FP registers by copies.
NumPendingSTs = 0;
for (unsigned i = 0; i < NumSTDefs; ++i) {
unsigned SR = getScratchReg();
pushReg(SR);
FPKills &= ~(1u << SR);
}
for (unsigned i = 0; i < NumSTDefs; ++i)
PendingST[NumPendingSTs++] = getStackEntry(i);
DEBUG({dbgs() << "After asm: "; dumpStack();});
// If any of the ST defs were dead, pop them immediately. Our caller only
// handles dead FP defs.
MachineBasicBlock::iterator InsertPt = MI;
for (unsigned i = 0; STDefs & (1u << i); ++i) {
if (!(STDeadDefs & (1u << i)))
continue;
freeStackSlotAfter(InsertPt, PendingST[i]);
PendingST[i] = NumFPRegs;
}
while (NumPendingSTs && PendingST[NumPendingSTs - 1] == NumFPRegs)
--NumPendingSTs;
// If this asm kills any FP registers (is the last use of them) we must
// explicitly emit pop instructions for them. Do this now after the asm has
// executed so that the ST(x) numbers are not off (which would happen if we
@ -1392,16 +1601,16 @@ void FPS::handleSpecialFP(MachineBasicBlock::iterator &I) {
//
// Note: this might be a non-optimal pop sequence. We might be able to do
// better by trying to pop in stack order or something.
MachineBasicBlock::iterator InsertPt = MI;
while (Kills) {
unsigned FPReg = CountTrailingZeros_32(Kills);
freeStackSlotAfter(InsertPt, FPReg);
Kills &= ~(1U << FPReg);
while (FPKills) {
unsigned FPReg = CountTrailingZeros_32(FPKills);
if (isLive(FPReg))
freeStackSlotAfter(InsertPt, FPReg);
FPKills &= ~(1U << FPReg);
}
// Don't delete the inline asm!
return;
}
case X86::RET:
case X86::RETI:
// If RET has an FP register use operand, pass the first one in ST(0) and
@ -1499,33 +1708,3 @@ void FPS::handleSpecialFP(MachineBasicBlock::iterator &I) {
} else
--I;
}
// Translate a COPY instruction to a pseudo-op that handleSpecialFP understands.
bool FPS::translateCopy(MachineInstr *MI) {
unsigned DstReg = MI->getOperand(0).getReg();
unsigned SrcReg = MI->getOperand(1).getReg();
if (DstReg == X86::ST0) {
MI->setDesc(TII->get(X86::FpSET_ST0_80));
MI->RemoveOperand(0);
return true;
}
if (DstReg == X86::ST1) {
MI->setDesc(TII->get(X86::FpSET_ST1_80));
MI->RemoveOperand(0);
return true;
}
if (SrcReg == X86::ST0) {
MI->setDesc(TII->get(X86::FpGET_ST0_80));
return true;
}
if (SrcReg == X86::ST1) {
MI->setDesc(TII->get(X86::FpGET_ST1_80));
return true;
}
if (X86::RFP80RegClass.contains(DstReg, SrcReg)) {
MI->setDesc(TII->get(X86::MOV_Fp8080));
return true;
}
return false;
}

View File

@ -1511,20 +1511,15 @@ X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
// If this is a call to a function that returns an fp value on the floating
// point stack, we must guarantee the the value is popped from the stack, so
// a CopyFromReg is not good enough - the copy instruction may be eliminated
// if the return value is not used. We use the FpGET_ST0 instructions
// if the return value is not used. We use the FpPOP_RETVAL instruction
// instead.
if (VA.getLocReg() == X86::ST0 || VA.getLocReg() == X86::ST1) {
// If we prefer to use the value in xmm registers, copy it out as f80 and
// use a truncate to move it from fp stack reg to xmm reg.
if (isScalarFPTypeInSSEReg(VA.getValVT())) CopyVT = MVT::f80;
bool isST0 = VA.getLocReg() == X86::ST0;
unsigned Opc = 0;
if (CopyVT == MVT::f32) Opc = isST0 ? X86::FpGET_ST0_32:X86::FpGET_ST1_32;
if (CopyVT == MVT::f64) Opc = isST0 ? X86::FpGET_ST0_64:X86::FpGET_ST1_64;
if (CopyVT == MVT::f80) Opc = isST0 ? X86::FpGET_ST0_80:X86::FpGET_ST1_80;
SDValue Ops[] = { Chain, InFlag };
Chain = SDValue(DAG.getMachineNode(Opc, dl, CopyVT, MVT::Other, MVT::Glue,
Ops, 2), 1);
Chain = SDValue(DAG.getMachineNode(X86::FpPOP_RETVAL, dl, CopyVT,
MVT::Other, MVT::Glue, Ops, 2), 1);
Val = Chain.getValue(0);
// Round the f80 to the right size, which also moves it to the appropriate

View File

@ -112,31 +112,8 @@ let usesCustomInserter = 1 in { // Expanded after instruction selection.
// a pattern) and the FPI instruction should have emission info (e.g. opcode
// encoding and asm printing info).
// Pseudo Instructions for FP stack return values.
def FpGET_ST0_32 : FpI_<(outs RFP32:$dst), (ins), SpecialFP, []>; // FPR = ST(0)
def FpGET_ST0_64 : FpI_<(outs RFP64:$dst), (ins), SpecialFP, []>; // FPR = ST(0)
def FpGET_ST0_80 : FpI_<(outs RFP80:$dst), (ins), SpecialFP, []>; // FPR = ST(0)
// FpGET_ST1* should only be issued *after* an FpGET_ST0* has been issued when
// there are two values live out on the stack from a call or inlineasm. This
// magic is handled by the stackifier. It is not valid to emit FpGET_ST1* and
// then FpGET_ST0*. In addition, it is invalid for any FP-using operations to
// occur between them.
def FpGET_ST1_32 : FpI_<(outs RFP32:$dst), (ins), SpecialFP, []>; // FPR = ST(1)
def FpGET_ST1_64 : FpI_<(outs RFP64:$dst), (ins), SpecialFP, []>; // FPR = ST(1)
def FpGET_ST1_80 : FpI_<(outs RFP80:$dst), (ins), SpecialFP, []>; // FPR = ST(1)
let Defs = [ST0] in {
def FpSET_ST0_32 : FpI_<(outs), (ins RFP32:$src), SpecialFP, []>; // ST(0) = FPR
def FpSET_ST0_64 : FpI_<(outs), (ins RFP64:$src), SpecialFP, []>; // ST(0) = FPR
def FpSET_ST0_80 : FpI_<(outs), (ins RFP80:$src), SpecialFP, []>; // ST(0) = FPR
}
let Defs = [ST1] in {
def FpSET_ST1_32 : FpI_<(outs), (ins RFP32:$src), SpecialFP, []>; // ST(1) = FPR
def FpSET_ST1_64 : FpI_<(outs), (ins RFP64:$src), SpecialFP, []>; // ST(1) = FPR
def FpSET_ST1_80 : FpI_<(outs), (ins RFP80:$src), SpecialFP, []>; // ST(1) = FPR
}
// Pseudo Instruction for FP stack return values.
def FpPOP_RETVAL : FpI_<(outs RFP80:$dst), (ins), SpecialFP, []>;
// FpIf32, FpIf64 - Floating Point Pseudo Instruction template.
// f32 instructions can use SSE1 and are predicated on FPStackf32 == !SSE1.
@ -147,19 +124,6 @@ class FpIf32<dag outs, dag ins, FPFormat fp, list<dag> pattern> :
class FpIf64<dag outs, dag ins, FPFormat fp, list<dag> pattern> :
FpI_<outs, ins, fp, pattern>, Requires<[FPStackf64]>;
// Register copies. Just copies, the shortening ones do not truncate.
let neverHasSideEffects = 1 in {
def MOV_Fp3232 : FpIf32<(outs RFP32:$dst), (ins RFP32:$src), SpecialFP, []>;
def MOV_Fp3264 : FpIf32<(outs RFP64:$dst), (ins RFP32:$src), SpecialFP, []>;
def MOV_Fp6432 : FpIf32<(outs RFP32:$dst), (ins RFP64:$src), SpecialFP, []>;
def MOV_Fp6464 : FpIf64<(outs RFP64:$dst), (ins RFP64:$src), SpecialFP, []>;
def MOV_Fp8032 : FpIf32<(outs RFP32:$dst), (ins RFP80:$src), SpecialFP, []>;
def MOV_Fp3280 : FpIf32<(outs RFP80:$dst), (ins RFP32:$src), SpecialFP, []>;
def MOV_Fp8064 : FpIf64<(outs RFP64:$dst), (ins RFP80:$src), SpecialFP, []>;
def MOV_Fp6480 : FpIf64<(outs RFP80:$dst), (ins RFP64:$src), SpecialFP, []>;
def MOV_Fp8080 : FpI_ <(outs RFP80:$dst), (ins RFP80:$src), SpecialFP, []>;
}
// Factoring for arithmetic.
multiclass FPBinary_rr<SDNode OpNode> {
// Register op register -> register

View File

@ -500,18 +500,6 @@ BitVector X86RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
Reserved.set(X86::BPL);
}
// Mark the x87 stack registers as reserved, since they don't behave normally
// with respect to liveness. We don't fully model the effects of x87 stack
// pushes and pops after stackification.
Reserved.set(X86::ST0);
Reserved.set(X86::ST1);
Reserved.set(X86::ST2);
Reserved.set(X86::ST3);
Reserved.set(X86::ST4);
Reserved.set(X86::ST5);
Reserved.set(X86::ST6);
Reserved.set(X86::ST7);
// Mark the segment registers as reserved.
Reserved.set(X86::CS);
Reserved.set(X86::SS);

View File

@ -206,15 +206,22 @@ let Namespace = "X86" in {
def YMM15: RegisterWithSubRegs<"ymm15", [XMM15]>, DwarfRegAlias<XMM15>;
}
// Floating point stack registers
def ST0 : Register<"st(0)">, DwarfRegNum<[33, 12, 11]>;
def ST1 : Register<"st(1)">, DwarfRegNum<[34, 13, 12]>;
def ST2 : Register<"st(2)">, DwarfRegNum<[35, 14, 13]>;
def ST3 : Register<"st(3)">, DwarfRegNum<[36, 15, 14]>;
def ST4 : Register<"st(4)">, DwarfRegNum<[37, 16, 15]>;
def ST5 : Register<"st(5)">, DwarfRegNum<[38, 17, 16]>;
def ST6 : Register<"st(6)">, DwarfRegNum<[39, 18, 17]>;
def ST7 : Register<"st(7)">, DwarfRegNum<[40, 19, 18]>;
class STRegister<string Name, list<Register> A> : Register<Name> {
let Aliases = A;
}
// Floating point stack registers. These don't map one-to-one to the FP
// pseudo registers, but we still mark them as aliasing FP registers. That
// way both kinds can be live without exceeding the stack depth. ST registers
// are only live around inline assembly.
def ST0 : STRegister<"st(0)", []>, DwarfRegNum<[33, 12, 11]>;
def ST1 : STRegister<"st(1)", [FP6]>, DwarfRegNum<[34, 13, 12]>;
def ST2 : STRegister<"st(2)", [FP5]>, DwarfRegNum<[35, 14, 13]>;
def ST3 : STRegister<"st(3)", [FP4]>, DwarfRegNum<[36, 15, 14]>;
def ST4 : STRegister<"st(4)", [FP3]>, DwarfRegNum<[37, 16, 15]>;
def ST5 : STRegister<"st(5)", [FP2]>, DwarfRegNum<[38, 17, 16]>;
def ST6 : STRegister<"st(6)", [FP1]>, DwarfRegNum<[39, 18, 17]>;
def ST7 : STRegister<"st(7)", [FP0]>, DwarfRegNum<[40, 19, 18]>;
// Status flags register
def EFLAGS : Register<"flags">;

View File

@ -106,6 +106,25 @@ return:
ret void
}
; Passing a non-killed value through asm in {st}.
; Make sure it is not duped before.
; Second asm kills st(0), so we shouldn't pop anything
; CHECK: testPR4185b
; CHECK-NOT: fld %st(0)
; CHECK: fistl
; CHECK-NOT: fstp
; CHECK: fistpl
; CHECK-NOT: fstp
; CHECK: ret
; A valid alternative would be to remat the constant pool load before each
; inline asm.
define void @testPR4185b() {
return:
call void asm sideeffect "fistl $0", "{st}"(double 1.000000e+06)
call void asm sideeffect "fistpl $0", "{st},~{st}"(double 1.000000e+06)
ret void
}
; PR4459
; The return value from ceil must be duped before being consumed by asm.
; CHECK: testPR4459
@ -160,3 +179,153 @@ entry:
tail call void asm sideeffect "fistpl $0", "{st},~{st}"(x86_fp80 %5)
ret void
}
; An input argument in a fixed position is implicitly popped by the asm only if
; the input argument is tied to an output register, or it is in the clobber list.
; The clobber list case is tested above.
;
; This doesn't implicitly pop the stack:
;
; void fist1(long double x, int *p) {
; asm volatile ("fistl %1" : : "t"(x), "m"(*p));
; }
;
; CHECK: fist1
; CHECK: fldt
; CHECK: fistl (%e
; CHECK: fstp
; CHECK: ret
define void @fist1(x86_fp80 %x, i32* %p) nounwind ssp {
entry:
tail call void asm sideeffect "fistl $1", "{st},*m,~{memory},~{dirflag},~{fpsr},~{flags}"(x86_fp80 %x, i32* %p) nounwind
ret void
}
; Here, the input operand is tied to an output which means that is is
; implicitly popped (and then the output is implicitly pushed).
;
; long double fist2(long double x, int *p) {
; long double y;
; asm ("fistl %1" : "=&t"(y) : "0"(x), "m"(*p) : "memory");
; return y;
; }
;
; CHECK: fist2
; CHECK: fldt
; CHECK: fistl (%e
; CHECK-NOT: fstp
; CHECK: ret
define x86_fp80 @fist2(x86_fp80 %x, i32* %p) nounwind ssp {
entry:
%0 = tail call x86_fp80 asm "fistl $2", "=&{st},0,*m,~{memory},~{dirflag},~{fpsr},~{flags}"(x86_fp80 %x, i32* %p) nounwind
ret x86_fp80 %0
}
; An 'f' constraint is never implicitly popped:
;
; void fucomp1(long double x, long double y) {
; asm volatile ("fucomp %1" : : "t"(x), "f"(y) : "st");
; }
; CHECK: fucomp1
; CHECK: fldt
; CHECK: fldt
; CHECK: fucomp %st
; CHECK: fstp
; CHECK-NOT: fstp
; CHECK: ret
define void @fucomp1(x86_fp80 %x, x86_fp80 %y) nounwind ssp {
entry:
tail call void asm sideeffect "fucomp $1", "{st},f,~{st},~{dirflag},~{fpsr},~{flags}"(x86_fp80 %x, x86_fp80 %y) nounwind
ret void
}
; The 'u' constraint is only popped implicitly when clobbered:
;
; void fucomp2(long double x, long double y) {
; asm volatile ("fucomp %1" : : "t"(x), "u"(y) : "st");
; }
;
; void fucomp3(long double x, long double y) {
; asm volatile ("fucompp %1" : : "t"(x), "u"(y) : "st", "st(1)");
; }
;
; CHECK: fucomp2
; CHECK: fldt
; CHECK: fldt
; CHECK: fucomp %st(1)
; CHECK: fstp
; CHECK-NOT: fstp
; CHECK: ret
;
; CHECK: fucomp3
; CHECK: fldt
; CHECK: fldt
; CHECK: fucompp %st(1)
; CHECK-NOT: fstp
; CHECK: ret
define void @fucomp2(x86_fp80 %x, x86_fp80 %y) nounwind ssp {
entry:
tail call void asm sideeffect "fucomp $1", "{st},{st(1)},~{st},~{dirflag},~{fpsr},~{flags}"(x86_fp80 %x, x86_fp80 %y) nounwind
ret void
}
define void @fucomp3(x86_fp80 %x, x86_fp80 %y) nounwind ssp {
entry:
tail call void asm sideeffect "fucompp $1", "{st},{st(1)},~{st},~{st(1)},~{dirflag},~{fpsr},~{flags}"(x86_fp80 %x, x86_fp80 %y) nounwind
ret void
}
; One input, two outputs, one dead output.
%complex = type { float, float }
; CHECK: sincos1
; CHECK: flds
; CHECK-NOT: fxch
; CHECK: sincos
; CHECK-NOT: fstp
; CHECK: fstp %st(1)
; CHECK-NOT: fstp
; CHECK: ret
define float @sincos1(float %x) nounwind ssp {
entry:
%0 = tail call %complex asm "sincos", "={st},={st(1)},0,~{dirflag},~{fpsr},~{flags}"(float %x) nounwind
%asmresult = extractvalue %complex %0, 0
ret float %asmresult
}
; Same thing, swapped output operands.
; CHECK: sincos2
; CHECK: flds
; CHECK-NOT: fxch
; CHECK: sincos
; CHECK-NOT: fstp
; CHECK: fstp %st(1)
; CHECK-NOT: fstp
; CHECK: ret
define float @sincos2(float %x) nounwind ssp {
entry:
%0 = tail call %complex asm "sincos", "={st(1)},={st},1,~{dirflag},~{fpsr},~{flags}"(float %x) nounwind
%asmresult = extractvalue %complex %0, 1
ret float %asmresult
}
; Clobber st(0) after it was live-out/dead from the previous asm.
; CHECK: sincos3
; Load x, make a copy for the second asm.
; CHECK: flds
; CHECK: fld %st(0)
; CHECK: sincos
; Discard dead result in st(0), bring x to the top.
; CHECK: fstp %st(0)
; CHECK: fxch
; x is now in st(0) for the second asm
; CHECK: sincos
; Discard both results.
; CHECK: fstp
; CHECK: fstp
; CHECK: ret
define float @sincos3(float %x) nounwind ssp {
entry:
%0 = tail call %complex asm sideeffect "sincos", "={st(1)},={st},1,~{dirflag},~{fpsr},~{flags}"(float %x) nounwind
%1 = tail call %complex asm sideeffect "sincos", "={st(1)},={st},1,~{dirflag},~{fpsr},~{flags}"(float %x) nounwind
%asmresult = extractvalue %complex %0, 0
ret float %asmresult
}