[ARM64] Support aggressive fastcc/tailcallopt breaking ABI by popping out argument stack from callee.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@208837 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
Jiangning Liu 2014-05-15 01:33:17 +00:00
parent cea72fe763
commit 66b123f0d8
8 changed files with 553 additions and 105 deletions

View File

@ -107,32 +107,44 @@ bool ARM64FrameLowering::hasReservedCallFrame(const MachineFunction &MF) const {
void ARM64FrameLowering::eliminateCallFramePseudoInstr(
MachineFunction &MF, MachineBasicBlock &MBB,
MachineBasicBlock::iterator I) const {
const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
const ARM64InstrInfo *TII =
static_cast<const ARM64InstrInfo *>(MF.getTarget().getInstrInfo());
if (!TFI->hasReservedCallFrame(MF)) {
// If we have alloca, convert as follows:
// ADJCALLSTACKDOWN -> sub, sp, sp, amount
// ADJCALLSTACKUP -> add, sp, sp, amount
MachineInstr *Old = I;
DebugLoc DL = Old->getDebugLoc();
unsigned Amount = Old->getOperand(0).getImm();
if (Amount != 0) {
// We need to keep the stack aligned properly. To do this, we round the
// amount of space needed for the outgoing arguments up to the next
// alignment boundary.
unsigned Align = TFI->getStackAlignment();
Amount = (Amount + Align - 1) / Align * Align;
DebugLoc DL = I->getDebugLoc();
int Opc = I->getOpcode();
bool IsDestroy = Opc == TII->getCallFrameDestroyOpcode();
uint64_t CalleePopAmount = IsDestroy ? I->getOperand(1).getImm() : 0;
// Replace the pseudo instruction with a new instruction...
unsigned Opc = Old->getOpcode();
if (Opc == ARM64::ADJCALLSTACKDOWN) {
emitFrameOffset(MBB, I, DL, ARM64::SP, ARM64::SP, -Amount, TII);
} else {
assert(Opc == ARM64::ADJCALLSTACKUP && "expected ADJCALLSTACKUP");
emitFrameOffset(MBB, I, DL, ARM64::SP, ARM64::SP, Amount, TII);
}
const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
if (!TFI->hasReservedCallFrame(MF)) {
unsigned Align = getStackAlignment();
int64_t Amount = I->getOperand(0).getImm();
Amount = RoundUpToAlignment(Amount, Align);
if (!IsDestroy)
Amount = -Amount;
// N.b. if CalleePopAmount is valid but zero (i.e. callee would pop, but it
// doesn't have to pop anything), then the first operand will be zero too so
// this adjustment is a no-op.
if (CalleePopAmount == 0) {
// FIXME: in-function stack adjustment for calls is limited to 24-bits
// because there's no guaranteed temporary register available.
//
// ADD/SUB (immediate) has only LSL #0 and LSL #12 avaiable.
// 1) For offset <= 12-bit, we use LSL #0
// 2) For 12-bit <= offset <= 24-bit, we use two instructions. One uses
// LSL #0, and the other uses LSL #12.
//
// Mostly call frames will be allocated at the start of a function so
// this is OK, but it is a limitation that needs dealing with.
assert(Amount > -0xffffff && Amount < 0xffffff && "call frame too large");
emitFrameOffset(MBB, I, DL, ARM64::SP, ARM64::SP, Amount, TII);
}
} else if (CalleePopAmount != 0) {
// If the calling convention demands that the callee pops arguments from the
// stack, we want to add it back if we have a reserved call frame.
assert(CalleePopAmount < 0xffffff && "call frame too large");
emitFrameOffset(MBB, I, DL, ARM64::SP, ARM64::SP, -CalleePopAmount, TII);
}
MBB.erase(I);
}
@ -420,8 +432,57 @@ void ARM64FrameLowering::emitEpilogue(MachineFunction &MF,
const ARM64RegisterInfo *RegInfo =
static_cast<const ARM64RegisterInfo *>(MF.getTarget().getRegisterInfo());
DebugLoc DL = MBBI->getDebugLoc();
unsigned RetOpcode = MBBI->getOpcode();
int NumBytes = MFI->getStackSize();
const ARM64FunctionInfo *AFI = MF.getInfo<ARM64FunctionInfo>();
// Initial and residual are named for consitency with the prologue. Note that
// in the epilogue, the residual adjustment is executed first.
uint64_t ArgumentPopSize = 0;
if (RetOpcode == ARM64::TCRETURNdi || RetOpcode == ARM64::TCRETURNri) {
MachineOperand &StackAdjust = MBBI->getOperand(1);
// For a tail-call in a callee-pops-arguments environment, some or all of
// the stack may actually be in use for the call's arguments, this is
// calculated during LowerCall and consumed here...
ArgumentPopSize = StackAdjust.getImm();
} else {
// ... otherwise the amount to pop is *all* of the argument space,
// conveniently stored in the MachineFunctionInfo by
// LowerFormalArguments. This will, of course, be zero for the C calling
// convention.
ArgumentPopSize = AFI->getArgumentStackToRestore();
}
// The stack frame should be like below,
//
// ---------------------- ---
// | | |
// | BytesInStackArgArea| CalleeArgStackSize
// | (NumReusableBytes) | (of tail call)
// | | ---
// | | |
// ---------------------| --- |
// | | | |
// | CalleeSavedReg | | |
// | (NumRestores * 16) | | |
// | | | |
// ---------------------| | NumBytes
// | | StackSize (StackAdjustUp)
// | LocalStackSize | | |
// | (covering callee | | |
// | args) | | |
// | | | |
// ---------------------- --- ---
//
// So NumBytes = StackSize + BytesInStackArgArea - CalleeArgStackSize
// = StackSize + ArgumentPopSize
//
// ARM64TargetLowering::LowerCall figures out ArgumentPopSize and keeps
// it as the 2nd argument of ARM64ISD::TC_RETURN.
NumBytes += ArgumentPopSize;
unsigned NumRestores = 0;
// Move past the restores of the callee-saved registers.
MachineBasicBlock::iterator LastPopI = MBBI;

View File

@ -39,12 +39,6 @@ using namespace llvm;
STATISTIC(NumTailCalls, "Number of tail calls");
STATISTIC(NumShiftInserts, "Number of vector shift inserts");
// This option should go away when tail calls fully work.
static cl::opt<bool>
EnableARM64TailCalls("arm64-tail-calls", cl::Hidden,
cl::desc("Generate ARM64 tail calls (TEMPORARY OPTION)."),
cl::init(true));
enum AlignMode {
StrictAlign,
NoStrictAlign
@ -1811,6 +1805,27 @@ SDValue ARM64TargetLowering::LowerFormalArguments(
AFI->setVarArgsStackIndex(MFI->CreateFixedObject(4, StackOffset, true));
}
ARM64FunctionInfo *FuncInfo = MF.getInfo<ARM64FunctionInfo>();
unsigned StackArgSize = CCInfo.getNextStackOffset();
bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
if (DoesCalleeRestoreStack(CallConv, TailCallOpt)) {
// This is a non-standard ABI so by fiat I say we're allowed to make full
// use of the stack area to be popped, which must be aligned to 16 bytes in
// any case:
StackArgSize = RoundUpToAlignment(StackArgSize, 16);
// If we're expected to restore the stack (e.g. fastcc) then we'll be adding
// a multiple of 16.
FuncInfo->setArgumentStackToRestore(StackArgSize);
// This realignment carries over to the available bytes below. Our own
// callers will guarantee the space is free by giving an aligned value to
// CALLSEQ_START.
}
// Even if we're not expected to free up the space, it's useful to know how
// much is there while considering tail calls (because we can reuse it).
FuncInfo->setBytesInStackArgArea(StackArgSize);
return Chain;
}
@ -1942,57 +1957,147 @@ bool ARM64TargetLowering::isEligibleForTailCallOptimization(
const SmallVectorImpl<ISD::OutputArg> &Outs,
const SmallVectorImpl<SDValue> &OutVals,
const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
// Look for obvious safe cases to perform tail call optimization that do not
// require ABI changes. This is what gcc calls sibcall.
// Do not sibcall optimize vararg calls unless the call site is not passing
// any arguments.
if (isVarArg && !Outs.empty())
// For CallingConv::C this function knows whether the ABI needs
// changing. That's not true for other conventions so they will have to opt in
// manually.
if (!IsTailCallConvention(CalleeCC) && CalleeCC != CallingConv::C)
return false;
// Also avoid sibcall optimization if either caller or callee uses struct
// return semantics.
if (isCalleeStructRet || isCallerStructRet)
const MachineFunction &MF = DAG.getMachineFunction();
const Function *CallerF = MF.getFunction();
CallingConv::ID CallerCC = CallerF->getCallingConv();
bool CCMatch = CallerCC == CalleeCC;
// Byval parameters hand the function a pointer directly into the stack area
// we want to reuse during a tail call. Working around this *is* possible (see
// X86) but less efficient and uglier in LowerCall.
for (Function::const_arg_iterator i = CallerF->arg_begin(),
e = CallerF->arg_end();
i != e; ++i)
if (i->hasByValAttr())
return false;
if (getTargetMachine().Options.GuaranteedTailCallOpt) {
if (IsTailCallConvention(CalleeCC) && CCMatch)
return true;
return false;
}
// Note that currently ARM64 "C" calling convention and "Fast" calling
// convention are compatible. If/when that ever changes, we'll need to
// add checks here to make sure any interactions are OK.
// Now we search for cases where we can use a tail call without changing the
// ABI. Sibcall is used in some places (particularly gcc) to refer to this
// concept.
// If the callee takes no arguments then go on to check the results of the
// call.
if (!Outs.empty()) {
// Check if stack adjustment is needed. For now, do not do this if any
// argument is passed on the stack.
// I want anyone implementing a new calling convention to think long and hard
// about this assert.
assert((!isVarArg || CalleeCC == CallingConv::C) &&
"Unexpected variadic calling convention");
if (isVarArg && !Outs.empty()) {
// At least two cases here: if caller is fastcc then we can't have any
// memory arguments (we'd be expected to clean up the stack afterwards). If
// caller is C then we could potentially use its argument area.
// FIXME: for now we take the most conservative of these in both cases:
// disallow all variadic memory operands.
SmallVector<CCValAssign, 16> ArgLocs;
CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(),
getTargetMachine(), ArgLocs, *DAG.getContext());
CCAssignFn *AssignFn = CCAssignFnForCall(CalleeCC, /*IsVarArg=*/false);
CCInfo.AnalyzeCallOperands(Outs, AssignFn);
if (CCInfo.getNextStackOffset()) {
// Check if the arguments are already laid out in the right way as
// the caller's fixed stack objects.
for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size(); i != e;
++i, ++realArgIdx) {
CCValAssign &VA = ArgLocs[i];
if (VA.getLocInfo() == CCValAssign::Indirect)
CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, true));
for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)
if (!ArgLocs[i].isRegLoc())
return false;
}
// If the calling conventions do not match, then we'd better make sure the
// results are returned in the same way as what the caller expects.
if (!CCMatch) {
SmallVector<CCValAssign, 16> RVLocs1;
CCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(),
getTargetMachine(), RVLocs1, *DAG.getContext());
CCInfo1.AnalyzeCallResult(Ins, CCAssignFnForCall(CalleeCC, isVarArg));
SmallVector<CCValAssign, 16> RVLocs2;
CCState CCInfo2(CallerCC, false, DAG.getMachineFunction(),
getTargetMachine(), RVLocs2, *DAG.getContext());
CCInfo2.AnalyzeCallResult(Ins, CCAssignFnForCall(CallerCC, isVarArg));
if (RVLocs1.size() != RVLocs2.size())
return false;
for (unsigned i = 0, e = RVLocs1.size(); i != e; ++i) {
if (RVLocs1[i].isRegLoc() != RVLocs2[i].isRegLoc())
return false;
if (RVLocs1[i].getLocInfo() != RVLocs2[i].getLocInfo())
return false;
if (RVLocs1[i].isRegLoc()) {
if (RVLocs1[i].getLocReg() != RVLocs2[i].getLocReg())
return false;
if (VA.needsCustom()) {
// Just don't handle anything that needs custom adjustments for now.
// If need be, we can revisit later, but we shouldn't ever end up
// here.
} else {
if (RVLocs1[i].getLocMemOffset() != RVLocs2[i].getLocMemOffset())
return false;
} else if (!VA.isRegLoc()) {
// Likewise, don't try to handle stack based arguments for the
// time being.
return false;
}
}
}
}
return true;
// Nothing more to check if the callee is taking no arguments
if (Outs.empty())
return true;
SmallVector<CCValAssign, 16> ArgLocs;
CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(),
getTargetMachine(), ArgLocs, *DAG.getContext());
CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, isVarArg));
const ARM64FunctionInfo *FuncInfo = MF.getInfo<ARM64FunctionInfo>();
// If the stack arguments for this call would fit into our own save area then
// the call can be made tail.
return CCInfo.getNextStackOffset() <= FuncInfo->getBytesInStackArgArea();
}
SDValue ARM64TargetLowering::addTokenForArgument(SDValue Chain,
SelectionDAG &DAG,
MachineFrameInfo *MFI,
int ClobberedFI) const {
SmallVector<SDValue, 8> ArgChains;
int64_t FirstByte = MFI->getObjectOffset(ClobberedFI);
int64_t LastByte = FirstByte + MFI->getObjectSize(ClobberedFI) - 1;
// Include the original chain at the beginning of the list. When this is
// used by target LowerCall hooks, this helps legalize find the
// CALLSEQ_BEGIN node.
ArgChains.push_back(Chain);
// Add a chain value for each stack argument corresponding
for (SDNode::use_iterator U = DAG.getEntryNode().getNode()->use_begin(),
UE = DAG.getEntryNode().getNode()->use_end();
U != UE; ++U)
if (LoadSDNode *L = dyn_cast<LoadSDNode>(*U))
if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr()))
if (FI->getIndex() < 0) {
int64_t InFirstByte = MFI->getObjectOffset(FI->getIndex());
int64_t InLastByte = InFirstByte;
InLastByte += MFI->getObjectSize(FI->getIndex()) - 1;
if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) ||
(FirstByte <= InFirstByte && InFirstByte <= LastByte))
ArgChains.push_back(SDValue(L, 1));
}
// Build a tokenfactor for all the chains.
return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains);
}
bool ARM64TargetLowering::DoesCalleeRestoreStack(CallingConv::ID CallCC,
bool TailCallOpt) const {
return CallCC == CallingConv::Fast && TailCallOpt;
}
bool ARM64TargetLowering::IsTailCallConvention(CallingConv::ID CallCC) const {
return CallCC == CallingConv::Fast;
}
/// LowerCall - Lower a call to a callseq_start + CALL + callseq_end chain,
/// and add input and output parameter nodes.
SDValue ARM64TargetLowering::LowerCall(CallLoweringInfo &CLI,
@ -2012,9 +2117,9 @@ SDValue ARM64TargetLowering::LowerCall(CallLoweringInfo &CLI,
bool IsStructRet = (Outs.empty()) ? false : Outs[0].Flags.isSRet();
bool IsThisReturn = false;
// If tail calls are explicitly disabled, make sure not to use them.
if (!EnableARM64TailCalls)
IsTailCall = false;
ARM64FunctionInfo *FuncInfo = MF.getInfo<ARM64FunctionInfo>();
bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
bool IsSibCall = false;
if (IsTailCall) {
// Check if it's really possible to do a tail call.
@ -2024,9 +2129,12 @@ SDValue ARM64TargetLowering::LowerCall(CallLoweringInfo &CLI,
if (!IsTailCall && CLI.CS && CLI.CS->isMustTailCall())
report_fatal_error("failed to perform tail call elimination on a call "
"site marked musttail");
// We don't support GuaranteedTailCallOpt, only automatically
// detected sibcalls.
// FIXME: Re-evaluate. Is this true? Should it be true?
// A sibling call is one where we're under the usual C ABI and not planning
// to change that but can still do a tail call:
if (!TailCallOpt && IsTailCall)
IsSibCall = true;
if (IsTailCall)
++NumTailCalls;
}
@ -2082,9 +2190,42 @@ SDValue ARM64TargetLowering::LowerCall(CallLoweringInfo &CLI,
// Get a count of how many bytes are to be pushed on the stack.
unsigned NumBytes = CCInfo.getNextStackOffset();
if (IsSibCall) {
// Since we're not changing the ABI to make this a tail call, the memory
// operands are already available in the caller's incoming argument space.
NumBytes = 0;
}
// FPDiff is the byte offset of the call's argument area from the callee's.
// Stores to callee stack arguments will be placed in FixedStackSlots offset
// by this amount for a tail call. In a sibling call it must be 0 because the
// caller will deallocate the entire stack and the callee still expects its
// arguments to begin at SP+0. Completely unused for non-tail calls.
int FPDiff = 0;
if (IsTailCall && !IsSibCall) {
unsigned NumReusableBytes = FuncInfo->getBytesInStackArgArea();
// Since callee will pop argument stack as a tail call, we must keep the
// popped size 16-byte aligned.
NumBytes = RoundUpToAlignment(NumBytes, 16);
// FPDiff will be negative if this tail call requires more space than we
// would automatically have in our incoming argument space. Positive if we
// can actually shrink the stack.
FPDiff = NumReusableBytes - NumBytes;
// The stack pointer must be 16-byte aligned at all times it's used for a
// memory operation, which in practice means at *all* times and in
// particular across call boundaries. Therefore our own arguments started at
// a 16-byte aligned SP and the delta applied for the tail call should
// satisfy the same constraint.
assert(FPDiff % 16 == 0 && "unaligned stack on tail call");
}
// Adjust the stack pointer for the new arguments...
// These operations are automatically eliminated by the prolog/epilog pass
if (!IsTailCall)
if (!IsSibCall)
Chain =
DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true), DL);
@ -2134,31 +2275,50 @@ SDValue ARM64TargetLowering::LowerCall(CallLoweringInfo &CLI,
RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
} else {
assert(VA.isMemLoc());
// There's no reason we can't support stack args w/ tailcall, but
// we currently don't, so assert if we see one.
assert(!IsTailCall && "stack argument with tail call!?");
// FIXME: This works on big-endian for composite byvals, which are the common
// case. It should also work for fundamental types too.
SDValue DstAddr;
MachinePointerInfo DstInfo;
// FIXME: This works on big-endian for composite byvals, which are the
// common case. It should also work for fundamental types too.
uint32_t BEAlign = 0;
unsigned OpSize = Flags.isByVal() ? Flags.getByValSize() * 8
: VA.getLocVT().getSizeInBits();
OpSize = (OpSize + 7) / 8;
if (!Subtarget->isLittleEndian() && !Flags.isByVal()) {
unsigned OpSize = (VA.getLocVT().getSizeInBits() + 7) / 8;
if (OpSize < 8)
BEAlign = 8 - OpSize;
}
unsigned LocMemOffset = VA.getLocMemOffset();
SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset + BEAlign);
int32_t Offset = LocMemOffset + BEAlign;
SDValue PtrOff = DAG.getIntPtrConstant(Offset);
PtrOff = DAG.getNode(ISD::ADD, DL, getPointerTy(), StackPtr, PtrOff);
if (IsTailCall) {
Offset = Offset + FPDiff;
int FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true);
DstAddr = DAG.getFrameIndex(FI, getPointerTy());
DstInfo = MachinePointerInfo::getFixedStack(FI);
// Make sure any stack arguments overlapping with where we're storing
// are loaded before this eventual operation. Otherwise they'll be
// clobbered.
Chain = addTokenForArgument(Chain, DAG, MF.getFrameInfo(), FI);
} else {
SDValue PtrOff = DAG.getIntPtrConstant(Offset);
DstAddr = DAG.getNode(ISD::ADD, DL, getPointerTy(), StackPtr, PtrOff);
DstInfo = MachinePointerInfo::getStack(LocMemOffset);
}
if (Outs[i].Flags.isByVal()) {
SDValue SizeNode =
DAG.getConstant(Outs[i].Flags.getByValSize(), MVT::i64);
SDValue Cpy = DAG.getMemcpy(
Chain, DL, PtrOff, Arg, SizeNode, Outs[i].Flags.getByValAlign(),
Chain, DL, DstAddr, Arg, SizeNode, Outs[i].Flags.getByValAlign(),
/*isVolatile = */ false,
/*alwaysInline = */ false,
MachinePointerInfo::getStack(LocMemOffset), MachinePointerInfo());
/*alwaysInline = */ false, DstInfo, MachinePointerInfo());
MemOpChains.push_back(Cpy);
} else {
@ -2171,9 +2331,8 @@ SDValue ARM64TargetLowering::LowerCall(CallLoweringInfo &CLI,
VA.getLocVT() == MVT::i16))
Arg = DAG.getNode(ISD::TRUNCATE, DL, VA.getLocVT(), Arg);
SDValue Store = DAG.getStore(Chain, DL, Arg, PtrOff,
MachinePointerInfo::getStack(LocMemOffset),
false, false, 0);
SDValue Store =
DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo, false, false, 0);
MemOpChains.push_back(Store);
}
}
@ -2221,10 +2380,27 @@ SDValue ARM64TargetLowering::LowerCall(CallLoweringInfo &CLI,
Callee = DAG.getTargetExternalSymbol(Sym, getPointerTy(), 0);
}
// We don't usually want to end the call-sequence here because we would tidy
// the frame up *after* the call, however in the ABI-changing tail-call case
// we've carefully laid out the parameters so that when sp is reset they'll be
// in the correct location.
if (IsTailCall && !IsSibCall) {
Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true),
DAG.getIntPtrConstant(0, true), InFlag, DL);
InFlag = Chain.getValue(1);
}
std::vector<SDValue> Ops;
Ops.push_back(Chain);
Ops.push_back(Callee);
if (IsTailCall) {
// Each tail call may have to adjust the stack by a different amount, so
// this information must travel along with the operation for eventual
// consumption by emitEpilogue.
Ops.push_back(DAG.getTargetConstant(FPDiff, MVT::i32));
}
// Add argument registers to the end of the list so that they are known live
// into the call.
for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
@ -2262,8 +2438,13 @@ SDValue ARM64TargetLowering::LowerCall(CallLoweringInfo &CLI,
Chain = DAG.getNode(ARM64ISD::CALL, DL, NodeTys, Ops);
InFlag = Chain.getValue(1);
uint64_t CalleePopBytes = DoesCalleeRestoreStack(CallConv, TailCallOpt)
? RoundUpToAlignment(NumBytes, 16)
: 0;
Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true),
DAG.getIntPtrConstant(0, true), InFlag, DL);
DAG.getIntPtrConstant(CalleePopBytes, true),
InFlag, DL);
if (!Ins.empty())
InFlag = Chain.getValue(1);
@ -7441,9 +7622,6 @@ bool ARM64TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
// return instructions to help enable tail call optimizations for this
// instruction.
bool ARM64TargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const {
if (!EnableARM64TailCalls)
return false;
if (!CI->isTailCall())
return false;

View File

@ -353,6 +353,16 @@ private:
const SmallVectorImpl<SDValue> &OutVals,
const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const;
/// Finds the incoming stack arguments which overlap the given fixed stack
/// object and incorporates their load into the current chain. This prevents
/// an upcoming store from clobbering the stack argument before it's used.
SDValue addTokenForArgument(SDValue Chain, SelectionDAG &DAG,
MachineFrameInfo *MFI, int ClobberedFI) const;
bool DoesCalleeRestoreStack(CallingConv::ID CallCC, bool TailCallOpt) const;
bool IsTailCallConvention(CallingConv::ID CallCC) const;
void saveVarArgRegisters(CCState &CCInfo, SelectionDAG &DAG, SDLoc DL,
SDValue &Chain) const;

View File

@ -88,7 +88,7 @@ def SDT_ARM64binvec : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
def SDT_ARM64trivec : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>,
SDTCisSameAs<0,2>,
SDTCisSameAs<0,3>]>;
def SDT_ARM64TCRET : SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>;
def SDT_ARM64TCRET : SDTypeProfile<0, 2, [SDTCisPtrTy<0>]>;
def SDT_ARM64PREFETCH : SDTypeProfile<0, 2, [SDTCisVT<0, i32>, SDTCisPtrTy<1>]>;
def SDT_ARM64ITOF : SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisSameAs<0,1>]>;
@ -5163,12 +5163,15 @@ def : Pat<(i64 (int_arm64_neon_urshl (i64 FPR64:$Rn), (i64 FPR64:$Rm))),
// Tail call return handling. These are all compiler pseudo-instructions,
// so no encoding information or anything like that.
let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [SP] in {
def TCRETURNdi : Pseudo<(outs), (ins i64imm:$dst), []>;
def TCRETURNri : Pseudo<(outs), (ins tcGPR64:$dst), []>;
def TCRETURNdi : Pseudo<(outs), (ins i64imm:$dst, i32imm:$FPDiff),[]>;
def TCRETURNri : Pseudo<(outs), (ins tcGPR64:$dst, i32imm:$FPDiff), []>;
}
def : Pat<(ARM64tcret tcGPR64:$dst), (TCRETURNri tcGPR64:$dst)>;
def : Pat<(ARM64tcret (i64 tglobaladdr:$dst)), (TCRETURNdi texternalsym:$dst)>;
def : Pat<(ARM64tcret (i64 texternalsym:$dst)), (TCRETURNdi texternalsym:$dst)>;
def : Pat<(ARM64tcret tcGPR64:$dst, (i32 timm:$FPDiff)),
(TCRETURNri tcGPR64:$dst, imm:$FPDiff)>;
def : Pat<(ARM64tcret tglobaladdr:$dst, (i32 timm:$FPDiff)),
(TCRETURNdi texternalsym:$dst, imm:$FPDiff)>;
def : Pat<(ARM64tcret texternalsym:$dst, (i32 timm:$FPDiff)),
(TCRETURNdi texternalsym:$dst, imm:$FPDiff)>;
include "ARM64InstrAtomics.td"

View File

@ -25,6 +25,22 @@ namespace llvm {
/// contains private ARM64-specific information for each MachineFunction.
class ARM64FunctionInfo : public MachineFunctionInfo {
/// Number of bytes of arguments this function has on the stack. If the callee
/// is expected to restore the argument stack this should be a multiple of 16,
/// all usable during a tail call.
///
/// The alternative would forbid tail call optimisation in some cases: if we
/// want to transfer control from a function with 8-bytes of stack-argument
/// space to a function with 16-bytes then misalignment of this value would
/// make a stack adjustment necessary, which could not be undone by the
/// callee.
unsigned BytesInStackArgArea;
/// The number of bytes to restore to deallocate space for incoming
/// arguments. Canonically 0 in the C calling convention, but non-zero when
/// callee is expected to pop the args.
unsigned ArgumentStackToRestore;
/// HasStackFrame - True if this function has a stack frame. Set by
/// processFunctionBeforeCalleeSavedScan().
bool HasStackFrame;
@ -58,17 +74,25 @@ class ARM64FunctionInfo : public MachineFunctionInfo {
public:
ARM64FunctionInfo()
: HasStackFrame(false), NumLocalDynamicTLSAccesses(0),
VarArgsStackIndex(0), VarArgsGPRIndex(0), VarArgsGPRSize(0),
VarArgsFPRIndex(0), VarArgsFPRSize(0) {}
: BytesInStackArgArea(0), ArgumentStackToRestore(0), HasStackFrame(false),
NumLocalDynamicTLSAccesses(0), VarArgsStackIndex(0), VarArgsGPRIndex(0),
VarArgsGPRSize(0), VarArgsFPRIndex(0), VarArgsFPRSize(0) {}
explicit ARM64FunctionInfo(MachineFunction &MF)
: HasStackFrame(false), NumLocalDynamicTLSAccesses(0),
VarArgsStackIndex(0), VarArgsGPRIndex(0), VarArgsGPRSize(0),
VarArgsFPRIndex(0), VarArgsFPRSize(0) {
: BytesInStackArgArea(0), ArgumentStackToRestore(0), HasStackFrame(false),
NumLocalDynamicTLSAccesses(0), VarArgsStackIndex(0), VarArgsGPRIndex(0),
VarArgsGPRSize(0), VarArgsFPRIndex(0), VarArgsFPRSize(0) {
(void)MF;
}
unsigned getBytesInStackArgArea() const { return BytesInStackArgArea; }
void setBytesInStackArgArea(unsigned bytes) { BytesInStackArgArea = bytes; }
unsigned getArgumentStackToRestore() const { return ArgumentStackToRestore; }
void setArgumentStackToRestore(unsigned bytes) {
ArgumentStackToRestore = bytes;
}
bool hasStackFrame() const { return HasStackFrame; }
void setHasStackFrame(bool s) { HasStackFrame = s; }

View File

@ -1,4 +1,5 @@
; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -tailcallopt | FileCheck %s
; RUN: llc -verify-machineinstrs < %s -mtriple=arm64-none-linux-gnu -tailcallopt | FileCheck %s --check-prefix=CHECK-ARM64
; This test is designed to be run in the situation where the
; call-frame is not reserved (hence disable-fp-elim), but where
@ -14,20 +15,28 @@ define fastcc void @foo(i32 %in) {
; Normal frame setup stuff:
; CHECK: sub sp, sp,
; CHECK: stp x29, x30
; CHECK-ARM64: stp x29, x30, [sp, #-16]!
; CHECK-ARM64: mov x29, sp
; Reserve space for call-frame:
; CHECK: sub sp, sp, #16
; CHECK-ARM64: sub sp, sp, #16
call fastcc void @will_pop([8 x i32] undef, i32 42)
; CHECK: bl will_pop
; CHECK-ARM64: bl will_pop
; Since @will_pop is fastcc with tailcallopt, it will put the stack
; back where it needs to be, we shouldn't duplicate that
; CHECK-NOT: sub sp, sp, #16
; CHECK-NOT: add sp, sp,
; CHECK-ARM64-NOT: sub sp, sp, #16
; CHECK-ARM64-NOT: add sp, sp,
; CHECK: ldp x29, x30
; CHECK: add sp, sp,
; CHECK-ARM64: mov sp, x29
; CHECK-ARM64: ldp x29, x30, [sp], #16
ret void
}
@ -40,19 +49,26 @@ define void @foo1(i32 %in) {
; Normal frame setup again
; CHECK: sub sp, sp,
; CHECK: stp x29, x30
; CHECK-ARM64: stp x29, x30, [sp, #-16]!
; CHECK-ARM64: mov x29, sp
; Reserve space for call-frame
; CHECK: sub sp, sp, #16
; CHECK-ARM64: sub sp, sp, #16
call void @wont_pop([8 x i32] undef, i32 42)
; CHECK: bl wont_pop
; CHECK-ARM64: bl wont_pop
; This time we *do* need to unreserve the call-frame
; CHECK: add sp, sp, #16
; CHECK-ARM64: add sp, sp, #16
; Check for epilogue (primarily to make sure sp spotted above wasn't
; part of it).
; CHECK: ldp x29, x30
; CHECK: add sp, sp,
; CHECK-ARM64: mov sp, x29
; CHECK-ARM64: ldp x29, x30, [sp], #16
ret void
}

View File

@ -1,5 +1,7 @@
; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -tailcallopt | FileCheck %s -check-prefix CHECK-TAIL
; RUN: llc -verify-machineinstrs < %s -mtriple=arm64-none-linux-gnu -tailcallopt | FileCheck %s -check-prefix CHECK-ARM64-TAIL
; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
; RUN: llc -verify-machineinstrs < %s -mtriple=arm64-none-linux-gnu | FileCheck --check-prefix=CHECK-ARM64 %s
; Without tailcallopt fastcc still means the caller cleans up the
; stack, so try to make sure this is respected.
@ -8,116 +10,219 @@ define fastcc void @func_stack0() {
; CHECK-LABEL: func_stack0:
; CHECK: sub sp, sp, #48
; CHECK-ARM64-LABEL: func_stack0:
; CHECK-ARM64: stp x29, x30, [sp, #-16]!
; CHECK-ARM64-NEXT: mov x29, sp
; CHECK-ARM64-NEXT: sub sp, sp, #32
; CHECK-TAIL-LABEL: func_stack0:
; CHECK-TAIL: sub sp, sp, #48
; CHECK-ARM64-TAIL-LABEL: func_stack0:
; CHECK-ARM64-TAIL: stp x29, x30, [sp, #-16]!
; CHECK-ARM64-TAIL-NEXT: mov x29, sp
; CHECK-ARM64-TAIL-NEXT: sub sp, sp, #32
call fastcc void @func_stack8([8 x i32] undef, i32 42)
; CHECK: bl func_stack8
; CHECK-NOT: sub sp, sp,
; CHECK-ARM64: bl func_stack8
; CHECK-ARM64-NOT: sub sp, sp,
; CHECK-TAIL: bl func_stack8
; CHECK-TAIL: sub sp, sp, #16
; CHECK-ARM64-TAIL: bl func_stack8
; CHECK-ARM64-TAIL: sub sp, sp, #16
call fastcc void @func_stack32([8 x i32] undef, i128 0, i128 9)
; CHECK: bl func_stack32
; CHECK-NOT: sub sp, sp,
; CHECK-ARM64: bl func_stack32
; CHECK-ARM64-NOT: sub sp, sp,
; CHECK-TAIL: bl func_stack32
; CHECK-TAIL: sub sp, sp, #32
; CHECK-ARM64-TAIL: bl func_stack32
; CHECK-ARM64-TAIL: sub sp, sp, #32
call fastcc void @func_stack0()
; CHECK: bl func_stack0
; CHECK-NOT: sub sp, sp
; CHECK-ARM64: bl func_stack0
; CHECK-ARM64-NOT: sub sp, sp
; CHECK-TAIL: bl func_stack0
; CHECK-TAIL-NOT: sub sp, sp
; CHECK-ARM64-TAIL: bl func_stack0
; CHECK-ARM64-TAIL-NOT: sub sp, sp
ret void
; CHECK: add sp, sp, #48
; CHECK-NEXT: ret
; CHECK-ARM64: mov sp, x29
; CHECK-ARM64-NEXT: ldp x29, x30, [sp], #16
; CHECK-ARM64-NEXT: ret
; CHECK-TAIL: add sp, sp, #48
; CHECK-TAIL-NEXT: ret
; CHECK-ARM64-TAIL: mov sp, x29
; CHECK-ARM64-TAIL-NEXT: ldp x29, x30, [sp], #16
; CHECK-ARM64-TAIL-NEXT: ret
}
define fastcc void @func_stack8([8 x i32], i32 %stacked) {
; CHECK-LABEL: func_stack8:
; CHECK: sub sp, sp, #48
; CHECK-ARM64-LABEL: func_stack8:
; CHECK-ARM64: stp x29, x30, [sp, #-16]!
; CHECK-ARM64: mov x29, sp
; CHECK-ARM64: sub sp, sp, #32
; CHECK-TAIL-LABEL: func_stack8:
; CHECK-TAIL: sub sp, sp, #48
; CHECK-ARM64-TAIL-LABEL: func_stack8:
; CHECK-ARM64-TAIL: stp x29, x30, [sp, #-16]!
; CHECK-ARM64-TAIL: mov x29, sp
; CHECK-ARM64-TAIL: sub sp, sp, #32
call fastcc void @func_stack8([8 x i32] undef, i32 42)
; CHECK: bl func_stack8
; CHECK-NOT: sub sp, sp,
; CHECK-ARM64: bl func_stack8
; CHECK-ARM64-NOT: sub sp, sp,
; CHECK-TAIL: bl func_stack8
; CHECK-TAIL: sub sp, sp, #16
; CHECK-ARM64-TAIL: bl func_stack8
; CHECK-ARM64-TAIL: sub sp, sp, #16
call fastcc void @func_stack32([8 x i32] undef, i128 0, i128 9)
; CHECK: bl func_stack32
; CHECK-NOT: sub sp, sp,
; CHECK-ARM64: bl func_stack32
; CHECK-ARM64-NOT: sub sp, sp,
; CHECK-TAIL: bl func_stack32
; CHECK-TAIL: sub sp, sp, #32
; CHECK-ARM64-TAIL: bl func_stack32
; CHECK-ARM64-TAIL: sub sp, sp, #32
call fastcc void @func_stack0()
; CHECK: bl func_stack0
; CHECK-NOT: sub sp, sp
; CHECK-ARM64: bl func_stack0
; CHECK-ARM64-NOT: sub sp, sp
; CHECK-TAIL: bl func_stack0
; CHECK-TAIL-NOT: sub sp, sp
; CHECK-ARM64-TAIL: bl func_stack0
; CHECK-ARM64-TAIL-NOT: sub sp, sp
ret void
; CHECK: add sp, sp, #48
; CHECK-NEXT: ret
; CHECK-ARM64: mov sp, x29
; CHECK-ARM64-NEXT: ldp x29, x30, [sp], #16
; CHECK-ARM64-NEXT: ret
; CHECK-TAIL: add sp, sp, #64
; CHECK-TAIL-NEXT: ret
; CHECK-ARM64-TAIL: mov sp, x29
; CHECK-ARM64-TAIL-NEXT: ldp x29, x30, [sp], #16
; CHECK-ARM64-TAIL-NEXT: ret
}
define fastcc void @func_stack32([8 x i32], i128 %stacked0, i128 %stacked1) {
; CHECK-LABEL: func_stack32:
; CHECK: sub sp, sp, #48
; CHECK-ARM64-LABEL: func_stack32:
; CHECK-ARM64: mov x29, sp
; CHECK-TAIL-LABEL: func_stack32:
; CHECK-TAIL: sub sp, sp, #48
; CHECK-ARM64-TAIL-LABEL: func_stack32:
; CHECK-ARM64-TAIL: mov x29, sp
call fastcc void @func_stack8([8 x i32] undef, i32 42)
; CHECK: bl func_stack8
; CHECK-NOT: sub sp, sp,
; CHECK-ARM64: bl func_stack8
; CHECK-ARM64-NOT: sub sp, sp,
; CHECK-TAIL: bl func_stack8
; CHECK-TAIL: sub sp, sp, #16
; CHECK-ARM64-TAIL: bl func_stack8
; CHECK-ARM64-TAIL: sub sp, sp, #16
call fastcc void @func_stack32([8 x i32] undef, i128 0, i128 9)
; CHECK: bl func_stack32
; CHECK-NOT: sub sp, sp,
; CHECK-ARM64: bl func_stack32
; CHECK-ARM64-NOT: sub sp, sp,
; CHECK-TAIL: bl func_stack32
; CHECK-TAIL: sub sp, sp, #32
; CHECK-ARM64-TAIL: bl func_stack32
; CHECK-ARM64-TAIL: sub sp, sp, #32
call fastcc void @func_stack0()
; CHECK: bl func_stack0
; CHECK-NOT: sub sp, sp
; CHECK-ARM64: bl func_stack0
; CHECK-ARM64-NOT: sub sp, sp
; CHECK-TAIL: bl func_stack0
; CHECK-TAIL-NOT: sub sp, sp
; CHECK-ARM64-TAIL: bl func_stack0
; CHECK-ARM64-TAIL-NOT: sub sp, sp
ret void
; CHECK: add sp, sp, #48
; CHECK-NEXT: ret
; CHECK-ARM64: mov sp, x29
; CHECK-ARM64-NEXT: ldp x29, x30, [sp], #16
; CHECK-ARM64-NEXT: ret
; CHECK-TAIL: add sp, sp, #80
; CHECK-TAIL-NEXT: ret
; CHECK-ARM64-TAIL: mov sp, x29
; CHECK-ARM64-TAIL-NEXT: ldp x29, x30, [sp], #16
; CHECK-ARM64-TAIL-NEXT: ret
}

View File

@ -1,4 +1,5 @@
; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -tailcallopt | FileCheck %s
; RUN: llc -verify-machineinstrs < %s -mtriple=arm64-none-linux-gnu -tailcallopt | FileCheck --check-prefix=CHECK-ARM64 %s
declare fastcc void @callee_stack0()
declare fastcc void @callee_stack8([8 x i32], i64)
@ -7,57 +8,92 @@ declare fastcc void @callee_stack16([8 x i32], i64, i64)
define fastcc void @caller_to0_from0() nounwind {
; CHECK-LABEL: caller_to0_from0:
; CHECK-NEXT: // BB
; CHECK-ARM64-LABEL: caller_to0_from0:
; CHECK-ARM64-NEXT: // BB
tail call fastcc void @callee_stack0()
ret void
; CHECK-NEXT: b callee_stack0
; CHECK-ARM64-NEXT: b callee_stack0
}
define fastcc void @caller_to0_from8([8 x i32], i64) {
; CHECK-LABEL: caller_to0_from8:
; CHECK-ARM64-LABEL: caller_to0_from8:
tail call fastcc void @callee_stack0()
ret void
; CHECK: add sp, sp, #16
; CHECK-NEXT: b callee_stack0
; CHECK-ARM64: add sp, sp, #16
; CHECK-ARM64-NEXT: b callee_stack0
}
define fastcc void @caller_to8_from0() {
; CHECK-LABEL: caller_to8_from0:
; CHECK: sub sp, sp, #32
; CHECK-ARM64-LABEL: caller_to8_from0:
; CHECK-ARM64: sub sp, sp, #32
; Key point is that the "42" should go #16 below incoming stack
; pointer (we didn't have arg space to reuse).
tail call fastcc void @callee_stack8([8 x i32] undef, i64 42)
ret void
; CHECK: str {{x[0-9]+}}, [sp, #16]
; CHECK-NEXT: add sp, sp, #16
; CHECK-NEXT: b callee_stack8
; CHECK-ARM64: str {{x[0-9]+}}, [sp, #16]!
; CHECK-ARM64-NEXT: b callee_stack8
}
define fastcc void @caller_to8_from8([8 x i32], i64 %a) {
; CHECK-LABEL: caller_to8_from8:
; CHECK: sub sp, sp, #16
; CHECK-ARM64-LABEL: caller_to8_from8:
; CHECK-ARM64: sub sp, sp, #16
; Key point is that the "%a" should go where at SP on entry.
tail call fastcc void @callee_stack8([8 x i32] undef, i64 42)
ret void
; CHECK: str {{x[0-9]+}}, [sp, #16]
; CHECK-NEXT: add sp, sp, #16
; CHECK-NEXT: b callee_stack8
; CHECK-ARM64: str {{x[0-9]+}}, [sp, #16]!
; CHECK-ARM64-NEXT: b callee_stack8
}
define fastcc void @caller_to16_from8([8 x i32], i64 %a) {
; CHECK-LABEL: caller_to16_from8:
; CHECK: sub sp, sp, #16
; CHECK-ARM64-LABEL: caller_to16_from8:
; CHECK-ARM64: sub sp, sp, #16
; Important point is that the call reuses the "dead" argument space
; above %a on the stack. If it tries to go below incoming-SP then the
; callee will not deallocate the space, even in fastcc.
tail call fastcc void @callee_stack16([8 x i32] undef, i64 42, i64 2)
; CHECK: str {{x[0-9]+}}, [sp, #24]
; CHECK: str {{x[0-9]+}}, [sp, #16]
; CHECK: add sp, sp, #16
; CHECK: b callee_stack16
; CHECK-NEXT: add sp, sp, #16
; CHECK-NEXT: b callee_stack16
; CHECK-ARM64: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #16]
; CHECK-ARM64-NEXT: add sp, sp, #16
; CHECK-ARM64-NEXT: b callee_stack16
ret void
}
@ -66,12 +102,19 @@ define fastcc void @caller_to8_from24([8 x i32], i64 %a, i64 %b, i64 %c) {
; CHECK-LABEL: caller_to8_from24:
; CHECK: sub sp, sp, #16
; CHECK-ARM64-LABEL: caller_to8_from24:
; CHECK-ARM64: sub sp, sp, #16
; Key point is that the "%a" should go where at #16 above SP on entry.
tail call fastcc void @callee_stack8([8 x i32] undef, i64 42)
ret void
; CHECK: str {{x[0-9]+}}, [sp, #32]
; CHECK-NEXT: add sp, sp, #32
; CHECK-NEXT: b callee_stack8
; CHECK-ARM64: str {{x[0-9]+}}, [sp, #32]!
; CHECK-ARM64-NEXT: b callee_stack8
}
@ -79,6 +122,9 @@ define fastcc void @caller_to16_from16([8 x i32], i64 %a, i64 %b) {
; CHECK-LABEL: caller_to16_from16:
; CHECK: sub sp, sp, #16
; CHECK-ARM64-LABEL: caller_to16_from16:
; CHECK-ARM64: sub sp, sp, #16
; Here we want to make sure that both loads happen before the stores:
; otherwise either %a or %b will be wrongly clobbered.
tail call fastcc void @callee_stack16([8 x i32] undef, i64 %b, i64 %a)
@ -89,6 +135,11 @@ define fastcc void @caller_to16_from16([8 x i32], i64 %a, i64 %b) {
; CHECK: str x1,
; CHECK: str x0,
; CHECK: add sp, sp, #16
; CHECK: b callee_stack16
; CHECK-NEXT: add sp, sp, #16
; CHECK-NEXT: b callee_stack16
; CHECK-ARM64: ldp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #16]
; CHECK-ARM64: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #16]
; CHECK-ARM64-NEXT: add sp, sp, #16
; CHECK-ARM64-NEXT: b callee_stack16
}