mirror of
https://github.com/c64scene-ar/llvm-6502.git
synced 2024-12-13 20:32:21 +00:00
[ARM64] Support aggressive fastcc/tailcallopt breaking ABI by popping out argument stack from callee.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@208837 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
parent
cea72fe763
commit
66b123f0d8
@ -107,32 +107,44 @@ bool ARM64FrameLowering::hasReservedCallFrame(const MachineFunction &MF) const {
|
||||
void ARM64FrameLowering::eliminateCallFramePseudoInstr(
|
||||
MachineFunction &MF, MachineBasicBlock &MBB,
|
||||
MachineBasicBlock::iterator I) const {
|
||||
const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
|
||||
const ARM64InstrInfo *TII =
|
||||
static_cast<const ARM64InstrInfo *>(MF.getTarget().getInstrInfo());
|
||||
if (!TFI->hasReservedCallFrame(MF)) {
|
||||
// If we have alloca, convert as follows:
|
||||
// ADJCALLSTACKDOWN -> sub, sp, sp, amount
|
||||
// ADJCALLSTACKUP -> add, sp, sp, amount
|
||||
MachineInstr *Old = I;
|
||||
DebugLoc DL = Old->getDebugLoc();
|
||||
unsigned Amount = Old->getOperand(0).getImm();
|
||||
if (Amount != 0) {
|
||||
// We need to keep the stack aligned properly. To do this, we round the
|
||||
// amount of space needed for the outgoing arguments up to the next
|
||||
// alignment boundary.
|
||||
unsigned Align = TFI->getStackAlignment();
|
||||
Amount = (Amount + Align - 1) / Align * Align;
|
||||
DebugLoc DL = I->getDebugLoc();
|
||||
int Opc = I->getOpcode();
|
||||
bool IsDestroy = Opc == TII->getCallFrameDestroyOpcode();
|
||||
uint64_t CalleePopAmount = IsDestroy ? I->getOperand(1).getImm() : 0;
|
||||
|
||||
// Replace the pseudo instruction with a new instruction...
|
||||
unsigned Opc = Old->getOpcode();
|
||||
if (Opc == ARM64::ADJCALLSTACKDOWN) {
|
||||
emitFrameOffset(MBB, I, DL, ARM64::SP, ARM64::SP, -Amount, TII);
|
||||
} else {
|
||||
assert(Opc == ARM64::ADJCALLSTACKUP && "expected ADJCALLSTACKUP");
|
||||
emitFrameOffset(MBB, I, DL, ARM64::SP, ARM64::SP, Amount, TII);
|
||||
}
|
||||
const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
|
||||
if (!TFI->hasReservedCallFrame(MF)) {
|
||||
unsigned Align = getStackAlignment();
|
||||
|
||||
int64_t Amount = I->getOperand(0).getImm();
|
||||
Amount = RoundUpToAlignment(Amount, Align);
|
||||
if (!IsDestroy)
|
||||
Amount = -Amount;
|
||||
|
||||
// N.b. if CalleePopAmount is valid but zero (i.e. callee would pop, but it
|
||||
// doesn't have to pop anything), then the first operand will be zero too so
|
||||
// this adjustment is a no-op.
|
||||
if (CalleePopAmount == 0) {
|
||||
// FIXME: in-function stack adjustment for calls is limited to 24-bits
|
||||
// because there's no guaranteed temporary register available.
|
||||
//
|
||||
// ADD/SUB (immediate) has only LSL #0 and LSL #12 avaiable.
|
||||
// 1) For offset <= 12-bit, we use LSL #0
|
||||
// 2) For 12-bit <= offset <= 24-bit, we use two instructions. One uses
|
||||
// LSL #0, and the other uses LSL #12.
|
||||
//
|
||||
// Mostly call frames will be allocated at the start of a function so
|
||||
// this is OK, but it is a limitation that needs dealing with.
|
||||
assert(Amount > -0xffffff && Amount < 0xffffff && "call frame too large");
|
||||
emitFrameOffset(MBB, I, DL, ARM64::SP, ARM64::SP, Amount, TII);
|
||||
}
|
||||
} else if (CalleePopAmount != 0) {
|
||||
// If the calling convention demands that the callee pops arguments from the
|
||||
// stack, we want to add it back if we have a reserved call frame.
|
||||
assert(CalleePopAmount < 0xffffff && "call frame too large");
|
||||
emitFrameOffset(MBB, I, DL, ARM64::SP, ARM64::SP, -CalleePopAmount, TII);
|
||||
}
|
||||
MBB.erase(I);
|
||||
}
|
||||
@ -420,8 +432,57 @@ void ARM64FrameLowering::emitEpilogue(MachineFunction &MF,
|
||||
const ARM64RegisterInfo *RegInfo =
|
||||
static_cast<const ARM64RegisterInfo *>(MF.getTarget().getRegisterInfo());
|
||||
DebugLoc DL = MBBI->getDebugLoc();
|
||||
unsigned RetOpcode = MBBI->getOpcode();
|
||||
|
||||
int NumBytes = MFI->getStackSize();
|
||||
const ARM64FunctionInfo *AFI = MF.getInfo<ARM64FunctionInfo>();
|
||||
|
||||
// Initial and residual are named for consitency with the prologue. Note that
|
||||
// in the epilogue, the residual adjustment is executed first.
|
||||
uint64_t ArgumentPopSize = 0;
|
||||
if (RetOpcode == ARM64::TCRETURNdi || RetOpcode == ARM64::TCRETURNri) {
|
||||
MachineOperand &StackAdjust = MBBI->getOperand(1);
|
||||
|
||||
// For a tail-call in a callee-pops-arguments environment, some or all of
|
||||
// the stack may actually be in use for the call's arguments, this is
|
||||
// calculated during LowerCall and consumed here...
|
||||
ArgumentPopSize = StackAdjust.getImm();
|
||||
} else {
|
||||
// ... otherwise the amount to pop is *all* of the argument space,
|
||||
// conveniently stored in the MachineFunctionInfo by
|
||||
// LowerFormalArguments. This will, of course, be zero for the C calling
|
||||
// convention.
|
||||
ArgumentPopSize = AFI->getArgumentStackToRestore();
|
||||
}
|
||||
|
||||
// The stack frame should be like below,
|
||||
//
|
||||
// ---------------------- ---
|
||||
// | | |
|
||||
// | BytesInStackArgArea| CalleeArgStackSize
|
||||
// | (NumReusableBytes) | (of tail call)
|
||||
// | | ---
|
||||
// | | |
|
||||
// ---------------------| --- |
|
||||
// | | | |
|
||||
// | CalleeSavedReg | | |
|
||||
// | (NumRestores * 16) | | |
|
||||
// | | | |
|
||||
// ---------------------| | NumBytes
|
||||
// | | StackSize (StackAdjustUp)
|
||||
// | LocalStackSize | | |
|
||||
// | (covering callee | | |
|
||||
// | args) | | |
|
||||
// | | | |
|
||||
// ---------------------- --- ---
|
||||
//
|
||||
// So NumBytes = StackSize + BytesInStackArgArea - CalleeArgStackSize
|
||||
// = StackSize + ArgumentPopSize
|
||||
//
|
||||
// ARM64TargetLowering::LowerCall figures out ArgumentPopSize and keeps
|
||||
// it as the 2nd argument of ARM64ISD::TC_RETURN.
|
||||
NumBytes += ArgumentPopSize;
|
||||
|
||||
unsigned NumRestores = 0;
|
||||
// Move past the restores of the callee-saved registers.
|
||||
MachineBasicBlock::iterator LastPopI = MBBI;
|
||||
|
@ -39,12 +39,6 @@ using namespace llvm;
|
||||
STATISTIC(NumTailCalls, "Number of tail calls");
|
||||
STATISTIC(NumShiftInserts, "Number of vector shift inserts");
|
||||
|
||||
// This option should go away when tail calls fully work.
|
||||
static cl::opt<bool>
|
||||
EnableARM64TailCalls("arm64-tail-calls", cl::Hidden,
|
||||
cl::desc("Generate ARM64 tail calls (TEMPORARY OPTION)."),
|
||||
cl::init(true));
|
||||
|
||||
enum AlignMode {
|
||||
StrictAlign,
|
||||
NoStrictAlign
|
||||
@ -1811,6 +1805,27 @@ SDValue ARM64TargetLowering::LowerFormalArguments(
|
||||
AFI->setVarArgsStackIndex(MFI->CreateFixedObject(4, StackOffset, true));
|
||||
}
|
||||
|
||||
ARM64FunctionInfo *FuncInfo = MF.getInfo<ARM64FunctionInfo>();
|
||||
unsigned StackArgSize = CCInfo.getNextStackOffset();
|
||||
bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
|
||||
if (DoesCalleeRestoreStack(CallConv, TailCallOpt)) {
|
||||
// This is a non-standard ABI so by fiat I say we're allowed to make full
|
||||
// use of the stack area to be popped, which must be aligned to 16 bytes in
|
||||
// any case:
|
||||
StackArgSize = RoundUpToAlignment(StackArgSize, 16);
|
||||
|
||||
// If we're expected to restore the stack (e.g. fastcc) then we'll be adding
|
||||
// a multiple of 16.
|
||||
FuncInfo->setArgumentStackToRestore(StackArgSize);
|
||||
|
||||
// This realignment carries over to the available bytes below. Our own
|
||||
// callers will guarantee the space is free by giving an aligned value to
|
||||
// CALLSEQ_START.
|
||||
}
|
||||
// Even if we're not expected to free up the space, it's useful to know how
|
||||
// much is there while considering tail calls (because we can reuse it).
|
||||
FuncInfo->setBytesInStackArgArea(StackArgSize);
|
||||
|
||||
return Chain;
|
||||
}
|
||||
|
||||
@ -1942,57 +1957,147 @@ bool ARM64TargetLowering::isEligibleForTailCallOptimization(
|
||||
const SmallVectorImpl<ISD::OutputArg> &Outs,
|
||||
const SmallVectorImpl<SDValue> &OutVals,
|
||||
const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
|
||||
// Look for obvious safe cases to perform tail call optimization that do not
|
||||
// require ABI changes. This is what gcc calls sibcall.
|
||||
|
||||
// Do not sibcall optimize vararg calls unless the call site is not passing
|
||||
// any arguments.
|
||||
if (isVarArg && !Outs.empty())
|
||||
// For CallingConv::C this function knows whether the ABI needs
|
||||
// changing. That's not true for other conventions so they will have to opt in
|
||||
// manually.
|
||||
if (!IsTailCallConvention(CalleeCC) && CalleeCC != CallingConv::C)
|
||||
return false;
|
||||
|
||||
// Also avoid sibcall optimization if either caller or callee uses struct
|
||||
// return semantics.
|
||||
if (isCalleeStructRet || isCallerStructRet)
|
||||
const MachineFunction &MF = DAG.getMachineFunction();
|
||||
const Function *CallerF = MF.getFunction();
|
||||
CallingConv::ID CallerCC = CallerF->getCallingConv();
|
||||
bool CCMatch = CallerCC == CalleeCC;
|
||||
|
||||
// Byval parameters hand the function a pointer directly into the stack area
|
||||
// we want to reuse during a tail call. Working around this *is* possible (see
|
||||
// X86) but less efficient and uglier in LowerCall.
|
||||
for (Function::const_arg_iterator i = CallerF->arg_begin(),
|
||||
e = CallerF->arg_end();
|
||||
i != e; ++i)
|
||||
if (i->hasByValAttr())
|
||||
return false;
|
||||
|
||||
if (getTargetMachine().Options.GuaranteedTailCallOpt) {
|
||||
if (IsTailCallConvention(CalleeCC) && CCMatch)
|
||||
return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
// Note that currently ARM64 "C" calling convention and "Fast" calling
|
||||
// convention are compatible. If/when that ever changes, we'll need to
|
||||
// add checks here to make sure any interactions are OK.
|
||||
// Now we search for cases where we can use a tail call without changing the
|
||||
// ABI. Sibcall is used in some places (particularly gcc) to refer to this
|
||||
// concept.
|
||||
|
||||
// If the callee takes no arguments then go on to check the results of the
|
||||
// call.
|
||||
if (!Outs.empty()) {
|
||||
// Check if stack adjustment is needed. For now, do not do this if any
|
||||
// argument is passed on the stack.
|
||||
// I want anyone implementing a new calling convention to think long and hard
|
||||
// about this assert.
|
||||
assert((!isVarArg || CalleeCC == CallingConv::C) &&
|
||||
"Unexpected variadic calling convention");
|
||||
|
||||
if (isVarArg && !Outs.empty()) {
|
||||
// At least two cases here: if caller is fastcc then we can't have any
|
||||
// memory arguments (we'd be expected to clean up the stack afterwards). If
|
||||
// caller is C then we could potentially use its argument area.
|
||||
|
||||
// FIXME: for now we take the most conservative of these in both cases:
|
||||
// disallow all variadic memory operands.
|
||||
SmallVector<CCValAssign, 16> ArgLocs;
|
||||
CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(),
|
||||
getTargetMachine(), ArgLocs, *DAG.getContext());
|
||||
CCAssignFn *AssignFn = CCAssignFnForCall(CalleeCC, /*IsVarArg=*/false);
|
||||
CCInfo.AnalyzeCallOperands(Outs, AssignFn);
|
||||
if (CCInfo.getNextStackOffset()) {
|
||||
// Check if the arguments are already laid out in the right way as
|
||||
// the caller's fixed stack objects.
|
||||
for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size(); i != e;
|
||||
++i, ++realArgIdx) {
|
||||
CCValAssign &VA = ArgLocs[i];
|
||||
if (VA.getLocInfo() == CCValAssign::Indirect)
|
||||
|
||||
CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, true));
|
||||
for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)
|
||||
if (!ArgLocs[i].isRegLoc())
|
||||
return false;
|
||||
}
|
||||
|
||||
// If the calling conventions do not match, then we'd better make sure the
|
||||
// results are returned in the same way as what the caller expects.
|
||||
if (!CCMatch) {
|
||||
SmallVector<CCValAssign, 16> RVLocs1;
|
||||
CCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(),
|
||||
getTargetMachine(), RVLocs1, *DAG.getContext());
|
||||
CCInfo1.AnalyzeCallResult(Ins, CCAssignFnForCall(CalleeCC, isVarArg));
|
||||
|
||||
SmallVector<CCValAssign, 16> RVLocs2;
|
||||
CCState CCInfo2(CallerCC, false, DAG.getMachineFunction(),
|
||||
getTargetMachine(), RVLocs2, *DAG.getContext());
|
||||
CCInfo2.AnalyzeCallResult(Ins, CCAssignFnForCall(CallerCC, isVarArg));
|
||||
|
||||
if (RVLocs1.size() != RVLocs2.size())
|
||||
return false;
|
||||
for (unsigned i = 0, e = RVLocs1.size(); i != e; ++i) {
|
||||
if (RVLocs1[i].isRegLoc() != RVLocs2[i].isRegLoc())
|
||||
return false;
|
||||
if (RVLocs1[i].getLocInfo() != RVLocs2[i].getLocInfo())
|
||||
return false;
|
||||
if (RVLocs1[i].isRegLoc()) {
|
||||
if (RVLocs1[i].getLocReg() != RVLocs2[i].getLocReg())
|
||||
return false;
|
||||
if (VA.needsCustom()) {
|
||||
// Just don't handle anything that needs custom adjustments for now.
|
||||
// If need be, we can revisit later, but we shouldn't ever end up
|
||||
// here.
|
||||
} else {
|
||||
if (RVLocs1[i].getLocMemOffset() != RVLocs2[i].getLocMemOffset())
|
||||
return false;
|
||||
} else if (!VA.isRegLoc()) {
|
||||
// Likewise, don't try to handle stack based arguments for the
|
||||
// time being.
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
// Nothing more to check if the callee is taking no arguments
|
||||
if (Outs.empty())
|
||||
return true;
|
||||
|
||||
SmallVector<CCValAssign, 16> ArgLocs;
|
||||
CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(),
|
||||
getTargetMachine(), ArgLocs, *DAG.getContext());
|
||||
|
||||
CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, isVarArg));
|
||||
|
||||
const ARM64FunctionInfo *FuncInfo = MF.getInfo<ARM64FunctionInfo>();
|
||||
|
||||
// If the stack arguments for this call would fit into our own save area then
|
||||
// the call can be made tail.
|
||||
return CCInfo.getNextStackOffset() <= FuncInfo->getBytesInStackArgArea();
|
||||
}
|
||||
|
||||
SDValue ARM64TargetLowering::addTokenForArgument(SDValue Chain,
|
||||
SelectionDAG &DAG,
|
||||
MachineFrameInfo *MFI,
|
||||
int ClobberedFI) const {
|
||||
SmallVector<SDValue, 8> ArgChains;
|
||||
int64_t FirstByte = MFI->getObjectOffset(ClobberedFI);
|
||||
int64_t LastByte = FirstByte + MFI->getObjectSize(ClobberedFI) - 1;
|
||||
|
||||
// Include the original chain at the beginning of the list. When this is
|
||||
// used by target LowerCall hooks, this helps legalize find the
|
||||
// CALLSEQ_BEGIN node.
|
||||
ArgChains.push_back(Chain);
|
||||
|
||||
// Add a chain value for each stack argument corresponding
|
||||
for (SDNode::use_iterator U = DAG.getEntryNode().getNode()->use_begin(),
|
||||
UE = DAG.getEntryNode().getNode()->use_end();
|
||||
U != UE; ++U)
|
||||
if (LoadSDNode *L = dyn_cast<LoadSDNode>(*U))
|
||||
if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr()))
|
||||
if (FI->getIndex() < 0) {
|
||||
int64_t InFirstByte = MFI->getObjectOffset(FI->getIndex());
|
||||
int64_t InLastByte = InFirstByte;
|
||||
InLastByte += MFI->getObjectSize(FI->getIndex()) - 1;
|
||||
|
||||
if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) ||
|
||||
(FirstByte <= InFirstByte && InFirstByte <= LastByte))
|
||||
ArgChains.push_back(SDValue(L, 1));
|
||||
}
|
||||
|
||||
// Build a tokenfactor for all the chains.
|
||||
return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains);
|
||||
}
|
||||
|
||||
bool ARM64TargetLowering::DoesCalleeRestoreStack(CallingConv::ID CallCC,
|
||||
bool TailCallOpt) const {
|
||||
return CallCC == CallingConv::Fast && TailCallOpt;
|
||||
}
|
||||
|
||||
bool ARM64TargetLowering::IsTailCallConvention(CallingConv::ID CallCC) const {
|
||||
return CallCC == CallingConv::Fast;
|
||||
}
|
||||
|
||||
/// LowerCall - Lower a call to a callseq_start + CALL + callseq_end chain,
|
||||
/// and add input and output parameter nodes.
|
||||
SDValue ARM64TargetLowering::LowerCall(CallLoweringInfo &CLI,
|
||||
@ -2012,9 +2117,9 @@ SDValue ARM64TargetLowering::LowerCall(CallLoweringInfo &CLI,
|
||||
bool IsStructRet = (Outs.empty()) ? false : Outs[0].Flags.isSRet();
|
||||
bool IsThisReturn = false;
|
||||
|
||||
// If tail calls are explicitly disabled, make sure not to use them.
|
||||
if (!EnableARM64TailCalls)
|
||||
IsTailCall = false;
|
||||
ARM64FunctionInfo *FuncInfo = MF.getInfo<ARM64FunctionInfo>();
|
||||
bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
|
||||
bool IsSibCall = false;
|
||||
|
||||
if (IsTailCall) {
|
||||
// Check if it's really possible to do a tail call.
|
||||
@ -2024,9 +2129,12 @@ SDValue ARM64TargetLowering::LowerCall(CallLoweringInfo &CLI,
|
||||
if (!IsTailCall && CLI.CS && CLI.CS->isMustTailCall())
|
||||
report_fatal_error("failed to perform tail call elimination on a call "
|
||||
"site marked musttail");
|
||||
// We don't support GuaranteedTailCallOpt, only automatically
|
||||
// detected sibcalls.
|
||||
// FIXME: Re-evaluate. Is this true? Should it be true?
|
||||
|
||||
// A sibling call is one where we're under the usual C ABI and not planning
|
||||
// to change that but can still do a tail call:
|
||||
if (!TailCallOpt && IsTailCall)
|
||||
IsSibCall = true;
|
||||
|
||||
if (IsTailCall)
|
||||
++NumTailCalls;
|
||||
}
|
||||
@ -2082,9 +2190,42 @@ SDValue ARM64TargetLowering::LowerCall(CallLoweringInfo &CLI,
|
||||
// Get a count of how many bytes are to be pushed on the stack.
|
||||
unsigned NumBytes = CCInfo.getNextStackOffset();
|
||||
|
||||
if (IsSibCall) {
|
||||
// Since we're not changing the ABI to make this a tail call, the memory
|
||||
// operands are already available in the caller's incoming argument space.
|
||||
NumBytes = 0;
|
||||
}
|
||||
|
||||
// FPDiff is the byte offset of the call's argument area from the callee's.
|
||||
// Stores to callee stack arguments will be placed in FixedStackSlots offset
|
||||
// by this amount for a tail call. In a sibling call it must be 0 because the
|
||||
// caller will deallocate the entire stack and the callee still expects its
|
||||
// arguments to begin at SP+0. Completely unused for non-tail calls.
|
||||
int FPDiff = 0;
|
||||
|
||||
if (IsTailCall && !IsSibCall) {
|
||||
unsigned NumReusableBytes = FuncInfo->getBytesInStackArgArea();
|
||||
|
||||
// Since callee will pop argument stack as a tail call, we must keep the
|
||||
// popped size 16-byte aligned.
|
||||
NumBytes = RoundUpToAlignment(NumBytes, 16);
|
||||
|
||||
// FPDiff will be negative if this tail call requires more space than we
|
||||
// would automatically have in our incoming argument space. Positive if we
|
||||
// can actually shrink the stack.
|
||||
FPDiff = NumReusableBytes - NumBytes;
|
||||
|
||||
// The stack pointer must be 16-byte aligned at all times it's used for a
|
||||
// memory operation, which in practice means at *all* times and in
|
||||
// particular across call boundaries. Therefore our own arguments started at
|
||||
// a 16-byte aligned SP and the delta applied for the tail call should
|
||||
// satisfy the same constraint.
|
||||
assert(FPDiff % 16 == 0 && "unaligned stack on tail call");
|
||||
}
|
||||
|
||||
// Adjust the stack pointer for the new arguments...
|
||||
// These operations are automatically eliminated by the prolog/epilog pass
|
||||
if (!IsTailCall)
|
||||
if (!IsSibCall)
|
||||
Chain =
|
||||
DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true), DL);
|
||||
|
||||
@ -2134,31 +2275,50 @@ SDValue ARM64TargetLowering::LowerCall(CallLoweringInfo &CLI,
|
||||
RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
|
||||
} else {
|
||||
assert(VA.isMemLoc());
|
||||
// There's no reason we can't support stack args w/ tailcall, but
|
||||
// we currently don't, so assert if we see one.
|
||||
assert(!IsTailCall && "stack argument with tail call!?");
|
||||
|
||||
// FIXME: This works on big-endian for composite byvals, which are the common
|
||||
// case. It should also work for fundamental types too.
|
||||
SDValue DstAddr;
|
||||
MachinePointerInfo DstInfo;
|
||||
|
||||
// FIXME: This works on big-endian for composite byvals, which are the
|
||||
// common case. It should also work for fundamental types too.
|
||||
uint32_t BEAlign = 0;
|
||||
unsigned OpSize = Flags.isByVal() ? Flags.getByValSize() * 8
|
||||
: VA.getLocVT().getSizeInBits();
|
||||
OpSize = (OpSize + 7) / 8;
|
||||
if (!Subtarget->isLittleEndian() && !Flags.isByVal()) {
|
||||
unsigned OpSize = (VA.getLocVT().getSizeInBits() + 7) / 8;
|
||||
if (OpSize < 8)
|
||||
BEAlign = 8 - OpSize;
|
||||
}
|
||||
|
||||
unsigned LocMemOffset = VA.getLocMemOffset();
|
||||
SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset + BEAlign);
|
||||
int32_t Offset = LocMemOffset + BEAlign;
|
||||
SDValue PtrOff = DAG.getIntPtrConstant(Offset);
|
||||
PtrOff = DAG.getNode(ISD::ADD, DL, getPointerTy(), StackPtr, PtrOff);
|
||||
|
||||
if (IsTailCall) {
|
||||
Offset = Offset + FPDiff;
|
||||
int FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true);
|
||||
|
||||
DstAddr = DAG.getFrameIndex(FI, getPointerTy());
|
||||
DstInfo = MachinePointerInfo::getFixedStack(FI);
|
||||
|
||||
// Make sure any stack arguments overlapping with where we're storing
|
||||
// are loaded before this eventual operation. Otherwise they'll be
|
||||
// clobbered.
|
||||
Chain = addTokenForArgument(Chain, DAG, MF.getFrameInfo(), FI);
|
||||
} else {
|
||||
SDValue PtrOff = DAG.getIntPtrConstant(Offset);
|
||||
|
||||
DstAddr = DAG.getNode(ISD::ADD, DL, getPointerTy(), StackPtr, PtrOff);
|
||||
DstInfo = MachinePointerInfo::getStack(LocMemOffset);
|
||||
}
|
||||
|
||||
if (Outs[i].Flags.isByVal()) {
|
||||
SDValue SizeNode =
|
||||
DAG.getConstant(Outs[i].Flags.getByValSize(), MVT::i64);
|
||||
SDValue Cpy = DAG.getMemcpy(
|
||||
Chain, DL, PtrOff, Arg, SizeNode, Outs[i].Flags.getByValAlign(),
|
||||
Chain, DL, DstAddr, Arg, SizeNode, Outs[i].Flags.getByValAlign(),
|
||||
/*isVolatile = */ false,
|
||||
/*alwaysInline = */ false,
|
||||
MachinePointerInfo::getStack(LocMemOffset), MachinePointerInfo());
|
||||
/*alwaysInline = */ false, DstInfo, MachinePointerInfo());
|
||||
|
||||
MemOpChains.push_back(Cpy);
|
||||
} else {
|
||||
@ -2171,9 +2331,8 @@ SDValue ARM64TargetLowering::LowerCall(CallLoweringInfo &CLI,
|
||||
VA.getLocVT() == MVT::i16))
|
||||
Arg = DAG.getNode(ISD::TRUNCATE, DL, VA.getLocVT(), Arg);
|
||||
|
||||
SDValue Store = DAG.getStore(Chain, DL, Arg, PtrOff,
|
||||
MachinePointerInfo::getStack(LocMemOffset),
|
||||
false, false, 0);
|
||||
SDValue Store =
|
||||
DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo, false, false, 0);
|
||||
MemOpChains.push_back(Store);
|
||||
}
|
||||
}
|
||||
@ -2221,10 +2380,27 @@ SDValue ARM64TargetLowering::LowerCall(CallLoweringInfo &CLI,
|
||||
Callee = DAG.getTargetExternalSymbol(Sym, getPointerTy(), 0);
|
||||
}
|
||||
|
||||
// We don't usually want to end the call-sequence here because we would tidy
|
||||
// the frame up *after* the call, however in the ABI-changing tail-call case
|
||||
// we've carefully laid out the parameters so that when sp is reset they'll be
|
||||
// in the correct location.
|
||||
if (IsTailCall && !IsSibCall) {
|
||||
Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true),
|
||||
DAG.getIntPtrConstant(0, true), InFlag, DL);
|
||||
InFlag = Chain.getValue(1);
|
||||
}
|
||||
|
||||
std::vector<SDValue> Ops;
|
||||
Ops.push_back(Chain);
|
||||
Ops.push_back(Callee);
|
||||
|
||||
if (IsTailCall) {
|
||||
// Each tail call may have to adjust the stack by a different amount, so
|
||||
// this information must travel along with the operation for eventual
|
||||
// consumption by emitEpilogue.
|
||||
Ops.push_back(DAG.getTargetConstant(FPDiff, MVT::i32));
|
||||
}
|
||||
|
||||
// Add argument registers to the end of the list so that they are known live
|
||||
// into the call.
|
||||
for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
|
||||
@ -2262,8 +2438,13 @@ SDValue ARM64TargetLowering::LowerCall(CallLoweringInfo &CLI,
|
||||
Chain = DAG.getNode(ARM64ISD::CALL, DL, NodeTys, Ops);
|
||||
InFlag = Chain.getValue(1);
|
||||
|
||||
uint64_t CalleePopBytes = DoesCalleeRestoreStack(CallConv, TailCallOpt)
|
||||
? RoundUpToAlignment(NumBytes, 16)
|
||||
: 0;
|
||||
|
||||
Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true),
|
||||
DAG.getIntPtrConstant(0, true), InFlag, DL);
|
||||
DAG.getIntPtrConstant(CalleePopBytes, true),
|
||||
InFlag, DL);
|
||||
if (!Ins.empty())
|
||||
InFlag = Chain.getValue(1);
|
||||
|
||||
@ -7441,9 +7622,6 @@ bool ARM64TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
|
||||
// return instructions to help enable tail call optimizations for this
|
||||
// instruction.
|
||||
bool ARM64TargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const {
|
||||
if (!EnableARM64TailCalls)
|
||||
return false;
|
||||
|
||||
if (!CI->isTailCall())
|
||||
return false;
|
||||
|
||||
|
@ -353,6 +353,16 @@ private:
|
||||
const SmallVectorImpl<SDValue> &OutVals,
|
||||
const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const;
|
||||
|
||||
/// Finds the incoming stack arguments which overlap the given fixed stack
|
||||
/// object and incorporates their load into the current chain. This prevents
|
||||
/// an upcoming store from clobbering the stack argument before it's used.
|
||||
SDValue addTokenForArgument(SDValue Chain, SelectionDAG &DAG,
|
||||
MachineFrameInfo *MFI, int ClobberedFI) const;
|
||||
|
||||
bool DoesCalleeRestoreStack(CallingConv::ID CallCC, bool TailCallOpt) const;
|
||||
|
||||
bool IsTailCallConvention(CallingConv::ID CallCC) const;
|
||||
|
||||
void saveVarArgRegisters(CCState &CCInfo, SelectionDAG &DAG, SDLoc DL,
|
||||
SDValue &Chain) const;
|
||||
|
||||
|
@ -88,7 +88,7 @@ def SDT_ARM64binvec : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
|
||||
def SDT_ARM64trivec : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>,
|
||||
SDTCisSameAs<0,2>,
|
||||
SDTCisSameAs<0,3>]>;
|
||||
def SDT_ARM64TCRET : SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>;
|
||||
def SDT_ARM64TCRET : SDTypeProfile<0, 2, [SDTCisPtrTy<0>]>;
|
||||
def SDT_ARM64PREFETCH : SDTypeProfile<0, 2, [SDTCisVT<0, i32>, SDTCisPtrTy<1>]>;
|
||||
|
||||
def SDT_ARM64ITOF : SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisSameAs<0,1>]>;
|
||||
@ -5163,12 +5163,15 @@ def : Pat<(i64 (int_arm64_neon_urshl (i64 FPR64:$Rn), (i64 FPR64:$Rm))),
|
||||
// Tail call return handling. These are all compiler pseudo-instructions,
|
||||
// so no encoding information or anything like that.
|
||||
let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [SP] in {
|
||||
def TCRETURNdi : Pseudo<(outs), (ins i64imm:$dst), []>;
|
||||
def TCRETURNri : Pseudo<(outs), (ins tcGPR64:$dst), []>;
|
||||
def TCRETURNdi : Pseudo<(outs), (ins i64imm:$dst, i32imm:$FPDiff),[]>;
|
||||
def TCRETURNri : Pseudo<(outs), (ins tcGPR64:$dst, i32imm:$FPDiff), []>;
|
||||
}
|
||||
|
||||
def : Pat<(ARM64tcret tcGPR64:$dst), (TCRETURNri tcGPR64:$dst)>;
|
||||
def : Pat<(ARM64tcret (i64 tglobaladdr:$dst)), (TCRETURNdi texternalsym:$dst)>;
|
||||
def : Pat<(ARM64tcret (i64 texternalsym:$dst)), (TCRETURNdi texternalsym:$dst)>;
|
||||
def : Pat<(ARM64tcret tcGPR64:$dst, (i32 timm:$FPDiff)),
|
||||
(TCRETURNri tcGPR64:$dst, imm:$FPDiff)>;
|
||||
def : Pat<(ARM64tcret tglobaladdr:$dst, (i32 timm:$FPDiff)),
|
||||
(TCRETURNdi texternalsym:$dst, imm:$FPDiff)>;
|
||||
def : Pat<(ARM64tcret texternalsym:$dst, (i32 timm:$FPDiff)),
|
||||
(TCRETURNdi texternalsym:$dst, imm:$FPDiff)>;
|
||||
|
||||
include "ARM64InstrAtomics.td"
|
||||
|
@ -25,6 +25,22 @@ namespace llvm {
|
||||
/// contains private ARM64-specific information for each MachineFunction.
|
||||
class ARM64FunctionInfo : public MachineFunctionInfo {
|
||||
|
||||
/// Number of bytes of arguments this function has on the stack. If the callee
|
||||
/// is expected to restore the argument stack this should be a multiple of 16,
|
||||
/// all usable during a tail call.
|
||||
///
|
||||
/// The alternative would forbid tail call optimisation in some cases: if we
|
||||
/// want to transfer control from a function with 8-bytes of stack-argument
|
||||
/// space to a function with 16-bytes then misalignment of this value would
|
||||
/// make a stack adjustment necessary, which could not be undone by the
|
||||
/// callee.
|
||||
unsigned BytesInStackArgArea;
|
||||
|
||||
/// The number of bytes to restore to deallocate space for incoming
|
||||
/// arguments. Canonically 0 in the C calling convention, but non-zero when
|
||||
/// callee is expected to pop the args.
|
||||
unsigned ArgumentStackToRestore;
|
||||
|
||||
/// HasStackFrame - True if this function has a stack frame. Set by
|
||||
/// processFunctionBeforeCalleeSavedScan().
|
||||
bool HasStackFrame;
|
||||
@ -58,17 +74,25 @@ class ARM64FunctionInfo : public MachineFunctionInfo {
|
||||
|
||||
public:
|
||||
ARM64FunctionInfo()
|
||||
: HasStackFrame(false), NumLocalDynamicTLSAccesses(0),
|
||||
VarArgsStackIndex(0), VarArgsGPRIndex(0), VarArgsGPRSize(0),
|
||||
VarArgsFPRIndex(0), VarArgsFPRSize(0) {}
|
||||
: BytesInStackArgArea(0), ArgumentStackToRestore(0), HasStackFrame(false),
|
||||
NumLocalDynamicTLSAccesses(0), VarArgsStackIndex(0), VarArgsGPRIndex(0),
|
||||
VarArgsGPRSize(0), VarArgsFPRIndex(0), VarArgsFPRSize(0) {}
|
||||
|
||||
explicit ARM64FunctionInfo(MachineFunction &MF)
|
||||
: HasStackFrame(false), NumLocalDynamicTLSAccesses(0),
|
||||
VarArgsStackIndex(0), VarArgsGPRIndex(0), VarArgsGPRSize(0),
|
||||
VarArgsFPRIndex(0), VarArgsFPRSize(0) {
|
||||
: BytesInStackArgArea(0), ArgumentStackToRestore(0), HasStackFrame(false),
|
||||
NumLocalDynamicTLSAccesses(0), VarArgsStackIndex(0), VarArgsGPRIndex(0),
|
||||
VarArgsGPRSize(0), VarArgsFPRIndex(0), VarArgsFPRSize(0) {
|
||||
(void)MF;
|
||||
}
|
||||
|
||||
unsigned getBytesInStackArgArea() const { return BytesInStackArgArea; }
|
||||
void setBytesInStackArgArea(unsigned bytes) { BytesInStackArgArea = bytes; }
|
||||
|
||||
unsigned getArgumentStackToRestore() const { return ArgumentStackToRestore; }
|
||||
void setArgumentStackToRestore(unsigned bytes) {
|
||||
ArgumentStackToRestore = bytes;
|
||||
}
|
||||
|
||||
bool hasStackFrame() const { return HasStackFrame; }
|
||||
void setHasStackFrame(bool s) { HasStackFrame = s; }
|
||||
|
||||
|
@ -1,4 +1,5 @@
|
||||
; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -tailcallopt | FileCheck %s
|
||||
; RUN: llc -verify-machineinstrs < %s -mtriple=arm64-none-linux-gnu -tailcallopt | FileCheck %s --check-prefix=CHECK-ARM64
|
||||
|
||||
; This test is designed to be run in the situation where the
|
||||
; call-frame is not reserved (hence disable-fp-elim), but where
|
||||
@ -14,20 +15,28 @@ define fastcc void @foo(i32 %in) {
|
||||
; Normal frame setup stuff:
|
||||
; CHECK: sub sp, sp,
|
||||
; CHECK: stp x29, x30
|
||||
; CHECK-ARM64: stp x29, x30, [sp, #-16]!
|
||||
; CHECK-ARM64: mov x29, sp
|
||||
|
||||
; Reserve space for call-frame:
|
||||
; CHECK: sub sp, sp, #16
|
||||
; CHECK-ARM64: sub sp, sp, #16
|
||||
|
||||
call fastcc void @will_pop([8 x i32] undef, i32 42)
|
||||
; CHECK: bl will_pop
|
||||
; CHECK-ARM64: bl will_pop
|
||||
|
||||
; Since @will_pop is fastcc with tailcallopt, it will put the stack
|
||||
; back where it needs to be, we shouldn't duplicate that
|
||||
; CHECK-NOT: sub sp, sp, #16
|
||||
; CHECK-NOT: add sp, sp,
|
||||
; CHECK-ARM64-NOT: sub sp, sp, #16
|
||||
; CHECK-ARM64-NOT: add sp, sp,
|
||||
|
||||
; CHECK: ldp x29, x30
|
||||
; CHECK: add sp, sp,
|
||||
; CHECK-ARM64: mov sp, x29
|
||||
; CHECK-ARM64: ldp x29, x30, [sp], #16
|
||||
ret void
|
||||
}
|
||||
|
||||
@ -40,19 +49,26 @@ define void @foo1(i32 %in) {
|
||||
; Normal frame setup again
|
||||
; CHECK: sub sp, sp,
|
||||
; CHECK: stp x29, x30
|
||||
; CHECK-ARM64: stp x29, x30, [sp, #-16]!
|
||||
; CHECK-ARM64: mov x29, sp
|
||||
|
||||
; Reserve space for call-frame
|
||||
; CHECK: sub sp, sp, #16
|
||||
; CHECK-ARM64: sub sp, sp, #16
|
||||
|
||||
call void @wont_pop([8 x i32] undef, i32 42)
|
||||
; CHECK: bl wont_pop
|
||||
; CHECK-ARM64: bl wont_pop
|
||||
|
||||
; This time we *do* need to unreserve the call-frame
|
||||
; CHECK: add sp, sp, #16
|
||||
; CHECK-ARM64: add sp, sp, #16
|
||||
|
||||
; Check for epilogue (primarily to make sure sp spotted above wasn't
|
||||
; part of it).
|
||||
; CHECK: ldp x29, x30
|
||||
; CHECK: add sp, sp,
|
||||
; CHECK-ARM64: mov sp, x29
|
||||
; CHECK-ARM64: ldp x29, x30, [sp], #16
|
||||
ret void
|
||||
}
|
||||
|
@ -1,5 +1,7 @@
|
||||
; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -tailcallopt | FileCheck %s -check-prefix CHECK-TAIL
|
||||
; RUN: llc -verify-machineinstrs < %s -mtriple=arm64-none-linux-gnu -tailcallopt | FileCheck %s -check-prefix CHECK-ARM64-TAIL
|
||||
; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
|
||||
; RUN: llc -verify-machineinstrs < %s -mtriple=arm64-none-linux-gnu | FileCheck --check-prefix=CHECK-ARM64 %s
|
||||
|
||||
; Without tailcallopt fastcc still means the caller cleans up the
|
||||
; stack, so try to make sure this is respected.
|
||||
@ -8,116 +10,219 @@ define fastcc void @func_stack0() {
|
||||
; CHECK-LABEL: func_stack0:
|
||||
; CHECK: sub sp, sp, #48
|
||||
|
||||
; CHECK-ARM64-LABEL: func_stack0:
|
||||
; CHECK-ARM64: stp x29, x30, [sp, #-16]!
|
||||
; CHECK-ARM64-NEXT: mov x29, sp
|
||||
; CHECK-ARM64-NEXT: sub sp, sp, #32
|
||||
|
||||
; CHECK-TAIL-LABEL: func_stack0:
|
||||
; CHECK-TAIL: sub sp, sp, #48
|
||||
|
||||
; CHECK-ARM64-TAIL-LABEL: func_stack0:
|
||||
; CHECK-ARM64-TAIL: stp x29, x30, [sp, #-16]!
|
||||
; CHECK-ARM64-TAIL-NEXT: mov x29, sp
|
||||
; CHECK-ARM64-TAIL-NEXT: sub sp, sp, #32
|
||||
|
||||
|
||||
call fastcc void @func_stack8([8 x i32] undef, i32 42)
|
||||
; CHECK: bl func_stack8
|
||||
; CHECK-NOT: sub sp, sp,
|
||||
|
||||
; CHECK-ARM64: bl func_stack8
|
||||
; CHECK-ARM64-NOT: sub sp, sp,
|
||||
|
||||
; CHECK-TAIL: bl func_stack8
|
||||
; CHECK-TAIL: sub sp, sp, #16
|
||||
|
||||
; CHECK-ARM64-TAIL: bl func_stack8
|
||||
; CHECK-ARM64-TAIL: sub sp, sp, #16
|
||||
|
||||
|
||||
call fastcc void @func_stack32([8 x i32] undef, i128 0, i128 9)
|
||||
; CHECK: bl func_stack32
|
||||
; CHECK-NOT: sub sp, sp,
|
||||
|
||||
; CHECK-ARM64: bl func_stack32
|
||||
; CHECK-ARM64-NOT: sub sp, sp,
|
||||
|
||||
; CHECK-TAIL: bl func_stack32
|
||||
; CHECK-TAIL: sub sp, sp, #32
|
||||
|
||||
; CHECK-ARM64-TAIL: bl func_stack32
|
||||
; CHECK-ARM64-TAIL: sub sp, sp, #32
|
||||
|
||||
|
||||
call fastcc void @func_stack0()
|
||||
; CHECK: bl func_stack0
|
||||
; CHECK-NOT: sub sp, sp
|
||||
|
||||
; CHECK-ARM64: bl func_stack0
|
||||
; CHECK-ARM64-NOT: sub sp, sp
|
||||
|
||||
; CHECK-TAIL: bl func_stack0
|
||||
; CHECK-TAIL-NOT: sub sp, sp
|
||||
|
||||
; CHECK-ARM64-TAIL: bl func_stack0
|
||||
; CHECK-ARM64-TAIL-NOT: sub sp, sp
|
||||
|
||||
ret void
|
||||
; CHECK: add sp, sp, #48
|
||||
; CHECK-NEXT: ret
|
||||
|
||||
; CHECK-ARM64: mov sp, x29
|
||||
; CHECK-ARM64-NEXT: ldp x29, x30, [sp], #16
|
||||
; CHECK-ARM64-NEXT: ret
|
||||
|
||||
; CHECK-TAIL: add sp, sp, #48
|
||||
; CHECK-TAIL-NEXT: ret
|
||||
|
||||
; CHECK-ARM64-TAIL: mov sp, x29
|
||||
; CHECK-ARM64-TAIL-NEXT: ldp x29, x30, [sp], #16
|
||||
; CHECK-ARM64-TAIL-NEXT: ret
|
||||
}
|
||||
|
||||
define fastcc void @func_stack8([8 x i32], i32 %stacked) {
|
||||
; CHECK-LABEL: func_stack8:
|
||||
; CHECK: sub sp, sp, #48
|
||||
|
||||
; CHECK-ARM64-LABEL: func_stack8:
|
||||
; CHECK-ARM64: stp x29, x30, [sp, #-16]!
|
||||
; CHECK-ARM64: mov x29, sp
|
||||
; CHECK-ARM64: sub sp, sp, #32
|
||||
|
||||
; CHECK-TAIL-LABEL: func_stack8:
|
||||
; CHECK-TAIL: sub sp, sp, #48
|
||||
|
||||
; CHECK-ARM64-TAIL-LABEL: func_stack8:
|
||||
; CHECK-ARM64-TAIL: stp x29, x30, [sp, #-16]!
|
||||
; CHECK-ARM64-TAIL: mov x29, sp
|
||||
; CHECK-ARM64-TAIL: sub sp, sp, #32
|
||||
|
||||
|
||||
call fastcc void @func_stack8([8 x i32] undef, i32 42)
|
||||
; CHECK: bl func_stack8
|
||||
; CHECK-NOT: sub sp, sp,
|
||||
|
||||
; CHECK-ARM64: bl func_stack8
|
||||
; CHECK-ARM64-NOT: sub sp, sp,
|
||||
|
||||
; CHECK-TAIL: bl func_stack8
|
||||
; CHECK-TAIL: sub sp, sp, #16
|
||||
|
||||
; CHECK-ARM64-TAIL: bl func_stack8
|
||||
; CHECK-ARM64-TAIL: sub sp, sp, #16
|
||||
|
||||
|
||||
call fastcc void @func_stack32([8 x i32] undef, i128 0, i128 9)
|
||||
; CHECK: bl func_stack32
|
||||
; CHECK-NOT: sub sp, sp,
|
||||
|
||||
; CHECK-ARM64: bl func_stack32
|
||||
; CHECK-ARM64-NOT: sub sp, sp,
|
||||
|
||||
; CHECK-TAIL: bl func_stack32
|
||||
; CHECK-TAIL: sub sp, sp, #32
|
||||
|
||||
; CHECK-ARM64-TAIL: bl func_stack32
|
||||
; CHECK-ARM64-TAIL: sub sp, sp, #32
|
||||
|
||||
|
||||
call fastcc void @func_stack0()
|
||||
; CHECK: bl func_stack0
|
||||
; CHECK-NOT: sub sp, sp
|
||||
|
||||
; CHECK-ARM64: bl func_stack0
|
||||
; CHECK-ARM64-NOT: sub sp, sp
|
||||
|
||||
; CHECK-TAIL: bl func_stack0
|
||||
; CHECK-TAIL-NOT: sub sp, sp
|
||||
|
||||
; CHECK-ARM64-TAIL: bl func_stack0
|
||||
; CHECK-ARM64-TAIL-NOT: sub sp, sp
|
||||
|
||||
ret void
|
||||
; CHECK: add sp, sp, #48
|
||||
; CHECK-NEXT: ret
|
||||
|
||||
; CHECK-ARM64: mov sp, x29
|
||||
; CHECK-ARM64-NEXT: ldp x29, x30, [sp], #16
|
||||
; CHECK-ARM64-NEXT: ret
|
||||
|
||||
; CHECK-TAIL: add sp, sp, #64
|
||||
; CHECK-TAIL-NEXT: ret
|
||||
|
||||
; CHECK-ARM64-TAIL: mov sp, x29
|
||||
; CHECK-ARM64-TAIL-NEXT: ldp x29, x30, [sp], #16
|
||||
; CHECK-ARM64-TAIL-NEXT: ret
|
||||
}
|
||||
|
||||
define fastcc void @func_stack32([8 x i32], i128 %stacked0, i128 %stacked1) {
|
||||
; CHECK-LABEL: func_stack32:
|
||||
; CHECK: sub sp, sp, #48
|
||||
|
||||
; CHECK-ARM64-LABEL: func_stack32:
|
||||
; CHECK-ARM64: mov x29, sp
|
||||
|
||||
; CHECK-TAIL-LABEL: func_stack32:
|
||||
; CHECK-TAIL: sub sp, sp, #48
|
||||
|
||||
; CHECK-ARM64-TAIL-LABEL: func_stack32:
|
||||
; CHECK-ARM64-TAIL: mov x29, sp
|
||||
|
||||
|
||||
call fastcc void @func_stack8([8 x i32] undef, i32 42)
|
||||
; CHECK: bl func_stack8
|
||||
; CHECK-NOT: sub sp, sp,
|
||||
|
||||
; CHECK-ARM64: bl func_stack8
|
||||
; CHECK-ARM64-NOT: sub sp, sp,
|
||||
|
||||
; CHECK-TAIL: bl func_stack8
|
||||
; CHECK-TAIL: sub sp, sp, #16
|
||||
|
||||
; CHECK-ARM64-TAIL: bl func_stack8
|
||||
; CHECK-ARM64-TAIL: sub sp, sp, #16
|
||||
|
||||
|
||||
call fastcc void @func_stack32([8 x i32] undef, i128 0, i128 9)
|
||||
; CHECK: bl func_stack32
|
||||
; CHECK-NOT: sub sp, sp,
|
||||
|
||||
; CHECK-ARM64: bl func_stack32
|
||||
; CHECK-ARM64-NOT: sub sp, sp,
|
||||
|
||||
; CHECK-TAIL: bl func_stack32
|
||||
; CHECK-TAIL: sub sp, sp, #32
|
||||
|
||||
; CHECK-ARM64-TAIL: bl func_stack32
|
||||
; CHECK-ARM64-TAIL: sub sp, sp, #32
|
||||
|
||||
|
||||
call fastcc void @func_stack0()
|
||||
; CHECK: bl func_stack0
|
||||
; CHECK-NOT: sub sp, sp
|
||||
|
||||
; CHECK-ARM64: bl func_stack0
|
||||
; CHECK-ARM64-NOT: sub sp, sp
|
||||
|
||||
; CHECK-TAIL: bl func_stack0
|
||||
; CHECK-TAIL-NOT: sub sp, sp
|
||||
|
||||
; CHECK-ARM64-TAIL: bl func_stack0
|
||||
; CHECK-ARM64-TAIL-NOT: sub sp, sp
|
||||
|
||||
ret void
|
||||
; CHECK: add sp, sp, #48
|
||||
; CHECK-NEXT: ret
|
||||
|
||||
; CHECK-ARM64: mov sp, x29
|
||||
; CHECK-ARM64-NEXT: ldp x29, x30, [sp], #16
|
||||
; CHECK-ARM64-NEXT: ret
|
||||
|
||||
; CHECK-TAIL: add sp, sp, #80
|
||||
; CHECK-TAIL-NEXT: ret
|
||||
|
||||
; CHECK-ARM64-TAIL: mov sp, x29
|
||||
; CHECK-ARM64-TAIL-NEXT: ldp x29, x30, [sp], #16
|
||||
; CHECK-ARM64-TAIL-NEXT: ret
|
||||
}
|
||||
|
@ -1,4 +1,5 @@
|
||||
; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -tailcallopt | FileCheck %s
|
||||
; RUN: llc -verify-machineinstrs < %s -mtriple=arm64-none-linux-gnu -tailcallopt | FileCheck --check-prefix=CHECK-ARM64 %s
|
||||
|
||||
declare fastcc void @callee_stack0()
|
||||
declare fastcc void @callee_stack8([8 x i32], i64)
|
||||
@ -7,57 +8,92 @@ declare fastcc void @callee_stack16([8 x i32], i64, i64)
|
||||
define fastcc void @caller_to0_from0() nounwind {
|
||||
; CHECK-LABEL: caller_to0_from0:
|
||||
; CHECK-NEXT: // BB
|
||||
|
||||
; CHECK-ARM64-LABEL: caller_to0_from0:
|
||||
; CHECK-ARM64-NEXT: // BB
|
||||
|
||||
tail call fastcc void @callee_stack0()
|
||||
ret void
|
||||
|
||||
; CHECK-NEXT: b callee_stack0
|
||||
|
||||
; CHECK-ARM64-NEXT: b callee_stack0
|
||||
}
|
||||
|
||||
define fastcc void @caller_to0_from8([8 x i32], i64) {
|
||||
; CHECK-LABEL: caller_to0_from8:
|
||||
|
||||
; CHECK-ARM64-LABEL: caller_to0_from8:
|
||||
|
||||
tail call fastcc void @callee_stack0()
|
||||
ret void
|
||||
|
||||
; CHECK: add sp, sp, #16
|
||||
; CHECK-NEXT: b callee_stack0
|
||||
|
||||
; CHECK-ARM64: add sp, sp, #16
|
||||
; CHECK-ARM64-NEXT: b callee_stack0
|
||||
}
|
||||
|
||||
define fastcc void @caller_to8_from0() {
|
||||
; CHECK-LABEL: caller_to8_from0:
|
||||
; CHECK: sub sp, sp, #32
|
||||
|
||||
; CHECK-ARM64-LABEL: caller_to8_from0:
|
||||
; CHECK-ARM64: sub sp, sp, #32
|
||||
|
||||
; Key point is that the "42" should go #16 below incoming stack
|
||||
; pointer (we didn't have arg space to reuse).
|
||||
tail call fastcc void @callee_stack8([8 x i32] undef, i64 42)
|
||||
ret void
|
||||
|
||||
; CHECK: str {{x[0-9]+}}, [sp, #16]
|
||||
; CHECK-NEXT: add sp, sp, #16
|
||||
; CHECK-NEXT: b callee_stack8
|
||||
|
||||
; CHECK-ARM64: str {{x[0-9]+}}, [sp, #16]!
|
||||
; CHECK-ARM64-NEXT: b callee_stack8
|
||||
}
|
||||
|
||||
define fastcc void @caller_to8_from8([8 x i32], i64 %a) {
|
||||
; CHECK-LABEL: caller_to8_from8:
|
||||
; CHECK: sub sp, sp, #16
|
||||
|
||||
; CHECK-ARM64-LABEL: caller_to8_from8:
|
||||
; CHECK-ARM64: sub sp, sp, #16
|
||||
|
||||
; Key point is that the "%a" should go where at SP on entry.
|
||||
tail call fastcc void @callee_stack8([8 x i32] undef, i64 42)
|
||||
ret void
|
||||
|
||||
; CHECK: str {{x[0-9]+}}, [sp, #16]
|
||||
; CHECK-NEXT: add sp, sp, #16
|
||||
; CHECK-NEXT: b callee_stack8
|
||||
|
||||
; CHECK-ARM64: str {{x[0-9]+}}, [sp, #16]!
|
||||
; CHECK-ARM64-NEXT: b callee_stack8
|
||||
}
|
||||
|
||||
define fastcc void @caller_to16_from8([8 x i32], i64 %a) {
|
||||
; CHECK-LABEL: caller_to16_from8:
|
||||
; CHECK: sub sp, sp, #16
|
||||
|
||||
; CHECK-ARM64-LABEL: caller_to16_from8:
|
||||
; CHECK-ARM64: sub sp, sp, #16
|
||||
|
||||
; Important point is that the call reuses the "dead" argument space
|
||||
; above %a on the stack. If it tries to go below incoming-SP then the
|
||||
; callee will not deallocate the space, even in fastcc.
|
||||
tail call fastcc void @callee_stack16([8 x i32] undef, i64 42, i64 2)
|
||||
|
||||
; CHECK: str {{x[0-9]+}}, [sp, #24]
|
||||
; CHECK: str {{x[0-9]+}}, [sp, #16]
|
||||
; CHECK: add sp, sp, #16
|
||||
; CHECK: b callee_stack16
|
||||
; CHECK-NEXT: add sp, sp, #16
|
||||
; CHECK-NEXT: b callee_stack16
|
||||
|
||||
; CHECK-ARM64: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #16]
|
||||
; CHECK-ARM64-NEXT: add sp, sp, #16
|
||||
; CHECK-ARM64-NEXT: b callee_stack16
|
||||
ret void
|
||||
}
|
||||
|
||||
@ -66,12 +102,19 @@ define fastcc void @caller_to8_from24([8 x i32], i64 %a, i64 %b, i64 %c) {
|
||||
; CHECK-LABEL: caller_to8_from24:
|
||||
; CHECK: sub sp, sp, #16
|
||||
|
||||
; CHECK-ARM64-LABEL: caller_to8_from24:
|
||||
; CHECK-ARM64: sub sp, sp, #16
|
||||
|
||||
; Key point is that the "%a" should go where at #16 above SP on entry.
|
||||
tail call fastcc void @callee_stack8([8 x i32] undef, i64 42)
|
||||
ret void
|
||||
|
||||
; CHECK: str {{x[0-9]+}}, [sp, #32]
|
||||
; CHECK-NEXT: add sp, sp, #32
|
||||
; CHECK-NEXT: b callee_stack8
|
||||
|
||||
; CHECK-ARM64: str {{x[0-9]+}}, [sp, #32]!
|
||||
; CHECK-ARM64-NEXT: b callee_stack8
|
||||
}
|
||||
|
||||
|
||||
@ -79,6 +122,9 @@ define fastcc void @caller_to16_from16([8 x i32], i64 %a, i64 %b) {
|
||||
; CHECK-LABEL: caller_to16_from16:
|
||||
; CHECK: sub sp, sp, #16
|
||||
|
||||
; CHECK-ARM64-LABEL: caller_to16_from16:
|
||||
; CHECK-ARM64: sub sp, sp, #16
|
||||
|
||||
; Here we want to make sure that both loads happen before the stores:
|
||||
; otherwise either %a or %b will be wrongly clobbered.
|
||||
tail call fastcc void @callee_stack16([8 x i32] undef, i64 %b, i64 %a)
|
||||
@ -89,6 +135,11 @@ define fastcc void @caller_to16_from16([8 x i32], i64 %a, i64 %b) {
|
||||
; CHECK: str x1,
|
||||
; CHECK: str x0,
|
||||
|
||||
; CHECK: add sp, sp, #16
|
||||
; CHECK: b callee_stack16
|
||||
; CHECK-NEXT: add sp, sp, #16
|
||||
; CHECK-NEXT: b callee_stack16
|
||||
|
||||
; CHECK-ARM64: ldp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #16]
|
||||
; CHECK-ARM64: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #16]
|
||||
; CHECK-ARM64-NEXT: add sp, sp, #16
|
||||
; CHECK-ARM64-NEXT: b callee_stack16
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user