Implement X86 code generation for musttail

Currently, musttail codegen is relying on sibcall optimization, and
reporting a fatal error if fails.  Sibcall optimization fails when stack
arguments need to be modified, which is insufficient for musttail.

The logic for moving arguments in memory safely is already implemented
for GuaranteedTailCallOpt.  This change merely arranges for musttail
calls to use it.

No functional change for GuaranteedTailCallOpt.

Reviewers: espindola

Differential Revision: http://reviews.llvm.org/D3493

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@207598 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
Reid Kleckner 2014-04-29 23:55:41 +00:00
parent fc288c8339
commit 9902128e2a
4 changed files with 274 additions and 47 deletions

View File

@ -2498,10 +2498,10 @@ X86TargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG,
/// EmitTailCallStoreRetAddr - Emit a store of the return address if tail call
/// optimization is performed and it is required (FPDiff!=0).
static SDValue
EmitTailCallStoreRetAddr(SelectionDAG & DAG, MachineFunction &MF,
SDValue Chain, SDValue RetAddrFrIdx, EVT PtrVT,
unsigned SlotSize, int FPDiff, SDLoc dl) {
static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF,
SDValue Chain, SDValue RetAddrFrIdx,
EVT PtrVT, unsigned SlotSize,
int FPDiff, SDLoc dl) {
// Store the return address to the appropriate stack slot.
if (!FPDiff) return Chain;
// Calculate the new stack slot for the return address.
@ -2538,17 +2538,19 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
if (MF.getTarget().Options.DisableTailCalls)
isTailCall = false;
if (isTailCall) {
bool IsMustTail = CLI.CS && CLI.CS->isMustTailCall();
if (IsMustTail) {
// Force this to be a tail call. The verifier rules are enough to ensure
// that we can lower this successfully without moving the return address
// around.
isTailCall = true;
} else if (isTailCall) {
// Check if it's really possible to do a tail call.
isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
isVarArg, SR != NotStructReturn,
MF.getFunction()->hasStructRetAttr(), CLI.RetTy,
Outs, OutVals, Ins, DAG);
if (!isTailCall && CLI.CS && CLI.CS->isMustTailCall())
report_fatal_error("failed to perform tail call elimination on a call "
"site marked musttail");
// Sibcalls are automatically detected tailcalls which do not require
// ABI changes.
if (!MF.getTarget().Options.GuaranteedTailCallOpt && isTailCall)
@ -2583,7 +2585,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);
int FPDiff = 0;
if (isTailCall && !IsSibcall) {
if (isTailCall && !IsSibcall && !IsMustTail) {
// Lower arguments at fp - stackoffset + fpdiff.
X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>();
unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn();
@ -2746,8 +2748,10 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
DAG.getConstant(NumXMMRegs, MVT::i8)));
}
// For tail calls lower the arguments to the 'real' stack slot.
if (isTailCall) {
// For tail calls lower the arguments to the 'real' stack slots. Sibcalls
// don't need this because the eligibility check rejects calls that require
// shuffling arguments passed in memory.
if (!IsSibcall && isTailCall) {
// Force all the incoming stack arguments to be loaded from the stack
// before any new outgoing arguments are stored to the stack, because the
// outgoing stack slots may alias the incoming argument stack slots, and
@ -2759,39 +2763,40 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
SmallVector<SDValue, 8> MemOpChains2;
SDValue FIN;
int FI = 0;
if (getTargetMachine().Options.GuaranteedTailCallOpt) {
for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
CCValAssign &VA = ArgLocs[i];
if (VA.isRegLoc())
continue;
assert(VA.isMemLoc());
SDValue Arg = OutVals[i];
ISD::ArgFlagsTy Flags = Outs[i].Flags;
// Create frame index.
int32_t Offset = VA.getLocMemOffset()+FPDiff;
uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;
FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true);
FIN = DAG.getFrameIndex(FI, getPointerTy());
for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
CCValAssign &VA = ArgLocs[i];
if (VA.isRegLoc())
continue;
assert(VA.isMemLoc());
SDValue Arg = OutVals[i];
ISD::ArgFlagsTy Flags = Outs[i].Flags;
// Skip inalloca arguments. They don't require any work.
if (Flags.isInAlloca())
continue;
// Create frame index.
int32_t Offset = VA.getLocMemOffset()+FPDiff;
uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;
FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true);
FIN = DAG.getFrameIndex(FI, getPointerTy());
if (Flags.isByVal()) {
// Copy relative to framepointer.
SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset());
if (!StackPtr.getNode())
StackPtr = DAG.getCopyFromReg(Chain, dl,
RegInfo->getStackRegister(),
getPointerTy());
Source = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, Source);
if (Flags.isByVal()) {
// Copy relative to framepointer.
SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset());
if (!StackPtr.getNode())
StackPtr = DAG.getCopyFromReg(Chain, dl,
RegInfo->getStackRegister(),
getPointerTy());
Source = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, Source);
MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN,
ArgChain,
Flags, DAG, dl));
} else {
// Store relative to framepointer.
MemOpChains2.push_back(
DAG.getStore(ArgChain, dl, Arg, FIN,
MachinePointerInfo::getFixedStack(FI),
false, false, 0));
}
MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN,
ArgChain,
Flags, DAG, dl));
} else {
// Store relative to framepointer.
MemOpChains2.push_back(
DAG.getStore(ArgChain, dl, Arg, FIN,
MachinePointerInfo::getFixedStack(FI),
false, false, 0));
}
}

View File

@ -0,0 +1,124 @@
; RUN: llc < %s -mtriple=i686-win32 | FileCheck %s
; RUN: llc < %s -mtriple=i686-win32 -O0 | FileCheck %s
; IR simplified from the following C++ snippet compiled for i686-windows-msvc:
; struct A { A(); ~A(); int a; };
;
; struct B {
; virtual int f(int);
; virtual int g(A, int, A);
; virtual void h(A, int, A);
; virtual A i(A, int, A);
; virtual A j(int);
; };
;
; int (B::*mp_f)(int) = &B::f;
; int (B::*mp_g)(A, int, A) = &B::g;
; void (B::*mp_h)(A, int, A) = &B::h;
; A (B::*mp_i)(A, int, A) = &B::i;
; A (B::*mp_j)(int) = &B::j;
; Each member pointer creates a thunk. The ones with inalloca are required to
; tail calls by the ABI, even at O0.
%struct.B = type { i32 (...)** }
%struct.A = type { i32 }
; CHECK-LABEL: f_thunk:
; CHECK: jmpl
; CHECK-NOT: ret
define x86_thiscallcc i32 @f_thunk(%struct.B* %this, i32) {
entry:
%1 = bitcast %struct.B* %this to i32 (%struct.B*, i32)***
%vtable = load i32 (%struct.B*, i32)*** %1
%2 = load i32 (%struct.B*, i32)** %vtable
%3 = musttail call x86_thiscallcc i32 %2(%struct.B* %this, i32 %0)
ret i32 %3
}
; Inalloca thunks shouldn't require any stores to the stack.
; CHECK-LABEL: g_thunk:
; CHECK-NOT: mov %{{.*}}, {{.*(.*esp.*)}}
; CHECK: jmpl
; CHECK-NOT: ret
define x86_thiscallcc i32 @g_thunk(%struct.B* %this, <{ %struct.A, i32, %struct.A }>* inalloca) {
entry:
%1 = bitcast %struct.B* %this to i32 (%struct.B*, <{ %struct.A, i32, %struct.A }>*)***
%vtable = load i32 (%struct.B*, <{ %struct.A, i32, %struct.A }>*)*** %1
%vfn = getelementptr inbounds i32 (%struct.B*, <{ %struct.A, i32, %struct.A }>*)** %vtable, i32 1
%2 = load i32 (%struct.B*, <{ %struct.A, i32, %struct.A }>*)** %vfn
%3 = musttail call x86_thiscallcc i32 %2(%struct.B* %this, <{ %struct.A, i32, %struct.A }>* inalloca %0)
ret i32 %3
}
; CHECK-LABEL: h_thunk:
; CHECK: jmpl
; CHECK-NOT: mov %{{.*}}, {{.*(.*esp.*)}}
; CHECK-NOT: ret
define x86_thiscallcc void @h_thunk(%struct.B* %this, <{ %struct.A, i32, %struct.A }>* inalloca) {
entry:
%1 = bitcast %struct.B* %this to void (%struct.B*, <{ %struct.A, i32, %struct.A }>*)***
%vtable = load void (%struct.B*, <{ %struct.A, i32, %struct.A }>*)*** %1
%vfn = getelementptr inbounds void (%struct.B*, <{ %struct.A, i32, %struct.A }>*)** %vtable, i32 2
%2 = load void (%struct.B*, <{ %struct.A, i32, %struct.A }>*)** %vfn
musttail call x86_thiscallcc void %2(%struct.B* %this, <{ %struct.A, i32, %struct.A }>* inalloca %0)
ret void
}
; CHECK-LABEL: i_thunk:
; CHECK-NOT: mov %{{.*}}, {{.*(.*esp.*)}}
; CHECK: jmpl
; CHECK-NOT: ret
define x86_thiscallcc %struct.A* @i_thunk(%struct.B* %this, <{ %struct.A*, %struct.A, i32, %struct.A }>* inalloca) {
entry:
%1 = bitcast %struct.B* %this to %struct.A* (%struct.B*, <{ %struct.A*, %struct.A, i32, %struct.A }>*)***
%vtable = load %struct.A* (%struct.B*, <{ %struct.A*, %struct.A, i32, %struct.A }>*)*** %1
%vfn = getelementptr inbounds %struct.A* (%struct.B*, <{ %struct.A*, %struct.A, i32, %struct.A }>*)** %vtable, i32 3
%2 = load %struct.A* (%struct.B*, <{ %struct.A*, %struct.A, i32, %struct.A }>*)** %vfn
%3 = musttail call x86_thiscallcc %struct.A* %2(%struct.B* %this, <{ %struct.A*, %struct.A, i32, %struct.A }>* inalloca %0)
ret %struct.A* %3
}
; CHECK-LABEL: j_thunk:
; CHECK: jmpl
; CHECK-NOT: ret
define x86_thiscallcc void @j_thunk(%struct.A* noalias sret %agg.result, %struct.B* %this, i32) {
entry:
%1 = bitcast %struct.B* %this to void (%struct.A*, %struct.B*, i32)***
%vtable = load void (%struct.A*, %struct.B*, i32)*** %1
%vfn = getelementptr inbounds void (%struct.A*, %struct.B*, i32)** %vtable, i32 4
%2 = load void (%struct.A*, %struct.B*, i32)** %vfn
musttail call x86_thiscallcc void %2(%struct.A* sret %agg.result, %struct.B* %this, i32 %0)
ret void
}
; CHECK-LABEL: _stdcall_thunk@8:
; CHECK-NOT: mov %{{.*}}, {{.*(.*esp.*)}}
; CHECK: jmpl
; CHECK-NOT: ret
define x86_stdcallcc i32 @stdcall_thunk(<{ %struct.B*, %struct.A }>* inalloca) {
entry:
%this_ptr = getelementptr inbounds <{ %struct.B*, %struct.A }>* %0, i32 0, i32 0
%this = load %struct.B** %this_ptr
%1 = bitcast %struct.B* %this to i32 (<{ %struct.B*, %struct.A }>*)***
%vtable = load i32 (<{ %struct.B*, %struct.A }>*)*** %1
%vfn = getelementptr inbounds i32 (<{ %struct.B*, %struct.A }>*)** %vtable, i32 1
%2 = load i32 (<{ %struct.B*, %struct.A }>*)** %vfn
%3 = musttail call x86_stdcallcc i32 %2(<{ %struct.B*, %struct.A }>* inalloca %0)
ret i32 %3
}
; CHECK-LABEL: @fastcall_thunk@8:
; CHECK-NOT: mov %{{.*}}, {{.*(.*esp.*)}}
; CHECK: jmpl
; CHECK-NOT: ret
define x86_fastcallcc i32 @fastcall_thunk(%struct.B* inreg %this, <{ %struct.A }>* inalloca) {
entry:
%1 = bitcast %struct.B* %this to i32 (%struct.B*, <{ %struct.A }>*)***
%vtable = load i32 (%struct.B*, <{ %struct.A }>*)*** %1
%vfn = getelementptr inbounds i32 (%struct.B*, <{ %struct.A }>*)** %vtable, i32 1
%2 = load i32 (%struct.B*, <{ %struct.A }>*)** %vfn
%3 = musttail call x86_fastcallcc i32 %2(%struct.B* inreg %this, <{ %struct.A }>* inalloca %0)
ret i32 %3
}

View File

@ -0,0 +1,31 @@
; RUN: llc -march=x86 < %s | FileCheck %s
; RUN: llc -march=x86 -O0 < %s | FileCheck %s
; CHECK-LABEL: t1:
; CHECK: jmp {{_?}}t1_callee
define x86_thiscallcc void @t1(i8* %this) {
%adj = getelementptr i8* %this, i32 4
musttail call x86_thiscallcc void @t1_callee(i8* %adj)
ret void
}
declare x86_thiscallcc void @t1_callee(i8* %this)
; CHECK-LABEL: t2:
; CHECK: jmp {{_?}}t2_callee
define x86_thiscallcc i32 @t2(i8* %this, i32 %a) {
%adj = getelementptr i8* %this, i32 4
%rv = musttail call x86_thiscallcc i32 @t2_callee(i8* %adj, i32 %a)
ret i32 %rv
}
declare x86_thiscallcc i32 @t2_callee(i8* %this, i32 %a)
; CHECK-LABEL: t3:
; CHECK: jmp {{_?}}t3_callee
define x86_thiscallcc i8* @t3(i8* %this, <{ i8*, i32 }>* inalloca %args) {
%adj = getelementptr i8* %this, i32 4
%a_ptr = getelementptr <{ i8*, i32 }>* %args, i32 0, i32 1
store i32 0, i32* %a_ptr
%rv = musttail call x86_thiscallcc i8* @t3_callee(i8* %adj, <{ i8*, i32 }>* inalloca %args)
ret i8* %rv
}
declare x86_thiscallcc i8* @t3_callee(i8* %this, <{ i8*, i32 }>* inalloca %args);

View File

@ -1,8 +1,6 @@
; RUN: llc -march=x86 < %s | FileCheck %s
; FIXME: Eliminate this tail call at -O0, since musttail is a correctness
; requirement.
; RUN: not llc -march=x86 -O0 < %s
; RUN: llc -march=x86 -O0 < %s | FileCheck %s
; RUN: llc -march=x86 -disable-tail-calls < %s | FileCheck %s
declare void @t1_callee(i8*)
define void @t1(i32* %a) {
@ -21,3 +19,72 @@ define i32* @t2() {
%w = bitcast i8* %v to i32*
ret i32* %w
}
; Complex frame layout: stack realignment with dynamic alloca.
define void @t3(i32 %n) alignstack(32) nounwind {
entry:
; CHECK: t3:
; CHECK: pushl %ebp
; CHECK: pushl %esi
; CHECK: andl $-32, %esp
; CHECK: movl %esp, %esi
; CHECK: popl %esi
; CHECK: popl %ebp
; CHECK-NEXT: jmp {{_?}}t3_callee
%a = alloca i8, i32 %n
call void @capture(i8* %a)
musttail call void @t3_callee(i32 %n) nounwind
ret void
}
declare void @capture(i8*)
declare void @t3_callee(i32)
; Test that we actually copy in and out stack arguments that aren't forwarded
; without modification.
define i32 @t4({}* %fn, i32 %n, i32 %r) {
; CHECK-LABEL: t4:
; CHECK: incl %[[r:.*]]
; CHECK: decl %[[n:.*]]
; CHECK: movl %[[r]], {{[0-9]+}}(%esp)
; CHECK: movl %[[n]], {{[0-9]+}}(%esp)
; CHECK: jmpl *%{{.*}}
entry:
%r1 = add i32 %r, 1
%n1 = sub i32 %n, 1
%fn_cast = bitcast {}* %fn to i32 ({}*, i32, i32)*
%r2 = musttail call i32 %fn_cast({}* %fn, i32 %n1, i32 %r1)
ret i32 %r2
}
; Combine the complex stack frame with the parameter modification.
define i32 @t5({}* %fn, i32 %n, i32 %r) alignstack(32) {
; CHECK-LABEL: t5:
; CHECK: pushl %ebp
; CHECK: movl %esp, %ebp
; CHECK: pushl %esi
; Align the stack.
; CHECK: andl $-32, %esp
; CHECK: movl %esp, %esi
; Modify the args.
; CHECK: incl %[[r:.*]]
; CHECK: decl %[[n:.*]]
; Store them through ebp, since that's the only stable arg pointer.
; CHECK: movl %[[r]], {{[0-9]+}}(%ebp)
; CHECK: movl %[[n]], {{[0-9]+}}(%ebp)
; Epilogue.
; CHECK: leal {{[-0-9]+}}(%ebp), %esp
; CHECK: popl %esi
; CHECK: popl %ebp
; CHECK: jmpl *%{{.*}}
entry:
%a = alloca i8, i32 %n
call void @capture(i8* %a)
%r1 = add i32 %r, 1
%n1 = sub i32 %n, 1
%fn_cast = bitcast {}* %fn to i32 ({}*, i32, i32)*
%r2 = musttail call i32 %fn_cast({}* %fn, i32 %n1, i32 %r1)
ret i32 %r2
}