X86: Implement the vectorcall calling convention

This is a Microsoft calling convention that supports both x86 and x86_64
subtargets. It passes vector and floating point arguments in XMM0-XMM5,
and passes them indirectly once they are consumed.

Homogenous vector aggregates of up to four elements can be passed in
sequential vector registers, but this part is not implemented in LLVM
and will be handled in Clang.

On 32-bit x86, it is similar to fastcall in that it uses ecx:edx as
integer register parameters and is callee cleanup. On x86_64, it
delegates to the normal win64 calling convention.

Reviewers: majnemer

Differential Revision: http://reviews.llvm.org/D5943

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@220745 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
Reid Kleckner 2014-10-28 01:29:26 +00:00
parent dd778c6c9f
commit d5de327da0
9 changed files with 226 additions and 35 deletions

View File

@ -140,7 +140,11 @@ namespace CallingConv {
/// convention differs from the more common \c X86_64_SysV convention
/// in a number of ways, most notably in that XMM registers used to pass
/// arguments are shadowed by GPRs, and vice versa.
X86_64_Win64 = 79
X86_64_Win64 = 79,
/// \brief MSVC calling convention that passes vectors and vector aggregates
/// in SSE registers.
X86_VectorCall = 80
};
} // End CallingConv namespace

View File

@ -580,6 +580,7 @@ lltok::Kind LLLexer::LexIdentifier() {
KEYWORD(x86_stdcallcc);
KEYWORD(x86_fastcallcc);
KEYWORD(x86_thiscallcc);
KEYWORD(x86_vectorcallcc);
KEYWORD(arm_apcscc);
KEYWORD(arm_aapcscc);
KEYWORD(arm_aapcs_vfpcc);

View File

@ -1448,6 +1448,7 @@ bool LLParser::ParseOptionalDLLStorageClass(unsigned &Res) {
/// ::= 'x86_stdcallcc'
/// ::= 'x86_fastcallcc'
/// ::= 'x86_thiscallcc'
/// ::= 'x86_vectorcallcc'
/// ::= 'arm_apcscc'
/// ::= 'arm_aapcscc'
/// ::= 'arm_aapcs_vfpcc'
@ -1473,6 +1474,7 @@ bool LLParser::ParseOptionalCallingConv(unsigned &CC) {
case lltok::kw_x86_stdcallcc: CC = CallingConv::X86_StdCall; break;
case lltok::kw_x86_fastcallcc: CC = CallingConv::X86_FastCall; break;
case lltok::kw_x86_thiscallcc: CC = CallingConv::X86_ThisCall; break;
case lltok::kw_x86_vectorcallcc:CC = CallingConv::X86_VectorCall; break;
case lltok::kw_arm_apcscc: CC = CallingConv::ARM_APCS; break;
case lltok::kw_arm_aapcscc: CC = CallingConv::ARM_AAPCS; break;
case lltok::kw_arm_aapcs_vfpcc:CC = CallingConv::ARM_AAPCS_VFP; break;

View File

@ -87,7 +87,7 @@ namespace lltok {
kw_cc, kw_ccc, kw_fastcc, kw_coldcc,
kw_intel_ocl_bicc,
kw_x86_stdcallcc, kw_x86_fastcallcc, kw_x86_thiscallcc,
kw_x86_stdcallcc, kw_x86_fastcallcc, kw_x86_thiscallcc, kw_x86_vectorcallcc,
kw_arm_apcscc, kw_arm_aapcscc, kw_arm_aapcs_vfpcc,
kw_msp430_intrcc,
kw_ptx_kernel, kw_ptx_device,

View File

@ -285,6 +285,7 @@ static void PrintCallingConv(unsigned cc, raw_ostream &Out) {
case CallingConv::X86_StdCall: Out << "x86_stdcallcc"; break;
case CallingConv::X86_FastCall: Out << "x86_fastcallcc"; break;
case CallingConv::X86_ThisCall: Out << "x86_thiscallcc"; break;
case CallingConv::X86_VectorCall:Out << "x86_vectorcallcc"; break;
case CallingConv::Intel_OCL_BI: Out << "intel_ocl_bicc"; break;
case CallingConv::ARM_APCS: Out << "arm_apcscc"; break;
case CallingConv::ARM_AAPCS: Out << "arm_aapcscc"; break;

View File

@ -22,7 +22,7 @@ using namespace llvm;
static void getNameWithPrefixx(raw_ostream &OS, const Twine &GVName,
Mangler::ManglerPrefixTy PrefixTy,
const DataLayout &DL, bool UseAt) {
const DataLayout &DL, char Prefix) {
SmallString<256> TmpData;
StringRef Name = GVName.toStringRef(TmpData);
assert(!Name.empty() && "getNameWithPrefix requires non-empty name");
@ -39,13 +39,8 @@ static void getNameWithPrefixx(raw_ostream &OS, const Twine &GVName,
else if (PrefixTy == Mangler::LinkerPrivate)
OS << DL.getLinkerPrivateGlobalPrefix();
if (UseAt) {
OS << '@';
} else {
char Prefix = DL.getGlobalPrefix();
if (Prefix != '\0')
OS << Prefix;
}
if (Prefix != '\0')
OS << Prefix;
// If this is a simple string that doesn't need escaping, just append it.
OS << Name;
@ -53,7 +48,8 @@ static void getNameWithPrefixx(raw_ostream &OS, const Twine &GVName,
void Mangler::getNameWithPrefix(raw_ostream &OS, const Twine &GVName,
ManglerPrefixTy PrefixTy) const {
return getNameWithPrefixx(OS, GVName, PrefixTy, *DL, false);
char Prefix = DL->getGlobalPrefix();
return getNameWithPrefixx(OS, GVName, PrefixTy, *DL, Prefix);
}
void Mangler::getNameWithPrefix(SmallVectorImpl<char> &OutName,
@ -63,11 +59,21 @@ void Mangler::getNameWithPrefix(SmallVectorImpl<char> &OutName,
return getNameWithPrefix(OS, GVName, PrefixTy);
}
/// AddFastCallStdCallSuffix - Microsoft fastcall and stdcall functions require
/// a suffix on their name indicating the number of words of arguments they
/// take.
static void AddFastCallStdCallSuffix(raw_ostream &OS, const Function *F,
const DataLayout &TD) {
static bool hasByteCountSuffix(CallingConv::ID CC) {
switch (CC) {
case CallingConv::X86_FastCall:
case CallingConv::X86_StdCall:
case CallingConv::X86_VectorCall:
return true;
default:
return false;
}
}
/// Microsoft fastcall and stdcall functions require a suffix on their name
/// indicating the number of words of arguments they take.
static void addByteCountSuffix(raw_ostream &OS, const Function *F,
const DataLayout &TD) {
// Calculate arguments size total.
unsigned ArgWords = 0;
for (Function::const_arg_iterator AI = F->arg_begin(), AE = F->arg_end();
@ -76,8 +82,9 @@ static void AddFastCallStdCallSuffix(raw_ostream &OS, const Function *F,
// 'Dereference' type in case of byval or inalloca parameter attribute.
if (AI->hasByValOrInAllocaAttr())
Ty = cast<PointerType>(Ty)->getElementType();
// Size should be aligned to DWORD boundary
ArgWords += ((TD.getTypeAllocSize(Ty) + 3)/4)*4;
// Size should be aligned to pointer size.
unsigned PtrSize = TD.getPointerSize();
ArgWords += RoundUpToAlignment(TD.getTypeAllocSize(Ty), PtrSize);
}
OS << '@' << ArgWords;
@ -106,34 +113,40 @@ void Mangler::getNameWithPrefix(raw_ostream &OS, const GlobalValue *GV,
}
StringRef Name = GV->getName();
char Prefix = DL->getGlobalPrefix();
bool UseAt = false;
const Function *MSFunc = nullptr;
CallingConv::ID CC;
if (Name[0] != '\1' && DL->hasMicrosoftFastStdCallMangling()) {
if ((MSFunc = dyn_cast<Function>(GV))) {
CC = MSFunc->getCallingConv();
// fastcall functions need to start with @ instead of _.
if (CC == CallingConv::X86_FastCall)
UseAt = true;
}
// Mangle functions with Microsoft calling conventions specially. Only do
// this mangling for x86_64 vectorcall and 32-bit x86.
const Function *MSFunc = dyn_cast<Function>(GV);
if (Name.startswith("\01"))
MSFunc = nullptr; // Don't mangle when \01 is present.
CallingConv::ID CC = MSFunc ? MSFunc->getCallingConv() : CallingConv::C;
if (!DL->hasMicrosoftFastStdCallMangling() &&
CC != CallingConv::X86_VectorCall)
MSFunc = nullptr;
if (MSFunc) {
if (CC == CallingConv::X86_FastCall)
Prefix = '@'; // fastcall functions have an @ prefix instead of _.
else if (CC == CallingConv::X86_VectorCall)
Prefix = '\0'; // vectorcall functions have no prefix.
}
getNameWithPrefixx(OS, Name, PrefixTy, *DL, UseAt);
getNameWithPrefixx(OS, Name, PrefixTy, *DL, Prefix);
if (!MSFunc)
return;
// If we are supposed to add a microsoft-style suffix for stdcall/fastcall,
// add it.
// fastcall and stdcall functions usually need @42 at the end to specify
// the argument info.
// If we are supposed to add a microsoft-style suffix for stdcall, fastcall,
// or vectorcall, add it. These functions have a suffix of @N where N is the
// cumulative byte size of all of the parameters to the function in decimal.
if (CC == CallingConv::X86_VectorCall)
OS << '@'; // vectorcall functions use a double @ suffix.
FunctionType *FT = MSFunc->getFunctionType();
if ((CC == CallingConv::X86_FastCall || CC == CallingConv::X86_StdCall) &&
if (hasByteCountSuffix(CC) &&
// "Pure" variadic functions do not receive @0 suffix.
(!FT->isVarArg() || FT->getNumParams() == 0 ||
(FT->getNumParams() == 1 && MSFunc->hasStructRetAttr())))
AddFastCallStdCallSuffix(OS, MSFunc, *DL);
addByteCountSuffix(OS, MSFunc, *DL);
}
void Mangler::getNameWithPrefix(SmallVectorImpl<char> &OutName,

View File

@ -20,6 +20,19 @@
namespace llvm {
inline bool CC_X86_32_VectorCallIndirect(unsigned &ValNo, MVT &ValVT,
MVT &LocVT,
CCValAssign::LocInfo &LocInfo,
ISD::ArgFlagsTy &ArgFlags,
CCState &State) {
// Similar to CCPassIndirect, with the addition of inreg.
LocVT = MVT::i32;
LocInfo = CCValAssign::Indirect;
ArgFlags.setInReg();
return false; // Continue the search, but now for i32.
}
inline bool CC_X86_AnyReg_Error(unsigned &, MVT &, MVT &,
CCValAssign::LocInfo &, ISD::ArgFlagsTy &,
CCState &) {

View File

@ -124,6 +124,24 @@ def RetCC_X86_32_HiPE : CallingConv<[
CCIfType<[i32], CCAssignToReg<[ESI, EBP, EAX, EDX]>>
]>;
// X86-32 HiPE return-value convention.
def RetCC_X86_32_VectorCall : CallingConv<[
// Vector types are returned in XMM0,XMM1,XMMM2 and XMM3.
CCIfType<[f32, f64, v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
CCAssignToReg<[XMM0,XMM1,XMM2,XMM3]>>,
// 256-bit FP vectors
CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64],
CCAssignToReg<[YMM0,YMM1,YMM2,YMM3]>>,
// 512-bit FP vectors
CCIfType<[v64i8, v32i16, v16i32, v8i64, v16f32, v8f64],
CCAssignToReg<[ZMM0,ZMM1,ZMM2,ZMM3]>>,
// Return integers in the standard way.
CCDelegateTo<RetCC_X86Common>
]>;
// X86-64 C return-value convention.
def RetCC_X86_64_C : CallingConv<[
// The X86-64 calling convention always returns FP values in XMM0.
@ -179,6 +197,7 @@ def RetCC_X86_32 : CallingConv<[
CCIfCC<"CallingConv::Fast", CCDelegateTo<RetCC_X86_32_Fast>>,
// If HiPE, use RetCC_X86_32_HiPE.
CCIfCC<"CallingConv::HiPE", CCDelegateTo<RetCC_X86_32_HiPE>>,
CCIfCC<"CallingConv::X86_VectorCall", CCDelegateTo<RetCC_X86_32_VectorCall>>,
// Otherwise, use RetCC_X86_32_C.
CCDelegateTo<RetCC_X86_32_C>
@ -330,6 +349,25 @@ def CC_X86_Win64_C : CallingConv<[
CCIfType<[f80], CCAssignToStack<0, 0>>
]>;
def CC_X86_Win64_VectorCall : CallingConv<[
// The first 6 floating point and vector types of 128 bits or less use
// XMM0-XMM5.
CCIfType<[f32, f64, v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
CCAssignToReg<[XMM0, XMM1, XMM2, XMM3, XMM4, XMM5]>>,
// 256-bit vectors use YMM registers.
CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64],
CCAssignToReg<[YMM0, YMM1, YMM2, YMM3, YMM4, YMM5]>>,
// 512-bit vectors use ZMM registers.
CCIfType<[v64i8, v32i16, v16i32, v8i64, v16f32, v8f64],
CCAssignToReg<[ZMM0, ZMM1, ZMM2, ZMM3, ZMM4, ZMM5]>>,
// Delegate to fastcall to handle integer types.
CCDelegateTo<CC_X86_Win64_C>
]>;
def CC_X86_64_GHC : CallingConv<[
// Promote i8/i16/i32 arguments to i64.
CCIfType<[i8, i16, i32], CCPromoteToType<i64>>,
@ -463,6 +501,30 @@ def CC_X86_32_FastCall : CallingConv<[
CCDelegateTo<CC_X86_32_Common>
]>;
def CC_X86_32_VectorCall : CallingConv<[
// The first 6 floating point and vector types of 128 bits or less use
// XMM0-XMM5.
CCIfType<[f32, f64, v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
CCAssignToReg<[XMM0, XMM1, XMM2, XMM3, XMM4, XMM5]>>,
// 256-bit vectors use YMM registers.
CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64],
CCAssignToReg<[YMM0, YMM1, YMM2, YMM3, YMM4, YMM5]>>,
// 512-bit vectors use ZMM registers.
CCIfType<[v64i8, v32i16, v16i32, v8i64, v16f32, v8f64],
CCAssignToReg<[ZMM0, ZMM1, ZMM2, ZMM3, ZMM4, ZMM5]>>,
// Otherwise, pass it indirectly.
CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64,
v32i8, v16i16, v8i32, v4i64, v8f32, v4f64,
v64i8, v32i16, v16i32, v8i64, v16f32, v8f64],
CCCustom<"CC_X86_32_VectorCallIndirect">>,
// Delegate to fastcall to handle integer types.
CCDelegateTo<CC_X86_32_FastCall>
]>;
def CC_X86_32_ThisCall_Common : CallingConv<[
// The first integer argument is passed in ECX
CCIfType<[i32], CCAssignToReg<[ECX]>>,
@ -576,6 +638,7 @@ def CC_Intel_OCL_BI : CallingConv<[
// This is the root argument convention for the X86-32 backend.
def CC_X86_32 : CallingConv<[
CCIfCC<"CallingConv::X86_FastCall", CCDelegateTo<CC_X86_32_FastCall>>,
CCIfCC<"CallingConv::X86_VectorCall", CCDelegateTo<CC_X86_32_VectorCall>>,
CCIfCC<"CallingConv::X86_ThisCall", CCDelegateTo<CC_X86_32_ThisCall>>,
CCIfCC<"CallingConv::Fast", CCDelegateTo<CC_X86_32_FastCC>>,
CCIfCC<"CallingConv::GHC", CCDelegateTo<CC_X86_32_GHC>>,
@ -593,6 +656,7 @@ def CC_X86_64 : CallingConv<[
CCIfCC<"CallingConv::AnyReg", CCDelegateTo<CC_X86_64_AnyReg>>,
CCIfCC<"CallingConv::X86_64_Win64", CCDelegateTo<CC_X86_Win64_C>>,
CCIfCC<"CallingConv::X86_64_SysV", CCDelegateTo<CC_X86_64_C>>,
CCIfCC<"CallingConv::X86_VectorCall", CCDelegateTo<CC_X86_Win64_VectorCall>>,
// Mingw64 and native Win64 use Win64 CC
CCIfSubtarget<"isTargetWin64()", CCDelegateTo<CC_X86_Win64_C>>,

View File

@ -0,0 +1,93 @@
; RUN: llc -mtriple=i686-pc-win32 -mattr=+sse2 < %s | FileCheck %s --check-prefix=CHECK --check-prefix=X86
; RUN: llc -mtriple=x86_64-pc-win32 < %s | FileCheck %s --check-prefix=CHECK --check-prefix=X64
; Test integer arguments.
define x86_vectorcallcc i32 @test_int_1() {
ret i32 0
}
; CHECK-LABEL: {{^}}test_int_1@@0:
; CHECK: xorl %eax, %eax
define x86_vectorcallcc i32 @test_int_2(i32 inreg %a) {
ret i32 %a
}
; X86-LABEL: {{^}}test_int_2@@4:
; X64-LABEL: {{^}}test_int_2@@8:
; CHECK: movl %ecx, %eax
define x86_vectorcallcc i32 @test_int_3(i64 inreg %a) {
%at = trunc i64 %a to i32
ret i32 %at
}
; X86-LABEL: {{^}}test_int_3@@8:
; X64-LABEL: {{^}}test_int_3@@8:
; CHECK: movl %ecx, %eax
define x86_vectorcallcc i32 @test_int_4(i32 inreg %a, i32 inreg %b) {
%s = add i32 %a, %b
ret i32 %s
}
; X86-LABEL: {{^}}test_int_4@@8:
; X86: leal (%ecx,%edx), %eax
; X64-LABEL: {{^}}test_int_4@@16:
; X64: leal (%rcx,%rdx), %eax
define x86_vectorcallcc i32 @"\01test_int_5"(i32, i32) {
ret i32 0
}
; CHECK-LABEL: {{^}}test_int_5:
define x86_vectorcallcc double @test_fp_1(double %a, double %b) {
ret double %b
}
; CHECK-LABEL: {{^}}test_fp_1@@16:
; CHECK: movaps %xmm1, %xmm0
define x86_vectorcallcc double @test_fp_2(
double, double, double, double, double, double, double %r) {
ret double %r
}
; CHECK-LABEL: {{^}}test_fp_2@@56:
; CHECK: movsd {{[0-9]+\(%[re]sp\)}}, %xmm0
define x86_vectorcallcc {double, double, double, double} @test_fp_3() {
ret {double, double, double, double}
{ double 0.0, double 0.0, double 0.0, double 0.0 }
}
; CHECK-LABEL: {{^}}test_fp_3@@0:
; CHECK: xorps %xmm0
; CHECK: xorps %xmm1
; CHECK: xorps %xmm2
; CHECK: xorps %xmm3
; FIXME: Returning via x87 isn't compatible, but its hard to structure the
; tablegen any other way.
define x86_vectorcallcc {double, double, double, double, double} @test_fp_4() {
ret {double, double, double, double, double}
{ double 0.0, double 0.0, double 0.0, double 0.0, double 0.0 }
}
; CHECK-LABEL: {{^}}test_fp_4@@0:
; CHECK: fldz
; CHECK: xorps %xmm0
; CHECK: xorps %xmm1
; CHECK: xorps %xmm2
; CHECK: xorps %xmm3
define x86_vectorcallcc <16 x i8> @test_vec_1(<16 x i8> %a, <16 x i8> %b) {
ret <16 x i8> %b
}
; CHECK-LABEL: {{^}}test_vec_1@@32:
; CHECK: movaps %xmm1, %xmm0
define x86_vectorcallcc <16 x i8> @test_vec_2(
double, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> %r) {
ret <16 x i8> %r
}
; CHECK-LABEL: {{^}}test_vec_2@@104:
; CHECK: movaps (%{{[re]}}cx), %xmm0