Special calling conventions for Intel OpenCL built-in library.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@166566 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
Elena Demikhovsky 2012-10-24 14:46:16 +00:00
parent 0ed9b8e0bf
commit 3575222175
10 changed files with 289 additions and 1 deletions

View File

@ -112,7 +112,11 @@ namespace CallingConv {
/// Cannot have variable arguments.
/// Can also be called by the host.
/// Is externally visible.
SPIR_KERNEL = 76
SPIR_KERNEL = 76,
/// Intel_OCL_BI - Calling conventions for Intel OpenCL built-ins
Intel_OCL_BI = 77
};
} // End CallingConv namespace

View File

@ -527,6 +527,7 @@ lltok::Kind LLLexer::LexIdentifier() {
KEYWORD(ptx_device);
KEYWORD(spir_kernel);
KEYWORD(spir_func);
KEYWORD(intel_ocl_bicc);
KEYWORD(cc);
KEYWORD(c);

View File

@ -1094,6 +1094,7 @@ bool LLParser::ParseOptionalVisibility(unsigned &Res) {
/// ::= /*empty*/
/// ::= 'ccc'
/// ::= 'fastcc'
/// ::= 'kw_intel_ocl_bicc'
/// ::= 'coldcc'
/// ::= 'x86_stdcallcc'
/// ::= 'x86_fastcallcc'
@ -1125,6 +1126,7 @@ bool LLParser::ParseOptionalCallingConv(CallingConv::ID &CC) {
case lltok::kw_ptx_device: CC = CallingConv::PTX_Device; break;
case lltok::kw_spir_kernel: CC = CallingConv::SPIR_KERNEL; break;
case lltok::kw_spir_func: CC = CallingConv::SPIR_FUNC; break;
case lltok::kw_intel_ocl_bicc: CC = CallingConv::Intel_OCL_BI; break;
case lltok::kw_cc: {
unsigned ArbitraryCC;
Lex.Lex();

View File

@ -77,6 +77,7 @@ namespace lltok {
kw_c,
kw_cc, kw_ccc, kw_fastcc, kw_coldcc,
kw_intel_ocl_bicc,
kw_x86_stdcallcc, kw_x86_fastcallcc, kw_x86_thiscallcc,
kw_arm_apcscc, kw_arm_aapcscc, kw_arm_aapcs_vfpcc,
kw_msp430_intrcc,

View File

@ -88,6 +88,21 @@ def RetCC_X86_32_Fast : CallingConv<[
CCDelegateTo<RetCC_X86Common>
]>;
// Intel_OCL_BI return-value convention.
def RetCC_Intel_OCL_BI : CallingConv<[
// Vector types are returned in XMM0,XMM1,XMMM2 and XMM3.
CCIfType<[f32, f64, v4i32, v2i64, v4f32, v2f64],
CCAssignToReg<[XMM0,XMM1,XMM2,XMM3]>>,
// 256-bit FP vectors
// No more than 4 registers
CCIfType<[v8f32, v4f64, v8i32, v4i64],
CCAssignToReg<[YMM0,YMM1,YMM2,YMM3]>>,
// i32, i64 in the standard way
CCDelegateTo<RetCC_X86Common>
]>;
// X86-64 C return-value convention.
def RetCC_X86_64_C : CallingConv<[
// The X86-64 calling convention always returns FP values in XMM0.
@ -128,6 +143,10 @@ def RetCC_X86_64 : CallingConv<[
// This is the return-value convention used for the entire X86 backend.
def RetCC_X86 : CallingConv<[
// Check if this is the Intel OpenCL built-ins calling convention
CCIfCC<"CallingConv::Intel_OCL_BI", CCDelegateTo<RetCC_Intel_OCL_BI>>,
CCIfSubtarget<"is64Bit()", CCDelegateTo<RetCC_X86_64>>,
CCDelegateTo<RetCC_X86_32>
]>;
@ -235,6 +254,29 @@ def CC_X86_Win64_C : CallingConv<[
CCIfType<[f80], CCAssignToStack<0, 0>>
]>;
// X86-64 Intel OpenCL built-ins calling convention.
def CC_Intel_OCL_BI : CallingConv<[
CCIfType<[i32], CCIfSubtarget<"isTargetWin32()", CCAssignToStack<4, 4>>>,
CCIfType<[i32], CCIfSubtarget<"isTargetWin64()", CCAssignToReg<[ECX, EDX, R8D, R9D]>>>,
CCIfType<[i64], CCIfSubtarget<"isTargetWin64()", CCAssignToReg<[RCX, RDX, R8, R9 ]>>>,
CCIfType<[i32], CCAssignToReg<[EDI, ESI, EDX, ECX]>>,
CCIfType<[i64], CCAssignToReg<[RDI, RSI, RDX, RCX]>>,
// The SSE vector arguments are passed in XMM registers.
CCIfType<[f32, f64, v4i32, v2i64, v4f32, v2f64],
CCAssignToReg<[XMM0, XMM1, XMM2, XMM3]>>,
// The 256-bit vector arguments are passed in YMM registers.
CCIfType<[v8f32, v4f64, v8i32, v4i64],
CCAssignToReg<[YMM0, YMM1, YMM2, YMM3]>>,
CCIfSubtarget<"isTargetWin64()", CCDelegateTo<CC_X86_Win64_C>>,
CCDelegateTo<CC_X86_64_C>
]>;
def CC_X86_64_GHC : CallingConv<[
// Promote i8/i16/i32 arguments to i64.
CCIfType<[i8, i16, i32], CCPromoteToType<i64>>,
@ -408,6 +450,7 @@ def CC_X86_64 : CallingConv<[
// This is the argument convention used for the entire X86 backend.
def CC_X86 : CallingConv<[
CCIfCC<"CallingConv::Intel_OCL_BI", CCDelegateTo<CC_Intel_OCL_BI>>,
CCIfSubtarget<"is64Bit()", CCDelegateTo<CC_X86_64>>,
CCDelegateTo<CC_X86_32>
]>;
@ -426,3 +469,17 @@ def CSR_64EHRet : CalleeSavedRegs<(add RAX, RDX, CSR_64)>;
def CSR_Win64 : CalleeSavedRegs<(add RBX, RBP, RDI, RSI, R12, R13, R14, R15,
(sequence "XMM%u", 6, 15))>;
// Standard C + YMM6-15
def CSR_Win64_Intel_OCL_BI_AVX : CalleeSavedRegs<(add RBX, RBP, RDI, RSI, R12,
R13, R14, R15,
(sequence "YMM%u", 6, 15))>;
//Standard C + XMM 8-15
def CSR_64_Intel_OCL_BI : CalleeSavedRegs<(add CSR_64,
(sequence "XMM%u", 8, 15))>;
//Standard C + YMM 8-15
def CSR_64_Intel_OCL_BI_AVX : CalleeSavedRegs<(add CSR_64,
(sequence "YMM%u", 8, 15))>;

View File

@ -229,15 +229,26 @@ const uint16_t *
X86RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
bool callsEHReturn = false;
bool ghcCall = false;
bool oclBiCall = false;
bool HasAVX = TM.getSubtarget<X86Subtarget>().hasAVX();
if (MF) {
callsEHReturn = MF->getMMI().callsEHReturn();
const Function *F = MF->getFunction();
ghcCall = (F ? F->getCallingConv() == CallingConv::GHC : false);
oclBiCall = (F ? F->getCallingConv() == CallingConv::Intel_OCL_BI : false);
}
if (ghcCall)
return CSR_NoRegs_SaveList;
if (oclBiCall) {
if (HasAVX && IsWin64)
return CSR_Win64_Intel_OCL_BI_AVX_SaveList;
if (HasAVX && Is64Bit)
return CSR_64_Intel_OCL_BI_AVX_SaveList;
if (!HasAVX && !IsWin64 && Is64Bit)
return CSR_64_Intel_OCL_BI_SaveList;
}
if (Is64Bit) {
if (IsWin64)
return CSR_Win64_SaveList;
@ -252,6 +263,16 @@ X86RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
const uint32_t*
X86RegisterInfo::getCallPreservedMask(CallingConv::ID CC) const {
bool HasAVX = TM.getSubtarget<X86Subtarget>().hasAVX();
if (CC == CallingConv::Intel_OCL_BI) {
if (IsWin64 && HasAVX)
return CSR_Win64_Intel_OCL_BI_AVX_RegMask;
if (Is64Bit && HasAVX)
return CSR_64_Intel_OCL_BI_AVX_RegMask;
if (!HasAVX && !IsWin64 && Is64Bit)
return CSR_64_Intel_OCL_BI_RegMask;
}
if (CC == CallingConv::GHC)
return CSR_NoRegs_RegMask;
if (!Is64Bit)

View File

@ -74,6 +74,7 @@ static void PrintCallingConv(unsigned cc, raw_ostream &Out)
case CallingConv::X86_StdCall: Out << "x86_stdcallcc"; break;
case CallingConv::X86_FastCall: Out << "x86_fastcallcc"; break;
case CallingConv::X86_ThisCall: Out << "x86_thiscallcc"; break;
case CallingConv::Intel_OCL_BI: Out << "intel_ocl_bicc"; break;
case CallingConv::ARM_APCS: Out << "arm_apcscc"; break;
case CallingConv::ARM_AAPCS: Out << "arm_aapcscc"; break;
case CallingConv::ARM_AAPCS_VFP:Out << "arm_aapcs_vfpcc"; break;

View File

@ -705,6 +705,7 @@ void Verifier::visitFunction(Function &F) {
case CallingConv::Cold:
case CallingConv::X86_FastCall:
case CallingConv::X86_ThisCall:
case CallingConv::Intel_OCL_BI:
case CallingConv::PTX_Kernel:
case CallingConv::PTX_Device:
Assert1(!F.isVarArg(),

View File

@ -0,0 +1,107 @@
; RUN: llc < %s -mtriple=i386-pc-win32 -mattr=+avx | FileCheck -check-prefix=WIN32 %s
; RUN: llc < %s -mtriple=x86_64-win32 -mattr=+avx | FileCheck -check-prefix=WIN64 %s
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx | FileCheck -check-prefix=NOT_WIN %s
declare <16 x float> @func_float16_ptr(<16 x float>, <16 x float> *)
declare <16 x float> @func_float16(<16 x float>, <16 x float>)
; WIN64: testf16_inp
; WIN64: vaddps {{.*}}, {{%ymm[0-1]}}
; WIN64: vaddps {{.*}}, {{%ymm[0-1]}}
; WIN64: leaq {{.*}}(%rsp), %rcx
; WIN64: call
; WIN64: ret
; WIN32: testf16_inp
; WIN32: movl %eax, (%esp)
; WIN32: vaddps {{.*}}, {{%ymm[0-1]}}
; WIN32: vaddps {{.*}}, {{%ymm[0-1]}}
; WIN32: call
; WIN32: ret
; NOT_WIN: testf16_inp
; NOT_WIN: vaddps {{.*}}, {{%ymm[0-1]}}
; NOT_WIN: vaddps {{.*}}, {{%ymm[0-1]}}
; NOT_WIN: leaq {{.*}}(%rsp), %rdi
; NOT_WIN: call
; NOT_WIN: ret
;test calling conventions - input parameters
define <16 x float> @testf16_inp(<16 x float> %a, <16 x float> %b) nounwind {
%y = alloca <16 x float>, align 16
%x = fadd <16 x float> %a, %b
%1 = call intel_ocl_bicc <16 x float> @func_float16_ptr(<16 x float> %x, <16 x float>* %y)
%2 = load <16 x float>* %y, align 16
%3 = fadd <16 x float> %2, %1
ret <16 x float> %3
}
;test calling conventions - preserved registers
; preserved ymm6-ymm15
; WIN64: testf16_regs
; WIN64: call
; WIN64: vaddps {{%ymm[6-7]}}, %ymm0, %ymm0
; WIN64: vaddps {{%ymm[6-7]}}, %ymm1, %ymm1
; WIN64: ret
; preserved ymm8-ymm15
; NOT_WIN: testf16_regs
; NOT_WIN: call
; NOT_WIN: vaddps {{%ymm[8-9]}}, %ymm0, %ymm0
; NOT_WIN: vaddps {{%ymm[8-9]}}, %ymm1, %ymm1
; NOT_WIN: ret
define <16 x float> @testf16_regs(<16 x float> %a, <16 x float> %b) nounwind {
%y = alloca <16 x float>, align 16
%x = fadd <16 x float> %a, %b
%1 = call intel_ocl_bicc <16 x float> @func_float16_ptr(<16 x float> %x, <16 x float>* %y)
%2 = load <16 x float>* %y, align 16
%3 = fadd <16 x float> %1, %b
%4 = fadd <16 x float> %2, %3
ret <16 x float> %4
}
; test calling conventions - prolog and epilog
; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rsp).*}} # 32-byte Spill
; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rsp).*}} # 32-byte Spill
; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rsp).*}} # 32-byte Spill
; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rsp).*}} # 32-byte Spill
; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rsp).*}} # 32-byte Spill
; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rsp).*}} # 32-byte Spill
; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rsp).*}} # 32-byte Spill
; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rsp).*}} # 32-byte Spill
; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rsp).*}} # 32-byte Spill
; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rsp).*}} # 32-byte Spill
; WIN64: call
; WIN64: vmovaps {{.*(%rsp).*}}, {{%ymm([6-9]|1[0-5])}} # 32-byte Reload
; WIN64: vmovaps {{.*(%rsp).*}}, {{%ymm([6-9]|1[0-5])}} # 32-byte Reload
; WIN64: vmovaps {{.*(%rsp).*}}, {{%ymm([6-9]|1[0-5])}} # 32-byte Reload
; WIN64: vmovaps {{.*(%rsp).*}}, {{%ymm([6-9]|1[0-5])}} # 32-byte Reload
; WIN64: vmovaps {{.*(%rsp).*}}, {{%ymm([6-9]|1[0-5])}} # 32-byte Reload
; WIN64: vmovaps {{.*(%rsp).*}}, {{%ymm([6-9]|1[0-5])}} # 32-byte Reload
; WIN64: vmovaps {{.*(%rsp).*}}, {{%ymm([6-9]|1[0-5])}} # 32-byte Reload
; WIN64: vmovaps {{.*(%rsp).*}}, {{%ymm([6-9]|1[0-5])}} # 32-byte Reload
; WIN64: vmovaps {{.*(%rsp).*}}, {{%ymm([6-9]|1[0-5])}} # 32-byte Reload
; WIN64: vmovaps {{.*(%rsp).*}}, {{%ymm([6-9]|1[0-5])}} # 32-byte Reload
; NOT_WIN: vmovaps {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rbp) ## 32-byte Spill
; NOT_WIN: vmovaps {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rbp) ## 32-byte Spill
; NOT_WIN: vmovaps {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rbp) ## 32-byte Spill
; NOT_WIN: vmovaps {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rbp) ## 32-byte Spill
; NOT_WIN: vmovaps {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rbp) ## 32-byte Spill
; NOT_WIN: vmovaps {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rbp) ## 32-byte Spill
; NOT_WIN: vmovaps {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rbp) ## 32-byte Spill
; NOT_WIN: vmovaps {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rbp) ## 32-byte Spill
; NOT_WIN: call
; NOT_WIN: vmovaps {{.*}}(%rbp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Reload
; NOT_WIN: vmovaps {{.*}}(%rbp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Reload
; NOT_WIN: vmovaps {{.*}}(%rbp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Reload
; NOT_WIN: vmovaps {{.*}}(%rbp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Reload
; NOT_WIN: vmovaps {{.*}}(%rbp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Reload
; NOT_WIN: vmovaps {{.*}}(%rbp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Reload
; NOT_WIN: vmovaps {{.*}}(%rbp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Reload
; NOT_WIN: vmovaps {{.*}}(%rbp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Reload
define intel_ocl_bicc <16 x float> @test_prolog_epilog(<16 x float> %a, <16 x float> %b) nounwind {
%c = call <16 x float> @func_float16(<16 x float> %a, <16 x float> %b)
ret <16 x float> %c
}

View File

@ -0,0 +1,93 @@
; RUN: llc < %s -mtriple=i386-pc-win32 -mcpu=nehalem | FileCheck -check-prefix=WIN32 %s
; RUN: llc < %s -mtriple=x86_64-win32 -mcpu=nehalem | FileCheck -check-prefix=WIN64 %s
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=nehalem | FileCheck -check-prefix=NOT_WIN %s
declare <16 x float> @func_float16_ptr(<16 x float>, <16 x float> *)
declare <16 x float> @func_float16(<16 x float>, <16 x float>)
; WIN64: testf16_inp
; WIN64: addps {{.*}}, {{%xmm[0-3]}}
; WIN64: addps {{.*}}, {{%xmm[0-3]}}
; WIN64: addps {{.*}}, {{%xmm[0-3]}}
; WIN64: addps {{.*}}, {{%xmm[0-3]}}
; WIN64: leaq {{.*}}(%rsp), %rcx
; WIN64: call
; WIN64: ret
; WIN32: testf16_inp
; WIN32: movl %eax, (%esp)
; WIN32: addps {{.*}}, {{%xmm[0-3]}}
; WIN32: addps {{.*}}, {{%xmm[0-3]}}
; WIN32: addps {{.*}}, {{%xmm[0-3]}}
; WIN32: addps {{.*}}, {{%xmm[0-3]}}
; WIN32: call
; WIN32: ret
; NOT_WIN: testf16_inp
; NOT_WIN: addps {{.*}}, {{%xmm[0-3]}}
; NOT_WIN: addps {{.*}}, {{%xmm[0-3]}}
; NOT_WIN: addps {{.*}}, {{%xmm[0-3]}}
; NOT_WIN: addps {{.*}}, {{%xmm[0-3]}}
; NOT_WIN: leaq {{.*}}(%rsp), %rdi
; NOT_WIN: call
; NOT_WIN: ret
;test calling conventions - input parameters
define <16 x float> @testf16_inp(<16 x float> %a, <16 x float> %b) nounwind {
%y = alloca <16 x float>, align 16
%x = fadd <16 x float> %a, %b
%1 = call intel_ocl_bicc <16 x float> @func_float16_ptr(<16 x float> %x, <16 x float>* %y)
%2 = load <16 x float>* %y, align 16
%3 = fadd <16 x float> %2, %1
ret <16 x float> %3
}
;test calling conventions - preserved registers
; preserved xmm6-xmm15
; WIN64: testf16_regs
; WIN64: call
; WIN64: addps {{%xmm[6-9]}}, {{.*}}
; WIN64: addps {{%xmm[6-9]}}, {{.*}}
; WIN64: ret
; preserved xmm8-xmm15
; NOT_WIN: testf16_regs
; NOT_WIN: call
; NOT_WIN: addps {{%xmm([8-9]|1[0-1])}}, {{.*}}
; NOT_WIN: addps {{%xmm([8-9]|1[0-1])}}, {{.*}}
; NOT_WIN: addps {{%xmm([8-9]|1[0-1])}}, {{.*}}
; NOT_WIN: addps {{%xmm([8-9]|1[0-1])}}, {{.*}}
; NOT_WIN: ret
define <16 x float> @testf16_regs(<16 x float> %a, <16 x float> %b) nounwind {
%y = alloca <16 x float>, align 16
%x = fadd <16 x float> %a, %b
%1 = call intel_ocl_bicc <16 x float> @func_float16_ptr(<16 x float> %x, <16 x float>* %y)
%2 = load <16 x float>* %y, align 16
%3 = fadd <16 x float> %1, %b
%4 = fadd <16 x float> %2, %3
ret <16 x float> %4
}
; test calling conventions - prolog and epilog
; NOT_WIN: movaps {{%xmm([8-9]|1[0-5])}}, {{.*(%rsp).*}} ## 16-byte Spill
; NOT_WIN: movaps {{%xmm([8-9]|1[0-5])}}, {{.*(%rsp).*}} ## 16-byte Spill
; NOT_WIN: movaps {{%xmm([8-9]|1[0-5])}}, {{.*(%rsp).*}} ## 16-byte Spill
; NOT_WIN: movaps {{%xmm([8-9]|1[0-5])}}, {{.*(%rsp).*}} ## 16-byte Spill
; NOT_WIN: movaps {{%xmm([8-9]|1[0-5])}}, {{.*(%rsp).*}} ## 16-byte Spill
; NOT_WIN: movaps {{%xmm([8-9]|1[0-5])}}, {{.*(%rsp).*}} ## 16-byte Spill
; NOT_WIN: movaps {{%xmm([8-9]|1[0-5])}}, {{.*(%rsp).*}} ## 16-byte Spill
; NOT_WIN: movaps {{%xmm([8-9]|1[0-5])}}, {{.*(%rsp).*}} ## 16-byte Spill
; NOT_WIN: call
; NOT_WIN: movaps {{.*(%rsp).*}}, {{%xmm([8-9]|1[0-5])}} ## 16-byte Reload
; NOT_WIN: movaps {{.*(%rsp).*}}, {{%xmm([8-9]|1[0-5])}} ## 16-byte Reload
; NOT_WIN: movaps {{.*(%rsp).*}}, {{%xmm([8-9]|1[0-5])}} ## 16-byte Reload
; NOT_WIN: movaps {{.*(%rsp).*}}, {{%xmm([8-9]|1[0-5])}} ## 16-byte Reload
; NOT_WIN: movaps {{.*(%rsp).*}}, {{%xmm([8-9]|1[0-5])}} ## 16-byte Reload
; NOT_WIN: movaps {{.*(%rsp).*}}, {{%xmm([8-9]|1[0-5])}} ## 16-byte Reload
; NOT_WIN: movaps {{.*(%rsp).*}}, {{%xmm([8-9]|1[0-5])}} ## 16-byte Reload
; NOT_WIN: movaps {{.*(%rsp).*}}, {{%xmm([8-9]|1[0-5])}} ## 16-byte Reload
define intel_ocl_bicc <16 x float> @test_prolog_epilog(<16 x float> %a, <16 x float> %b) nounwind {
%c = call <16 x float> @func_float16(<16 x float> %a, <16 x float> %b)
ret <16 x float> %c
}