From 3575222175b4982f380ff291bb17be67aadc0966 Mon Sep 17 00:00:00 2001 From: Elena Demikhovsky Date: Wed, 24 Oct 2012 14:46:16 +0000 Subject: [PATCH] Special calling conventions for Intel OpenCL built-in library. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@166566 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/CallingConv.h | 6 +- lib/AsmParser/LLLexer.cpp | 1 + lib/AsmParser/LLParser.cpp | 2 + lib/AsmParser/LLToken.h | 1 + lib/Target/X86/X86CallingConv.td | 57 +++++++++++++++ lib/Target/X86/X86RegisterInfo.cpp | 21 ++++++ lib/VMCore/AsmWriter.cpp | 1 + lib/VMCore/Verifier.cpp | 1 + test/CodeGen/X86/avx-intel-ocl.ll | 107 +++++++++++++++++++++++++++++ test/CodeGen/X86/sse-intel-ocl.ll | 93 +++++++++++++++++++++++++ 10 files changed, 289 insertions(+), 1 deletion(-) create mode 100644 test/CodeGen/X86/avx-intel-ocl.ll create mode 100644 test/CodeGen/X86/sse-intel-ocl.ll diff --git a/include/llvm/CallingConv.h b/include/llvm/CallingConv.h index 86e4eebb827..053f4eb326f 100644 --- a/include/llvm/CallingConv.h +++ b/include/llvm/CallingConv.h @@ -112,7 +112,11 @@ namespace CallingConv { /// Cannot have variable arguments. /// Can also be called by the host. /// Is externally visible. - SPIR_KERNEL = 76 + SPIR_KERNEL = 76, + + /// Intel_OCL_BI - Calling conventions for Intel OpenCL built-ins + Intel_OCL_BI = 77 + }; } // End CallingConv namespace diff --git a/lib/AsmParser/LLLexer.cpp b/lib/AsmParser/LLLexer.cpp index 464dfd51d67..91f973d8d39 100644 --- a/lib/AsmParser/LLLexer.cpp +++ b/lib/AsmParser/LLLexer.cpp @@ -527,6 +527,7 @@ lltok::Kind LLLexer::LexIdentifier() { KEYWORD(ptx_device); KEYWORD(spir_kernel); KEYWORD(spir_func); + KEYWORD(intel_ocl_bicc); KEYWORD(cc); KEYWORD(c); diff --git a/lib/AsmParser/LLParser.cpp b/lib/AsmParser/LLParser.cpp index cc7cc312460..75fc16cd956 100644 --- a/lib/AsmParser/LLParser.cpp +++ b/lib/AsmParser/LLParser.cpp @@ -1094,6 +1094,7 @@ bool LLParser::ParseOptionalVisibility(unsigned &Res) { /// ::= /*empty*/ /// ::= 'ccc' /// ::= 'fastcc' +/// ::= 'kw_intel_ocl_bicc' /// ::= 'coldcc' /// ::= 'x86_stdcallcc' /// ::= 'x86_fastcallcc' @@ -1125,6 +1126,7 @@ bool LLParser::ParseOptionalCallingConv(CallingConv::ID &CC) { case lltok::kw_ptx_device: CC = CallingConv::PTX_Device; break; case lltok::kw_spir_kernel: CC = CallingConv::SPIR_KERNEL; break; case lltok::kw_spir_func: CC = CallingConv::SPIR_FUNC; break; + case lltok::kw_intel_ocl_bicc: CC = CallingConv::Intel_OCL_BI; break; case lltok::kw_cc: { unsigned ArbitraryCC; Lex.Lex(); diff --git a/lib/AsmParser/LLToken.h b/lib/AsmParser/LLToken.h index ff6d68f0da4..6cffc52d17f 100644 --- a/lib/AsmParser/LLToken.h +++ b/lib/AsmParser/LLToken.h @@ -77,6 +77,7 @@ namespace lltok { kw_c, kw_cc, kw_ccc, kw_fastcc, kw_coldcc, + kw_intel_ocl_bicc, kw_x86_stdcallcc, kw_x86_fastcallcc, kw_x86_thiscallcc, kw_arm_apcscc, kw_arm_aapcscc, kw_arm_aapcs_vfpcc, kw_msp430_intrcc, diff --git a/lib/Target/X86/X86CallingConv.td b/lib/Target/X86/X86CallingConv.td index c881b4d0ad3..6786756c7fa 100644 --- a/lib/Target/X86/X86CallingConv.td +++ b/lib/Target/X86/X86CallingConv.td @@ -88,6 +88,21 @@ def RetCC_X86_32_Fast : CallingConv<[ CCDelegateTo ]>; +// Intel_OCL_BI return-value convention. +def RetCC_Intel_OCL_BI : CallingConv<[ + // Vector types are returned in XMM0,XMM1,XMMM2 and XMM3. + CCIfType<[f32, f64, v4i32, v2i64, v4f32, v2f64], + CCAssignToReg<[XMM0,XMM1,XMM2,XMM3]>>, + + // 256-bit FP vectors + // No more than 4 registers + CCIfType<[v8f32, v4f64, v8i32, v4i64], + CCAssignToReg<[YMM0,YMM1,YMM2,YMM3]>>, + + // i32, i64 in the standard way + CCDelegateTo +]>; + // X86-64 C return-value convention. def RetCC_X86_64_C : CallingConv<[ // The X86-64 calling convention always returns FP values in XMM0. @@ -128,6 +143,10 @@ def RetCC_X86_64 : CallingConv<[ // This is the return-value convention used for the entire X86 backend. def RetCC_X86 : CallingConv<[ + + // Check if this is the Intel OpenCL built-ins calling convention + CCIfCC<"CallingConv::Intel_OCL_BI", CCDelegateTo>, + CCIfSubtarget<"is64Bit()", CCDelegateTo>, CCDelegateTo ]>; @@ -235,6 +254,29 @@ def CC_X86_Win64_C : CallingConv<[ CCIfType<[f80], CCAssignToStack<0, 0>> ]>; +// X86-64 Intel OpenCL built-ins calling convention. +def CC_Intel_OCL_BI : CallingConv<[ + CCIfType<[i32], CCIfSubtarget<"isTargetWin32()", CCAssignToStack<4, 4>>>, + + CCIfType<[i32], CCIfSubtarget<"isTargetWin64()", CCAssignToReg<[ECX, EDX, R8D, R9D]>>>, + CCIfType<[i64], CCIfSubtarget<"isTargetWin64()", CCAssignToReg<[RCX, RDX, R8, R9 ]>>>, + + CCIfType<[i32], CCAssignToReg<[EDI, ESI, EDX, ECX]>>, + CCIfType<[i64], CCAssignToReg<[RDI, RSI, RDX, RCX]>>, + + // The SSE vector arguments are passed in XMM registers. + CCIfType<[f32, f64, v4i32, v2i64, v4f32, v2f64], + CCAssignToReg<[XMM0, XMM1, XMM2, XMM3]>>, + + // The 256-bit vector arguments are passed in YMM registers. + CCIfType<[v8f32, v4f64, v8i32, v4i64], + CCAssignToReg<[YMM0, YMM1, YMM2, YMM3]>>, + + CCIfSubtarget<"isTargetWin64()", CCDelegateTo>, + CCDelegateTo +]>; + + def CC_X86_64_GHC : CallingConv<[ // Promote i8/i16/i32 arguments to i64. CCIfType<[i8, i16, i32], CCPromoteToType>, @@ -408,6 +450,7 @@ def CC_X86_64 : CallingConv<[ // This is the argument convention used for the entire X86 backend. def CC_X86 : CallingConv<[ + CCIfCC<"CallingConv::Intel_OCL_BI", CCDelegateTo>, CCIfSubtarget<"is64Bit()", CCDelegateTo>, CCDelegateTo ]>; @@ -426,3 +469,17 @@ def CSR_64EHRet : CalleeSavedRegs<(add RAX, RDX, CSR_64)>; def CSR_Win64 : CalleeSavedRegs<(add RBX, RBP, RDI, RSI, R12, R13, R14, R15, (sequence "XMM%u", 6, 15))>; + + +// Standard C + YMM6-15 +def CSR_Win64_Intel_OCL_BI_AVX : CalleeSavedRegs<(add RBX, RBP, RDI, RSI, R12, + R13, R14, R15, + (sequence "YMM%u", 6, 15))>; + +//Standard C + XMM 8-15 +def CSR_64_Intel_OCL_BI : CalleeSavedRegs<(add CSR_64, + (sequence "XMM%u", 8, 15))>; + +//Standard C + YMM 8-15 +def CSR_64_Intel_OCL_BI_AVX : CalleeSavedRegs<(add CSR_64, + (sequence "YMM%u", 8, 15))>; diff --git a/lib/Target/X86/X86RegisterInfo.cpp b/lib/Target/X86/X86RegisterInfo.cpp index 4bcf6b1f19e..73ac7477427 100644 --- a/lib/Target/X86/X86RegisterInfo.cpp +++ b/lib/Target/X86/X86RegisterInfo.cpp @@ -229,15 +229,26 @@ const uint16_t * X86RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { bool callsEHReturn = false; bool ghcCall = false; + bool oclBiCall = false; + bool HasAVX = TM.getSubtarget().hasAVX(); if (MF) { callsEHReturn = MF->getMMI().callsEHReturn(); const Function *F = MF->getFunction(); ghcCall = (F ? F->getCallingConv() == CallingConv::GHC : false); + oclBiCall = (F ? F->getCallingConv() == CallingConv::Intel_OCL_BI : false); } if (ghcCall) return CSR_NoRegs_SaveList; + if (oclBiCall) { + if (HasAVX && IsWin64) + return CSR_Win64_Intel_OCL_BI_AVX_SaveList; + if (HasAVX && Is64Bit) + return CSR_64_Intel_OCL_BI_AVX_SaveList; + if (!HasAVX && !IsWin64 && Is64Bit) + return CSR_64_Intel_OCL_BI_SaveList; + } if (Is64Bit) { if (IsWin64) return CSR_Win64_SaveList; @@ -252,6 +263,16 @@ X86RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { const uint32_t* X86RegisterInfo::getCallPreservedMask(CallingConv::ID CC) const { + bool HasAVX = TM.getSubtarget().hasAVX(); + + if (CC == CallingConv::Intel_OCL_BI) { + if (IsWin64 && HasAVX) + return CSR_Win64_Intel_OCL_BI_AVX_RegMask; + if (Is64Bit && HasAVX) + return CSR_64_Intel_OCL_BI_AVX_RegMask; + if (!HasAVX && !IsWin64 && Is64Bit) + return CSR_64_Intel_OCL_BI_RegMask; + } if (CC == CallingConv::GHC) return CSR_NoRegs_RegMask; if (!Is64Bit) diff --git a/lib/VMCore/AsmWriter.cpp b/lib/VMCore/AsmWriter.cpp index 5e23e6fc78e..b72c17f667f 100644 --- a/lib/VMCore/AsmWriter.cpp +++ b/lib/VMCore/AsmWriter.cpp @@ -74,6 +74,7 @@ static void PrintCallingConv(unsigned cc, raw_ostream &Out) case CallingConv::X86_StdCall: Out << "x86_stdcallcc"; break; case CallingConv::X86_FastCall: Out << "x86_fastcallcc"; break; case CallingConv::X86_ThisCall: Out << "x86_thiscallcc"; break; + case CallingConv::Intel_OCL_BI: Out << "intel_ocl_bicc"; break; case CallingConv::ARM_APCS: Out << "arm_apcscc"; break; case CallingConv::ARM_AAPCS: Out << "arm_aapcscc"; break; case CallingConv::ARM_AAPCS_VFP:Out << "arm_aapcs_vfpcc"; break; diff --git a/lib/VMCore/Verifier.cpp b/lib/VMCore/Verifier.cpp index fd629b485aa..eb40b09d29f 100644 --- a/lib/VMCore/Verifier.cpp +++ b/lib/VMCore/Verifier.cpp @@ -705,6 +705,7 @@ void Verifier::visitFunction(Function &F) { case CallingConv::Cold: case CallingConv::X86_FastCall: case CallingConv::X86_ThisCall: + case CallingConv::Intel_OCL_BI: case CallingConv::PTX_Kernel: case CallingConv::PTX_Device: Assert1(!F.isVarArg(), diff --git a/test/CodeGen/X86/avx-intel-ocl.ll b/test/CodeGen/X86/avx-intel-ocl.ll new file mode 100644 index 00000000000..f0c219fb986 --- /dev/null +++ b/test/CodeGen/X86/avx-intel-ocl.ll @@ -0,0 +1,107 @@ +; RUN: llc < %s -mtriple=i386-pc-win32 -mattr=+avx | FileCheck -check-prefix=WIN32 %s +; RUN: llc < %s -mtriple=x86_64-win32 -mattr=+avx | FileCheck -check-prefix=WIN64 %s +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx | FileCheck -check-prefix=NOT_WIN %s + +declare <16 x float> @func_float16_ptr(<16 x float>, <16 x float> *) +declare <16 x float> @func_float16(<16 x float>, <16 x float>) +; WIN64: testf16_inp +; WIN64: vaddps {{.*}}, {{%ymm[0-1]}} +; WIN64: vaddps {{.*}}, {{%ymm[0-1]}} +; WIN64: leaq {{.*}}(%rsp), %rcx +; WIN64: call +; WIN64: ret + +; WIN32: testf16_inp +; WIN32: movl %eax, (%esp) +; WIN32: vaddps {{.*}}, {{%ymm[0-1]}} +; WIN32: vaddps {{.*}}, {{%ymm[0-1]}} +; WIN32: call +; WIN32: ret + +; NOT_WIN: testf16_inp +; NOT_WIN: vaddps {{.*}}, {{%ymm[0-1]}} +; NOT_WIN: vaddps {{.*}}, {{%ymm[0-1]}} +; NOT_WIN: leaq {{.*}}(%rsp), %rdi +; NOT_WIN: call +; NOT_WIN: ret + +;test calling conventions - input parameters +define <16 x float> @testf16_inp(<16 x float> %a, <16 x float> %b) nounwind { + %y = alloca <16 x float>, align 16 + %x = fadd <16 x float> %a, %b + %1 = call intel_ocl_bicc <16 x float> @func_float16_ptr(<16 x float> %x, <16 x float>* %y) + %2 = load <16 x float>* %y, align 16 + %3 = fadd <16 x float> %2, %1 + ret <16 x float> %3 +} + +;test calling conventions - preserved registers + +; preserved ymm6-ymm15 +; WIN64: testf16_regs +; WIN64: call +; WIN64: vaddps {{%ymm[6-7]}}, %ymm0, %ymm0 +; WIN64: vaddps {{%ymm[6-7]}}, %ymm1, %ymm1 +; WIN64: ret + +; preserved ymm8-ymm15 +; NOT_WIN: testf16_regs +; NOT_WIN: call +; NOT_WIN: vaddps {{%ymm[8-9]}}, %ymm0, %ymm0 +; NOT_WIN: vaddps {{%ymm[8-9]}}, %ymm1, %ymm1 +; NOT_WIN: ret + +define <16 x float> @testf16_regs(<16 x float> %a, <16 x float> %b) nounwind { + %y = alloca <16 x float>, align 16 + %x = fadd <16 x float> %a, %b + %1 = call intel_ocl_bicc <16 x float> @func_float16_ptr(<16 x float> %x, <16 x float>* %y) + %2 = load <16 x float>* %y, align 16 + %3 = fadd <16 x float> %1, %b + %4 = fadd <16 x float> %2, %3 + ret <16 x float> %4 +} + +; test calling conventions - prolog and epilog +; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rsp).*}} # 32-byte Spill +; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rsp).*}} # 32-byte Spill +; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rsp).*}} # 32-byte Spill +; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rsp).*}} # 32-byte Spill +; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rsp).*}} # 32-byte Spill +; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rsp).*}} # 32-byte Spill +; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rsp).*}} # 32-byte Spill +; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rsp).*}} # 32-byte Spill +; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rsp).*}} # 32-byte Spill +; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rsp).*}} # 32-byte Spill +; WIN64: call +; WIN64: vmovaps {{.*(%rsp).*}}, {{%ymm([6-9]|1[0-5])}} # 32-byte Reload +; WIN64: vmovaps {{.*(%rsp).*}}, {{%ymm([6-9]|1[0-5])}} # 32-byte Reload +; WIN64: vmovaps {{.*(%rsp).*}}, {{%ymm([6-9]|1[0-5])}} # 32-byte Reload +; WIN64: vmovaps {{.*(%rsp).*}}, {{%ymm([6-9]|1[0-5])}} # 32-byte Reload +; WIN64: vmovaps {{.*(%rsp).*}}, {{%ymm([6-9]|1[0-5])}} # 32-byte Reload +; WIN64: vmovaps {{.*(%rsp).*}}, {{%ymm([6-9]|1[0-5])}} # 32-byte Reload +; WIN64: vmovaps {{.*(%rsp).*}}, {{%ymm([6-9]|1[0-5])}} # 32-byte Reload +; WIN64: vmovaps {{.*(%rsp).*}}, {{%ymm([6-9]|1[0-5])}} # 32-byte Reload +; WIN64: vmovaps {{.*(%rsp).*}}, {{%ymm([6-9]|1[0-5])}} # 32-byte Reload +; WIN64: vmovaps {{.*(%rsp).*}}, {{%ymm([6-9]|1[0-5])}} # 32-byte Reload + +; NOT_WIN: vmovaps {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rbp) ## 32-byte Spill +; NOT_WIN: vmovaps {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rbp) ## 32-byte Spill +; NOT_WIN: vmovaps {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rbp) ## 32-byte Spill +; NOT_WIN: vmovaps {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rbp) ## 32-byte Spill +; NOT_WIN: vmovaps {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rbp) ## 32-byte Spill +; NOT_WIN: vmovaps {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rbp) ## 32-byte Spill +; NOT_WIN: vmovaps {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rbp) ## 32-byte Spill +; NOT_WIN: vmovaps {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rbp) ## 32-byte Spill +; NOT_WIN: call +; NOT_WIN: vmovaps {{.*}}(%rbp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Reload +; NOT_WIN: vmovaps {{.*}}(%rbp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Reload +; NOT_WIN: vmovaps {{.*}}(%rbp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Reload +; NOT_WIN: vmovaps {{.*}}(%rbp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Reload +; NOT_WIN: vmovaps {{.*}}(%rbp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Reload +; NOT_WIN: vmovaps {{.*}}(%rbp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Reload +; NOT_WIN: vmovaps {{.*}}(%rbp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Reload +; NOT_WIN: vmovaps {{.*}}(%rbp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Reload +define intel_ocl_bicc <16 x float> @test_prolog_epilog(<16 x float> %a, <16 x float> %b) nounwind { + %c = call <16 x float> @func_float16(<16 x float> %a, <16 x float> %b) + ret <16 x float> %c +} \ No newline at end of file diff --git a/test/CodeGen/X86/sse-intel-ocl.ll b/test/CodeGen/X86/sse-intel-ocl.ll new file mode 100644 index 00000000000..188505072f0 --- /dev/null +++ b/test/CodeGen/X86/sse-intel-ocl.ll @@ -0,0 +1,93 @@ +; RUN: llc < %s -mtriple=i386-pc-win32 -mcpu=nehalem | FileCheck -check-prefix=WIN32 %s +; RUN: llc < %s -mtriple=x86_64-win32 -mcpu=nehalem | FileCheck -check-prefix=WIN64 %s +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=nehalem | FileCheck -check-prefix=NOT_WIN %s + +declare <16 x float> @func_float16_ptr(<16 x float>, <16 x float> *) +declare <16 x float> @func_float16(<16 x float>, <16 x float>) +; WIN64: testf16_inp +; WIN64: addps {{.*}}, {{%xmm[0-3]}} +; WIN64: addps {{.*}}, {{%xmm[0-3]}} +; WIN64: addps {{.*}}, {{%xmm[0-3]}} +; WIN64: addps {{.*}}, {{%xmm[0-3]}} +; WIN64: leaq {{.*}}(%rsp), %rcx +; WIN64: call +; WIN64: ret + +; WIN32: testf16_inp +; WIN32: movl %eax, (%esp) +; WIN32: addps {{.*}}, {{%xmm[0-3]}} +; WIN32: addps {{.*}}, {{%xmm[0-3]}} +; WIN32: addps {{.*}}, {{%xmm[0-3]}} +; WIN32: addps {{.*}}, {{%xmm[0-3]}} +; WIN32: call +; WIN32: ret + +; NOT_WIN: testf16_inp +; NOT_WIN: addps {{.*}}, {{%xmm[0-3]}} +; NOT_WIN: addps {{.*}}, {{%xmm[0-3]}} +; NOT_WIN: addps {{.*}}, {{%xmm[0-3]}} +; NOT_WIN: addps {{.*}}, {{%xmm[0-3]}} +; NOT_WIN: leaq {{.*}}(%rsp), %rdi +; NOT_WIN: call +; NOT_WIN: ret + +;test calling conventions - input parameters +define <16 x float> @testf16_inp(<16 x float> %a, <16 x float> %b) nounwind { + %y = alloca <16 x float>, align 16 + %x = fadd <16 x float> %a, %b + %1 = call intel_ocl_bicc <16 x float> @func_float16_ptr(<16 x float> %x, <16 x float>* %y) + %2 = load <16 x float>* %y, align 16 + %3 = fadd <16 x float> %2, %1 + ret <16 x float> %3 +} + +;test calling conventions - preserved registers + +; preserved xmm6-xmm15 +; WIN64: testf16_regs +; WIN64: call +; WIN64: addps {{%xmm[6-9]}}, {{.*}} +; WIN64: addps {{%xmm[6-9]}}, {{.*}} +; WIN64: ret + +; preserved xmm8-xmm15 +; NOT_WIN: testf16_regs +; NOT_WIN: call +; NOT_WIN: addps {{%xmm([8-9]|1[0-1])}}, {{.*}} +; NOT_WIN: addps {{%xmm([8-9]|1[0-1])}}, {{.*}} +; NOT_WIN: addps {{%xmm([8-9]|1[0-1])}}, {{.*}} +; NOT_WIN: addps {{%xmm([8-9]|1[0-1])}}, {{.*}} +; NOT_WIN: ret + +define <16 x float> @testf16_regs(<16 x float> %a, <16 x float> %b) nounwind { + %y = alloca <16 x float>, align 16 + %x = fadd <16 x float> %a, %b + %1 = call intel_ocl_bicc <16 x float> @func_float16_ptr(<16 x float> %x, <16 x float>* %y) + %2 = load <16 x float>* %y, align 16 + %3 = fadd <16 x float> %1, %b + %4 = fadd <16 x float> %2, %3 + ret <16 x float> %4 +} + +; test calling conventions - prolog and epilog +; NOT_WIN: movaps {{%xmm([8-9]|1[0-5])}}, {{.*(%rsp).*}} ## 16-byte Spill +; NOT_WIN: movaps {{%xmm([8-9]|1[0-5])}}, {{.*(%rsp).*}} ## 16-byte Spill +; NOT_WIN: movaps {{%xmm([8-9]|1[0-5])}}, {{.*(%rsp).*}} ## 16-byte Spill +; NOT_WIN: movaps {{%xmm([8-9]|1[0-5])}}, {{.*(%rsp).*}} ## 16-byte Spill +; NOT_WIN: movaps {{%xmm([8-9]|1[0-5])}}, {{.*(%rsp).*}} ## 16-byte Spill +; NOT_WIN: movaps {{%xmm([8-9]|1[0-5])}}, {{.*(%rsp).*}} ## 16-byte Spill +; NOT_WIN: movaps {{%xmm([8-9]|1[0-5])}}, {{.*(%rsp).*}} ## 16-byte Spill +; NOT_WIN: movaps {{%xmm([8-9]|1[0-5])}}, {{.*(%rsp).*}} ## 16-byte Spill +; NOT_WIN: call +; NOT_WIN: movaps {{.*(%rsp).*}}, {{%xmm([8-9]|1[0-5])}} ## 16-byte Reload +; NOT_WIN: movaps {{.*(%rsp).*}}, {{%xmm([8-9]|1[0-5])}} ## 16-byte Reload +; NOT_WIN: movaps {{.*(%rsp).*}}, {{%xmm([8-9]|1[0-5])}} ## 16-byte Reload +; NOT_WIN: movaps {{.*(%rsp).*}}, {{%xmm([8-9]|1[0-5])}} ## 16-byte Reload +; NOT_WIN: movaps {{.*(%rsp).*}}, {{%xmm([8-9]|1[0-5])}} ## 16-byte Reload +; NOT_WIN: movaps {{.*(%rsp).*}}, {{%xmm([8-9]|1[0-5])}} ## 16-byte Reload +; NOT_WIN: movaps {{.*(%rsp).*}}, {{%xmm([8-9]|1[0-5])}} ## 16-byte Reload +; NOT_WIN: movaps {{.*(%rsp).*}}, {{%xmm([8-9]|1[0-5])}} ## 16-byte Reload +define intel_ocl_bicc <16 x float> @test_prolog_epilog(<16 x float> %a, <16 x float> %b) nounwind { + %c = call <16 x float> @func_float16(<16 x float> %a, <16 x float> %b) + ret <16 x float> %c +}