llvm-6502/test/CodeGen/X86/avx-intel-ocl.ll

; RUN: llc < %s -mtriple=i686-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck -check-prefix=X32 %s
; RUN: llc < %s -mtriple=i386-pc-win32 -mcpu=corei7-avx -mattr=+avx | FileCheck -check-prefix=X32 %s
; RUN: llc < %s -mtriple=x86_64-win32 -mcpu=corei7-avx -mattr=+avx | FileCheck -check-prefix=WIN64 %s
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck -check-prefix=X64 %s

declare <16 x float> @func_float16_ptr(<16 x float>, <16 x float> *)
declare <16 x float> @func_float16(<16 x float>, <16 x float>)
declare i32 @func_int(i32, i32)

; WIN64: testf16_inp
; WIN64: vaddps  {{.*}}, {{%ymm[0-1]}}
; WIN64: vaddps  {{.*}}, {{%ymm[0-1]}}
; WIN64: leaq    {{.*}}(%rsp), %rcx
; WIN64: call
; WIN64: ret

; X32: testf16_inp
; X32: movl    %eax, (%esp)
; X32: vaddps  {{.*}}, {{%ymm[0-1]}}
; X32: vaddps  {{.*}}, {{%ymm[0-1]}}
; X32: call
; X32: ret

; X64: testf16_inp
; X64: vaddps  {{.*}}, {{%ymm[0-1]}}
; X64: vaddps  {{.*}}, {{%ymm[0-1]}}
; X64: leaq    {{.*}}(%rsp), %rdi
; X64: call
; X64: ret

;test calling conventions - input parameters
define <16 x float> @testf16_inp(<16 x float> %a, <16 x float> %b) nounwind {
  %y = alloca <16 x float>, align 16
  %x = fadd <16 x float> %a, %b
  %1 = call intel_ocl_bicc <16 x float> @func_float16_ptr(<16 x float> %x, <16 x float>* %y) 
  %2 = load <16 x float>* %y, align 16
  %3 = fadd <16 x float> %2, %1
  ret <16 x float> %3
}

;test calling conventions - preserved registers

; preserved ymm6-ymm15
; WIN64: testf16_regs
; WIN64: call
; WIN64: vaddps  {{%ymm[6-7]}}, %ymm0, %ymm0
; WIN64: vaddps  {{%ymm[6-7]}}, %ymm1, %ymm1
; WIN64: ret

; preserved ymm8-ymm15
; X64: testf16_regs
; X64: call
; X64: vaddps  {{%ymm[8-9]}}, %ymm0, %ymm0
; X64: vaddps  {{%ymm[8-9]}}, %ymm1, %ymm1
; X64: ret

define <16 x float> @testf16_regs(<16 x float> %a, <16 x float> %b) nounwind {
  %y = alloca <16 x float>, align 16
  %x = fadd <16 x float> %a, %b
  %1 = call intel_ocl_bicc <16 x float> @func_float16_ptr(<16 x float> %x, <16 x float>* %y) 
  %2 = load <16 x float>* %y, align 16
  %3 = fadd <16 x float> %1, %b
  %4 = fadd <16 x float> %2, %3
  ret <16 x float> %4
}

; test calling conventions - prolog and epilog
; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rsp).*}}     # 32-byte Spill
; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rsp).*}}     # 32-byte Spill
; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rsp).*}}     # 32-byte Spill
; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rsp).*}}     # 32-byte Spill
; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rsp).*}}     # 32-byte Spill
; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rsp).*}}     # 32-byte Spill
; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rsp).*}}     # 32-byte Spill
; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rsp).*}}     # 32-byte Spill
; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rsp).*}}     # 32-byte Spill
; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rsp).*}}     # 32-byte Spill
; WIN64: call
; WIN64: vmovaps {{.*(%rsp).*}}, {{%ymm([6-9]|1[0-5])}}     # 32-byte Reload
; WIN64: vmovaps {{.*(%rsp).*}}, {{%ymm([6-9]|1[0-5])}}     # 32-byte Reload
; WIN64: vmovaps {{.*(%rsp).*}}, {{%ymm([6-9]|1[0-5])}}     # 32-byte Reload
; WIN64: vmovaps {{.*(%rsp).*}}, {{%ymm([6-9]|1[0-5])}}     # 32-byte Reload
; WIN64: vmovaps {{.*(%rsp).*}}, {{%ymm([6-9]|1[0-5])}}     # 32-byte Reload
; WIN64: vmovaps {{.*(%rsp).*}}, {{%ymm([6-9]|1[0-5])}}     # 32-byte Reload
; WIN64: vmovaps {{.*(%rsp).*}}, {{%ymm([6-9]|1[0-5])}}     # 32-byte Reload
; WIN64: vmovaps {{.*(%rsp).*}}, {{%ymm([6-9]|1[0-5])}}     # 32-byte Reload
; WIN64: vmovaps {{.*(%rsp).*}}, {{%ymm([6-9]|1[0-5])}}     # 32-byte Reload
; WIN64: vmovaps {{.*(%rsp).*}}, {{%ymm([6-9]|1[0-5])}}     # 32-byte Reload

; X64: vmovups {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rsp)  ## 32-byte Folded Spill
; X64: vmovups {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rsp)  ## 32-byte Folded Spill
; X64: vmovups {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rsp)  ## 32-byte Folded Spill
; X64: vmovups {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rsp)  ## 32-byte Folded Spill
; X64: vmovups {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rsp)  ## 32-byte Folded Spill
; X64: vmovups {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rsp)  ## 32-byte Folded Spill
; X64: vmovups {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rsp)  ## 32-byte Folded Spill
; X64: vmovups {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rsp)  ## 32-byte Folded Spill
; X64: call
; X64: vmovups {{.*}}(%rsp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Folded Reload
; X64: vmovups {{.*}}(%rsp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Folded Reload
; X64: vmovups {{.*}}(%rsp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Folded Reload
; X64: vmovups {{.*}}(%rsp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Folded Reload
; X64: vmovups {{.*}}(%rsp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Folded Reload
; X64: vmovups {{.*}}(%rsp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Folded Reload
; X64: vmovups {{.*}}(%rsp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Folded Reload
; X64: vmovups {{.*}}(%rsp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Folded Reload
define intel_ocl_bicc <16 x float> @test_prolog_epilog(<16 x float> %a, <16 x float> %b) nounwind {
   %c = call <16 x float> @func_float16(<16 x float> %a, <16 x float> %b)
   ret <16 x float> %c
}

; test functions with integer parameters
; pass parameters on stack for 32-bit platform
; X32: movl {{.*}}, 4(%esp)
; X32: movl {{.*}}, (%esp)
; X32: call
; X32: addl {{.*}}, %eax

; pass parameters in registers for 64-bit platform
; X64: leal {{.*}}, %edi
; X64: movl {{.*}}, %esi
; X64: call
; X64: addl {{.*}}, %eax
define i32 @test_int(i32 %a, i32 %b) nounwind {
    %c1 = add i32 %a, %b
	%c2 = call intel_ocl_bicc i32 @func_int(i32 %c1, i32 %a)
    %c = add i32 %c2, %b
	ret i32 %c
}

; WIN64: test_float4
; WIN64-NOT: vzeroupper
; WIN64: call
; WIN64-NOT: vzeroupper
; WIN64: call
; WIN64: ret

; X64: test_float4
; X64-NOT: vzeroupper
; X64: call
; X64-NOT: vzeroupper
; X64: call
; X64: ret

; X32: test_float4
; X32: vzeroupper
; X32: call
; X32: vzeroupper
; X32: call
; X32: ret

declare <4 x float> @func_float4(<4 x float>, <4 x float>, <4 x float>)

define <8 x float> @test_float4(<8 x float> %a, <8 x float> %b, <8 x float> %c) nounwind readnone {
entry:
  %0 = shufflevector <8 x float> %a, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
  %1 = shufflevector <8 x float> %b, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
  %2 = shufflevector <8 x float> %c, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
  %call.i = tail call intel_ocl_bicc <4 x float> @func_float4(<4 x float> %0, <4 x float> %1, <4 x float> %2) nounwind
  %3 = shufflevector <4 x float> %call.i, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
  %4 = shufflevector <8 x float> %a, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
  %5 = shufflevector <8 x float> %b, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
  %6 = shufflevector <8 x float> %c, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
  %call.i2 = tail call intel_ocl_bicc <4 x float> @func_float4(<4 x float> %4, <4 x float> %5, <4 x float> %6) nounwind
  %7 = shufflevector <4 x float> %call.i2, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
  %8 = shufflevector <8 x float> %3, <8 x float> %7, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
  ret <8 x float> %8
}
Intel OCL built-ins calling conventions now support MacOS 32-bit. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@168359 91177308-0d34-0410-b5e6-96231b3b80d8 2012-11-20 09:37:57 +00:00			`; RUN: llc < %s -mtriple=i686-apple-darwin -mcpu=corei7-avx -mattr=+avx \| FileCheck -check-prefix=X32 %s`
			`; RUN: llc < %s -mtriple=i386-pc-win32 -mcpu=corei7-avx -mattr=+avx \| FileCheck -check-prefix=X32 %s`
The test avx-intel-ocl.ll failed. I can't reproduce on any of my machines. I added -mcpu flag, may be it will fix the problem git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@166669 91177308-0d34-0410-b5e6-96231b3b80d8 2012-10-25 08:38:42 +00:00			`; RUN: llc < %s -mtriple=x86_64-win32 -mcpu=corei7-avx -mattr=+avx \| FileCheck -check-prefix=WIN64 %s`
Intel OCL built-ins calling conventions now support MacOS 32-bit. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@168359 91177308-0d34-0410-b5e6-96231b3b80d8 2012-11-20 09:37:57 +00:00			`; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx \| FileCheck -check-prefix=X64 %s`
Special calling conventions for Intel OpenCL built-in library. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@166566 91177308-0d34-0410-b5e6-96231b3b80d8 2012-10-24 14:46:16 +00:00
			`declare <16 x float> @func_float16_ptr(<16 x float>, <16 x float> *)`
			`declare <16 x float> @func_float16(<16 x float>, <16 x float>)`
Intel OCL built-ins calling conventions now support MacOS 32-bit. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@168359 91177308-0d34-0410-b5e6-96231b3b80d8 2012-11-20 09:37:57 +00:00			`declare i32 @func_int(i32, i32)`

Special calling conventions for Intel OpenCL built-in library. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@166566 91177308-0d34-0410-b5e6-96231b3b80d8 2012-10-24 14:46:16 +00:00			`; WIN64: testf16_inp`
			`; WIN64: vaddps {{.*}}, {{%ymm[0-1]}}`
			`; WIN64: vaddps {{.*}}, {{%ymm[0-1]}}`
			`; WIN64: leaq {{.*}}(%rsp), %rcx`
			`; WIN64: call`
			`; WIN64: ret`

Intel OCL built-ins calling conventions now support MacOS 32-bit. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@168359 91177308-0d34-0410-b5e6-96231b3b80d8 2012-11-20 09:37:57 +00:00			`; X32: testf16_inp`
			`; X32: movl %eax, (%esp)`
			`; X32: vaddps {{.*}}, {{%ymm[0-1]}}`
			`; X32: vaddps {{.*}}, {{%ymm[0-1]}}`
			`; X32: call`
			`; X32: ret`
Special calling conventions for Intel OpenCL built-in library. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@166566 91177308-0d34-0410-b5e6-96231b3b80d8 2012-10-24 14:46:16 +00:00
Intel OCL built-ins calling conventions now support MacOS 32-bit. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@168359 91177308-0d34-0410-b5e6-96231b3b80d8 2012-11-20 09:37:57 +00:00			`; X64: testf16_inp`
			`; X64: vaddps {{.*}}, {{%ymm[0-1]}}`
			`; X64: vaddps {{.*}}, {{%ymm[0-1]}}`
			`; X64: leaq {{.*}}(%rsp), %rdi`
			`; X64: call`
			`; X64: ret`
Special calling conventions for Intel OpenCL built-in library. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@166566 91177308-0d34-0410-b5e6-96231b3b80d8 2012-10-24 14:46:16 +00:00
			`;test calling conventions - input parameters`
			`define <16 x float> @testf16_inp(<16 x float> %a, <16 x float> %b) nounwind {`
			`%y = alloca <16 x float>, align 16`
			`%x = fadd <16 x float> %a, %b`
			`%1 = call intel_ocl_bicc <16 x float> @func_float16_ptr(<16 x float> %x, <16 x float>* %y)`
			`%2 = load <16 x float>* %y, align 16`
			`%3 = fadd <16 x float> %2, %1`
			`ret <16 x float> %3`
			`}`

			`;test calling conventions - preserved registers`

			`; preserved ymm6-ymm15`
			`; WIN64: testf16_regs`
			`; WIN64: call`
			`; WIN64: vaddps {{%ymm[6-7]}}, %ymm0, %ymm0`
			`; WIN64: vaddps {{%ymm[6-7]}}, %ymm1, %ymm1`
			`; WIN64: ret`

			`; preserved ymm8-ymm15`
Intel OCL built-ins calling conventions now support MacOS 32-bit. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@168359 91177308-0d34-0410-b5e6-96231b3b80d8 2012-11-20 09:37:57 +00:00			`; X64: testf16_regs`
			`; X64: call`
			`; X64: vaddps {{%ymm[8-9]}}, %ymm0, %ymm0`
			`; X64: vaddps {{%ymm[8-9]}}, %ymm1, %ymm1`
			`; X64: ret`
Special calling conventions for Intel OpenCL built-in library. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@166566 91177308-0d34-0410-b5e6-96231b3b80d8 2012-10-24 14:46:16 +00:00
			`define <16 x float> @testf16_regs(<16 x float> %a, <16 x float> %b) nounwind {`
			`%y = alloca <16 x float>, align 16`
			`%x = fadd <16 x float> %a, %b`
			`%1 = call intel_ocl_bicc <16 x float> @func_float16_ptr(<16 x float> %x, <16 x float>* %y)`
			`%2 = load <16 x float>* %y, align 16`
			`%3 = fadd <16 x float> %1, %b`
			`%4 = fadd <16 x float> %2, %3`
			`ret <16 x float> %4`
			`}`

			`; test calling conventions - prolog and epilog`
			`; WIN64: vmovaps {{%ymm([6-9]\|1[0-5])}}, {{.(%rsp).}} # 32-byte Spill`
			`; WIN64: vmovaps {{%ymm([6-9]\|1[0-5])}}, {{.(%rsp).}} # 32-byte Spill`
			`; WIN64: vmovaps {{%ymm([6-9]\|1[0-5])}}, {{.(%rsp).}} # 32-byte Spill`
			`; WIN64: vmovaps {{%ymm([6-9]\|1[0-5])}}, {{.(%rsp).}} # 32-byte Spill`
			`; WIN64: vmovaps {{%ymm([6-9]\|1[0-5])}}, {{.(%rsp).}} # 32-byte Spill`
			`; WIN64: vmovaps {{%ymm([6-9]\|1[0-5])}}, {{.(%rsp).}} # 32-byte Spill`
			`; WIN64: vmovaps {{%ymm([6-9]\|1[0-5])}}, {{.(%rsp).}} # 32-byte Spill`
			`; WIN64: vmovaps {{%ymm([6-9]\|1[0-5])}}, {{.(%rsp).}} # 32-byte Spill`
			`; WIN64: vmovaps {{%ymm([6-9]\|1[0-5])}}, {{.(%rsp).}} # 32-byte Spill`
			`; WIN64: vmovaps {{%ymm([6-9]\|1[0-5])}}, {{.(%rsp).}} # 32-byte Spill`
			`; WIN64: call`
			`; WIN64: vmovaps {{.(%rsp).}}, {{%ymm([6-9]\|1[0-5])}} # 32-byte Reload`
			`; WIN64: vmovaps {{.(%rsp).}}, {{%ymm([6-9]\|1[0-5])}} # 32-byte Reload`
			`; WIN64: vmovaps {{.(%rsp).}}, {{%ymm([6-9]\|1[0-5])}} # 32-byte Reload`
			`; WIN64: vmovaps {{.(%rsp).}}, {{%ymm([6-9]\|1[0-5])}} # 32-byte Reload`
			`; WIN64: vmovaps {{.(%rsp).}}, {{%ymm([6-9]\|1[0-5])}} # 32-byte Reload`
			`; WIN64: vmovaps {{.(%rsp).}}, {{%ymm([6-9]\|1[0-5])}} # 32-byte Reload`
			`; WIN64: vmovaps {{.(%rsp).}}, {{%ymm([6-9]\|1[0-5])}} # 32-byte Reload`
			`; WIN64: vmovaps {{.(%rsp).}}, {{%ymm([6-9]\|1[0-5])}} # 32-byte Reload`
			`; WIN64: vmovaps {{.(%rsp).}}, {{%ymm([6-9]\|1[0-5])}} # 32-byte Reload`
			`; WIN64: vmovaps {{.(%rsp).}}, {{%ymm([6-9]\|1[0-5])}} # 32-byte Reload`

Remove the X86 Maximal Stack Alignment Check pass as it is no longer necessary. This pass was conservative in that it always reserved the FP to enable dynamic stack realignment, which allowed the RA to use aligned spills for vector registers. This happens even when spills were not necessary. The RA has since been improved to use unaligned spills when necessary. The new behavior is to realign the stack if the frame pointer was already reserved for some other reason, but don't reserve the frame pointer just because a function contains vector virtual registers. Part of rdar://12719844 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@168627 91177308-0d34-0410-b5e6-96231b3b80d8 2012-11-26 22:55:05 +00:00			`; X64: vmovups {{%ymm([8-9]\|1[0-5])}}, {{.*}}(%rsp) ## 32-byte Folded Spill`
			`; X64: vmovups {{%ymm([8-9]\|1[0-5])}}, {{.*}}(%rsp) ## 32-byte Folded Spill`
			`; X64: vmovups {{%ymm([8-9]\|1[0-5])}}, {{.*}}(%rsp) ## 32-byte Folded Spill`
			`; X64: vmovups {{%ymm([8-9]\|1[0-5])}}, {{.*}}(%rsp) ## 32-byte Folded Spill`
			`; X64: vmovups {{%ymm([8-9]\|1[0-5])}}, {{.*}}(%rsp) ## 32-byte Folded Spill`
			`; X64: vmovups {{%ymm([8-9]\|1[0-5])}}, {{.*}}(%rsp) ## 32-byte Folded Spill`
			`; X64: vmovups {{%ymm([8-9]\|1[0-5])}}, {{.*}}(%rsp) ## 32-byte Folded Spill`
			`; X64: vmovups {{%ymm([8-9]\|1[0-5])}}, {{.*}}(%rsp) ## 32-byte Folded Spill`
Intel OCL built-ins calling conventions now support MacOS 32-bit. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@168359 91177308-0d34-0410-b5e6-96231b3b80d8 2012-11-20 09:37:57 +00:00			`; X64: call`
Remove the X86 Maximal Stack Alignment Check pass as it is no longer necessary. This pass was conservative in that it always reserved the FP to enable dynamic stack realignment, which allowed the RA to use aligned spills for vector registers. This happens even when spills were not necessary. The RA has since been improved to use unaligned spills when necessary. The new behavior is to realign the stack if the frame pointer was already reserved for some other reason, but don't reserve the frame pointer just because a function contains vector virtual registers. Part of rdar://12719844 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@168627 91177308-0d34-0410-b5e6-96231b3b80d8 2012-11-26 22:55:05 +00:00			`; X64: vmovups {{.*}}(%rsp), {{%ymm([8-9]\|1[0-5])}} ## 32-byte Folded Reload`
			`; X64: vmovups {{.*}}(%rsp), {{%ymm([8-9]\|1[0-5])}} ## 32-byte Folded Reload`
			`; X64: vmovups {{.*}}(%rsp), {{%ymm([8-9]\|1[0-5])}} ## 32-byte Folded Reload`
			`; X64: vmovups {{.*}}(%rsp), {{%ymm([8-9]\|1[0-5])}} ## 32-byte Folded Reload`
			`; X64: vmovups {{.*}}(%rsp), {{%ymm([8-9]\|1[0-5])}} ## 32-byte Folded Reload`
			`; X64: vmovups {{.*}}(%rsp), {{%ymm([8-9]\|1[0-5])}} ## 32-byte Folded Reload`
			`; X64: vmovups {{.*}}(%rsp), {{%ymm([8-9]\|1[0-5])}} ## 32-byte Folded Reload`
			`; X64: vmovups {{.*}}(%rsp), {{%ymm([8-9]\|1[0-5])}} ## 32-byte Folded Reload`
Special calling conventions for Intel OpenCL built-in library. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@166566 91177308-0d34-0410-b5e6-96231b3b80d8 2012-10-24 14:46:16 +00:00			`define intel_ocl_bicc <16 x float> @test_prolog_epilog(<16 x float> %a, <16 x float> %b) nounwind {`
			`%c = call <16 x float> @func_float16(<16 x float> %a, <16 x float> %b)`
			`ret <16 x float> %c`
The test avx-intel-ocl.ll failed. I can't reproduce on any of my machines. I added -mcpu flag, may be it will fix the problem git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@166669 91177308-0d34-0410-b5e6-96231b3b80d8 2012-10-25 08:38:42 +00:00			`}`
Intel OCL built-ins calling conventions now support MacOS 32-bit. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@168359 91177308-0d34-0410-b5e6-96231b3b80d8 2012-11-20 09:37:57 +00:00
			`; test functions with integer parameters`
			`; pass parameters on stack for 32-bit platform`
			`; X32: movl {{.*}}, 4(%esp)`
			`; X32: movl {{.*}}, (%esp)`
			`; X32: call`
			`; X32: addl {{.*}}, %eax`

			`; pass parameters in registers for 64-bit platform`
			`; X64: leal {{.*}}, %edi`
			`; X64: movl {{.*}}, %esi`
			`; X64: call`
			`; X64: addl {{.*}}, %eax`
			`define i32 @test_int(i32 %a, i32 %b) nounwind {`
			`%c1 = add i32 %a, %b`
			`%c2 = call intel_ocl_bicc i32 @func_int(i32 %c1, i32 %a)`
			`%c = add i32 %c2, %b`
			`ret i32 %c`
			`}`
Prevent insertion of "vzeroupper" before call that preserves YMM registers, since a caller uses preserved registers across the call. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@175043 91177308-0d34-0410-b5e6-96231b3b80d8 2013-02-13 08:02:04 +00:00
			`; WIN64: test_float4`
			`; WIN64-NOT: vzeroupper`
			`; WIN64: call`
			`; WIN64-NOT: vzeroupper`
			`; WIN64: call`
			`; WIN64: ret`

			`; X64: test_float4`
			`; X64-NOT: vzeroupper`
			`; X64: call`
			`; X64-NOT: vzeroupper`
			`; X64: call`
			`; X64: ret`

			`; X32: test_float4`
			`; X32: vzeroupper`
			`; X32: call`
			`; X32: vzeroupper`
			`; X32: call`
			`; X32: ret`

			`declare <4 x float> @func_float4(<4 x float>, <4 x float>, <4 x float>)`

			`define <8 x float> @test_float4(<8 x float> %a, <8 x float> %b, <8 x float> %c) nounwind readnone {`
			`entry:`
			`%0 = shufflevector <8 x float> %a, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>`
			`%1 = shufflevector <8 x float> %b, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>`
			`%2 = shufflevector <8 x float> %c, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>`
			`%call.i = tail call intel_ocl_bicc <4 x float> @func_float4(<4 x float> %0, <4 x float> %1, <4 x float> %2) nounwind`
			`%3 = shufflevector <4 x float> %call.i, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>`
			`%4 = shufflevector <8 x float> %a, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>`
			`%5 = shufflevector <8 x float> %b, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>`
			`%6 = shufflevector <8 x float> %c, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>`
			`%call.i2 = tail call intel_ocl_bicc <4 x float> @func_float4(<4 x float> %4, <4 x float> %5, <4 x float> %6) nounwind`
			`%7 = shufflevector <4 x float> %call.i2, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>`
			`%8 = shufflevector <8 x float> %3, <8 x float> %7, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>`
			`ret <8 x float> %8`
			`}`