llvm-6502/test/CodeGen/X86/avx-intel-ocl.ll

; RUN: llc < %s -mtriple=i686-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck -check-prefix=X32 %s
; RUN: llc < %s -mtriple=i386-pc-win32 -mcpu=corei7-avx -mattr=+avx | FileCheck -check-prefix=X32 %s
; RUN: llc < %s -mtriple=x86_64-win32 -mcpu=corei7-avx -mattr=+avx | FileCheck -check-prefix=WIN64 %s
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck -check-prefix=X64 %s

declare <16 x float> @func_float16_ptr(<16 x float>, <16 x float> *)
declare <16 x float> @func_float16(<16 x float>, <16 x float>)
declare i32 @func_int(i32, i32)

; WIN64: testf16_inp
; WIN64: vaddps  {{.*}}, {{%ymm[0-1]}}
; WIN64: vaddps  {{.*}}, {{%ymm[0-1]}}
; WIN64: leaq    {{.*}}(%rsp), %rcx
; WIN64: call
; WIN64: ret

; X32: testf16_inp
; X32: movl    %eax, (%esp)
; X32: vaddps  {{.*}}, {{%ymm[0-1]}}
; X32: vaddps  {{.*}}, {{%ymm[0-1]}}
; X32: call
; X32: ret

; X64: testf16_inp
; X64: vaddps  {{.*}}, {{%ymm[0-1]}}
; X64: vaddps  {{.*}}, {{%ymm[0-1]}}
; X64: leaq    {{.*}}(%rsp), %rdi
; X64: call
; X64: ret

;test calling conventions - input parameters
define <16 x float> @testf16_inp(<16 x float> %a, <16 x float> %b) nounwind {
  %y = alloca <16 x float>, align 16
  %x = fadd <16 x float> %a, %b
  %1 = call intel_ocl_bicc <16 x float> @func_float16_ptr(<16 x float> %x, <16 x float>* %y)
  %2 = load <16 x float>* %y, align 16
  %3 = fadd <16 x float> %2, %1
  ret <16 x float> %3
}

;test calling conventions - preserved registers

; preserved ymm6-ymm15
; WIN64: testf16_regs
; WIN64: call
; WIN64: vaddps  {{%ymm[6-7]}}, {{%ymm[0-1]}}, {{%ymm[0-1]}}
; WIN64: vaddps  {{%ymm[6-7]}}, {{%ymm[0-1]}}, {{%ymm[0-1]}}
; WIN64: ret

; preserved ymm8-ymm15
; X64: testf16_regs
; X64: call
; X64: vaddps  {{%ymm[8-9]}}, {{%ymm[0-1]}}, {{%ymm[0-1]}}
; X64: vaddps  {{%ymm[8-9]}}, {{%ymm[0-1]}}, {{%ymm[0-1]}}
; X64: ret

define <16 x float> @testf16_regs(<16 x float> %a, <16 x float> %b) nounwind {
  %y = alloca <16 x float>, align 16
  %x = fadd <16 x float> %a, %b
  %1 = call intel_ocl_bicc <16 x float> @func_float16_ptr(<16 x float> %x, <16 x float>* %y)
  %2 = load <16 x float>* %y, align 16
  %3 = fadd <16 x float> %1, %b
  %4 = fadd <16 x float> %2, %3
  ret <16 x float> %4
}

; test calling conventions - prolog and epilog
; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rsp).*}}     # 32-byte Spill
; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rsp).*}}     # 32-byte Spill
; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rsp).*}}     # 32-byte Spill
; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rsp).*}}     # 32-byte Spill
; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rsp).*}}     # 32-byte Spill
; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rsp).*}}     # 32-byte Spill
; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rsp).*}}     # 32-byte Spill
; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rsp).*}}     # 32-byte Spill
; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rsp).*}}     # 32-byte Spill
; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rsp).*}}     # 32-byte Spill
; WIN64: call
; WIN64: vmovaps {{.*(%rsp).*}}, {{%ymm([6-9]|1[0-5])}}     # 32-byte Reload
; WIN64: vmovaps {{.*(%rsp).*}}, {{%ymm([6-9]|1[0-5])}}     # 32-byte Reload
; WIN64: vmovaps {{.*(%rsp).*}}, {{%ymm([6-9]|1[0-5])}}     # 32-byte Reload
; WIN64: vmovaps {{.*(%rsp).*}}, {{%ymm([6-9]|1[0-5])}}     # 32-byte Reload
; WIN64: vmovaps {{.*(%rsp).*}}, {{%ymm([6-9]|1[0-5])}}     # 32-byte Reload
; WIN64: vmovaps {{.*(%rsp).*}}, {{%ymm([6-9]|1[0-5])}}     # 32-byte Reload
; WIN64: vmovaps {{.*(%rsp).*}}, {{%ymm([6-9]|1[0-5])}}     # 32-byte Reload
; WIN64: vmovaps {{.*(%rsp).*}}, {{%ymm([6-9]|1[0-5])}}     # 32-byte Reload
; WIN64: vmovaps {{.*(%rsp).*}}, {{%ymm([6-9]|1[0-5])}}     # 32-byte Reload
; WIN64: vmovaps {{.*(%rsp).*}}, {{%ymm([6-9]|1[0-5])}}     # 32-byte Reload

; X64: vmovups {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rsp)  ## 32-byte Folded Spill
; X64: vmovups {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rsp)  ## 32-byte Folded Spill
; X64: vmovups {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rsp)  ## 32-byte Folded Spill
; X64: vmovups {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rsp)  ## 32-byte Folded Spill
; X64: vmovups {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rsp)  ## 32-byte Folded Spill
; X64: vmovups {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rsp)  ## 32-byte Folded Spill
; X64: vmovups {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rsp)  ## 32-byte Folded Spill
; X64: vmovups {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rsp)  ## 32-byte Folded Spill
; X64: call
; X64: vmovups {{.*}}(%rsp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Folded Reload
; X64: vmovups {{.*}}(%rsp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Folded Reload
; X64: vmovups {{.*}}(%rsp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Folded Reload
; X64: vmovups {{.*}}(%rsp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Folded Reload
; X64: vmovups {{.*}}(%rsp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Folded Reload
; X64: vmovups {{.*}}(%rsp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Folded Reload
; X64: vmovups {{.*}}(%rsp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Folded Reload
; X64: vmovups {{.*}}(%rsp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Folded Reload
define intel_ocl_bicc <16 x float> @test_prolog_epilog(<16 x float> %a, <16 x float> %b) nounwind {
   %c = call <16 x float> @func_float16(<16 x float> %a, <16 x float> %b)
   ret <16 x float> %c
}

; test functions with integer parameters
; pass parameters on stack for 32-bit platform
; X32: movl {{.*}}, 4(%esp)
; X32: movl {{.*}}, (%esp)
; X32: call
; X32: addl {{.*}}, %eax

; pass parameters in registers for 64-bit platform
; X64: leal {{.*}}, %edi
; X64: movl {{.*}}, %esi
; X64: call
; X64: addl {{.*}}, %eax
define i32 @test_int(i32 %a, i32 %b) nounwind {
    %c1 = add i32 %a, %b
	%c2 = call intel_ocl_bicc i32 @func_int(i32 %c1, i32 %a)
    %c = add i32 %c2, %b
	ret i32 %c
}

; WIN64: test_float4
; WIN64-NOT: vzeroupper
; WIN64: call
; WIN64-NOT: vzeroupper
; WIN64: call
; WIN64: ret

; X64: test_float4
; X64-NOT: vzeroupper
; X64: call
; X64-NOT: vzeroupper
; X64: call
; X64: ret

; X32: test_float4
; X32: vzeroupper
; X32: call
; X32: vzeroupper
; X32: call
; X32: ret

declare <4 x float> @func_float4(<4 x float>, <4 x float>, <4 x float>)

define <8 x float> @test_float4(<8 x float> %a, <8 x float> %b, <8 x float> %c) nounwind readnone {
entry:
  %0 = shufflevector <8 x float> %a, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
  %1 = shufflevector <8 x float> %b, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
  %2 = shufflevector <8 x float> %c, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
  %call.i = tail call intel_ocl_bicc <4 x float> @func_float4(<4 x float> %0, <4 x float> %1, <4 x float> %2) nounwind
  %3 = shufflevector <4 x float> %call.i, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
  %4 = shufflevector <8 x float> %a, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
  %5 = shufflevector <8 x float> %b, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
  %6 = shufflevector <8 x float> %c, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
  %call.i2 = tail call intel_ocl_bicc <4 x float> @func_float4(<4 x float> %4, <4 x float> %5, <4 x float> %6) nounwind
  %7 = shufflevector <4 x float> %call.i2, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
  %8 = shufflevector <8 x float> %3, <8 x float> %7, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
  ret <8 x float> %8
}
Intel OCL built-ins calling conventions now support MacOS 32-bit. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@168359 91177308-0d34-0410-b5e6-96231b3b80d8 2012-11-20 09:37:57 +00:00			`; RUN: llc < %s -mtriple=i686-apple-darwin -mcpu=corei7-avx -mattr=+avx \| FileCheck -check-prefix=X32 %s`
			`; RUN: llc < %s -mtriple=i386-pc-win32 -mcpu=corei7-avx -mattr=+avx \| FileCheck -check-prefix=X32 %s`
The test avx-intel-ocl.ll failed. I can't reproduce on any of my machines. I added -mcpu flag, may be it will fix the problem git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@166669 91177308-0d34-0410-b5e6-96231b3b80d8 2012-10-25 08:38:42 +00:00			`; RUN: llc < %s -mtriple=x86_64-win32 -mcpu=corei7-avx -mattr=+avx \| FileCheck -check-prefix=WIN64 %s`
Intel OCL built-ins calling conventions now support MacOS 32-bit. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@168359 91177308-0d34-0410-b5e6-96231b3b80d8 2012-11-20 09:37:57 +00:00			`; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx \| FileCheck -check-prefix=X64 %s`
Special calling conventions for Intel OpenCL built-in library. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@166566 91177308-0d34-0410-b5e6-96231b3b80d8 2012-10-24 14:46:16 +00:00
			`declare <16 x float> @func_float16_ptr(<16 x float>, <16 x float> *)`
			`declare <16 x float> @func_float16(<16 x float>, <16 x float>)`
Intel OCL built-ins calling conventions now support MacOS 32-bit. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@168359 91177308-0d34-0410-b5e6-96231b3b80d8 2012-11-20 09:37:57 +00:00			`declare i32 @func_int(i32, i32)`

Special calling conventions for Intel OpenCL built-in library. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@166566 91177308-0d34-0410-b5e6-96231b3b80d8 2012-10-24 14:46:16 +00:00			`; WIN64: testf16_inp`
			`; WIN64: vaddps {{.*}}, {{%ymm[0-1]}}`
			`; WIN64: vaddps {{.*}}, {{%ymm[0-1]}}`
			`; WIN64: leaq {{.*}}(%rsp), %rcx`
			`; WIN64: call`
			`; WIN64: ret`

Intel OCL built-ins calling conventions now support MacOS 32-bit. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@168359 91177308-0d34-0410-b5e6-96231b3b80d8 2012-11-20 09:37:57 +00:00			`; X32: testf16_inp`
			`; X32: movl %eax, (%esp)`
			`; X32: vaddps {{.*}}, {{%ymm[0-1]}}`
			`; X32: vaddps {{.*}}, {{%ymm[0-1]}}`
			`; X32: call`
			`; X32: ret`
Special calling conventions for Intel OpenCL built-in library. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@166566 91177308-0d34-0410-b5e6-96231b3b80d8 2012-10-24 14:46:16 +00:00
Intel OCL built-ins calling conventions now support MacOS 32-bit. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@168359 91177308-0d34-0410-b5e6-96231b3b80d8 2012-11-20 09:37:57 +00:00			`; X64: testf16_inp`
			`; X64: vaddps {{.*}}, {{%ymm[0-1]}}`
			`; X64: vaddps {{.*}}, {{%ymm[0-1]}}`
			`; X64: leaq {{.*}}(%rsp), %rdi`
			`; X64: call`
			`; X64: ret`
Special calling conventions for Intel OpenCL built-in library. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@166566 91177308-0d34-0410-b5e6-96231b3b80d8 2012-10-24 14:46:16 +00:00
			`;test calling conventions - input parameters`
			`define <16 x float> @testf16_inp(<16 x float> %a, <16 x float> %b) nounwind {`
			`%y = alloca <16 x float>, align 16`
			`%x = fadd <16 x float> %a, %b`
Enable MI Sched for x86. This changes the SelectionDAG scheduling preference to source order. Soon, the SelectionDAG scheduler can be bypassed saving a nice chunk of compile time. Performance differences that result from this change are often a consequence of register coalescing. The register coalescer is far from perfect. Bugs can be filed for deficiencies. On x86 SandyBridge/Haswell, the source order schedule is often preserved, particularly for small blocks. Register pressure is generally improved over the SD scheduler's ILP mode. However, we are still able to handle large blocks that require latency hiding, unlike the SD scheduler's BURR mode. MI scheduler also attempts to discover the critical path in single-block loops and adjust heuristics accordingly. The MI scheduler relies on the new machine model. This is currently unimplemented for AVX, so we may not be generating the best code yet. Unit tests are updated so they don't depend on SD scheduling heuristics. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@192750 91177308-0d34-0410-b5e6-96231b3b80d8 2013-10-15 23:33:07 +00:00			`%1 = call intel_ocl_bicc <16 x float> @func_float16_ptr(<16 x float> %x, <16 x float>* %y)`
Special calling conventions for Intel OpenCL built-in library. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@166566 91177308-0d34-0410-b5e6-96231b3b80d8 2012-10-24 14:46:16 +00:00			`%2 = load <16 x float>* %y, align 16`
			`%3 = fadd <16 x float> %2, %1`
			`ret <16 x float> %3`
			`}`

			`;test calling conventions - preserved registers`

			`; preserved ymm6-ymm15`
			`; WIN64: testf16_regs`
			`; WIN64: call`
Enable MI Sched for x86. This changes the SelectionDAG scheduling preference to source order. Soon, the SelectionDAG scheduler can be bypassed saving a nice chunk of compile time. Performance differences that result from this change are often a consequence of register coalescing. The register coalescer is far from perfect. Bugs can be filed for deficiencies. On x86 SandyBridge/Haswell, the source order schedule is often preserved, particularly for small blocks. Register pressure is generally improved over the SD scheduler's ILP mode. However, we are still able to handle large blocks that require latency hiding, unlike the SD scheduler's BURR mode. MI scheduler also attempts to discover the critical path in single-block loops and adjust heuristics accordingly. The MI scheduler relies on the new machine model. This is currently unimplemented for AVX, so we may not be generating the best code yet. Unit tests are updated so they don't depend on SD scheduling heuristics. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@192750 91177308-0d34-0410-b5e6-96231b3b80d8 2013-10-15 23:33:07 +00:00			`; WIN64: vaddps {{%ymm[6-7]}}, {{%ymm[0-1]}}, {{%ymm[0-1]}}`
			`; WIN64: vaddps {{%ymm[6-7]}}, {{%ymm[0-1]}}, {{%ymm[0-1]}}`
Special calling conventions for Intel OpenCL built-in library. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@166566 91177308-0d34-0410-b5e6-96231b3b80d8 2012-10-24 14:46:16 +00:00			`; WIN64: ret`

			`; preserved ymm8-ymm15`
Intel OCL built-ins calling conventions now support MacOS 32-bit. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@168359 91177308-0d34-0410-b5e6-96231b3b80d8 2012-11-20 09:37:57 +00:00			`; X64: testf16_regs`
			`; X64: call`
Enable MI Sched for x86. This changes the SelectionDAG scheduling preference to source order. Soon, the SelectionDAG scheduler can be bypassed saving a nice chunk of compile time. Performance differences that result from this change are often a consequence of register coalescing. The register coalescer is far from perfect. Bugs can be filed for deficiencies. On x86 SandyBridge/Haswell, the source order schedule is often preserved, particularly for small blocks. Register pressure is generally improved over the SD scheduler's ILP mode. However, we are still able to handle large blocks that require latency hiding, unlike the SD scheduler's BURR mode. MI scheduler also attempts to discover the critical path in single-block loops and adjust heuristics accordingly. The MI scheduler relies on the new machine model. This is currently unimplemented for AVX, so we may not be generating the best code yet. Unit tests are updated so they don't depend on SD scheduling heuristics. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@192750 91177308-0d34-0410-b5e6-96231b3b80d8 2013-10-15 23:33:07 +00:00			`; X64: vaddps {{%ymm[8-9]}}, {{%ymm[0-1]}}, {{%ymm[0-1]}}`
			`; X64: vaddps {{%ymm[8-9]}}, {{%ymm[0-1]}}, {{%ymm[0-1]}}`
Intel OCL built-ins calling conventions now support MacOS 32-bit. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@168359 91177308-0d34-0410-b5e6-96231b3b80d8 2012-11-20 09:37:57 +00:00			`; X64: ret`
Special calling conventions for Intel OpenCL built-in library. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@166566 91177308-0d34-0410-b5e6-96231b3b80d8 2012-10-24 14:46:16 +00:00
			`define <16 x float> @testf16_regs(<16 x float> %a, <16 x float> %b) nounwind {`
			`%y = alloca <16 x float>, align 16`
			`%x = fadd <16 x float> %a, %b`
Enable MI Sched for x86. This changes the SelectionDAG scheduling preference to source order. Soon, the SelectionDAG scheduler can be bypassed saving a nice chunk of compile time. Performance differences that result from this change are often a consequence of register coalescing. The register coalescer is far from perfect. Bugs can be filed for deficiencies. On x86 SandyBridge/Haswell, the source order schedule is often preserved, particularly for small blocks. Register pressure is generally improved over the SD scheduler's ILP mode. However, we are still able to handle large blocks that require latency hiding, unlike the SD scheduler's BURR mode. MI scheduler also attempts to discover the critical path in single-block loops and adjust heuristics accordingly. The MI scheduler relies on the new machine model. This is currently unimplemented for AVX, so we may not be generating the best code yet. Unit tests are updated so they don't depend on SD scheduling heuristics. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@192750 91177308-0d34-0410-b5e6-96231b3b80d8 2013-10-15 23:33:07 +00:00			`%1 = call intel_ocl_bicc <16 x float> @func_float16_ptr(<16 x float> %x, <16 x float>* %y)`
Special calling conventions for Intel OpenCL built-in library. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@166566 91177308-0d34-0410-b5e6-96231b3b80d8 2012-10-24 14:46:16 +00:00			`%2 = load <16 x float>* %y, align 16`
			`%3 = fadd <16 x float> %1, %b`
			`%4 = fadd <16 x float> %2, %3`
			`ret <16 x float> %4`
			`}`

			`; test calling conventions - prolog and epilog`
			`; WIN64: vmovaps {{%ymm([6-9]\|1[0-5])}}, {{.(%rsp).}} # 32-byte Spill`
			`; WIN64: vmovaps {{%ymm([6-9]\|1[0-5])}}, {{.(%rsp).}} # 32-byte Spill`
			`; WIN64: vmovaps {{%ymm([6-9]\|1[0-5])}}, {{.(%rsp).}} # 32-byte Spill`
			`; WIN64: vmovaps {{%ymm([6-9]\|1[0-5])}}, {{.(%rsp).}} # 32-byte Spill`
			`; WIN64: vmovaps {{%ymm([6-9]\|1[0-5])}}, {{.(%rsp).}} # 32-byte Spill`
			`; WIN64: vmovaps {{%ymm([6-9]\|1[0-5])}}, {{.(%rsp).}} # 32-byte Spill`
			`; WIN64: vmovaps {{%ymm([6-9]\|1[0-5])}}, {{.(%rsp).}} # 32-byte Spill`
			`; WIN64: vmovaps {{%ymm([6-9]\|1[0-5])}}, {{.(%rsp).}} # 32-byte Spill`
			`; WIN64: vmovaps {{%ymm([6-9]\|1[0-5])}}, {{.(%rsp).}} # 32-byte Spill`
			`; WIN64: vmovaps {{%ymm([6-9]\|1[0-5])}}, {{.(%rsp).}} # 32-byte Spill`
			`; WIN64: call`
			`; WIN64: vmovaps {{.(%rsp).}}, {{%ymm([6-9]\|1[0-5])}} # 32-byte Reload`
			`; WIN64: vmovaps {{.(%rsp).}}, {{%ymm([6-9]\|1[0-5])}} # 32-byte Reload`
			`; WIN64: vmovaps {{.(%rsp).}}, {{%ymm([6-9]\|1[0-5])}} # 32-byte Reload`
			`; WIN64: vmovaps {{.(%rsp).}}, {{%ymm([6-9]\|1[0-5])}} # 32-byte Reload`
			`; WIN64: vmovaps {{.(%rsp).}}, {{%ymm([6-9]\|1[0-5])}} # 32-byte Reload`
			`; WIN64: vmovaps {{.(%rsp).}}, {{%ymm([6-9]\|1[0-5])}} # 32-byte Reload`
			`; WIN64: vmovaps {{.(%rsp).}}, {{%ymm([6-9]\|1[0-5])}} # 32-byte Reload`
			`; WIN64: vmovaps {{.(%rsp).}}, {{%ymm([6-9]\|1[0-5])}} # 32-byte Reload`
			`; WIN64: vmovaps {{.(%rsp).}}, {{%ymm([6-9]\|1[0-5])}} # 32-byte Reload`
			`; WIN64: vmovaps {{.(%rsp).}}, {{%ymm([6-9]\|1[0-5])}} # 32-byte Reload`

Remove the X86 Maximal Stack Alignment Check pass as it is no longer necessary. This pass was conservative in that it always reserved the FP to enable dynamic stack realignment, which allowed the RA to use aligned spills for vector registers. This happens even when spills were not necessary. The RA has since been improved to use unaligned spills when necessary. The new behavior is to realign the stack if the frame pointer was already reserved for some other reason, but don't reserve the frame pointer just because a function contains vector virtual registers. Part of rdar://12719844 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@168627 91177308-0d34-0410-b5e6-96231b3b80d8 2012-11-26 22:55:05 +00:00			`; X64: vmovups {{%ymm([8-9]\|1[0-5])}}, {{.*}}(%rsp) ## 32-byte Folded Spill`
			`; X64: vmovups {{%ymm([8-9]\|1[0-5])}}, {{.*}}(%rsp) ## 32-byte Folded Spill`
			`; X64: vmovups {{%ymm([8-9]\|1[0-5])}}, {{.*}}(%rsp) ## 32-byte Folded Spill`
			`; X64: vmovups {{%ymm([8-9]\|1[0-5])}}, {{.*}}(%rsp) ## 32-byte Folded Spill`
			`; X64: vmovups {{%ymm([8-9]\|1[0-5])}}, {{.*}}(%rsp) ## 32-byte Folded Spill`
			`; X64: vmovups {{%ymm([8-9]\|1[0-5])}}, {{.*}}(%rsp) ## 32-byte Folded Spill`
			`; X64: vmovups {{%ymm([8-9]\|1[0-5])}}, {{.*}}(%rsp) ## 32-byte Folded Spill`
			`; X64: vmovups {{%ymm([8-9]\|1[0-5])}}, {{.*}}(%rsp) ## 32-byte Folded Spill`
Intel OCL built-ins calling conventions now support MacOS 32-bit. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@168359 91177308-0d34-0410-b5e6-96231b3b80d8 2012-11-20 09:37:57 +00:00			`; X64: call`
Remove the X86 Maximal Stack Alignment Check pass as it is no longer necessary. This pass was conservative in that it always reserved the FP to enable dynamic stack realignment, which allowed the RA to use aligned spills for vector registers. This happens even when spills were not necessary. The RA has since been improved to use unaligned spills when necessary. The new behavior is to realign the stack if the frame pointer was already reserved for some other reason, but don't reserve the frame pointer just because a function contains vector virtual registers. Part of rdar://12719844 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@168627 91177308-0d34-0410-b5e6-96231b3b80d8 2012-11-26 22:55:05 +00:00			`; X64: vmovups {{.*}}(%rsp), {{%ymm([8-9]\|1[0-5])}} ## 32-byte Folded Reload`
			`; X64: vmovups {{.*}}(%rsp), {{%ymm([8-9]\|1[0-5])}} ## 32-byte Folded Reload`
			`; X64: vmovups {{.*}}(%rsp), {{%ymm([8-9]\|1[0-5])}} ## 32-byte Folded Reload`
			`; X64: vmovups {{.*}}(%rsp), {{%ymm([8-9]\|1[0-5])}} ## 32-byte Folded Reload`
			`; X64: vmovups {{.*}}(%rsp), {{%ymm([8-9]\|1[0-5])}} ## 32-byte Folded Reload`
			`; X64: vmovups {{.*}}(%rsp), {{%ymm([8-9]\|1[0-5])}} ## 32-byte Folded Reload`
			`; X64: vmovups {{.*}}(%rsp), {{%ymm([8-9]\|1[0-5])}} ## 32-byte Folded Reload`
			`; X64: vmovups {{.*}}(%rsp), {{%ymm([8-9]\|1[0-5])}} ## 32-byte Folded Reload`
Special calling conventions for Intel OpenCL built-in library. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@166566 91177308-0d34-0410-b5e6-96231b3b80d8 2012-10-24 14:46:16 +00:00			`define intel_ocl_bicc <16 x float> @test_prolog_epilog(<16 x float> %a, <16 x float> %b) nounwind {`
			`%c = call <16 x float> @func_float16(<16 x float> %a, <16 x float> %b)`
			`ret <16 x float> %c`
The test avx-intel-ocl.ll failed. I can't reproduce on any of my machines. I added -mcpu flag, may be it will fix the problem git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@166669 91177308-0d34-0410-b5e6-96231b3b80d8 2012-10-25 08:38:42 +00:00			`}`
Intel OCL built-ins calling conventions now support MacOS 32-bit. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@168359 91177308-0d34-0410-b5e6-96231b3b80d8 2012-11-20 09:37:57 +00:00
			`; test functions with integer parameters`
			`; pass parameters on stack for 32-bit platform`
			`; X32: movl {{.*}}, 4(%esp)`
			`; X32: movl {{.*}}, (%esp)`
			`; X32: call`
			`; X32: addl {{.*}}, %eax`

			`; pass parameters in registers for 64-bit platform`
			`; X64: leal {{.*}}, %edi`
			`; X64: movl {{.*}}, %esi`
			`; X64: call`
			`; X64: addl {{.*}}, %eax`
			`define i32 @test_int(i32 %a, i32 %b) nounwind {`
			`%c1 = add i32 %a, %b`
			`%c2 = call intel_ocl_bicc i32 @func_int(i32 %c1, i32 %a)`
			`%c = add i32 %c2, %b`
			`ret i32 %c`
			`}`
Prevent insertion of "vzeroupper" before call that preserves YMM registers, since a caller uses preserved registers across the call. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@175043 91177308-0d34-0410-b5e6-96231b3b80d8 2013-02-13 08:02:04 +00:00
			`; WIN64: test_float4`
			`; WIN64-NOT: vzeroupper`
			`; WIN64: call`
			`; WIN64-NOT: vzeroupper`
			`; WIN64: call`
			`; WIN64: ret`

			`; X64: test_float4`
			`; X64-NOT: vzeroupper`
			`; X64: call`
			`; X64-NOT: vzeroupper`
			`; X64: call`
			`; X64: ret`

			`; X32: test_float4`
			`; X32: vzeroupper`
			`; X32: call`
			`; X32: vzeroupper`
			`; X32: call`
			`; X32: ret`

			`declare <4 x float> @func_float4(<4 x float>, <4 x float>, <4 x float>)`

			`define <8 x float> @test_float4(<8 x float> %a, <8 x float> %b, <8 x float> %c) nounwind readnone {`
			`entry:`
			`%0 = shufflevector <8 x float> %a, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>`
			`%1 = shufflevector <8 x float> %b, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>`
			`%2 = shufflevector <8 x float> %c, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>`
			`%call.i = tail call intel_ocl_bicc <4 x float> @func_float4(<4 x float> %0, <4 x float> %1, <4 x float> %2) nounwind`
			`%3 = shufflevector <4 x float> %call.i, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>`
			`%4 = shufflevector <8 x float> %a, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>`
			`%5 = shufflevector <8 x float> %b, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>`
			`%6 = shufflevector <8 x float> %c, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>`
			`%call.i2 = tail call intel_ocl_bicc <4 x float> @func_float4(<4 x float> %4, <4 x float> %5, <4 x float> %6) nounwind`
			`%7 = shufflevector <4 x float> %call.i2, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>`
			`%8 = shufflevector <8 x float> %3, <8 x float> %7, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>`
			`ret <8 x float> %8`
			`}`