llvm-6502/test/CodeGen/X86/vec_splat.ll

; RUN: llc < %s -march=x86 -mcpu=pentium4 -mattr=+sse2 | FileCheck %s -check-prefix=SSE2
; RUN: llc < %s -march=x86 -mcpu=pentium4 -mattr=+sse3 | FileCheck %s -check-prefix=SSE3
; RUN: llc < %s -march=x86-64 -mattr=+avx | FileCheck %s -check-prefix=AVX

define void @test_v4sf(<4 x float>* %P, <4 x float>* %Q, float %X) nounwind {
	%tmp = insertelement <4 x float> zeroinitializer, float %X, i32 0		; <<4 x float>> [#uses=1]
	%tmp2 = insertelement <4 x float> %tmp, float %X, i32 1		; <<4 x float>> [#uses=1]
	%tmp4 = insertelement <4 x float> %tmp2, float %X, i32 2		; <<4 x float>> [#uses=1]
	%tmp6 = insertelement <4 x float> %tmp4, float %X, i32 3		; <<4 x float>> [#uses=1]
	%tmp8 = load <4 x float>* %Q		; <<4 x float>> [#uses=1]
	%tmp10 = fmul <4 x float> %tmp8, %tmp6		; <<4 x float>> [#uses=1]
	store <4 x float> %tmp10, <4 x float>* %P
	ret void

; SSE2-LABEL: test_v4sf:
; SSE2: pshufd $0

; SSE3-LABEL: test_v4sf:
; SSE3: pshufd $0
}

define void @test_v2sd(<2 x double>* %P, <2 x double>* %Q, double %X) nounwind {
	%tmp = insertelement <2 x double> zeroinitializer, double %X, i32 0		; <<2 x double>> [#uses=1]
	%tmp2 = insertelement <2 x double> %tmp, double %X, i32 1		; <<2 x double>> [#uses=1]
	%tmp4 = load <2 x double>* %Q		; <<2 x double>> [#uses=1]
	%tmp6 = fmul <2 x double> %tmp4, %tmp2		; <<2 x double>> [#uses=1]
	store <2 x double> %tmp6, <2 x double>* %P
	ret void

; SSE2-LABEL: test_v2sd:
; SSE2: movlhps

; SSE3-LABEL: test_v2sd:
; SSE3: movddup
}

; Fold extract of a load into the load's address computation. This avoids spilling to the stack.
define <4 x float> @load_extract_splat(<4 x float>* nocapture readonly %ptr, i64 %i, i64 %j) nounwind {
  %1 = getelementptr inbounds <4 x float>* %ptr, i64 %i
  %2 = load <4 x float>* %1, align 16
  %3 = trunc i64 %j to i32
  %4 = extractelement <4 x float> %2, i32 %3
  %5 = insertelement <4 x float> undef, float %4, i32 0
  %6 = insertelement <4 x float> %5, float %4, i32 1
  %7 = insertelement <4 x float> %6, float %4, i32 2
  %8 = insertelement <4 x float> %7, float %4, i32 3
  ret <4 x float> %8
  
; AVX-LABEL: load_extract_splat
; AVX-NOT: rsp
; AVX: vbroadcastss
}

; Fold extract of a load into the load's address computation. This avoids spilling to the stack.
define <4 x float> @load_extract_splat1(<4 x float>* nocapture readonly %ptr, i64 %i, i64 %j) nounwind {
  %1 = getelementptr inbounds <4 x float>* %ptr, i64 %i
  %2 = load <4 x float>* %1, align 16
  %3 = extractelement <4 x float> %2, i64 %j
  %4 = insertelement <4 x float> undef, float %3, i32 0
  %5 = insertelement <4 x float> %4, float %3, i32 1
  %6 = insertelement <4 x float> %5, float %3, i32 2
  %7 = insertelement <4 x float> %6, float %3, i32 3
  ret <4 x float> %7
  
; AVX-LABEL: load_extract_splat1
; AVX-NOT: movs
; AVX: vbroadcastss
}
FileCheckize and merge some tests. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@173568 91177308-0d34-0410-b5e6-96231b3b80d8 2013-01-26 11:14:32 +00:00			`; RUN: llc < %s -march=x86 -mcpu=pentium4 -mattr=+sse2 \| FileCheck %s -check-prefix=SSE2`
			`; RUN: llc < %s -march=x86 -mcpu=pentium4 -mattr=+sse3 \| FileCheck %s -check-prefix=SSE3`
[x86] Fold extract_vector_elt of a load into the Load's address computation. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@215409 91177308-0d34-0410-b5e6-96231b3b80d8 2014-08-11 23:49:33 +00:00			`; RUN: llc < %s -march=x86-64 -mattr=+avx \| FileCheck %s -check-prefix=AVX`
Replace vector splat test case. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@26963 91177308-0d34-0410-b5e6-96231b3b80d8 2006-03-22 21:39:25 +00:00
Favors pshufd over shufps when shuffling elements from one vector. pshufd is faster than shufps. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@49244 91177308-0d34-0410-b5e6-96231b3b80d8 2008-04-05 00:30:36 +00:00			`define void @test_v4sf(<4 x float>* %P, <4 x float>* %Q, float %X) nounwind {`
Remove llvm-upgrade and update tests. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@47432 91177308-0d34-0410-b5e6-96231b3b80d8 2008-02-21 07:42:26 +00:00			`%tmp = insertelement <4 x float> zeroinitializer, float %X, i32 0 ; <<4 x float>> [#uses=1]`
			`%tmp2 = insertelement <4 x float> %tmp, float %X, i32 1 ; <<4 x float>> [#uses=1]`
			`%tmp4 = insertelement <4 x float> %tmp2, float %X, i32 2 ; <<4 x float>> [#uses=1]`
			`%tmp6 = insertelement <4 x float> %tmp4, float %X, i32 3 ; <<4 x float>> [#uses=1]`
			`%tmp8 = load <4 x float>* %Q ; <<4 x float>> [#uses=1]`
Split the Add, Sub, and Mul instruction opcodes into separate integer and floating-point opcodes, introducing FAdd, FSub, and FMul. For now, the AsmParser, BitcodeReader, and IRBuilder all preserve backwards compatability, and the Core LLVM APIs preserve backwards compatibility for IR producers. Most front-ends won't need to change immediately. This implements the first step of the plan outlined here: http://nondot.org/sabre/LLVMNotes/IntegerOverflow.txt git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@72897 91177308-0d34-0410-b5e6-96231b3b80d8 2009-06-04 22:49:04 +00:00			`%tmp10 = fmul <4 x float> %tmp8, %tmp6 ; <<4 x float>> [#uses=1]`
Replace vector splat test case. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@26963 91177308-0d34-0410-b5e6-96231b3b80d8 2006-03-22 21:39:25 +00:00			`store <4 x float> %tmp10, <4 x float>* %P`
			`ret void`
FileCheckize and merge some tests. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@173568 91177308-0d34-0410-b5e6-96231b3b80d8 2013-01-26 11:14:32 +00:00
Convert CodeGen//.ll tests to use the new CHECK-LABEL for easier debugging. No functionality change and all tests pass after conversion. This was done with the following sed invocation to catch label lines demarking function boundaries: sed -i '' "s/^;\( \)\([A-Z0-9_]\):\( \)test\([A-Za-z0-9_-]\):\( \)$/;\1\2-LABEL:\3test\4:\5/g" test/CodeGen//*.ll which was written conservatively to avoid false positives rather than false negatives. I scanned through all the changes and everything looks correct. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@186258 91177308-0d34-0410-b5e6-96231b3b80d8 2013-07-13 20:38:47 +00:00			`; SSE2-LABEL: test_v4sf:`
FileCheckize and merge some tests. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@173568 91177308-0d34-0410-b5e6-96231b3b80d8 2013-01-26 11:14:32 +00:00			`; SSE2: pshufd $0`

Convert CodeGen//.ll tests to use the new CHECK-LABEL for easier debugging. No functionality change and all tests pass after conversion. This was done with the following sed invocation to catch label lines demarking function boundaries: sed -i '' "s/^;\( \)\([A-Z0-9_]\):\( \)test\([A-Za-z0-9_-]\):\( \)$/;\1\2-LABEL:\3test\4:\5/g" test/CodeGen//*.ll which was written conservatively to avoid false positives rather than false negatives. I scanned through all the changes and everything looks correct. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@186258 91177308-0d34-0410-b5e6-96231b3b80d8 2013-07-13 20:38:47 +00:00			`; SSE3-LABEL: test_v4sf:`
FileCheckize and merge some tests. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@173568 91177308-0d34-0410-b5e6-96231b3b80d8 2013-01-26 11:14:32 +00:00			`; SSE3: pshufd $0`
Replace vector splat test case. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@26963 91177308-0d34-0410-b5e6-96231b3b80d8 2006-03-22 21:39:25 +00:00			`}`

Favors pshufd over shufps when shuffling elements from one vector. pshufd is faster than shufps. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@49244 91177308-0d34-0410-b5e6-96231b3b80d8 2008-04-05 00:30:36 +00:00			`define void @test_v2sd(<2 x double>* %P, <2 x double>* %Q, double %X) nounwind {`
Remove llvm-upgrade and update tests. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@47432 91177308-0d34-0410-b5e6-96231b3b80d8 2008-02-21 07:42:26 +00:00			`%tmp = insertelement <2 x double> zeroinitializer, double %X, i32 0 ; <<2 x double>> [#uses=1]`
			`%tmp2 = insertelement <2 x double> %tmp, double %X, i32 1 ; <<2 x double>> [#uses=1]`
			`%tmp4 = load <2 x double>* %Q ; <<2 x double>> [#uses=1]`
Split the Add, Sub, and Mul instruction opcodes into separate integer and floating-point opcodes, introducing FAdd, FSub, and FMul. For now, the AsmParser, BitcodeReader, and IRBuilder all preserve backwards compatability, and the Core LLVM APIs preserve backwards compatibility for IR producers. Most front-ends won't need to change immediately. This implements the first step of the plan outlined here: http://nondot.org/sabre/LLVMNotes/IntegerOverflow.txt git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@72897 91177308-0d34-0410-b5e6-96231b3b80d8 2009-06-04 22:49:04 +00:00			`%tmp6 = fmul <2 x double> %tmp4, %tmp2 ; <<2 x double>> [#uses=1]`
Replace vector splat test case. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@26963 91177308-0d34-0410-b5e6-96231b3b80d8 2006-03-22 21:39:25 +00:00			`store <2 x double> %tmp6, <2 x double>* %P`
			`ret void`
FileCheckize and merge some tests. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@173568 91177308-0d34-0410-b5e6-96231b3b80d8 2013-01-26 11:14:32 +00:00
Convert CodeGen//.ll tests to use the new CHECK-LABEL for easier debugging. No functionality change and all tests pass after conversion. This was done with the following sed invocation to catch label lines demarking function boundaries: sed -i '' "s/^;\( \)\([A-Z0-9_]\):\( \)test\([A-Za-z0-9_-]\):\( \)$/;\1\2-LABEL:\3test\4:\5/g" test/CodeGen//*.ll which was written conservatively to avoid false positives rather than false negatives. I scanned through all the changes and everything looks correct. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@186258 91177308-0d34-0410-b5e6-96231b3b80d8 2013-07-13 20:38:47 +00:00			`; SSE2-LABEL: test_v2sd:`
[x86] Teach the vector combiner that picks a canonical shuffle from to support transforming the forms from the new vector shuffle lowering to use 'movddup' when appropriate. A bunch of the cases where we actually form 'movddup' don't actually show up in the test results because something even later than DAG legalization maps them back to 'unpcklpd'. If this shows back up as a performance problem, I'll probably chase it down, but it is at least an encoded size loss. =/ To make this work, also always do this canonicalizing step for floating point vectors where the baseline shuffle instructions don't provide any free copies of their inputs. This also causes us to canonicalize unpck[hl]pd into mov{hl,lh}ps (resp.) which is a nice encoding space win. There is one test which is "regressed" by this: extractelement-load. There, the test case where the optimization it is testing fails, the exact instruction pattern which results is slightly different. This should probably be fixed by having the appropriate extract formed earlier in the DAG, but that would defeat the purpose of the test.... If this test case is critically important for anyone, please let me know and I'll try to work on it. The prior behavior was actually contrary to the comment in the test case and seems likely to have been an accident. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@217738 91177308-0d34-0410-b5e6-96231b3b80d8 2014-09-14 22:41:37 +00:00			`; SSE2: movlhps`
FileCheckize and merge some tests. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@173568 91177308-0d34-0410-b5e6-96231b3b80d8 2013-01-26 11:14:32 +00:00
Convert CodeGen//.ll tests to use the new CHECK-LABEL for easier debugging. No functionality change and all tests pass after conversion. This was done with the following sed invocation to catch label lines demarking function boundaries: sed -i '' "s/^;\( \)\([A-Z0-9_]\):\( \)test\([A-Za-z0-9_-]\):\( \)$/;\1\2-LABEL:\3test\4:\5/g" test/CodeGen//*.ll which was written conservatively to avoid false positives rather than false negatives. I scanned through all the changes and everything looks correct. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@186258 91177308-0d34-0410-b5e6-96231b3b80d8 2013-07-13 20:38:47 +00:00			`; SSE3-LABEL: test_v2sd:`
FileCheckize and merge some tests. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@173568 91177308-0d34-0410-b5e6-96231b3b80d8 2013-01-26 11:14:32 +00:00			`; SSE3: movddup`
Replace vector splat test case. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@26963 91177308-0d34-0410-b5e6-96231b3b80d8 2006-03-22 21:39:25 +00:00			`}`
[IR] Make {extract,insert}element accept an index of any integer type. Given the following C code llvm currently generates suboptimal code for x86-64: __m128 bss4( const __m128 ptr, size_t i, size_t j ) { float f = ptr[i][j]; return (__m128) { f, f, f, f }; } ================================================= define <4 x float> @_Z4bss4PKDv4_fmm(<4 x float> nocapture readonly %ptr, i64 %i, i64 %j) #0 { %a1 = getelementptr inbounds <4 x float>* %ptr, i64 %i %a2 = load <4 x float>* %a1, align 16, !tbaa !1 %a3 = trunc i64 %j to i32 %a4 = extractelement <4 x float> %a2, i32 %a3 %a5 = insertelement <4 x float> undef, float %a4, i32 0 %a6 = insertelement <4 x float> %a5, float %a4, i32 1 %a7 = insertelement <4 x float> %a6, float %a4, i32 2 %a8 = insertelement <4 x float> %a7, float %a4, i32 3 ret <4 x float> %a8 } ================================================= shlq $4, %rsi addq %rdi, %rsi movslq %edx, %rax vbroadcastss (%rsi,%rax,4), %xmm0 retq ================================================= The movslq is uneeded, but is present because of the trunc to i32 and then sext back to i64 that the backend adds for vbroadcastss. We can't remove it because it changes the meaning. The IR that clang generates is already suboptimal. What clang really should emit is: %a4 = extractelement <4 x float> %a2, i64 %j This patch makes that legal. A separate patch will teach clang to do it. Differential Revision: http://reviews.llvm.org/D3519 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@207801 91177308-0d34-0410-b5e6-96231b3b80d8 2014-05-01 22:12:39 +00:00
			`; Fold extract of a load into the load's address computation. This avoids spilling to the stack.`
			`define <4 x float> @load_extract_splat(<4 x float>* nocapture readonly %ptr, i64 %i, i64 %j) nounwind {`
[x86] Fold extract_vector_elt of a load into the Load's address computation. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@215409 91177308-0d34-0410-b5e6-96231b3b80d8 2014-08-11 23:49:33 +00:00			`%1 = getelementptr inbounds <4 x float>* %ptr, i64 %i`
			`%2 = load <4 x float>* %1, align 16`
			`%3 = trunc i64 %j to i32`
			`%4 = extractelement <4 x float> %2, i32 %3`
			`%5 = insertelement <4 x float> undef, float %4, i32 0`
			`%6 = insertelement <4 x float> %5, float %4, i32 1`
			`%7 = insertelement <4 x float> %6, float %4, i32 2`
			`%8 = insertelement <4 x float> %7, float %4, i32 3`
			`ret <4 x float> %8`

			`; AVX-LABEL: load_extract_splat`
			`; AVX-NOT: rsp`
			`; AVX: vbroadcastss`
			`}`

			`; Fold extract of a load into the load's address computation. This avoids spilling to the stack.`
			`define <4 x float> @load_extract_splat1(<4 x float>* nocapture readonly %ptr, i64 %i, i64 %j) nounwind {`
[IR] Make {extract,insert}element accept an index of any integer type. Given the following C code llvm currently generates suboptimal code for x86-64: __m128 bss4( const __m128 ptr, size_t i, size_t j ) { float f = ptr[i][j]; return (__m128) { f, f, f, f }; } ================================================= define <4 x float> @_Z4bss4PKDv4_fmm(<4 x float> nocapture readonly %ptr, i64 %i, i64 %j) #0 { %a1 = getelementptr inbounds <4 x float>* %ptr, i64 %i %a2 = load <4 x float>* %a1, align 16, !tbaa !1 %a3 = trunc i64 %j to i32 %a4 = extractelement <4 x float> %a2, i32 %a3 %a5 = insertelement <4 x float> undef, float %a4, i32 0 %a6 = insertelement <4 x float> %a5, float %a4, i32 1 %a7 = insertelement <4 x float> %a6, float %a4, i32 2 %a8 = insertelement <4 x float> %a7, float %a4, i32 3 ret <4 x float> %a8 } ================================================= shlq $4, %rsi addq %rdi, %rsi movslq %edx, %rax vbroadcastss (%rsi,%rax,4), %xmm0 retq ================================================= The movslq is uneeded, but is present because of the trunc to i32 and then sext back to i64 that the backend adds for vbroadcastss. We can't remove it because it changes the meaning. The IR that clang generates is already suboptimal. What clang really should emit is: %a4 = extractelement <4 x float> %a2, i64 %j This patch makes that legal. A separate patch will teach clang to do it. Differential Revision: http://reviews.llvm.org/D3519 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@207801 91177308-0d34-0410-b5e6-96231b3b80d8 2014-05-01 22:12:39 +00:00			`%1 = getelementptr inbounds <4 x float>* %ptr, i64 %i`
			`%2 = load <4 x float>* %1, align 16`
			`%3 = extractelement <4 x float> %2, i64 %j`
			`%4 = insertelement <4 x float> undef, float %3, i32 0`
			`%5 = insertelement <4 x float> %4, float %3, i32 1`
			`%6 = insertelement <4 x float> %5, float %3, i32 2`
			`%7 = insertelement <4 x float> %6, float %3, i32 3`
			`ret <4 x float> %7`

[x86] Fold extract_vector_elt of a load into the Load's address computation. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@215409 91177308-0d34-0410-b5e6-96231b3b80d8 2014-08-11 23:49:33 +00:00			`; AVX-LABEL: load_extract_splat1`
[IR] Make {extract,insert}element accept an index of any integer type. Given the following C code llvm currently generates suboptimal code for x86-64: __m128 bss4( const __m128 ptr, size_t i, size_t j ) { float f = ptr[i][j]; return (__m128) { f, f, f, f }; } ================================================= define <4 x float> @_Z4bss4PKDv4_fmm(<4 x float> nocapture readonly %ptr, i64 %i, i64 %j) #0 { %a1 = getelementptr inbounds <4 x float>* %ptr, i64 %i %a2 = load <4 x float>* %a1, align 16, !tbaa !1 %a3 = trunc i64 %j to i32 %a4 = extractelement <4 x float> %a2, i32 %a3 %a5 = insertelement <4 x float> undef, float %a4, i32 0 %a6 = insertelement <4 x float> %a5, float %a4, i32 1 %a7 = insertelement <4 x float> %a6, float %a4, i32 2 %a8 = insertelement <4 x float> %a7, float %a4, i32 3 ret <4 x float> %a8 } ================================================= shlq $4, %rsi addq %rdi, %rsi movslq %edx, %rax vbroadcastss (%rsi,%rax,4), %xmm0 retq ================================================= The movslq is uneeded, but is present because of the trunc to i32 and then sext back to i64 that the backend adds for vbroadcastss. We can't remove it because it changes the meaning. The IR that clang generates is already suboptimal. What clang really should emit is: %a4 = extractelement <4 x float> %a2, i64 %j This patch makes that legal. A separate patch will teach clang to do it. Differential Revision: http://reviews.llvm.org/D3519 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@207801 91177308-0d34-0410-b5e6-96231b3b80d8 2014-05-01 22:12:39 +00:00			`; AVX-NOT: movs`
			`; AVX: vbroadcastss`
			`}`