llvm-6502/test/CodeGen/X86/avx-splat.ll

; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck %s


; CHECK: vpunpcklbw %xmm
; CHECK-NEXT: vpunpckhbw %xmm
; CHECK-NEXT: vpshufd $85
; CHECK-NEXT: vinsertf128 $1
define <32 x i8> @funcA(<32 x i8> %a) nounwind uwtable readnone ssp {
entry:
  %shuffle = shufflevector <32 x i8> %a, <32 x i8> undef, <32 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
  ret <32 x i8> %shuffle
}

; CHECK: vpunpckhwd %xmm
; CHECK-NEXT: vpshufd $85
; CHECK-NEXT: vinsertf128 $1
define <16 x i16> @funcB(<16 x i16> %a) nounwind uwtable readnone ssp {
entry:
  %shuffle = shufflevector <16 x i16> %a, <16 x i16> undef, <16 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
  ret <16 x i16> %shuffle
}

; CHECK: vmovq
; CHECK-NEXT: vmovlhps %xmm
; CHECK-NEXT: vinsertf128 $1
define <4 x i64> @funcC(i64 %q) nounwind uwtable readnone ssp {
entry:
  %vecinit.i = insertelement <4 x i64> undef, i64 %q, i32 0
  %vecinit2.i = insertelement <4 x i64> %vecinit.i, i64 %q, i32 1
  %vecinit4.i = insertelement <4 x i64> %vecinit2.i, i64 %q, i32 2
  %vecinit6.i = insertelement <4 x i64> %vecinit4.i, i64 %q, i32 3
  ret <4 x i64> %vecinit6.i
}

; CHECK: vpermilpd $0
; CHECK-NEXT: vinsertf128 $1
define <4 x double> @funcD(double %q) nounwind uwtable readnone ssp {
entry:
  %vecinit.i = insertelement <4 x double> undef, double %q, i32 0
  %vecinit2.i = insertelement <4 x double> %vecinit.i, double %q, i32 1
  %vecinit4.i = insertelement <4 x double> %vecinit2.i, double %q, i32 2
  %vecinit6.i = insertelement <4 x double> %vecinit4.i, double %q, i32 3
  ret <4 x double> %vecinit6.i
}

; Test this turns into a broadcast:
;   shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
;   
; CHECK: vbroadcastss
define <8 x float> @funcE() nounwind {
allocas:
  %udx495 = alloca [18 x [18 x float]], align 32
  br label %for_test505.preheader

for_test505.preheader:                            ; preds = %for_test505.preheader, %allocas
  br i1 undef, label %for_exit499, label %for_test505.preheader

for_exit499:                                      ; preds = %for_test505.preheader
  br i1 undef, label %__load_and_broadcast_32.exit1249, label %load.i1247

load.i1247:                                       ; preds = %for_exit499
  %ptr1227 = getelementptr [18 x [18 x float]]* %udx495, i64 0, i64 1, i64 1
  %ptr.i1237 = bitcast float* %ptr1227 to i32*
  %val.i1238 = load i32* %ptr.i1237, align 4
  %ret6.i1245 = insertelement <8 x i32> undef, i32 %val.i1238, i32 6
  %ret7.i1246 = insertelement <8 x i32> %ret6.i1245, i32 %val.i1238, i32 7
  %phitmp = bitcast <8 x i32> %ret7.i1246 to <8 x float>
  br label %__load_and_broadcast_32.exit1249

__load_and_broadcast_32.exit1249:                 ; preds = %load.i1247, %for_exit499
  %load_broadcast12281250 = phi <8 x float> [ %phitmp, %load.i1247 ], [ undef, %for_exit499 ]
  ret <8 x float> %load_broadcast12281250
}

; CHECK: vpshufd $0
; CHECK-NEXT: vinsertf128 $1
define <8 x float> @funcF(i32 %val) nounwind {
  %ret6 = insertelement <8 x i32> undef, i32 %val, i32 6
  %ret7 = insertelement <8 x i32> %ret6, i32 %val, i32 7
  %tmp = bitcast <8 x i32> %ret7 to <8 x float>
  ret <8 x float> %tmp
}

; CHECK: vpshufd  $0
; CHECK-NEXT: vinsertf128  $1
define <8 x float> @funcG(<8 x float> %a) nounwind uwtable readnone ssp {
entry:
  %shuffle = shufflevector <8 x float> %a, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
  ret <8 x float> %shuffle
}

; CHECK: vextractf128  $1
; CHECK-NEXT: vpshufd
; CHECK-NEXT: vinsertf128  $1
define <8 x float> @funcH(<8 x float> %a) nounwind uwtable readnone ssp {
entry:
  %shuffle = shufflevector <8 x float> %a, <8 x float> undef, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
  ret <8 x float> %shuffle
}
Add support for 256-bit versions of VPERMIL instruction. This is a new instruction introduced in AVX, which can operate on 128 and 256-bit vectors. It considers a 256-bit vector as two independent 128-bit lanes. It can permute any 32 or 64 elements inside a lane, and restricts the second lane to have the same permutation of the first one. With the improved splat support introduced early today, adding codegen for this instruction enable more efficient 256-bit code: Instead of: vextractf128 $0, %ymm0, %xmm0 punpcklbw %xmm0, %xmm0 punpckhbw %xmm0, %xmm0 vinsertf128 $0, %xmm0, %ymm0, %ymm1 vinsertf128 $1, %xmm0, %ymm1, %ymm0 vextractf128 $1, %ymm0, %xmm1 shufps $1, %xmm1, %xmm1 movss %xmm1, 28(%rsp) movss %xmm1, 24(%rsp) movss %xmm1, 20(%rsp) movss %xmm1, 16(%rsp) vextractf128 $0, %ymm0, %xmm0 shufps $1, %xmm0, %xmm0 movss %xmm0, 12(%rsp) movss %xmm0, 8(%rsp) movss %xmm0, 4(%rsp) movss %xmm0, (%rsp) vmovaps (%rsp), %ymm0 We get: vextractf128 $0, %ymm0, %xmm0 punpcklbw %xmm0, %xmm0 punpckhbw %xmm0, %xmm0 vinsertf128 $0, %xmm0, %ymm0, %ymm1 vinsertf128 $1, %xmm0, %ymm1, %ymm0 vpermilps $85, %ymm0, %ymm0 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@135662 91177308-0d34-0410-b5e6-96231b3b80d8 2011-07-21 01:55:47 +00:00			`; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx \| FileCheck %s`


Add support for breaking 256-bit v16i16 and v32i8 VSETCC into two 128-bit ones, avoiding sclarization. Add vex form of pcmpeqq and pcmpgtq. Fixes more cases for PR10712. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@138321 91177308-0d34-0410-b5e6-96231b3b80d8 2011-08-23 04:36:33 +00:00			`; CHECK: vpunpcklbw %xmm`
			`; CHECK-NEXT: vpunpckhbw %xmm`
X86: Do splat promotion later, so the optimizer can chew on it first. This catches many cases where we can emit a more efficient shuffle for a specific mask or when the mask contains undefs. Once the splat is lowered to unpacks we can't do that anymore. There is a possibility of moving the promotion after pshufb matching, but I'm not sure if pshufb with a mask loaded from memory is faster than 3 shuffles, so I avoided that for now. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@173569 91177308-0d34-0410-b5e6-96231b3b80d8 2013-01-26 11:44:21 +00:00			`; CHECK-NEXT: vpshufd $85`
Add support for 256-bit versions of VPERMIL instruction. This is a new instruction introduced in AVX, which can operate on 128 and 256-bit vectors. It considers a 256-bit vector as two independent 128-bit lanes. It can permute any 32 or 64 elements inside a lane, and restricts the second lane to have the same permutation of the first one. With the improved splat support introduced early today, adding codegen for this instruction enable more efficient 256-bit code: Instead of: vextractf128 $0, %ymm0, %xmm0 punpcklbw %xmm0, %xmm0 punpckhbw %xmm0, %xmm0 vinsertf128 $0, %xmm0, %ymm0, %ymm1 vinsertf128 $1, %xmm0, %ymm1, %ymm0 vextractf128 $1, %ymm0, %xmm1 shufps $1, %xmm1, %xmm1 movss %xmm1, 28(%rsp) movss %xmm1, 24(%rsp) movss %xmm1, 20(%rsp) movss %xmm1, 16(%rsp) vextractf128 $0, %ymm0, %xmm0 shufps $1, %xmm0, %xmm0 movss %xmm0, 12(%rsp) movss %xmm0, 8(%rsp) movss %xmm0, 4(%rsp) movss %xmm0, (%rsp) vmovaps (%rsp), %ymm0 We get: vextractf128 $0, %ymm0, %xmm0 punpcklbw %xmm0, %xmm0 punpckhbw %xmm0, %xmm0 vinsertf128 $0, %xmm0, %ymm0, %ymm1 vinsertf128 $1, %xmm0, %ymm1, %ymm0 vpermilps $85, %ymm0, %ymm0 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@135662 91177308-0d34-0410-b5e6-96231b3b80d8 2011-07-21 01:55:47 +00:00			`; CHECK-NEXT: vinsertf128 $1`
			`define <32 x i8> @funcA(<32 x i8> %a) nounwind uwtable readnone ssp {`
			`entry:`
			`%shuffle = shufflevector <32 x i8> %a, <32 x i8> undef, <32 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>`
			`ret <32 x i8> %shuffle`
			`}`

Add support for breaking 256-bit v16i16 and v32i8 VSETCC into two 128-bit ones, avoiding sclarization. Add vex form of pcmpeqq and pcmpgtq. Fixes more cases for PR10712. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@138321 91177308-0d34-0410-b5e6-96231b3b80d8 2011-08-23 04:36:33 +00:00			`; CHECK: vpunpckhwd %xmm`
X86: Do splat promotion later, so the optimizer can chew on it first. This catches many cases where we can emit a more efficient shuffle for a specific mask or when the mask contains undefs. Once the splat is lowered to unpacks we can't do that anymore. There is a possibility of moving the promotion after pshufb matching, but I'm not sure if pshufb with a mask loaded from memory is faster than 3 shuffles, so I avoided that for now. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@173569 91177308-0d34-0410-b5e6-96231b3b80d8 2013-01-26 11:44:21 +00:00			`; CHECK-NEXT: vpshufd $85`
- Register v16i16 as valid VR256 register class - Add more bitcasts for v16i16 - Since 135661 and 135662 already added the splat logic, just add one more splat test for v16i16 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@135663 91177308-0d34-0410-b5e6-96231b3b80d8 2011-07-21 02:24:08 +00:00			`; CHECK-NEXT: vinsertf128 $1`
			`define <16 x i16> @funcB(<16 x i16> %a) nounwind uwtable readnone ssp {`
			`entry:`
			`%shuffle = shufflevector <16 x i16> %a, <16 x i16> undef, <16 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>`
			`ret <16 x i16> %shuffle`
			`}`

Remove some instructions that existed to provide aliases to the assembler. Can be done with InstAlias instead. Unfortunately, this was causing printer to use 'vmovq' or 'vmovd' based on what was parsed. To cleanup the inconsistencies convert all 'vmovd' with 64-bit registers to 'vmovq', but provide an alias so that 'vmovd' will still parse. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@192171 91177308-0d34-0410-b5e6-96231b3b80d8 2013-10-08 05:53:50 +00:00			`; CHECK: vmovq`
Fix a nasty bug where a v4i64 was being wrong emitted with 32-bit permutations. Also tidy up some patterns and make them close to their instruction definition! git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@138392 91177308-0d34-0410-b5e6-96231b3b80d8 2011-08-23 22:06:37 +00:00			`; CHECK-NEXT: vmovlhps %xmm`
- Handle special scalar_to_vector case: splats. Using a native 128-bit shuffle before inserting on a 256-bit vector. - Add AVX versions of movd/movq instructions - Introduce a few COPY patterns to match insert_subvector instructions. This turns a trivial insert_subvector instruction into a register copy, coalescing the xmm into a ymm and avoid emiting on more instruction. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@136002 91177308-0d34-0410-b5e6-96231b3b80d8 2011-07-25 23:05:25 +00:00			`; CHECK-NEXT: vinsertf128 $1`
			`define <4 x i64> @funcC(i64 %q) nounwind uwtable readnone ssp {`
			`entry:`
			`%vecinit.i = insertelement <4 x i64> undef, i64 %q, i32 0`
			`%vecinit2.i = insertelement <4 x i64> %vecinit.i, i64 %q, i32 1`
			`%vecinit4.i = insertelement <4 x i64> %vecinit2.i, i64 %q, i32 2`
			`%vecinit6.i = insertelement <4 x i64> %vecinit4.i, i64 %q, i32 3`
			`ret <4 x i64> %vecinit6.i`
			`}`

Add instruction selection for 256-bit VPSHUFD and 128-bit VPERMILPS/VPERMILPD. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@149968 91177308-0d34-0410-b5e6-96231b3b80d8 2012-02-07 06:28:42 +00:00			`; CHECK: vpermilpd $0`
Fix a nasty bug where a v4i64 was being wrong emitted with 32-bit permutations. Also tidy up some patterns and make them close to their instruction definition! git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@138392 91177308-0d34-0410-b5e6-96231b3b80d8 2011-08-23 22:06:37 +00:00			`; CHECK-NEXT: vinsertf128 $1`
- Handle special scalar_to_vector case: splats. Using a native 128-bit shuffle before inserting on a 256-bit vector. - Add AVX versions of movd/movq instructions - Introduce a few COPY patterns to match insert_subvector instructions. This turns a trivial insert_subvector instruction into a register copy, coalescing the xmm into a ymm and avoid emiting on more instruction. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@136002 91177308-0d34-0410-b5e6-96231b3b80d8 2011-07-25 23:05:25 +00:00			`define <4 x double> @funcD(double %q) nounwind uwtable readnone ssp {`
			`entry:`
			`%vecinit.i = insertelement <4 x double> undef, double %q, i32 0`
			`%vecinit2.i = insertelement <4 x double> %vecinit.i, double %q, i32 1`
			`%vecinit4.i = insertelement <4 x double> %vecinit2.i, double %q, i32 2`
			`%vecinit6.i = insertelement <4 x double> %vecinit4.i, double %q, i32 3`
			`ret <4 x double> %vecinit6.i`
			`}`
Make this kind of lowering to be supported by 256-bit instructions: shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0> To: shuffle (vload ptr)), undef, <1, 1, 1, 1> Fix PR10494 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@136691 91177308-0d34-0410-b5e6-96231b3b80d8 2011-08-02 16:06:18 +00:00
[x86] Re-apply a variant of the x86 side of r212324 now that the rest has settled without incident, removing the x86-specific and overly strict 'isVectorSplat' routine in favor of generic and more powerful splat detection. The primary motivation and result of this is that the x86 backend can now see through splats which contain undef elements. This is essential if we are using a widening form of legalization and I've updated a test case to also run in that mode as before this change the generated code for the test case was completely scalarized. This version of the patch much more carefully handles the undef lanes. - We aren't overly conservative about them in the shift lowering (where we will never use the splat itself). - One place where the splat would have been re-used by the existing code now explicitly constructs a new constant splat that will be safe. - The broadcast lowering is much more reasonable with undefs by doing a correct check of whether the splat is the only user of a loaded value, checking that the splat actually crosses multiple lanes before using a broadcast, and handling broadcasts of non-constant splats. As a consequence of the last bullet, the weird usage of vpshufd instead of vbroadcast is gone, and we actually can lower an AVX splat with vbroadcastss where before we emitted a really strange pattern of a vector load and a manual splat across the vector. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@212602 91177308-0d34-0410-b5e6-96231b3b80d8 2014-07-09 10:06:58 +00:00			`; Test this turns into a broadcast:`
Make this kind of lowering to be supported by 256-bit instructions: shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0> To: shuffle (vload ptr)), undef, <1, 1, 1, 1> Fix PR10494 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@136691 91177308-0d34-0410-b5e6-96231b3b80d8 2011-08-02 16:06:18 +00:00			`; shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>`
[x86] Re-apply a variant of the x86 side of r212324 now that the rest has settled without incident, removing the x86-specific and overly strict 'isVectorSplat' routine in favor of generic and more powerful splat detection. The primary motivation and result of this is that the x86 backend can now see through splats which contain undef elements. This is essential if we are using a widening form of legalization and I've updated a test case to also run in that mode as before this change the generated code for the test case was completely scalarized. This version of the patch much more carefully handles the undef lanes. - We aren't overly conservative about them in the shift lowering (where we will never use the splat itself). - One place where the splat would have been re-used by the existing code now explicitly constructs a new constant splat that will be safe. - The broadcast lowering is much more reasonable with undefs by doing a correct check of whether the splat is the only user of a loaded value, checking that the splat actually crosses multiple lanes before using a broadcast, and handling broadcasts of non-constant splats. As a consequence of the last bullet, the weird usage of vpshufd instead of vbroadcast is gone, and we actually can lower an AVX splat with vbroadcastss where before we emitted a really strange pattern of a vector load and a manual splat across the vector. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@212602 91177308-0d34-0410-b5e6-96231b3b80d8 2014-07-09 10:06:58 +00:00			`;`
			`; CHECK: vbroadcastss`
Splats for v8i32/v8f32 can be handled by VPERMILPSY. This was causing infinite recursive calls in legalize. Fix PR10562 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@137296 91177308-0d34-0410-b5e6-96231b3b80d8 2011-08-11 02:49:44 +00:00			`define <8 x float> @funcE() nounwind {`
Make this kind of lowering to be supported by 256-bit instructions: shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0> To: shuffle (vload ptr)), undef, <1, 1, 1, 1> Fix PR10494 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@136691 91177308-0d34-0410-b5e6-96231b3b80d8 2011-08-02 16:06:18 +00:00			`allocas:`
			`%udx495 = alloca [18 x [18 x float]], align 32`
			`br label %for_test505.preheader`

			`for_test505.preheader: ; preds = %for_test505.preheader, %allocas`
			`br i1 undef, label %for_exit499, label %for_test505.preheader`

			`for_exit499: ; preds = %for_test505.preheader`
			`br i1 undef, label %__load_and_broadcast_32.exit1249, label %load.i1247`

			`load.i1247: ; preds = %for_exit499`
			`%ptr1227 = getelementptr [18 x [18 x float]]* %udx495, i64 0, i64 1, i64 1`
			`%ptr.i1237 = bitcast float* %ptr1227 to i32*`
			`%val.i1238 = load i32* %ptr.i1237, align 4`
			`%ret6.i1245 = insertelement <8 x i32> undef, i32 %val.i1238, i32 6`
			`%ret7.i1246 = insertelement <8 x i32> %ret6.i1245, i32 %val.i1238, i32 7`
			`%phitmp = bitcast <8 x i32> %ret7.i1246 to <8 x float>`
			`br label %__load_and_broadcast_32.exit1249`

			`__load_and_broadcast_32.exit1249: ; preds = %load.i1247, %for_exit499`
			`%load_broadcast12281250 = phi <8 x float> [ %phitmp, %load.i1247 ], [ undef, %for_exit499 ]`
Splats for v8i32/v8f32 can be handled by VPERMILPSY. This was causing infinite recursive calls in legalize. Fix PR10562 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@137296 91177308-0d34-0410-b5e6-96231b3b80d8 2011-08-11 02:49:44 +00:00			`ret <8 x float> %load_broadcast12281250`
Make this kind of lowering to be supported by 256-bit instructions: shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0> To: shuffle (vload ptr)), undef, <1, 1, 1, 1> Fix PR10494 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@136691 91177308-0d34-0410-b5e6-96231b3b80d8 2011-08-02 16:06:18 +00:00			`}`

Normalize splat 256bit vectors with 8 elements. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@168600 91177308-0d34-0410-b5e6-96231b3b80d8 2012-11-26 19:24:31 +00:00			`; CHECK: vpshufd $0`
			`; CHECK-NEXT: vinsertf128 $1`
Update test to not use the scalar type to splat from a load git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@137809 91177308-0d34-0410-b5e6-96231b3b80d8 2011-08-17 02:29:15 +00:00			`define <8 x float> @funcF(i32 %val) nounwind {`
Use the splat index to generate the desired shuffle. Otherwise we could only get undefs and the vector shuffle becomes an undef, generating wrong code. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@137295 91177308-0d34-0410-b5e6-96231b3b80d8 2011-08-11 02:49:41 +00:00			`%ret6 = insertelement <8 x i32> undef, i32 %val, i32 6`
			`%ret7 = insertelement <8 x i32> %ret6, i32 %val, i32 7`
			`%tmp = bitcast <8 x i32> %ret7 to <8 x float>`
			`ret <8 x float> %tmp`
			`}`

X86: Prefer using VPSHUFD over VPERMIL because it has better throughput. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@169624 91177308-0d34-0410-b5e6-96231b3b80d8 2012-12-07 19:01:13 +00:00			`; CHECK: vpshufd $0`
Normalize splat 256bit vectors with 8 elements. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@168600 91177308-0d34-0410-b5e6-96231b3b80d8 2012-11-26 19:24:31 +00:00			`; CHECK-NEXT: vinsertf128 $1`
Splats for v8i32/v8f32 can be handled by VPERMILPSY. This was causing infinite recursive calls in legalize. Fix PR10562 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@137296 91177308-0d34-0410-b5e6-96231b3b80d8 2011-08-11 02:49:44 +00:00			`define <8 x float> @funcG(<8 x float> %a) nounwind uwtable readnone ssp {`
			`entry:`
			`%shuffle = shufflevector <8 x float> %a, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>`
			`ret <8 x float> %shuffle`
			`}`

			`; CHECK: vextractf128 $1`
X86: Prefer using VPSHUFD over VPERMIL because it has better throughput. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@169624 91177308-0d34-0410-b5e6-96231b3b80d8 2012-12-07 19:01:13 +00:00			`; CHECK-NEXT: vpshufd`
Normalize splat 256bit vectors with 8 elements. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@168600 91177308-0d34-0410-b5e6-96231b3b80d8 2012-11-26 19:24:31 +00:00			`; CHECK-NEXT: vinsertf128 $1`
Splats for v8i32/v8f32 can be handled by VPERMILPSY. This was causing infinite recursive calls in legalize. Fix PR10562 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@137296 91177308-0d34-0410-b5e6-96231b3b80d8 2011-08-11 02:49:44 +00:00			`define <8 x float> @funcH(<8 x float> %a) nounwind uwtable readnone ssp {`
			`entry:`
			`%shuffle = shufflevector <8 x float> %a, <8 x float> undef, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>`
			`ret <8 x float> %shuffle`
			`}`