llvm-6502/test/CodeGen/X86/avx-splat.ll
Chandler Carruth 33957173a7 [x86] Teach the vector combiner that picks a canonical shuffle from to
support transforming the forms from the new vector shuffle lowering to
use 'movddup' when appropriate.

A bunch of the cases where we actually form 'movddup' don't actually
show up in the test results because something even later than DAG
legalization maps them back to 'unpcklpd'. If this shows back up as
a performance problem, I'll probably chase it down, but it is at least
an encoded size loss. =/

To make this work, also always do this canonicalizing step for floating
point vectors where the baseline shuffle instructions don't provide any
free copies of their inputs. This also causes us to canonicalize
unpck[hl]pd into mov{hl,lh}ps (resp.) which is a nice encoding space
win.

There is one test which is "regressed" by this: extractelement-load.
There, the test case where the optimization it is testing *fails*, the
exact instruction pattern which results is slightly different. This
should probably be fixed by having the appropriate extract formed
earlier in the DAG, but that would defeat the purpose of the test.... If
this test case is critically important for anyone, please let me know
and I'll try to work on it. The prior behavior was actually contrary to
the comment in the test case and seems likely to have been an accident.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@217738 91177308-0d34-0410-b5e6-96231b3b80d8
2014-09-14 22:41:37 +00:00

99 lines
3.9 KiB
LLVM

; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck %s
; CHECK: vpshufb {{.*}} ## xmm0 = xmm0[5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5]
; CHECK-NEXT: vinsertf128 $1
define <32 x i8> @funcA(<32 x i8> %a) nounwind uwtable readnone ssp {
entry:
%shuffle = shufflevector <32 x i8> %a, <32 x i8> undef, <32 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
ret <32 x i8> %shuffle
}
; CHECK: vpunpckhwd %xmm
; CHECK-NEXT: vpshufd $85
; CHECK-NEXT: vinsertf128 $1
define <16 x i16> @funcB(<16 x i16> %a) nounwind uwtable readnone ssp {
entry:
%shuffle = shufflevector <16 x i16> %a, <16 x i16> undef, <16 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
ret <16 x i16> %shuffle
}
; CHECK: vmovq
; CHECK-NEXT: vpunpcklqdq %xmm
; CHECK-NEXT: vinsertf128 $1
define <4 x i64> @funcC(i64 %q) nounwind uwtable readnone ssp {
entry:
%vecinit.i = insertelement <4 x i64> undef, i64 %q, i32 0
%vecinit2.i = insertelement <4 x i64> %vecinit.i, i64 %q, i32 1
%vecinit4.i = insertelement <4 x i64> %vecinit2.i, i64 %q, i32 2
%vecinit6.i = insertelement <4 x i64> %vecinit4.i, i64 %q, i32 3
ret <4 x i64> %vecinit6.i
}
; CHECK: vunpcklpd %xmm
; CHECK-NEXT: vinsertf128 $1
define <4 x double> @funcD(double %q) nounwind uwtable readnone ssp {
entry:
%vecinit.i = insertelement <4 x double> undef, double %q, i32 0
%vecinit2.i = insertelement <4 x double> %vecinit.i, double %q, i32 1
%vecinit4.i = insertelement <4 x double> %vecinit2.i, double %q, i32 2
%vecinit6.i = insertelement <4 x double> %vecinit4.i, double %q, i32 3
ret <4 x double> %vecinit6.i
}
; Test this turns into a broadcast:
; shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
;
; CHECK: vbroadcastss
define <8 x float> @funcE() nounwind {
allocas:
%udx495 = alloca [18 x [18 x float]], align 32
br label %for_test505.preheader
for_test505.preheader: ; preds = %for_test505.preheader, %allocas
br i1 undef, label %for_exit499, label %for_test505.preheader
for_exit499: ; preds = %for_test505.preheader
br i1 undef, label %__load_and_broadcast_32.exit1249, label %load.i1247
load.i1247: ; preds = %for_exit499
%ptr1227 = getelementptr [18 x [18 x float]]* %udx495, i64 0, i64 1, i64 1
%ptr.i1237 = bitcast float* %ptr1227 to i32*
%val.i1238 = load i32* %ptr.i1237, align 4
%ret6.i1245 = insertelement <8 x i32> undef, i32 %val.i1238, i32 6
%ret7.i1246 = insertelement <8 x i32> %ret6.i1245, i32 %val.i1238, i32 7
%phitmp = bitcast <8 x i32> %ret7.i1246 to <8 x float>
br label %__load_and_broadcast_32.exit1249
__load_and_broadcast_32.exit1249: ; preds = %load.i1247, %for_exit499
%load_broadcast12281250 = phi <8 x float> [ %phitmp, %load.i1247 ], [ undef, %for_exit499 ]
ret <8 x float> %load_broadcast12281250
}
; CHECK: vpshufd $0
; CHECK-NEXT: vinsertf128 $1
define <8 x float> @funcF(i32 %val) nounwind {
%ret6 = insertelement <8 x i32> undef, i32 %val, i32 6
%ret7 = insertelement <8 x i32> %ret6, i32 %val, i32 7
%tmp = bitcast <8 x i32> %ret7 to <8 x float>
ret <8 x float> %tmp
}
; CHECK: vpshufd $0
; CHECK-NEXT: vinsertf128 $1
define <8 x float> @funcG(<8 x float> %a) nounwind uwtable readnone ssp {
entry:
%shuffle = shufflevector <8 x float> %a, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
ret <8 x float> %shuffle
}
; CHECK: vextractf128 $1
; CHECK-NEXT: vpshufd
; CHECK-NEXT: vinsertf128 $1
define <8 x float> @funcH(<8 x float> %a) nounwind uwtable readnone ssp {
entry:
%shuffle = shufflevector <8 x float> %a, <8 x float> undef, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
ret <8 x float> %shuffle
}