mirror of
https://github.com/c64scene-ar/llvm-6502.git
synced 2025-05-09 20:47:25 +00:00
This patch adds shuffle matching for the SSE3 MOVDDUP, MOVSLDUP and MOVSHDUP instructions. The big use of these being that they avoid many single source shuffles from needing to use (pre-AVX) dual source instructions such as SHUFPD/SHUFPS: causing extra moves and preventing load folds. Adding these instructions uncovered an issue in XFormVExtractWithShuffleIntoLoad which crashed on single operand shuffle instructions (now fixed). It also involved fixing getTargetShuffleMask to correctly identify theses instructions as unary shuffles. Also adds a missing tablegen pattern for MOVDDUP. Differential Revision: http://reviews.llvm.org/D7042 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@226716 91177308-0d34-0410-b5e6-96231b3b80d8
98 lines
4.0 KiB
LLVM
98 lines
4.0 KiB
LLVM
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck %s
|
|
|
|
|
|
; CHECK: vpshufb {{.*}} ## xmm0 = xmm0[5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5]
|
|
; CHECK-NEXT: vinsertf128 $1
|
|
define <32 x i8> @funcA(<32 x i8> %a) nounwind uwtable readnone ssp {
|
|
entry:
|
|
%shuffle = shufflevector <32 x i8> %a, <32 x i8> undef, <32 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
|
|
ret <32 x i8> %shuffle
|
|
}
|
|
|
|
; CHECK: vpshufb {{.*}} ## xmm0 = xmm0[10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11]
|
|
; CHECK-NEXT: vinsertf128 $1
|
|
define <16 x i16> @funcB(<16 x i16> %a) nounwind uwtable readnone ssp {
|
|
entry:
|
|
%shuffle = shufflevector <16 x i16> %a, <16 x i16> undef, <16 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
|
|
ret <16 x i16> %shuffle
|
|
}
|
|
|
|
; CHECK: vmovq
|
|
; CHECK-NEXT: vmovddup %xmm
|
|
; CHECK-NEXT: vinsertf128 $1
|
|
define <4 x i64> @funcC(i64 %q) nounwind uwtable readnone ssp {
|
|
entry:
|
|
%vecinit.i = insertelement <4 x i64> undef, i64 %q, i32 0
|
|
%vecinit2.i = insertelement <4 x i64> %vecinit.i, i64 %q, i32 1
|
|
%vecinit4.i = insertelement <4 x i64> %vecinit2.i, i64 %q, i32 2
|
|
%vecinit6.i = insertelement <4 x i64> %vecinit4.i, i64 %q, i32 3
|
|
ret <4 x i64> %vecinit6.i
|
|
}
|
|
|
|
; CHECK: vmovddup %xmm
|
|
; CHECK-NEXT: vinsertf128 $1
|
|
define <4 x double> @funcD(double %q) nounwind uwtable readnone ssp {
|
|
entry:
|
|
%vecinit.i = insertelement <4 x double> undef, double %q, i32 0
|
|
%vecinit2.i = insertelement <4 x double> %vecinit.i, double %q, i32 1
|
|
%vecinit4.i = insertelement <4 x double> %vecinit2.i, double %q, i32 2
|
|
%vecinit6.i = insertelement <4 x double> %vecinit4.i, double %q, i32 3
|
|
ret <4 x double> %vecinit6.i
|
|
}
|
|
|
|
; Test this turns into a broadcast:
|
|
; shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
|
|
;
|
|
; CHECK: vbroadcastss
|
|
define <8 x float> @funcE() nounwind {
|
|
allocas:
|
|
%udx495 = alloca [18 x [18 x float]], align 32
|
|
br label %for_test505.preheader
|
|
|
|
for_test505.preheader: ; preds = %for_test505.preheader, %allocas
|
|
br i1 undef, label %for_exit499, label %for_test505.preheader
|
|
|
|
for_exit499: ; preds = %for_test505.preheader
|
|
br i1 undef, label %__load_and_broadcast_32.exit1249, label %load.i1247
|
|
|
|
load.i1247: ; preds = %for_exit499
|
|
%ptr1227 = getelementptr [18 x [18 x float]]* %udx495, i64 0, i64 1, i64 1
|
|
%ptr.i1237 = bitcast float* %ptr1227 to i32*
|
|
%val.i1238 = load i32* %ptr.i1237, align 4
|
|
%ret6.i1245 = insertelement <8 x i32> undef, i32 %val.i1238, i32 6
|
|
%ret7.i1246 = insertelement <8 x i32> %ret6.i1245, i32 %val.i1238, i32 7
|
|
%phitmp = bitcast <8 x i32> %ret7.i1246 to <8 x float>
|
|
br label %__load_and_broadcast_32.exit1249
|
|
|
|
__load_and_broadcast_32.exit1249: ; preds = %load.i1247, %for_exit499
|
|
%load_broadcast12281250 = phi <8 x float> [ %phitmp, %load.i1247 ], [ undef, %for_exit499 ]
|
|
ret <8 x float> %load_broadcast12281250
|
|
}
|
|
|
|
; CHECK: vpermilps $4
|
|
; CHECK-NEXT: vinsertf128 $1
|
|
define <8 x float> @funcF(i32 %val) nounwind {
|
|
%ret6 = insertelement <8 x i32> undef, i32 %val, i32 6
|
|
%ret7 = insertelement <8 x i32> %ret6, i32 %val, i32 7
|
|
%tmp = bitcast <8 x i32> %ret7 to <8 x float>
|
|
ret <8 x float> %tmp
|
|
}
|
|
|
|
; CHECK: vpermilps $0
|
|
; CHECK-NEXT: vinsertf128 $1
|
|
define <8 x float> @funcG(<8 x float> %a) nounwind uwtable readnone ssp {
|
|
entry:
|
|
%shuffle = shufflevector <8 x float> %a, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
|
|
ret <8 x float> %shuffle
|
|
}
|
|
|
|
; CHECK: vextractf128 $1
|
|
; CHECK-NEXT: vpermilps $85
|
|
; CHECK-NEXT: vinsertf128 $1
|
|
define <8 x float> @funcH(<8 x float> %a) nounwind uwtable readnone ssp {
|
|
entry:
|
|
%shuffle = shufflevector <8 x float> %a, <8 x float> undef, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
|
|
ret <8 x float> %shuffle
|
|
}
|
|
|