mirror of
https://github.com/c64scene-ar/llvm-6502.git
synced 2024-11-19 01:13:25 +00:00
b9a47b824f
Generate better code for v16i8 shuffles on SSE2 (avoids stack) Generate pshufb for v8i16 and v16i8 shuffles on SSSE3 where it is fewer uops. Document the shuffle matching logic and add some FIXMEs for later further cleanups. New tests that test the above. Examples: New: _shuf2: pextrw $7, %xmm0, %eax punpcklqdq %xmm1, %xmm0 pshuflw $128, %xmm0, %xmm0 pinsrw $2, %eax, %xmm0 Old: _shuf2: pextrw $2, %xmm0, %eax pextrw $7, %xmm0, %ecx pinsrw $2, %ecx, %xmm0 pinsrw $3, %eax, %xmm0 movd %xmm1, %eax pinsrw $4, %eax, %xmm0 ret ========= New: _shuf4: punpcklqdq %xmm1, %xmm0 pshufb LCPI1_0, %xmm0 Old: _shuf4: pextrw $3, %xmm0, %eax movsd %xmm1, %xmm0 pextrw $3, %xmm1, %ecx pinsrw $4, %ecx, %xmm0 pinsrw $5, %eax, %xmm0 ======== New: _shuf1: pushl %ebx pushl %edi pushl %esi pextrw $1, %xmm0, %eax rolw $8, %ax movd %xmm0, %ecx rolw $8, %cx pextrw $5, %xmm0, %edx pextrw $4, %xmm0, %esi pextrw $3, %xmm0, %edi pextrw $2, %xmm0, %ebx movaps %xmm0, %xmm1 pinsrw $0, %ecx, %xmm1 pinsrw $1, %eax, %xmm1 rolw $8, %bx pinsrw $2, %ebx, %xmm1 rolw $8, %di pinsrw $3, %edi, %xmm1 rolw $8, %si pinsrw $4, %esi, %xmm1 rolw $8, %dx pinsrw $5, %edx, %xmm1 pextrw $7, %xmm0, %eax rolw $8, %ax movaps %xmm1, %xmm0 pinsrw $7, %eax, %xmm0 popl %esi popl %edi popl %ebx ret Old: _shuf1: subl $252, %esp movaps %xmm0, (%esp) movaps %xmm0, 16(%esp) movaps %xmm0, 32(%esp) movaps %xmm0, 48(%esp) movaps %xmm0, 64(%esp) movaps %xmm0, 80(%esp) movaps %xmm0, 96(%esp) movaps %xmm0, 224(%esp) movaps %xmm0, 208(%esp) movaps %xmm0, 192(%esp) movaps %xmm0, 176(%esp) movaps %xmm0, 160(%esp) movaps %xmm0, 144(%esp) movaps %xmm0, 128(%esp) movaps %xmm0, 112(%esp) movzbl 14(%esp), %eax movd %eax, %xmm1 movzbl 22(%esp), %eax movd %eax, %xmm2 punpcklbw %xmm1, %xmm2 movzbl 42(%esp), %eax movd %eax, %xmm1 movzbl 50(%esp), %eax movd %eax, %xmm3 punpcklbw %xmm1, %xmm3 punpcklbw %xmm2, %xmm3 movzbl 77(%esp), %eax movd %eax, %xmm1 movzbl 84(%esp), %eax movd %eax, %xmm2 punpcklbw %xmm1, %xmm2 movzbl 104(%esp), %eax movd %eax, %xmm1 punpcklbw %xmm1, %xmm0 punpcklbw %xmm2, %xmm0 movaps %xmm0, %xmm1 punpcklbw %xmm3, %xmm1 movzbl 127(%esp), %eax movd %eax, %xmm0 movzbl 135(%esp), %eax movd %eax, %xmm2 punpcklbw %xmm0, %xmm2 movzbl 155(%esp), %eax movd %eax, %xmm0 movzbl 163(%esp), %eax movd %eax, %xmm3 punpcklbw %xmm0, %xmm3 punpcklbw %xmm2, %xmm3 movzbl 188(%esp), %eax movd %eax, %xmm0 movzbl 197(%esp), %eax movd %eax, %xmm2 punpcklbw %xmm0, %xmm2 movzbl 217(%esp), %eax movd %eax, %xmm4 movzbl 225(%esp), %eax movd %eax, %xmm0 punpcklbw %xmm4, %xmm0 punpcklbw %xmm2, %xmm0 punpcklbw %xmm3, %xmm0 punpcklbw %xmm1, %xmm0 addl $252, %esp ret git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@65311 91177308-0d34-0410-b5e6-96231b3b80d8
48 lines
2.9 KiB
LLVM
48 lines
2.9 KiB
LLVM
; RUN: llvm-as < %s | llc -march=x86 -mcpu=yonah -o %t -f
|
|
; RUN: grep pshufhw %t | count 1
|
|
; RUN: grep pshuflw %t | count 1
|
|
; RUN: grep movhps %t | count 1
|
|
|
|
define void @test1(<2 x i64>* %res, <2 x i64>* %A) nounwind {
|
|
%tmp = load <2 x i64>* %A ; <<2 x i64>> [#uses=1]
|
|
%tmp.upgrd.1 = bitcast <2 x i64> %tmp to <8 x i16> ; <<8 x i16>> [#uses=8]
|
|
%tmp0 = extractelement <8 x i16> %tmp.upgrd.1, i32 0 ; <i16> [#uses=1]
|
|
%tmp1 = extractelement <8 x i16> %tmp.upgrd.1, i32 1 ; <i16> [#uses=1]
|
|
%tmp2 = extractelement <8 x i16> %tmp.upgrd.1, i32 2 ; <i16> [#uses=1]
|
|
%tmp3 = extractelement <8 x i16> %tmp.upgrd.1, i32 3 ; <i16> [#uses=1]
|
|
%tmp4 = extractelement <8 x i16> %tmp.upgrd.1, i32 4 ; <i16> [#uses=1]
|
|
%tmp5 = extractelement <8 x i16> %tmp.upgrd.1, i32 5 ; <i16> [#uses=1]
|
|
%tmp6 = extractelement <8 x i16> %tmp.upgrd.1, i32 6 ; <i16> [#uses=1]
|
|
%tmp7 = extractelement <8 x i16> %tmp.upgrd.1, i32 7 ; <i16> [#uses=1]
|
|
%tmp8 = insertelement <8 x i16> undef, i16 %tmp2, i32 0 ; <<8 x i16>> [#uses=1]
|
|
%tmp9 = insertelement <8 x i16> %tmp8, i16 %tmp1, i32 1 ; <<8 x i16>> [#uses=1]
|
|
%tmp10 = insertelement <8 x i16> %tmp9, i16 %tmp0, i32 2 ; <<8 x i16>> [#uses=1]
|
|
%tmp11 = insertelement <8 x i16> %tmp10, i16 %tmp3, i32 3 ; <<8 x i16>> [#uses=1]
|
|
%tmp12 = insertelement <8 x i16> %tmp11, i16 %tmp6, i32 4 ; <<8 x i16>> [#uses=1]
|
|
%tmp13 = insertelement <8 x i16> %tmp12, i16 %tmp5, i32 5 ; <<8 x i16>> [#uses=1]
|
|
%tmp14 = insertelement <8 x i16> %tmp13, i16 %tmp4, i32 6 ; <<8 x i16>> [#uses=1]
|
|
%tmp15 = insertelement <8 x i16> %tmp14, i16 %tmp7, i32 7 ; <<8 x i16>> [#uses=1]
|
|
%tmp15.upgrd.2 = bitcast <8 x i16> %tmp15 to <2 x i64> ; <<2 x i64>> [#uses=1]
|
|
store <2 x i64> %tmp15.upgrd.2, <2 x i64>* %res
|
|
ret void
|
|
}
|
|
|
|
define void @test2(<4 x float>* %r, <2 x i32>* %A) nounwind {
|
|
%tmp = load <4 x float>* %r ; <<4 x float>> [#uses=2]
|
|
%tmp.upgrd.3 = bitcast <2 x i32>* %A to double* ; <double*> [#uses=1]
|
|
%tmp.upgrd.4 = load double* %tmp.upgrd.3 ; <double> [#uses=1]
|
|
%tmp.upgrd.5 = insertelement <2 x double> undef, double %tmp.upgrd.4, i32 0 ; <<2 x double>> [#uses=1]
|
|
%tmp5 = insertelement <2 x double> %tmp.upgrd.5, double undef, i32 1 ; <<2 x double>> [#uses=1]
|
|
%tmp6 = bitcast <2 x double> %tmp5 to <4 x float> ; <<4 x float>> [#uses=2]
|
|
%tmp.upgrd.6 = extractelement <4 x float> %tmp, i32 0 ; <float> [#uses=1]
|
|
%tmp7 = extractelement <4 x float> %tmp, i32 1 ; <float> [#uses=1]
|
|
%tmp8 = extractelement <4 x float> %tmp6, i32 0 ; <float> [#uses=1]
|
|
%tmp9 = extractelement <4 x float> %tmp6, i32 1 ; <float> [#uses=1]
|
|
%tmp10 = insertelement <4 x float> undef, float %tmp.upgrd.6, i32 0 ; <<4 x float>> [#uses=1]
|
|
%tmp11 = insertelement <4 x float> %tmp10, float %tmp7, i32 1 ; <<4 x float>> [#uses=1]
|
|
%tmp12 = insertelement <4 x float> %tmp11, float %tmp8, i32 2 ; <<4 x float>> [#uses=1]
|
|
%tmp13 = insertelement <4 x float> %tmp12, float %tmp9, i32 3 ; <<4 x float>> [#uses=1]
|
|
store <4 x float> %tmp13, <4 x float>* %r
|
|
ret void
|
|
}
|