llvm-6502/test/CodeGen/X86/vec_shuffle-2.ll
Nate Begeman b9a47b824f Generate better code for v8i16 shuffles on SSE2
Generate better code for v16i8 shuffles on SSE2 (avoids stack)
Generate pshufb for v8i16 and v16i8 shuffles on SSSE3 where it is fewer uops.
Document the shuffle matching logic and add some FIXMEs for later further
  cleanups.
New tests that test the above.

Examples:

New:
_shuf2:
	pextrw	$7, %xmm0, %eax
	punpcklqdq	%xmm1, %xmm0
	pshuflw	$128, %xmm0, %xmm0
	pinsrw	$2, %eax, %xmm0

Old:
_shuf2:
	pextrw	$2, %xmm0, %eax
	pextrw	$7, %xmm0, %ecx
	pinsrw	$2, %ecx, %xmm0
	pinsrw	$3, %eax, %xmm0
	movd	%xmm1, %eax
	pinsrw	$4, %eax, %xmm0
	ret

=========

New:
_shuf4:
	punpcklqdq	%xmm1, %xmm0
	pshufb	LCPI1_0, %xmm0

Old:
_shuf4:
	pextrw	$3, %xmm0, %eax
	movsd	%xmm1, %xmm0
	pextrw	$3, %xmm1, %ecx
	pinsrw	$4, %ecx, %xmm0
	pinsrw	$5, %eax, %xmm0

========

New:
_shuf1:
	pushl	%ebx
	pushl	%edi
	pushl	%esi
	pextrw	$1, %xmm0, %eax
	rolw	$8, %ax
	movd	%xmm0, %ecx
	rolw	$8, %cx
	pextrw	$5, %xmm0, %edx
	pextrw	$4, %xmm0, %esi
	pextrw	$3, %xmm0, %edi
	pextrw	$2, %xmm0, %ebx
	movaps	%xmm0, %xmm1
	pinsrw	$0, %ecx, %xmm1
	pinsrw	$1, %eax, %xmm1
	rolw	$8, %bx
	pinsrw	$2, %ebx, %xmm1
	rolw	$8, %di
	pinsrw	$3, %edi, %xmm1
	rolw	$8, %si
	pinsrw	$4, %esi, %xmm1
	rolw	$8, %dx
	pinsrw	$5, %edx, %xmm1
	pextrw	$7, %xmm0, %eax
	rolw	$8, %ax
	movaps	%xmm1, %xmm0
	pinsrw	$7, %eax, %xmm0
	popl	%esi
	popl	%edi
	popl	%ebx
	ret

Old:
_shuf1:
	subl	$252, %esp
	movaps	%xmm0, (%esp)
	movaps	%xmm0, 16(%esp)
	movaps	%xmm0, 32(%esp)
	movaps	%xmm0, 48(%esp)
	movaps	%xmm0, 64(%esp)
	movaps	%xmm0, 80(%esp)
	movaps	%xmm0, 96(%esp)
	movaps	%xmm0, 224(%esp)
	movaps	%xmm0, 208(%esp)
	movaps	%xmm0, 192(%esp)
	movaps	%xmm0, 176(%esp)
	movaps	%xmm0, 160(%esp)
	movaps	%xmm0, 144(%esp)
	movaps	%xmm0, 128(%esp)
	movaps	%xmm0, 112(%esp)
	movzbl	14(%esp), %eax
	movd	%eax, %xmm1
	movzbl	22(%esp), %eax
	movd	%eax, %xmm2
	punpcklbw	%xmm1, %xmm2
	movzbl	42(%esp), %eax
	movd	%eax, %xmm1
	movzbl	50(%esp), %eax
	movd	%eax, %xmm3
	punpcklbw	%xmm1, %xmm3
	punpcklbw	%xmm2, %xmm3
	movzbl	77(%esp), %eax
	movd	%eax, %xmm1
	movzbl	84(%esp), %eax
	movd	%eax, %xmm2
	punpcklbw	%xmm1, %xmm2
	movzbl	104(%esp), %eax
	movd	%eax, %xmm1
	punpcklbw	%xmm1, %xmm0
	punpcklbw	%xmm2, %xmm0
	movaps	%xmm0, %xmm1
	punpcklbw	%xmm3, %xmm1
	movzbl	127(%esp), %eax
	movd	%eax, %xmm0
	movzbl	135(%esp), %eax
	movd	%eax, %xmm2
	punpcklbw	%xmm0, %xmm2
	movzbl	155(%esp), %eax
	movd	%eax, %xmm0
	movzbl	163(%esp), %eax
	movd	%eax, %xmm3
	punpcklbw	%xmm0, %xmm3
	punpcklbw	%xmm2, %xmm3
	movzbl	188(%esp), %eax
	movd	%eax, %xmm0
	movzbl	197(%esp), %eax
	movd	%eax, %xmm2
	punpcklbw	%xmm0, %xmm2
	movzbl	217(%esp), %eax
	movd	%eax, %xmm4
	movzbl	225(%esp), %eax
	movd	%eax, %xmm0
	punpcklbw	%xmm4, %xmm0
	punpcklbw	%xmm2, %xmm0
	punpcklbw	%xmm3, %xmm0
	punpcklbw	%xmm1, %xmm0
	addl	$252, %esp
	ret



git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@65311 91177308-0d34-0410-b5e6-96231b3b80d8
2009-02-23 08:49:38 +00:00

48 lines
2.9 KiB
LLVM

; RUN: llvm-as < %s | llc -march=x86 -mcpu=yonah -o %t -f
; RUN: grep pshufhw %t | count 1
; RUN: grep pshuflw %t | count 1
; RUN: grep movhps %t | count 1
define void @test1(<2 x i64>* %res, <2 x i64>* %A) nounwind {
%tmp = load <2 x i64>* %A ; <<2 x i64>> [#uses=1]
%tmp.upgrd.1 = bitcast <2 x i64> %tmp to <8 x i16> ; <<8 x i16>> [#uses=8]
%tmp0 = extractelement <8 x i16> %tmp.upgrd.1, i32 0 ; <i16> [#uses=1]
%tmp1 = extractelement <8 x i16> %tmp.upgrd.1, i32 1 ; <i16> [#uses=1]
%tmp2 = extractelement <8 x i16> %tmp.upgrd.1, i32 2 ; <i16> [#uses=1]
%tmp3 = extractelement <8 x i16> %tmp.upgrd.1, i32 3 ; <i16> [#uses=1]
%tmp4 = extractelement <8 x i16> %tmp.upgrd.1, i32 4 ; <i16> [#uses=1]
%tmp5 = extractelement <8 x i16> %tmp.upgrd.1, i32 5 ; <i16> [#uses=1]
%tmp6 = extractelement <8 x i16> %tmp.upgrd.1, i32 6 ; <i16> [#uses=1]
%tmp7 = extractelement <8 x i16> %tmp.upgrd.1, i32 7 ; <i16> [#uses=1]
%tmp8 = insertelement <8 x i16> undef, i16 %tmp2, i32 0 ; <<8 x i16>> [#uses=1]
%tmp9 = insertelement <8 x i16> %tmp8, i16 %tmp1, i32 1 ; <<8 x i16>> [#uses=1]
%tmp10 = insertelement <8 x i16> %tmp9, i16 %tmp0, i32 2 ; <<8 x i16>> [#uses=1]
%tmp11 = insertelement <8 x i16> %tmp10, i16 %tmp3, i32 3 ; <<8 x i16>> [#uses=1]
%tmp12 = insertelement <8 x i16> %tmp11, i16 %tmp6, i32 4 ; <<8 x i16>> [#uses=1]
%tmp13 = insertelement <8 x i16> %tmp12, i16 %tmp5, i32 5 ; <<8 x i16>> [#uses=1]
%tmp14 = insertelement <8 x i16> %tmp13, i16 %tmp4, i32 6 ; <<8 x i16>> [#uses=1]
%tmp15 = insertelement <8 x i16> %tmp14, i16 %tmp7, i32 7 ; <<8 x i16>> [#uses=1]
%tmp15.upgrd.2 = bitcast <8 x i16> %tmp15 to <2 x i64> ; <<2 x i64>> [#uses=1]
store <2 x i64> %tmp15.upgrd.2, <2 x i64>* %res
ret void
}
define void @test2(<4 x float>* %r, <2 x i32>* %A) nounwind {
%tmp = load <4 x float>* %r ; <<4 x float>> [#uses=2]
%tmp.upgrd.3 = bitcast <2 x i32>* %A to double* ; <double*> [#uses=1]
%tmp.upgrd.4 = load double* %tmp.upgrd.3 ; <double> [#uses=1]
%tmp.upgrd.5 = insertelement <2 x double> undef, double %tmp.upgrd.4, i32 0 ; <<2 x double>> [#uses=1]
%tmp5 = insertelement <2 x double> %tmp.upgrd.5, double undef, i32 1 ; <<2 x double>> [#uses=1]
%tmp6 = bitcast <2 x double> %tmp5 to <4 x float> ; <<4 x float>> [#uses=2]
%tmp.upgrd.6 = extractelement <4 x float> %tmp, i32 0 ; <float> [#uses=1]
%tmp7 = extractelement <4 x float> %tmp, i32 1 ; <float> [#uses=1]
%tmp8 = extractelement <4 x float> %tmp6, i32 0 ; <float> [#uses=1]
%tmp9 = extractelement <4 x float> %tmp6, i32 1 ; <float> [#uses=1]
%tmp10 = insertelement <4 x float> undef, float %tmp.upgrd.6, i32 0 ; <<4 x float>> [#uses=1]
%tmp11 = insertelement <4 x float> %tmp10, float %tmp7, i32 1 ; <<4 x float>> [#uses=1]
%tmp12 = insertelement <4 x float> %tmp11, float %tmp8, i32 2 ; <<4 x float>> [#uses=1]
%tmp13 = insertelement <4 x float> %tmp12, float %tmp9, i32 3 ; <<4 x float>> [#uses=1]
store <4 x float> %tmp13, <4 x float>* %r
ret void
}