mirror of
https://github.com/c64scene-ar/llvm-6502.git
synced 2025-01-31 10:34:17 +00:00
807360ab08
Handle the poor codegen for i64/x86xmm->v2i64 (%mm -> %xmm) moves. Instead of using stack store/load pair to do the job, use scalar_to_vector directly, which in the MMX case can use movq2dq. This was the current behavior prior to improvements for vector legalization of extloads in r213897. This commit fixes the regression and as a side-effect also remove some unnecessary shuffles. In the new attached testcase, we go from: pshufw $-18, (%rdi), %mm0 movq %mm0, -8(%rsp) movq -8(%rsp), %xmm0 pshufd $-44, %xmm0, %xmm0 movd %xmm0, %eax ... To: pshufw $-18, (%rdi), %mm0 movq2dq %mm0, %xmm0 movd %xmm0, %eax ... Differential Revision: http://reviews.llvm.org/D7126 rdar://problem/19413324 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@226953 91177308-0d34-0410-b5e6-96231b3b80d8
30 lines
954 B
LLVM
30 lines
954 B
LLVM
; RUN: llc < %s -march=x86 -mattr=+mmx,+sse2 | FileCheck %s -check-prefix=X86-32
|
|
; RUN: llc < %s -march=x86-64 -mattr=+mmx,+sse2 | FileCheck %s -check-prefix=X86-64
|
|
|
|
; X86-32-LABEL: test0
|
|
; X86-64-LABEL: test0
|
|
define i32 @test0(<1 x i64>* %v4) {
|
|
%v5 = load <1 x i64>* %v4, align 8
|
|
%v12 = bitcast <1 x i64> %v5 to <4 x i16>
|
|
%v13 = bitcast <4 x i16> %v12 to x86_mmx
|
|
; X86-32: pshufw $238
|
|
; X86-32-NOT: movq
|
|
; X86-32-NOT: movsd
|
|
; X86-32: movq2dq
|
|
; X86-64: pshufw $238
|
|
; X86-64-NOT: movq
|
|
; X86-64-NOT: pshufd
|
|
; X86-64: movq2dq
|
|
; X86-64-NEXT: movd
|
|
%v14 = tail call x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx %v13, i8 -18)
|
|
%v15 = bitcast x86_mmx %v14 to <4 x i16>
|
|
%v16 = bitcast <4 x i16> %v15 to <1 x i64>
|
|
%v17 = extractelement <1 x i64> %v16, i32 0
|
|
%v18 = bitcast i64 %v17 to <2 x i32>
|
|
%v19 = extractelement <2 x i32> %v18, i32 0
|
|
%v20 = add i32 %v19, 32
|
|
ret i32 %v20
|
|
}
|
|
|
|
declare x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx, i8)
|