llvm-6502/test/CodeGen/X86/sse3.ll
Chris Lattner d1b7382983 Fix some issues in WalkChainUsers dealing with
CopyToReg/CopyFromReg/INLINEASM.  These are annoying because
they have the same opcode before an after isel.  Fix this by
setting their NodeID to -1 to indicate that they are selected,
just like what automatically happens when selecting things that
end up being machine nodes.

With that done, give IsLegalToFold a new flag that causes it to
ignore chains.  This lets the HandleMergeInputChains routine be
the one place that validates chains after a match is successful,
enabling the new hotness in chain processing.  This smarter
chain processing eliminates the need for "PreprocessRMW" in the
X86 and MSP430 backends and enables MSP to start matching it's
multiple mem operand instructions more aggressively.

I currently #if out the dead code in the X86 backend and MSP 
backend, I'll remove it for real in a follow-on patch.

The testcase changes are:
  test/CodeGen/X86/sse3.ll: we generate better code
  test/CodeGen/X86/store_op_load_fold2.ll: PreprocessRMW was 
      miscompiling this before, we now generate correct code
      Convert it to filecheck while I'm at it.
  test/CodeGen/MSP430/Inst16mm.ll: Add a testcase for mem/mem
      folding to make anton happy. :)




git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@97596 91177308-0d34-0410-b5e6-96231b3b80d8
2010-03-02 22:20:06 +00:00

263 lines
9.1 KiB
LLVM

; These are tests for SSE3 codegen. Yonah has SSE3 and earlier but not SSSE3+.
; RUN: llc < %s -march=x86-64 -mcpu=yonah -mtriple=i686-apple-darwin9 -O3 \
; RUN: | FileCheck %s --check-prefix=X64
; Test for v8xi16 lowering where we extract the first element of the vector and
; placed it in the second element of the result.
define void @t0(<8 x i16>* %dest, <8 x i16>* %old) nounwind {
entry:
%tmp3 = load <8 x i16>* %old
%tmp6 = shufflevector <8 x i16> %tmp3,
<8 x i16> < i16 0, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef >,
<8 x i32> < i32 8, i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef >
store <8 x i16> %tmp6, <8 x i16>* %dest
ret void
; X64: t0:
; X64: movddup (%rsi), %xmm0
; X64: xorl %eax, %eax
; X64: pshuflw $0, %xmm0, %xmm0
; X64: pinsrw $0, %eax, %xmm0
; X64: movaps %xmm0, (%rdi)
; X64: ret
}
define <8 x i16> @t1(<8 x i16>* %A, <8 x i16>* %B) nounwind {
%tmp1 = load <8 x i16>* %A
%tmp2 = load <8 x i16>* %B
%tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> < i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7 >
ret <8 x i16> %tmp3
; X64: t1:
; X64: movl (%rsi), %eax
; X64: movaps (%rdi), %xmm0
; X64: pinsrw $0, %eax, %xmm0
; X64: ret
}
define <8 x i16> @t2(<8 x i16> %A, <8 x i16> %B) nounwind {
%tmp = shufflevector <8 x i16> %A, <8 x i16> %B, <8 x i32> < i32 9, i32 1, i32 2, i32 9, i32 4, i32 5, i32 6, i32 7 >
ret <8 x i16> %tmp
; X64: t2:
; X64: pextrw $1, %xmm1, %eax
; X64: pinsrw $0, %eax, %xmm0
; X64: pinsrw $3, %eax, %xmm0
; X64: ret
}
define <8 x i16> @t3(<8 x i16> %A, <8 x i16> %B) nounwind {
%tmp = shufflevector <8 x i16> %A, <8 x i16> %A, <8 x i32> < i32 8, i32 3, i32 2, i32 13, i32 7, i32 6, i32 5, i32 4 >
ret <8 x i16> %tmp
; X64: t3:
; X64: pextrw $5, %xmm0, %eax
; X64: pshuflw $44, %xmm0, %xmm0
; X64: pshufhw $27, %xmm0, %xmm0
; X64: pinsrw $3, %eax, %xmm0
; X64: ret
}
define <8 x i16> @t4(<8 x i16> %A, <8 x i16> %B) nounwind {
%tmp = shufflevector <8 x i16> %A, <8 x i16> %B, <8 x i32> < i32 0, i32 7, i32 2, i32 3, i32 1, i32 5, i32 6, i32 5 >
ret <8 x i16> %tmp
; X64: t4:
; X64: pextrw $7, %xmm0, %eax
; X64: pshufhw $100, %xmm0, %xmm2
; X64: pinsrw $1, %eax, %xmm2
; X64: pextrw $1, %xmm0, %eax
; X64: movaps %xmm2, %xmm0
; X64: pinsrw $4, %eax, %xmm0
; X64: ret
}
define <8 x i16> @t5(<8 x i16> %A, <8 x i16> %B) nounwind {
%tmp = shufflevector <8 x i16> %A, <8 x i16> %B, <8 x i32> < i32 8, i32 9, i32 0, i32 1, i32 10, i32 11, i32 2, i32 3 >
ret <8 x i16> %tmp
; X64: t5:
; X64: movlhps %xmm1, %xmm0
; X64: pshufd $114, %xmm0, %xmm0
; X64: ret
}
define <8 x i16> @t6(<8 x i16> %A, <8 x i16> %B) nounwind {
%tmp = shufflevector <8 x i16> %A, <8 x i16> %B, <8 x i32> < i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7 >
ret <8 x i16> %tmp
; X64: t6:
; X64: movss %xmm1, %xmm0
; X64: ret
}
define <8 x i16> @t7(<8 x i16> %A, <8 x i16> %B) nounwind {
%tmp = shufflevector <8 x i16> %A, <8 x i16> %B, <8 x i32> < i32 0, i32 0, i32 3, i32 2, i32 4, i32 6, i32 4, i32 7 >
ret <8 x i16> %tmp
; X64: t7:
; X64: pshuflw $-80, %xmm0, %xmm0
; X64: pshufhw $-56, %xmm0, %xmm0
; X64: ret
}
define void @t8(<2 x i64>* %res, <2 x i64>* %A) nounwind {
%tmp = load <2 x i64>* %A
%tmp.upgrd.1 = bitcast <2 x i64> %tmp to <8 x i16>
%tmp0 = extractelement <8 x i16> %tmp.upgrd.1, i32 0
%tmp1 = extractelement <8 x i16> %tmp.upgrd.1, i32 1
%tmp2 = extractelement <8 x i16> %tmp.upgrd.1, i32 2
%tmp3 = extractelement <8 x i16> %tmp.upgrd.1, i32 3
%tmp4 = extractelement <8 x i16> %tmp.upgrd.1, i32 4
%tmp5 = extractelement <8 x i16> %tmp.upgrd.1, i32 5
%tmp6 = extractelement <8 x i16> %tmp.upgrd.1, i32 6
%tmp7 = extractelement <8 x i16> %tmp.upgrd.1, i32 7
%tmp8 = insertelement <8 x i16> undef, i16 %tmp2, i32 0
%tmp9 = insertelement <8 x i16> %tmp8, i16 %tmp1, i32 1
%tmp10 = insertelement <8 x i16> %tmp9, i16 %tmp0, i32 2
%tmp11 = insertelement <8 x i16> %tmp10, i16 %tmp3, i32 3
%tmp12 = insertelement <8 x i16> %tmp11, i16 %tmp6, i32 4
%tmp13 = insertelement <8 x i16> %tmp12, i16 %tmp5, i32 5
%tmp14 = insertelement <8 x i16> %tmp13, i16 %tmp4, i32 6
%tmp15 = insertelement <8 x i16> %tmp14, i16 %tmp7, i32 7
%tmp15.upgrd.2 = bitcast <8 x i16> %tmp15 to <2 x i64>
store <2 x i64> %tmp15.upgrd.2, <2 x i64>* %res
ret void
; X64: t8:
; X64: pshuflw $-58, (%rsi), %xmm0
; X64: pshufhw $-58, %xmm0, %xmm0
; X64: movaps %xmm0, (%rdi)
; X64: ret
}
define void @t9(<4 x float>* %r, <2 x i32>* %A) nounwind {
%tmp = load <4 x float>* %r
%tmp.upgrd.3 = bitcast <2 x i32>* %A to double*
%tmp.upgrd.4 = load double* %tmp.upgrd.3
%tmp.upgrd.5 = insertelement <2 x double> undef, double %tmp.upgrd.4, i32 0
%tmp5 = insertelement <2 x double> %tmp.upgrd.5, double undef, i32 1
%tmp6 = bitcast <2 x double> %tmp5 to <4 x float>
%tmp.upgrd.6 = extractelement <4 x float> %tmp, i32 0
%tmp7 = extractelement <4 x float> %tmp, i32 1
%tmp8 = extractelement <4 x float> %tmp6, i32 0
%tmp9 = extractelement <4 x float> %tmp6, i32 1
%tmp10 = insertelement <4 x float> undef, float %tmp.upgrd.6, i32 0
%tmp11 = insertelement <4 x float> %tmp10, float %tmp7, i32 1
%tmp12 = insertelement <4 x float> %tmp11, float %tmp8, i32 2
%tmp13 = insertelement <4 x float> %tmp12, float %tmp9, i32 3
store <4 x float> %tmp13, <4 x float>* %r
ret void
; X64: t9:
; X64: movaps (%rdi), %xmm0
; X64: movhps (%rsi), %xmm0
; X64: movaps %xmm0, (%rdi)
; X64: ret
}
; FIXME: This testcase produces icky code. It can be made much better!
; PR2585
@g1 = external constant <4 x i32>
@g2 = external constant <4 x i16>
define internal void @t10() nounwind {
load <4 x i32>* @g1, align 16
bitcast <4 x i32> %1 to <8 x i16>
shufflevector <8 x i16> %2, <8 x i16> undef, <8 x i32> < i32 0, i32 2, i32 4, i32 6, i32 undef, i32 undef, i32 undef, i32 undef >
bitcast <8 x i16> %3 to <2 x i64>
extractelement <2 x i64> %4, i32 0
bitcast i64 %5 to <4 x i16>
store <4 x i16> %6, <4 x i16>* @g2, align 8
ret void
; X64: t10:
; X64: pextrw $4, %xmm0, %eax
; X64: pextrw $6, %xmm0, %edx
; X64: movlhps %xmm1, %xmm1
; X64: pshuflw $8, %xmm1, %xmm1
; X64: pinsrw $2, %eax, %xmm1
; X64: pinsrw $3, %edx, %xmm1
}
; Pack various elements via shuffles.
define <8 x i16> @t11(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone {
entry:
%tmp7 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> < i32 1, i32 8, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef , i32 undef >
ret <8 x i16> %tmp7
; X64: t11:
; X64: movlhps %xmm0, %xmm0
; X64: movd %xmm1, %eax
; X64: pshuflw $1, %xmm0, %xmm0
; X64: pinsrw $1, %eax, %xmm0
; X64: ret
}
define <8 x i16> @t12(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone {
entry:
%tmp9 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> < i32 0, i32 1, i32 undef, i32 undef, i32 3, i32 11, i32 undef , i32 undef >
ret <8 x i16> %tmp9
; X64: t12:
; X64: movlhps %xmm0, %xmm0
; X64: pextrw $3, %xmm1, %eax
; X64: pshufhw $3, %xmm0, %xmm0
; X64: pinsrw $5, %eax, %xmm0
; X64: ret
}
define <8 x i16> @t13(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone {
entry:
%tmp9 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> < i32 8, i32 9, i32 undef, i32 undef, i32 11, i32 3, i32 undef , i32 undef >
ret <8 x i16> %tmp9
; X64: t13:
; X64: punpcklqdq %xmm0, %xmm1
; X64: pextrw $3, %xmm1, %eax
; X64: pshufd $52, %xmm1, %xmm0
; X64: pinsrw $4, %eax, %xmm0
; X64: ret
}
define <8 x i16> @t14(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone {
entry:
%tmp9 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> < i32 8, i32 9, i32 undef, i32 undef, i32 undef, i32 2, i32 undef , i32 undef >
ret <8 x i16> %tmp9
; X64: t14:
; X64: punpcklqdq %xmm0, %xmm1
; X64: pshufhw $8, %xmm1, %xmm0
; X64: ret
}
define <8 x i16> @t15(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone {
entry:
%tmp8 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> < i32 undef, i32 undef, i32 7, i32 2, i32 8, i32 undef, i32 undef , i32 undef >
ret <8 x i16> %tmp8
; X64: t15:
; X64: pextrw $7, %xmm0, %eax
; X64: punpcklqdq %xmm1, %xmm0
; X64: pshuflw $-128, %xmm0, %xmm0
; X64: pinsrw $2, %eax, %xmm0
; X64: ret
}
; Test yonah where we convert a shuffle to pextrw and pinrsw
define <16 x i8> @t16(<16 x i8> %T0) nounwind readnone {
entry:
%tmp8 = shufflevector <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 1, i8 1, i8 1, i8 1, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <16 x i8> %T0, <16 x i32> < i32 0, i32 1, i32 16, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef , i32 undef >
%tmp9 = shufflevector <16 x i8> %tmp8, <16 x i8> %T0, <16 x i32> < i32 0, i32 1, i32 2, i32 17, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef , i32 undef >
ret <16 x i8> %tmp9
; X64: t16:
; X64: pinsrw $0, %eax, %xmm1
; X64: pextrw $8, %xmm0, %eax
; X64: pinsrw $1, %eax, %xmm1
; X64: pextrw $1, %xmm1, %ecx
; X64: movd %xmm1, %edx
; X64: pinsrw $0, %edx, %xmm1
; X64: pinsrw $1, %eax, %xmm0
; X64: ret
}