mirror of
https://github.com/c64scene-ar/llvm-6502.git
synced 2025-01-06 04:31:08 +00:00
Remove some already-fixed README entries.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@105377 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
parent
a04a2c0a50
commit
db1bf34178
@ -66,12 +66,6 @@ LBB_X_2:
|
|||||||
|
|
||||||
//===---------------------------------------------------------------------===//
|
//===---------------------------------------------------------------------===//
|
||||||
|
|
||||||
It's not clear whether we should use pxor or xorps / xorpd to clear XMM
|
|
||||||
registers. The choice may depend on subtarget information. We should do some
|
|
||||||
more experiments on different x86 machines.
|
|
||||||
|
|
||||||
//===---------------------------------------------------------------------===//
|
|
||||||
|
|
||||||
Lower memcpy / memset to a series of SSE 128 bit move instructions when it's
|
Lower memcpy / memset to a series of SSE 128 bit move instructions when it's
|
||||||
feasible.
|
feasible.
|
||||||
|
|
||||||
@ -95,35 +89,6 @@ Perhaps use pxor / xorp* to clear a XMM register first?
|
|||||||
|
|
||||||
//===---------------------------------------------------------------------===//
|
//===---------------------------------------------------------------------===//
|
||||||
|
|
||||||
How to decide when to use the "floating point version" of logical ops? Here are
|
|
||||||
some code fragments:
|
|
||||||
|
|
||||||
movaps LCPI5_5, %xmm2
|
|
||||||
divps %xmm1, %xmm2
|
|
||||||
mulps %xmm2, %xmm3
|
|
||||||
mulps 8656(%ecx), %xmm3
|
|
||||||
addps 8672(%ecx), %xmm3
|
|
||||||
andps LCPI5_6, %xmm2
|
|
||||||
andps LCPI5_1, %xmm3
|
|
||||||
por %xmm2, %xmm3
|
|
||||||
movdqa %xmm3, (%edi)
|
|
||||||
|
|
||||||
movaps LCPI5_5, %xmm1
|
|
||||||
divps %xmm0, %xmm1
|
|
||||||
mulps %xmm1, %xmm3
|
|
||||||
mulps 8656(%ecx), %xmm3
|
|
||||||
addps 8672(%ecx), %xmm3
|
|
||||||
andps LCPI5_6, %xmm1
|
|
||||||
andps LCPI5_1, %xmm3
|
|
||||||
orps %xmm1, %xmm3
|
|
||||||
movaps %xmm3, 112(%esp)
|
|
||||||
movaps %xmm3, (%ebx)
|
|
||||||
|
|
||||||
Due to some minor source change, the later case ended up using orps and movaps
|
|
||||||
instead of por and movdqa. Does it matter?
|
|
||||||
|
|
||||||
//===---------------------------------------------------------------------===//
|
|
||||||
|
|
||||||
X86RegisterInfo::copyRegToReg() returns X86::MOVAPSrr for VR128. Is it possible
|
X86RegisterInfo::copyRegToReg() returns X86::MOVAPSrr for VR128. Is it possible
|
||||||
to choose between movaps, movapd, and movdqa based on types of source and
|
to choose between movaps, movapd, and movdqa based on types of source and
|
||||||
destination?
|
destination?
|
||||||
@ -222,41 +187,6 @@ It also exposes some other problems. See MOV32ri -3 and the spills.
|
|||||||
|
|
||||||
//===---------------------------------------------------------------------===//
|
//===---------------------------------------------------------------------===//
|
||||||
|
|
||||||
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=25500
|
|
||||||
|
|
||||||
LLVM is producing bad code.
|
|
||||||
|
|
||||||
LBB_main_4: # cond_true44
|
|
||||||
addps %xmm1, %xmm2
|
|
||||||
subps %xmm3, %xmm2
|
|
||||||
movaps (%ecx), %xmm4
|
|
||||||
movaps %xmm2, %xmm1
|
|
||||||
addps %xmm4, %xmm1
|
|
||||||
addl $16, %ecx
|
|
||||||
incl %edx
|
|
||||||
cmpl $262144, %edx
|
|
||||||
movaps %xmm3, %xmm2
|
|
||||||
movaps %xmm4, %xmm3
|
|
||||||
jne LBB_main_4 # cond_true44
|
|
||||||
|
|
||||||
There are two problems. 1) No need to two loop induction variables. We can
|
|
||||||
compare against 262144 * 16. 2) Known register coalescer issue. We should
|
|
||||||
be able eliminate one of the movaps:
|
|
||||||
|
|
||||||
addps %xmm2, %xmm1 <=== Commute!
|
|
||||||
subps %xmm3, %xmm1
|
|
||||||
movaps (%ecx), %xmm4
|
|
||||||
movaps %xmm1, %xmm1 <=== Eliminate!
|
|
||||||
addps %xmm4, %xmm1
|
|
||||||
addl $16, %ecx
|
|
||||||
incl %edx
|
|
||||||
cmpl $262144, %edx
|
|
||||||
movaps %xmm3, %xmm2
|
|
||||||
movaps %xmm4, %xmm3
|
|
||||||
jne LBB_main_4 # cond_true44
|
|
||||||
|
|
||||||
//===---------------------------------------------------------------------===//
|
|
||||||
|
|
||||||
Consider:
|
Consider:
|
||||||
|
|
||||||
__m128 test(float a) {
|
__m128 test(float a) {
|
||||||
@ -326,22 +256,6 @@ elements are fixed zeros.
|
|||||||
|
|
||||||
//===---------------------------------------------------------------------===//
|
//===---------------------------------------------------------------------===//
|
||||||
|
|
||||||
__m128d test1( __m128d A, __m128d B) {
|
|
||||||
return _mm_shuffle_pd(A, B, 0x3);
|
|
||||||
}
|
|
||||||
|
|
||||||
compiles to
|
|
||||||
|
|
||||||
shufpd $3, %xmm1, %xmm0
|
|
||||||
|
|
||||||
Perhaps it's better to use unpckhpd instead?
|
|
||||||
|
|
||||||
unpckhpd %xmm1, %xmm0
|
|
||||||
|
|
||||||
Don't know if unpckhpd is faster. But it is shorter.
|
|
||||||
|
|
||||||
//===---------------------------------------------------------------------===//
|
|
||||||
|
|
||||||
This code generates ugly code, probably due to costs being off or something:
|
This code generates ugly code, probably due to costs being off or something:
|
||||||
|
|
||||||
define void @test(float* %P, <4 x float>* %P2 ) {
|
define void @test(float* %P, <4 x float>* %P2 ) {
|
||||||
@ -493,6 +407,7 @@ entry:
|
|||||||
%tmp20 = tail call i64 @ccoshf( float %tmp6, float %z.0 ) nounwind readonly
|
%tmp20 = tail call i64 @ccoshf( float %tmp6, float %z.0 ) nounwind readonly
|
||||||
ret i64 %tmp20
|
ret i64 %tmp20
|
||||||
}
|
}
|
||||||
|
declare i64 @ccoshf(float %z.0, float %z.1) nounwind readonly
|
||||||
|
|
||||||
This currently compiles to:
|
This currently compiles to:
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user