mirror of
https://github.com/c64scene-ar/llvm-6502.git
synced 2025-08-07 12:28:24 +00:00
Add some observations from CoreGraphics benchmark. Remove register
scavenging todo item, since it is now implemented. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@35044 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
@@ -140,6 +140,29 @@ odd/even pair. However, we probably would pay a penalty if the address is not
|
|||||||
aligned on 8-byte boundary. This requires more information on load / store
|
aligned on 8-byte boundary. This requires more information on load / store
|
||||||
nodes (and MI's?) then we currently carry.
|
nodes (and MI's?) then we currently carry.
|
||||||
|
|
||||||
|
6) (From CoreGraphics): struct copies appear to be done field by field
|
||||||
|
instead of by words, at least sometimes:
|
||||||
|
|
||||||
|
struct foo { int x; short s; char c1; char c2; };
|
||||||
|
void cpy(struct foo*a, struct foo*b) { *a = *b; }
|
||||||
|
|
||||||
|
llvm code (-O2)
|
||||||
|
ldrb r3, [r1, #+6]
|
||||||
|
ldr r2, [r1]
|
||||||
|
ldrb r12, [r1, #+7]
|
||||||
|
ldrh r1, [r1, #+4]
|
||||||
|
str r2, [r0]
|
||||||
|
strh r1, [r0, #+4]
|
||||||
|
strb r3, [r0, #+6]
|
||||||
|
strb r12, [r0, #+7]
|
||||||
|
gcc code (-O2)
|
||||||
|
ldmia r1, {r1-r2}
|
||||||
|
stmia r0, {r1-r2}
|
||||||
|
|
||||||
|
In this benchmark poor handling of aggregate copies has shown up as
|
||||||
|
having a large effect on size, and possibly speed as well (we don't have
|
||||||
|
a good way to measure on ARM).
|
||||||
|
|
||||||
//===---------------------------------------------------------------------===//
|
//===---------------------------------------------------------------------===//
|
||||||
|
|
||||||
* Consider this silly example:
|
* Consider this silly example:
|
||||||
@@ -282,53 +305,8 @@ See McCat/18-imp/ComputeBoundingBoxes for an example.
|
|||||||
|
|
||||||
//===---------------------------------------------------------------------===//
|
//===---------------------------------------------------------------------===//
|
||||||
|
|
||||||
We need register scavenging. Currently, the 'ip' register is reserved in case
|
Register scavenging is now implemented. The example in the previous version
|
||||||
frame indexes are too big. This means that we generate extra code for stuff
|
of this document produces optimal code at -O2.
|
||||||
like this:
|
|
||||||
|
|
||||||
void foo(unsigned x, unsigned y, unsigned z, unsigned *a, unsigned *b, unsigned *c) {
|
|
||||||
short Rconst = (short) (16384.0f * 1.40200 + 0.5 );
|
|
||||||
*a = x * Rconst;
|
|
||||||
*b = y * Rconst;
|
|
||||||
*c = z * Rconst;
|
|
||||||
}
|
|
||||||
|
|
||||||
we compile it to:
|
|
||||||
|
|
||||||
_foo:
|
|
||||||
*** stmfd sp!, {r4, r7}
|
|
||||||
*** add r7, sp, #4
|
|
||||||
mov r4, #186
|
|
||||||
orr r4, r4, #89, 24 @ 22784
|
|
||||||
mul r0, r0, r4
|
|
||||||
str r0, [r3]
|
|
||||||
mul r0, r1, r4
|
|
||||||
ldr r1, [sp, #+8]
|
|
||||||
str r0, [r1]
|
|
||||||
mul r0, r2, r4
|
|
||||||
ldr r1, [sp, #+12]
|
|
||||||
str r0, [r1]
|
|
||||||
*** sub sp, r7, #4
|
|
||||||
*** ldmfd sp!, {r4, r7}
|
|
||||||
bx lr
|
|
||||||
|
|
||||||
GCC produces:
|
|
||||||
|
|
||||||
_foo:
|
|
||||||
ldr ip, L4
|
|
||||||
mul r0, ip, r0
|
|
||||||
mul r1, ip, r1
|
|
||||||
str r0, [r3, #0]
|
|
||||||
ldr r3, [sp, #0]
|
|
||||||
mul r2, ip, r2
|
|
||||||
str r1, [r3, #0]
|
|
||||||
ldr r3, [sp, #4]
|
|
||||||
str r2, [r3, #0]
|
|
||||||
bx lr
|
|
||||||
L4:
|
|
||||||
.long 22970
|
|
||||||
|
|
||||||
This is apparently all because we couldn't use ip here.
|
|
||||||
|
|
||||||
//===---------------------------------------------------------------------===//
|
//===---------------------------------------------------------------------===//
|
||||||
|
|
||||||
@@ -449,3 +427,25 @@ http://www.inf.u-szeged.hu/gcc-arm/
|
|||||||
http://citeseer.ist.psu.edu/debus04linktime.html
|
http://citeseer.ist.psu.edu/debus04linktime.html
|
||||||
|
|
||||||
//===---------------------------------------------------------------------===//
|
//===---------------------------------------------------------------------===//
|
||||||
|
(CoreGraphics): gcc generates smaller code for this function at -O2 or -Os:
|
||||||
|
|
||||||
|
void foo(signed char* p) {
|
||||||
|
if (*p == 3)
|
||||||
|
bar();
|
||||||
|
else if (*p == 4)
|
||||||
|
baz();
|
||||||
|
else if (*p == 5)
|
||||||
|
quux();
|
||||||
|
}
|
||||||
|
|
||||||
|
llvm decides it's a good idea to turn the repeated if...else into a
|
||||||
|
binary tree, as if it were a switch; the resulting code requires -1
|
||||||
|
compare-and-branches when *p<=2 or *p==5, the same number if *p==4
|
||||||
|
or *p>6, and +1 if *p==3. So it should be a speed win
|
||||||
|
(on balance). However, the revised code is larger, with 4 conditional
|
||||||
|
branches instead of 3.
|
||||||
|
|
||||||
|
More seriously, there is a byte->word extend before
|
||||||
|
each comparison, where there should be only one, and the condition codes
|
||||||
|
are not remembered when the same two values are compared twice.
|
||||||
|
|
||||||
|
Reference in New Issue
Block a user