mirror of
https://github.com/c64scene-ar/llvm-6502.git
synced 2024-12-30 02:32:08 +00:00
3cda14ff3e
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@25439 91177308-0d34-0410-b5e6-96231b3b80d8
346 lines
9.9 KiB
Plaintext
346 lines
9.9 KiB
Plaintext
TODO:
|
|
* gpr0 allocation
|
|
* implement do-loop -> bdnz transform
|
|
* implement powerpc-64 for darwin
|
|
* use stfiwx in float->int
|
|
|
|
* Fold add and sub with constant into non-extern, non-weak addresses so this:
|
|
lis r2, ha16(l2__ZTV4Cell)
|
|
la r2, lo16(l2__ZTV4Cell)(r2)
|
|
addi r2, r2, 8
|
|
becomes:
|
|
lis r2, ha16(l2__ZTV4Cell+8)
|
|
la r2, lo16(l2__ZTV4Cell+8)(r2)
|
|
|
|
|
|
* Teach LLVM how to codegen this:
|
|
unsigned short foo(float a) { return a; }
|
|
as:
|
|
_foo:
|
|
fctiwz f0,f1
|
|
stfd f0,-8(r1)
|
|
lhz r3,-2(r1)
|
|
blr
|
|
not:
|
|
_foo:
|
|
fctiwz f0, f1
|
|
stfd f0, -8(r1)
|
|
lwz r2, -4(r1)
|
|
rlwinm r3, r2, 0, 16, 31
|
|
blr
|
|
|
|
* Support 'update' load/store instructions. These are cracked on the G5, but
|
|
are still a codesize win.
|
|
|
|
* should hint to the branch select pass that it doesn't need to print the
|
|
second unconditional branch, so we don't end up with things like:
|
|
b .LBBl42__2E_expand_function_8_674 ; loopentry.24
|
|
b .LBBl42__2E_expand_function_8_42 ; NewDefault
|
|
b .LBBl42__2E_expand_function_8_42 ; NewDefault
|
|
|
|
===-------------------------------------------------------------------------===
|
|
|
|
* Codegen this:
|
|
|
|
void test2(int X) {
|
|
if (X == 0x12345678) bar();
|
|
}
|
|
|
|
as:
|
|
|
|
xoris r0,r3,0x1234
|
|
cmpwi cr0,r0,0x5678
|
|
beq cr0,L6
|
|
|
|
not:
|
|
|
|
lis r2, 4660
|
|
ori r2, r2, 22136
|
|
cmpw cr0, r3, r2
|
|
bne .LBB_test2_2
|
|
|
|
===-------------------------------------------------------------------------===
|
|
|
|
Lump the constant pool for each function into ONE pic object, and reference
|
|
pieces of it as offsets from the start. For functions like this (contrived
|
|
to have lots of constants obviously):
|
|
|
|
double X(double Y) { return (Y*1.23 + 4.512)*2.34 + 14.38; }
|
|
|
|
We generate:
|
|
|
|
_X:
|
|
lis r2, ha16(.CPI_X_0)
|
|
lfd f0, lo16(.CPI_X_0)(r2)
|
|
lis r2, ha16(.CPI_X_1)
|
|
lfd f2, lo16(.CPI_X_1)(r2)
|
|
fmadd f0, f1, f0, f2
|
|
lis r2, ha16(.CPI_X_2)
|
|
lfd f1, lo16(.CPI_X_2)(r2)
|
|
lis r2, ha16(.CPI_X_3)
|
|
lfd f2, lo16(.CPI_X_3)(r2)
|
|
fmadd f1, f0, f1, f2
|
|
blr
|
|
|
|
It would be better to materialize .CPI_X into a register, then use immediates
|
|
off of the register to avoid the lis's. This is even more important in PIC
|
|
mode.
|
|
|
|
===-------------------------------------------------------------------------===
|
|
|
|
Implement Newton-Rhapson method for improving estimate instructions to the
|
|
correct accuracy, and implementing divide as multiply by reciprocal when it has
|
|
more than one use. Itanium will want this too.
|
|
|
|
===-------------------------------------------------------------------------===
|
|
|
|
int foo(int a, int b) { return a == b ? 16 : 0; }
|
|
_foo:
|
|
cmpw cr7, r3, r4
|
|
mfcr r2
|
|
rlwinm r2, r2, 31, 31, 31
|
|
slwi r3, r2, 4
|
|
blr
|
|
|
|
If we exposed the srl & mask ops after the MFCR that we are doing to select
|
|
the correct CR bit, then we could fold the slwi into the rlwinm before it.
|
|
|
|
===-------------------------------------------------------------------------===
|
|
|
|
#define ARRAY_LENGTH 16
|
|
|
|
union bitfield {
|
|
struct {
|
|
#ifndef __ppc__
|
|
unsigned int field0 : 6;
|
|
unsigned int field1 : 6;
|
|
unsigned int field2 : 6;
|
|
unsigned int field3 : 6;
|
|
unsigned int field4 : 3;
|
|
unsigned int field5 : 4;
|
|
unsigned int field6 : 1;
|
|
#else
|
|
unsigned int field6 : 1;
|
|
unsigned int field5 : 4;
|
|
unsigned int field4 : 3;
|
|
unsigned int field3 : 6;
|
|
unsigned int field2 : 6;
|
|
unsigned int field1 : 6;
|
|
unsigned int field0 : 6;
|
|
#endif
|
|
} bitfields, bits;
|
|
unsigned int u32All;
|
|
signed int i32All;
|
|
float f32All;
|
|
};
|
|
|
|
|
|
typedef struct program_t {
|
|
union bitfield array[ARRAY_LENGTH];
|
|
int size;
|
|
int loaded;
|
|
} program;
|
|
|
|
|
|
void AdjustBitfields(program* prog, unsigned int fmt1)
|
|
{
|
|
unsigned int shift = 0;
|
|
unsigned int texCount = 0;
|
|
unsigned int i;
|
|
|
|
for (i = 0; i < 8; i++)
|
|
{
|
|
prog->array[i].bitfields.field0 = texCount;
|
|
prog->array[i].bitfields.field1 = texCount + 1;
|
|
prog->array[i].bitfields.field2 = texCount + 2;
|
|
prog->array[i].bitfields.field3 = texCount + 3;
|
|
|
|
texCount += (fmt1 >> shift) & 0x7;
|
|
shift += 3;
|
|
}
|
|
}
|
|
|
|
In the loop above, the bitfield adds get generated as
|
|
(add (shl bitfield, C1), (shl C2, C1)) where C2 is 1, 2 or 3.
|
|
|
|
Since the input to the (or and, and) is an (add) rather than a (shl), the shift
|
|
doesn't get folded into the rlwimi instruction. We should ideally see through
|
|
things like this, rather than forcing llvm to generate the equivalent
|
|
|
|
(shl (add bitfield, C2), C1) with some kind of mask.
|
|
|
|
===-------------------------------------------------------------------------===
|
|
|
|
Compile this:
|
|
|
|
int %f1(int %a, int %b) {
|
|
%tmp.1 = and int %a, 15 ; <int> [#uses=1]
|
|
%tmp.3 = and int %b, 240 ; <int> [#uses=1]
|
|
%tmp.4 = or int %tmp.3, %tmp.1 ; <int> [#uses=1]
|
|
ret int %tmp.4
|
|
}
|
|
|
|
without a copy. We make this currently:
|
|
|
|
_f1:
|
|
rlwinm r2, r4, 0, 24, 27
|
|
rlwimi r2, r3, 0, 28, 31
|
|
or r3, r2, r2
|
|
blr
|
|
|
|
The two-addr pass or RA needs to learn when it is profitable to commute an
|
|
instruction to avoid a copy AFTER the 2-addr instruction. The 2-addr pass
|
|
currently only commutes to avoid inserting a copy BEFORE the two addr instr.
|
|
|
|
===-------------------------------------------------------------------------===
|
|
|
|
Compile offsets from allocas:
|
|
|
|
int *%test() {
|
|
%X = alloca { int, int }
|
|
%Y = getelementptr {int,int}* %X, int 0, uint 1
|
|
ret int* %Y
|
|
}
|
|
|
|
into a single add, not two:
|
|
|
|
_test:
|
|
addi r2, r1, -8
|
|
addi r3, r2, 4
|
|
blr
|
|
|
|
--> important for C++.
|
|
|
|
===-------------------------------------------------------------------------===
|
|
|
|
int test3(int a, int b) { return (a < 0) ? a : 0; }
|
|
|
|
should be branch free code. LLVM is turning it into < 1 because of the RHS.
|
|
|
|
===-------------------------------------------------------------------------===
|
|
|
|
No loads or stores of the constants should be needed:
|
|
|
|
struct foo { double X, Y; };
|
|
void xxx(struct foo F);
|
|
void bar() { struct foo R = { 1.0, 2.0 }; xxx(R); }
|
|
|
|
===-------------------------------------------------------------------------===
|
|
|
|
For this:
|
|
|
|
int h(int i, int j, int k) {
|
|
return (i==0||j==0||k == 0);
|
|
}
|
|
|
|
We currently emit this:
|
|
|
|
_h:
|
|
cntlzw r2, r3
|
|
cntlzw r3, r4
|
|
cntlzw r4, r5
|
|
srwi r2, r2, 5
|
|
srwi r3, r3, 5
|
|
srwi r4, r4, 5
|
|
or r2, r3, r2
|
|
or r3, r2, r4
|
|
blr
|
|
|
|
The ctlz/shift instructions are created by the isel, so the dag combiner doesn't
|
|
have a chance to pull the shifts through the or's (eliminating two
|
|
instructions). SETCC nodes should be custom lowered in this case, not expanded
|
|
by the isel.
|
|
|
|
===-------------------------------------------------------------------------===
|
|
|
|
Darwin Stub LICM optimization:
|
|
|
|
Loops like this:
|
|
|
|
for (...) bar();
|
|
|
|
Have to go through an indirect stub if bar is external or linkonce. It would
|
|
be better to compile it as:
|
|
|
|
fp = &bar;
|
|
for (...) fp();
|
|
|
|
which only computes the address of bar once (instead of each time through the
|
|
stub). This is Darwin specific and would have to be done in the code generator.
|
|
Probably not a win on x86.
|
|
|
|
===-------------------------------------------------------------------------===
|
|
|
|
PowerPC i1/setcc stuff (depends on subreg stuff):
|
|
|
|
Check out the PPC code we get for 'compare' in this testcase:
|
|
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=19672
|
|
|
|
oof. on top of not doing the logical crnand instead of (mfcr, mfcr,
|
|
invert, invert, or), we then have to compare it against zero instead of
|
|
using the value already in a CR!
|
|
|
|
that should be something like
|
|
cmpw cr7, r8, r5
|
|
cmpw cr0, r7, r3
|
|
crnand cr0, cr0, cr7
|
|
bne cr0, LBB_compare_4
|
|
|
|
instead of
|
|
cmpw cr7, r8, r5
|
|
cmpw cr0, r7, r3
|
|
mfcr r7, 1
|
|
mcrf cr7, cr0
|
|
mfcr r8, 1
|
|
rlwinm r7, r7, 30, 31, 31
|
|
rlwinm r8, r8, 30, 31, 31
|
|
xori r7, r7, 1
|
|
xori r8, r8, 1
|
|
addi r2, r2, 1
|
|
or r7, r8, r7
|
|
cmpwi cr0, r7, 0
|
|
bne cr0, LBB_compare_4 ; loopexit
|
|
|
|
===-------------------------------------------------------------------------===
|
|
|
|
Simple IPO for argument passing, change:
|
|
void foo(int X, double Y, int Z) -> void foo(int X, int Z, double Y)
|
|
|
|
the Darwin ABI specifies that any integer arguments in the first 32 bytes worth
|
|
of arguments get assigned to r3 through r10. That is, if you have a function
|
|
foo(int, double, int) you get r3, f1, r6, since the 64 bit double ate up the
|
|
argument bytes for r4 and r5. The trick then would be to shuffle the argument
|
|
order for functions we can internalize so that the maximum number of
|
|
integers/pointers get passed in regs before you see any of the fp arguments.
|
|
|
|
Instead of implementing this, it would actually probably be easier to just
|
|
implement a PPC fastcc, where we could do whatever we wanted to the CC,
|
|
including having this work sanely.
|
|
|
|
===-------------------------------------------------------------------------===
|
|
|
|
Fix Darwin FP-In-Integer Registers ABI
|
|
|
|
Darwin passes doubles in structures in integer registers, which is very very
|
|
bad. Add something like a BIT_CONVERT to LLVM, then do an i-p transformation
|
|
that percolates these things out of functions.
|
|
|
|
Check out how horrible this is:
|
|
http://gcc.gnu.org/ml/gcc/2005-10/msg01036.html
|
|
|
|
This is an extension of "interprocedural CC unmunging" that can't be done with
|
|
just fastcc.
|
|
|
|
===-------------------------------------------------------------------------===
|
|
|
|
Code Gen IPO optimization:
|
|
|
|
Squish small scalar globals together into a single global struct, allowing the
|
|
address of the struct to be CSE'd, avoiding PIC accesses (also reduces the size
|
|
of the GOT on targets with one).
|
|
|
|
===-------------------------------------------------------------------------===
|
|
|
|
Generate lwbrx and other byteswapping load/store instructions when reasonable.
|
|
|