2004-08-10 20:42:36 +00:00
|
|
|
TODO:
|
2005-04-11 20:48:57 +00:00
|
|
|
* gpr0 allocation
|
2004-10-26 04:10:53 +00:00
|
|
|
* implement do-loop -> bdnz transform
|
2004-08-14 22:16:36 +00:00
|
|
|
* implement powerpc-64 for darwin
|
2004-08-29 22:02:43 +00:00
|
|
|
* use stfiwx in float->int
|
2005-12-24 01:00:15 +00:00
|
|
|
|
|
|
|
* Fold add and sub with constant into non-extern, non-weak addresses so this:
|
2005-07-26 18:59:06 +00:00
|
|
|
lis r2, ha16(l2__ZTV4Cell)
|
|
|
|
la r2, lo16(l2__ZTV4Cell)(r2)
|
|
|
|
addi r2, r2, 8
|
2005-12-24 01:00:15 +00:00
|
|
|
becomes:
|
|
|
|
lis r2, ha16(l2__ZTV4Cell+8)
|
|
|
|
la r2, lo16(l2__ZTV4Cell+8)(r2)
|
|
|
|
|
2005-07-26 19:07:51 +00:00
|
|
|
|
Make FP_TO_UINT Illegal. This allows us to generate significantly better
codegen for FP_TO_UINT by using the legalizer's SELECT variant.
Implement a codegen improvement for SELECT_CC, selecting the false node in
the MBB that feeds the phi node. This allows us to codegen:
void foo(int *a, int b, int c) { int d = (a < b) ? 5 : 9; *a = d; }
as:
_foo:
li r2, 5
cmpw cr0, r4, r3
bgt .LBB_foo_2 ; entry
.LBB_foo_1: ; entry
li r2, 9
.LBB_foo_2: ; entry
stw r2, 0(r3)
blr
insted of:
_foo:
li r2, 5
li r5, 9
cmpw cr0, r4, r3
bgt .LBB_foo_2 ; entry
.LBB_foo_1: ; entry
or r2, r5, r5
.LBB_foo_2: ; entry
stw r2, 0(r3)
blr
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@22784 91177308-0d34-0410-b5e6-96231b3b80d8
2005-08-14 01:17:16 +00:00
|
|
|
* Teach LLVM how to codegen this:
|
|
|
|
unsigned short foo(float a) { return a; }
|
|
|
|
as:
|
|
|
|
_foo:
|
|
|
|
fctiwz f0,f1
|
|
|
|
stfd f0,-8(r1)
|
|
|
|
lhz r3,-2(r1)
|
|
|
|
blr
|
|
|
|
not:
|
|
|
|
_foo:
|
|
|
|
fctiwz f0, f1
|
|
|
|
stfd f0, -8(r1)
|
|
|
|
lwz r2, -4(r1)
|
|
|
|
rlwinm r3, r2, 0, 16, 31
|
|
|
|
blr
|
|
|
|
|
2005-08-05 19:18:32 +00:00
|
|
|
* Support 'update' load/store instructions. These are cracked on the G5, but
|
|
|
|
are still a codesize win.
|
|
|
|
|
2004-07-27 18:43:04 +00:00
|
|
|
* should hint to the branch select pass that it doesn't need to print the
|
|
|
|
second unconditional branch, so we don't end up with things like:
|
|
|
|
b .LBBl42__2E_expand_function_8_674 ; loopentry.24
|
|
|
|
b .LBBl42__2E_expand_function_8_42 ; NewDefault
|
|
|
|
b .LBBl42__2E_expand_function_8_42 ; NewDefault
|
2005-08-23 06:27:59 +00:00
|
|
|
|
2005-08-24 18:15:24 +00:00
|
|
|
===-------------------------------------------------------------------------===
|
|
|
|
|
2005-08-23 06:27:59 +00:00
|
|
|
* Codegen this:
|
|
|
|
|
|
|
|
void test2(int X) {
|
|
|
|
if (X == 0x12345678) bar();
|
|
|
|
}
|
|
|
|
|
|
|
|
as:
|
|
|
|
|
|
|
|
xoris r0,r3,0x1234
|
|
|
|
cmpwi cr0,r0,0x5678
|
|
|
|
beq cr0,L6
|
|
|
|
|
|
|
|
not:
|
|
|
|
|
|
|
|
lis r2, 4660
|
|
|
|
ori r2, r2, 22136
|
|
|
|
cmpw cr0, r3, r2
|
|
|
|
bne .LBB_test2_2
|
|
|
|
|
2005-08-24 18:15:24 +00:00
|
|
|
===-------------------------------------------------------------------------===
|
|
|
|
|
|
|
|
Lump the constant pool for each function into ONE pic object, and reference
|
|
|
|
pieces of it as offsets from the start. For functions like this (contrived
|
|
|
|
to have lots of constants obviously):
|
|
|
|
|
|
|
|
double X(double Y) { return (Y*1.23 + 4.512)*2.34 + 14.38; }
|
|
|
|
|
|
|
|
We generate:
|
|
|
|
|
|
|
|
_X:
|
|
|
|
lis r2, ha16(.CPI_X_0)
|
|
|
|
lfd f0, lo16(.CPI_X_0)(r2)
|
|
|
|
lis r2, ha16(.CPI_X_1)
|
|
|
|
lfd f2, lo16(.CPI_X_1)(r2)
|
|
|
|
fmadd f0, f1, f0, f2
|
|
|
|
lis r2, ha16(.CPI_X_2)
|
|
|
|
lfd f1, lo16(.CPI_X_2)(r2)
|
|
|
|
lis r2, ha16(.CPI_X_3)
|
|
|
|
lfd f2, lo16(.CPI_X_3)(r2)
|
|
|
|
fmadd f1, f0, f1, f2
|
|
|
|
blr
|
|
|
|
|
|
|
|
It would be better to materialize .CPI_X into a register, then use immediates
|
|
|
|
off of the register to avoid the lis's. This is even more important in PIC
|
|
|
|
mode.
|
|
|
|
|
|
|
|
===-------------------------------------------------------------------------===
|
2005-09-06 15:30:48 +00:00
|
|
|
|
|
|
|
Implement Newton-Rhapson method for improving estimate instructions to the
|
|
|
|
correct accuracy, and implementing divide as multiply by reciprocal when it has
|
|
|
|
more than one use. Itanium will want this too.
|
2005-10-16 05:39:50 +00:00
|
|
|
|
|
|
|
===-------------------------------------------------------------------------===
|
|
|
|
|
|
|
|
int foo(int a, int b) { return a == b ? 16 : 0; }
|
|
|
|
_foo:
|
|
|
|
cmpw cr7, r3, r4
|
|
|
|
mfcr r2
|
|
|
|
rlwinm r2, r2, 31, 31, 31
|
|
|
|
slwi r3, r2, 4
|
|
|
|
blr
|
|
|
|
|
|
|
|
If we exposed the srl & mask ops after the MFCR that we are doing to select
|
|
|
|
the correct CR bit, then we could fold the slwi into the rlwinm before it.
|
2005-10-25 23:50:02 +00:00
|
|
|
|
|
|
|
===-------------------------------------------------------------------------===
|
|
|
|
|
|
|
|
#define ARRAY_LENGTH 16
|
|
|
|
|
|
|
|
union bitfield {
|
|
|
|
struct {
|
|
|
|
#ifndef __ppc__
|
|
|
|
unsigned int field0 : 6;
|
|
|
|
unsigned int field1 : 6;
|
|
|
|
unsigned int field2 : 6;
|
|
|
|
unsigned int field3 : 6;
|
|
|
|
unsigned int field4 : 3;
|
|
|
|
unsigned int field5 : 4;
|
|
|
|
unsigned int field6 : 1;
|
|
|
|
#else
|
|
|
|
unsigned int field6 : 1;
|
|
|
|
unsigned int field5 : 4;
|
|
|
|
unsigned int field4 : 3;
|
|
|
|
unsigned int field3 : 6;
|
|
|
|
unsigned int field2 : 6;
|
|
|
|
unsigned int field1 : 6;
|
|
|
|
unsigned int field0 : 6;
|
|
|
|
#endif
|
|
|
|
} bitfields, bits;
|
|
|
|
unsigned int u32All;
|
|
|
|
signed int i32All;
|
|
|
|
float f32All;
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
typedef struct program_t {
|
|
|
|
union bitfield array[ARRAY_LENGTH];
|
|
|
|
int size;
|
|
|
|
int loaded;
|
|
|
|
} program;
|
|
|
|
|
|
|
|
|
|
|
|
void AdjustBitfields(program* prog, unsigned int fmt1)
|
|
|
|
{
|
|
|
|
unsigned int shift = 0;
|
|
|
|
unsigned int texCount = 0;
|
|
|
|
unsigned int i;
|
|
|
|
|
|
|
|
for (i = 0; i < 8; i++)
|
|
|
|
{
|
|
|
|
prog->array[i].bitfields.field0 = texCount;
|
|
|
|
prog->array[i].bitfields.field1 = texCount + 1;
|
|
|
|
prog->array[i].bitfields.field2 = texCount + 2;
|
|
|
|
prog->array[i].bitfields.field3 = texCount + 3;
|
|
|
|
|
|
|
|
texCount += (fmt1 >> shift) & 0x7;
|
|
|
|
shift += 3;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
In the loop above, the bitfield adds get generated as
|
|
|
|
(add (shl bitfield, C1), (shl C2, C1)) where C2 is 1, 2 or 3.
|
|
|
|
|
|
|
|
Since the input to the (or and, and) is an (add) rather than a (shl), the shift
|
|
|
|
doesn't get folded into the rlwimi instruction. We should ideally see through
|
|
|
|
things like this, rather than forcing llvm to generate the equivalent
|
|
|
|
|
|
|
|
(shl (add bitfield, C2), C1) with some kind of mask.
|
2005-10-28 00:20:45 +00:00
|
|
|
|
|
|
|
===-------------------------------------------------------------------------===
|
|
|
|
|
2005-11-05 08:57:56 +00:00
|
|
|
Compile this:
|
|
|
|
|
|
|
|
int %f1(int %a, int %b) {
|
|
|
|
%tmp.1 = and int %a, 15 ; <int> [#uses=1]
|
|
|
|
%tmp.3 = and int %b, 240 ; <int> [#uses=1]
|
|
|
|
%tmp.4 = or int %tmp.3, %tmp.1 ; <int> [#uses=1]
|
|
|
|
ret int %tmp.4
|
|
|
|
}
|
|
|
|
|
|
|
|
without a copy. We make this currently:
|
|
|
|
|
|
|
|
_f1:
|
|
|
|
rlwinm r2, r4, 0, 24, 27
|
|
|
|
rlwimi r2, r3, 0, 28, 31
|
|
|
|
or r3, r2, r2
|
|
|
|
blr
|
|
|
|
|
|
|
|
The two-addr pass or RA needs to learn when it is profitable to commute an
|
|
|
|
instruction to avoid a copy AFTER the 2-addr instruction. The 2-addr pass
|
|
|
|
currently only commutes to avoid inserting a copy BEFORE the two addr instr.
|
|
|
|
|
2005-12-08 07:13:28 +00:00
|
|
|
===-------------------------------------------------------------------------===
|
|
|
|
|
2006-01-28 01:22:10 +00:00
|
|
|
176.gcc contains a bunch of code like this (this occurs dozens of times):
|
|
|
|
|
|
|
|
int %test(uint %mode.0.i.0) {
|
|
|
|
%tmp.79 = cast uint %mode.0.i.0 to sbyte ; <sbyte> [#uses=1]
|
|
|
|
%tmp.80 = cast sbyte %tmp.79 to int ; <int> [#uses=1]
|
|
|
|
%tmp.81 = shl int %tmp.80, ubyte 16 ; <int> [#uses=1]
|
|
|
|
%tmp.82 = and int %tmp.81, 16711680
|
|
|
|
ret int %tmp.82
|
|
|
|
}
|
|
|
|
|
|
|
|
which we compile to:
|
|
|
|
|
|
|
|
_test:
|
|
|
|
extsb r2, r3
|
|
|
|
rlwinm r3, r2, 16, 8, 15
|
|
|
|
blr
|
|
|
|
|
|
|
|
The extsb is obviously dead. This can be handled by a future thing like
|
|
|
|
MaskedValueIsZero that checks to see if bits are ever demanded (in this case,
|
|
|
|
the sign bits are never used, so we can fold the sext_inreg to nothing).
|
|
|
|
|
|
|
|
I'm seeing code like this:
|
|
|
|
|
|
|
|
srwi r3, r3, 16
|
|
|
|
extsb r3, r3
|
|
|
|
rlwimi r4, r3, 16, 8, 15
|
|
|
|
|
|
|
|
in which the extsb is preventing the srwi from being nuked.
|
|
|
|
|
|
|
|
===-------------------------------------------------------------------------===
|
|
|
|
|
|
|
|
Another example that occurs is:
|
|
|
|
|
|
|
|
uint %test(int %specbits.6.1) {
|
|
|
|
%tmp.2540 = shr int %specbits.6.1, ubyte 11 ; <int> [#uses=1]
|
|
|
|
%tmp.2541 = cast int %tmp.2540 to uint ; <uint> [#uses=1]
|
|
|
|
%tmp.2542 = shl uint %tmp.2541, ubyte 13 ; <uint> [#uses=1]
|
|
|
|
%tmp.2543 = and uint %tmp.2542, 8192 ; <uint> [#uses=1]
|
|
|
|
ret uint %tmp.2543
|
|
|
|
}
|
|
|
|
|
|
|
|
which we codegen as:
|
|
|
|
|
|
|
|
l1_test:
|
|
|
|
srawi r2, r3, 11
|
|
|
|
rlwinm r3, r2, 13, 18, 18
|
|
|
|
blr
|
|
|
|
|
|
|
|
the srawi can be nuked by turning the SAR into a logical SHR (the sext bits are
|
|
|
|
dead), which I think can then be folded into the rlwinm.
|
|
|
|
|
|
|
|
===-------------------------------------------------------------------------===
|
|
|
|
|
2005-12-08 07:13:28 +00:00
|
|
|
Compile offsets from allocas:
|
|
|
|
|
|
|
|
int *%test() {
|
|
|
|
%X = alloca { int, int }
|
|
|
|
%Y = getelementptr {int,int}* %X, int 0, uint 1
|
|
|
|
ret int* %Y
|
|
|
|
}
|
|
|
|
|
|
|
|
into a single add, not two:
|
|
|
|
|
|
|
|
_test:
|
|
|
|
addi r2, r1, -8
|
|
|
|
addi r3, r2, 4
|
|
|
|
blr
|
|
|
|
|
|
|
|
--> important for C++.
|
|
|
|
|
2005-12-22 17:19:28 +00:00
|
|
|
===-------------------------------------------------------------------------===
|
|
|
|
|
|
|
|
int test3(int a, int b) { return (a < 0) ? a : 0; }
|
|
|
|
|
|
|
|
should be branch free code. LLVM is turning it into < 1 because of the RHS.
|
|
|
|
|
|
|
|
===-------------------------------------------------------------------------===
|
|
|
|
|
|
|
|
No loads or stores of the constants should be needed:
|
|
|
|
|
|
|
|
struct foo { double X, Y; };
|
|
|
|
void xxx(struct foo F);
|
|
|
|
void bar() { struct foo R = { 1.0, 2.0 }; xxx(R); }
|
|
|
|
|
2006-01-16 17:53:00 +00:00
|
|
|
===-------------------------------------------------------------------------===
|
|
|
|
|
|
|
|
For this:
|
|
|
|
|
|
|
|
int h(int i, int j, int k) {
|
|
|
|
return (i==0||j==0||k == 0);
|
|
|
|
}
|
|
|
|
|
|
|
|
We currently emit this:
|
|
|
|
|
|
|
|
_h:
|
|
|
|
cntlzw r2, r3
|
|
|
|
cntlzw r3, r4
|
|
|
|
cntlzw r4, r5
|
|
|
|
srwi r2, r2, 5
|
|
|
|
srwi r3, r3, 5
|
|
|
|
srwi r4, r4, 5
|
|
|
|
or r2, r3, r2
|
|
|
|
or r3, r2, r4
|
|
|
|
blr
|
|
|
|
|
|
|
|
The ctlz/shift instructions are created by the isel, so the dag combiner doesn't
|
|
|
|
have a chance to pull the shifts through the or's (eliminating two
|
|
|
|
instructions). SETCC nodes should be custom lowered in this case, not expanded
|
|
|
|
by the isel.
|
|
|
|
|
2006-01-16 17:58:54 +00:00
|
|
|
===-------------------------------------------------------------------------===
|
|
|
|
|
|
|
|
Darwin Stub LICM optimization:
|
|
|
|
|
|
|
|
Loops like this:
|
|
|
|
|
|
|
|
for (...) bar();
|
|
|
|
|
|
|
|
Have to go through an indirect stub if bar is external or linkonce. It would
|
|
|
|
be better to compile it as:
|
|
|
|
|
|
|
|
fp = &bar;
|
|
|
|
for (...) fp();
|
|
|
|
|
|
|
|
which only computes the address of bar once (instead of each time through the
|
|
|
|
stub). This is Darwin specific and would have to be done in the code generator.
|
|
|
|
Probably not a win on x86.
|
|
|
|
|
|
|
|
===-------------------------------------------------------------------------===
|
|
|
|
|
|
|
|
PowerPC i1/setcc stuff (depends on subreg stuff):
|
|
|
|
|
|
|
|
Check out the PPC code we get for 'compare' in this testcase:
|
|
|
|
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=19672
|
|
|
|
|
|
|
|
oof. on top of not doing the logical crnand instead of (mfcr, mfcr,
|
|
|
|
invert, invert, or), we then have to compare it against zero instead of
|
|
|
|
using the value already in a CR!
|
|
|
|
|
|
|
|
that should be something like
|
|
|
|
cmpw cr7, r8, r5
|
|
|
|
cmpw cr0, r7, r3
|
|
|
|
crnand cr0, cr0, cr7
|
|
|
|
bne cr0, LBB_compare_4
|
|
|
|
|
|
|
|
instead of
|
|
|
|
cmpw cr7, r8, r5
|
|
|
|
cmpw cr0, r7, r3
|
|
|
|
mfcr r7, 1
|
|
|
|
mcrf cr7, cr0
|
|
|
|
mfcr r8, 1
|
|
|
|
rlwinm r7, r7, 30, 31, 31
|
|
|
|
rlwinm r8, r8, 30, 31, 31
|
|
|
|
xori r7, r7, 1
|
|
|
|
xori r8, r8, 1
|
|
|
|
addi r2, r2, 1
|
|
|
|
or r7, r8, r7
|
|
|
|
cmpwi cr0, r7, 0
|
|
|
|
bne cr0, LBB_compare_4 ; loopexit
|
|
|
|
|
|
|
|
===-------------------------------------------------------------------------===
|
|
|
|
|
|
|
|
Simple IPO for argument passing, change:
|
|
|
|
void foo(int X, double Y, int Z) -> void foo(int X, int Z, double Y)
|
|
|
|
|
|
|
|
the Darwin ABI specifies that any integer arguments in the first 32 bytes worth
|
|
|
|
of arguments get assigned to r3 through r10. That is, if you have a function
|
|
|
|
foo(int, double, int) you get r3, f1, r6, since the 64 bit double ate up the
|
|
|
|
argument bytes for r4 and r5. The trick then would be to shuffle the argument
|
|
|
|
order for functions we can internalize so that the maximum number of
|
|
|
|
integers/pointers get passed in regs before you see any of the fp arguments.
|
|
|
|
|
|
|
|
Instead of implementing this, it would actually probably be easier to just
|
|
|
|
implement a PPC fastcc, where we could do whatever we wanted to the CC,
|
|
|
|
including having this work sanely.
|
|
|
|
|
|
|
|
===-------------------------------------------------------------------------===
|
|
|
|
|
|
|
|
Fix Darwin FP-In-Integer Registers ABI
|
|
|
|
|
|
|
|
Darwin passes doubles in structures in integer registers, which is very very
|
|
|
|
bad. Add something like a BIT_CONVERT to LLVM, then do an i-p transformation
|
|
|
|
that percolates these things out of functions.
|
|
|
|
|
|
|
|
Check out how horrible this is:
|
|
|
|
http://gcc.gnu.org/ml/gcc/2005-10/msg01036.html
|
|
|
|
|
|
|
|
This is an extension of "interprocedural CC unmunging" that can't be done with
|
|
|
|
just fastcc.
|
|
|
|
|
|
|
|
===-------------------------------------------------------------------------===
|
|
|
|
|
|
|
|
Code Gen IPO optimization:
|
|
|
|
|
|
|
|
Squish small scalar globals together into a single global struct, allowing the
|
|
|
|
address of the struct to be CSE'd, avoiding PIC accesses (also reduces the size
|
|
|
|
of the GOT on targets with one).
|
|
|
|
|
2006-01-19 02:09:38 +00:00
|
|
|
===-------------------------------------------------------------------------===
|
|
|
|
|
|
|
|
Generate lwbrx and other byteswapping load/store instructions when reasonable.
|
|
|
|
|
2006-01-28 05:40:47 +00:00
|
|
|
===-------------------------------------------------------------------------===
|
|
|
|
|
|
|
|
Implement TargetConstantVec, and set up PPC to custom lower ConstantVec into
|
|
|
|
TargetConstantVec's if it's one of the many forms that are algorithmically
|
|
|
|
computable using the spiffy altivec instructions.
|
|
|
|
|