mirror of
https://github.com/jeremysrand/llvm-65816.git
synced 2024-11-05 00:06:07 +00:00
734 lines
22 KiB
Plaintext
734 lines
22 KiB
Plaintext
|
//===---------------------------------------------------------------------===//
|
||
|
// Random ideas for the ARM backend.
|
||
|
//===---------------------------------------------------------------------===//
|
||
|
|
||
|
Reimplement 'select' in terms of 'SEL'.
|
||
|
|
||
|
* We would really like to support UXTAB16, but we need to prove that the
|
||
|
add doesn't need to overflow between the two 16-bit chunks.
|
||
|
|
||
|
* Implement pre/post increment support. (e.g. PR935)
|
||
|
* Implement smarter constant generation for binops with large immediates.
|
||
|
|
||
|
A few ARMv6T2 ops should be pattern matched: BFI, SBFX, and UBFX
|
||
|
|
||
|
Interesting optimization for PIC codegen on arm-linux:
|
||
|
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=43129
|
||
|
|
||
|
//===---------------------------------------------------------------------===//
|
||
|
|
||
|
Crazy idea: Consider code that uses lots of 8-bit or 16-bit values. By the
|
||
|
time regalloc happens, these values are now in a 32-bit register, usually with
|
||
|
the top-bits known to be sign or zero extended. If spilled, we should be able
|
||
|
to spill these to a 8-bit or 16-bit stack slot, zero or sign extending as part
|
||
|
of the reload.
|
||
|
|
||
|
Doing this reduces the size of the stack frame (important for thumb etc), and
|
||
|
also increases the likelihood that we will be able to reload multiple values
|
||
|
from the stack with a single load.
|
||
|
|
||
|
//===---------------------------------------------------------------------===//
|
||
|
|
||
|
The constant island pass is in good shape. Some cleanups might be desirable,
|
||
|
but there is unlikely to be much improvement in the generated code.
|
||
|
|
||
|
1. There may be some advantage to trying to be smarter about the initial
|
||
|
placement, rather than putting everything at the end.
|
||
|
|
||
|
2. There might be some compile-time efficiency to be had by representing
|
||
|
consecutive islands as a single block rather than multiple blocks.
|
||
|
|
||
|
3. Use a priority queue to sort constant pool users in inverse order of
|
||
|
position so we always process the one closed to the end of functions
|
||
|
first. This may simply CreateNewWater.
|
||
|
|
||
|
//===---------------------------------------------------------------------===//
|
||
|
|
||
|
Eliminate copysign custom expansion. We are still generating crappy code with
|
||
|
default expansion + if-conversion.
|
||
|
|
||
|
//===---------------------------------------------------------------------===//
|
||
|
|
||
|
Eliminate one instruction from:
|
||
|
|
||
|
define i32 @_Z6slow4bii(i32 %x, i32 %y) {
|
||
|
%tmp = icmp sgt i32 %x, %y
|
||
|
%retval = select i1 %tmp, i32 %x, i32 %y
|
||
|
ret i32 %retval
|
||
|
}
|
||
|
|
||
|
__Z6slow4bii:
|
||
|
cmp r0, r1
|
||
|
movgt r1, r0
|
||
|
mov r0, r1
|
||
|
bx lr
|
||
|
=>
|
||
|
|
||
|
__Z6slow4bii:
|
||
|
cmp r0, r1
|
||
|
movle r0, r1
|
||
|
bx lr
|
||
|
|
||
|
//===---------------------------------------------------------------------===//
|
||
|
|
||
|
Implement long long "X-3" with instructions that fold the immediate in. These
|
||
|
were disabled due to badness with the ARM carry flag on subtracts.
|
||
|
|
||
|
//===---------------------------------------------------------------------===//
|
||
|
|
||
|
More load / store optimizations:
|
||
|
1) Better representation for block transfer? This is from Olden/power:
|
||
|
|
||
|
fldd d0, [r4]
|
||
|
fstd d0, [r4, #+32]
|
||
|
fldd d0, [r4, #+8]
|
||
|
fstd d0, [r4, #+40]
|
||
|
fldd d0, [r4, #+16]
|
||
|
fstd d0, [r4, #+48]
|
||
|
fldd d0, [r4, #+24]
|
||
|
fstd d0, [r4, #+56]
|
||
|
|
||
|
If we can spare the registers, it would be better to use fldm and fstm here.
|
||
|
Need major register allocator enhancement though.
|
||
|
|
||
|
2) Can we recognize the relative position of constantpool entries? i.e. Treat
|
||
|
|
||
|
ldr r0, LCPI17_3
|
||
|
ldr r1, LCPI17_4
|
||
|
ldr r2, LCPI17_5
|
||
|
|
||
|
as
|
||
|
ldr r0, LCPI17
|
||
|
ldr r1, LCPI17+4
|
||
|
ldr r2, LCPI17+8
|
||
|
|
||
|
Then the ldr's can be combined into a single ldm. See Olden/power.
|
||
|
|
||
|
Note for ARM v4 gcc uses ldmia to load a pair of 32-bit values to represent a
|
||
|
double 64-bit FP constant:
|
||
|
|
||
|
adr r0, L6
|
||
|
ldmia r0, {r0-r1}
|
||
|
|
||
|
.align 2
|
||
|
L6:
|
||
|
.long -858993459
|
||
|
.long 1074318540
|
||
|
|
||
|
3) struct copies appear to be done field by field
|
||
|
instead of by words, at least sometimes:
|
||
|
|
||
|
struct foo { int x; short s; char c1; char c2; };
|
||
|
void cpy(struct foo*a, struct foo*b) { *a = *b; }
|
||
|
|
||
|
llvm code (-O2)
|
||
|
ldrb r3, [r1, #+6]
|
||
|
ldr r2, [r1]
|
||
|
ldrb r12, [r1, #+7]
|
||
|
ldrh r1, [r1, #+4]
|
||
|
str r2, [r0]
|
||
|
strh r1, [r0, #+4]
|
||
|
strb r3, [r0, #+6]
|
||
|
strb r12, [r0, #+7]
|
||
|
gcc code (-O2)
|
||
|
ldmia r1, {r1-r2}
|
||
|
stmia r0, {r1-r2}
|
||
|
|
||
|
In this benchmark poor handling of aggregate copies has shown up as
|
||
|
having a large effect on size, and possibly speed as well (we don't have
|
||
|
a good way to measure on ARM).
|
||
|
|
||
|
//===---------------------------------------------------------------------===//
|
||
|
|
||
|
* Consider this silly example:
|
||
|
|
||
|
double bar(double x) {
|
||
|
double r = foo(3.1);
|
||
|
return x+r;
|
||
|
}
|
||
|
|
||
|
_bar:
|
||
|
stmfd sp!, {r4, r5, r7, lr}
|
||
|
add r7, sp, #8
|
||
|
mov r4, r0
|
||
|
mov r5, r1
|
||
|
fldd d0, LCPI1_0
|
||
|
fmrrd r0, r1, d0
|
||
|
bl _foo
|
||
|
fmdrr d0, r4, r5
|
||
|
fmsr s2, r0
|
||
|
fsitod d1, s2
|
||
|
faddd d0, d1, d0
|
||
|
fmrrd r0, r1, d0
|
||
|
ldmfd sp!, {r4, r5, r7, pc}
|
||
|
|
||
|
Ignore the prologue and epilogue stuff for a second. Note
|
||
|
mov r4, r0
|
||
|
mov r5, r1
|
||
|
the copys to callee-save registers and the fact they are only being used by the
|
||
|
fmdrr instruction. It would have been better had the fmdrr been scheduled
|
||
|
before the call and place the result in a callee-save DPR register. The two
|
||
|
mov ops would not have been necessary.
|
||
|
|
||
|
//===---------------------------------------------------------------------===//
|
||
|
|
||
|
Calling convention related stuff:
|
||
|
|
||
|
* gcc's parameter passing implementation is terrible and we suffer as a result:
|
||
|
|
||
|
e.g.
|
||
|
struct s {
|
||
|
double d1;
|
||
|
int s1;
|
||
|
};
|
||
|
|
||
|
void foo(struct s S) {
|
||
|
printf("%g, %d\n", S.d1, S.s1);
|
||
|
}
|
||
|
|
||
|
'S' is passed via registers r0, r1, r2. But gcc stores them to the stack, and
|
||
|
then reload them to r1, r2, and r3 before issuing the call (r0 contains the
|
||
|
address of the format string):
|
||
|
|
||
|
stmfd sp!, {r7, lr}
|
||
|
add r7, sp, #0
|
||
|
sub sp, sp, #12
|
||
|
stmia sp, {r0, r1, r2}
|
||
|
ldmia sp, {r1-r2}
|
||
|
ldr r0, L5
|
||
|
ldr r3, [sp, #8]
|
||
|
L2:
|
||
|
add r0, pc, r0
|
||
|
bl L_printf$stub
|
||
|
|
||
|
Instead of a stmia, ldmia, and a ldr, wouldn't it be better to do three moves?
|
||
|
|
||
|
* Return an aggregate type is even worse:
|
||
|
|
||
|
e.g.
|
||
|
struct s foo(void) {
|
||
|
struct s S = {1.1, 2};
|
||
|
return S;
|
||
|
}
|
||
|
|
||
|
mov ip, r0
|
||
|
ldr r0, L5
|
||
|
sub sp, sp, #12
|
||
|
L2:
|
||
|
add r0, pc, r0
|
||
|
@ lr needed for prologue
|
||
|
ldmia r0, {r0, r1, r2}
|
||
|
stmia sp, {r0, r1, r2}
|
||
|
stmia ip, {r0, r1, r2}
|
||
|
mov r0, ip
|
||
|
add sp, sp, #12
|
||
|
bx lr
|
||
|
|
||
|
r0 (and later ip) is the hidden parameter from caller to store the value in. The
|
||
|
first ldmia loads the constants into r0, r1, r2. The last stmia stores r0, r1,
|
||
|
r2 into the address passed in. However, there is one additional stmia that
|
||
|
stores r0, r1, and r2 to some stack location. The store is dead.
|
||
|
|
||
|
The llvm-gcc generated code looks like this:
|
||
|
|
||
|
csretcc void %foo(%struct.s* %agg.result) {
|
||
|
entry:
|
||
|
%S = alloca %struct.s, align 4 ; <%struct.s*> [#uses=1]
|
||
|
%memtmp = alloca %struct.s ; <%struct.s*> [#uses=1]
|
||
|
cast %struct.s* %S to sbyte* ; <sbyte*>:0 [#uses=2]
|
||
|
call void %llvm.memcpy.i32( sbyte* %0, sbyte* cast ({ double, int }* %C.0.904 to sbyte*), uint 12, uint 4 )
|
||
|
cast %struct.s* %agg.result to sbyte* ; <sbyte*>:1 [#uses=2]
|
||
|
call void %llvm.memcpy.i32( sbyte* %1, sbyte* %0, uint 12, uint 0 )
|
||
|
cast %struct.s* %memtmp to sbyte* ; <sbyte*>:2 [#uses=1]
|
||
|
call void %llvm.memcpy.i32( sbyte* %2, sbyte* %1, uint 12, uint 0 )
|
||
|
ret void
|
||
|
}
|
||
|
|
||
|
llc ends up issuing two memcpy's (the first memcpy becomes 3 loads from
|
||
|
constantpool). Perhaps we should 1) fix llvm-gcc so the memcpy is translated
|
||
|
into a number of load and stores, or 2) custom lower memcpy (of small size) to
|
||
|
be ldmia / stmia. I think option 2 is better but the current register
|
||
|
allocator cannot allocate a chunk of registers at a time.
|
||
|
|
||
|
A feasible temporary solution is to use specific physical registers at the
|
||
|
lowering time for small (<= 4 words?) transfer size.
|
||
|
|
||
|
* ARM CSRet calling convention requires the hidden argument to be returned by
|
||
|
the callee.
|
||
|
|
||
|
//===---------------------------------------------------------------------===//
|
||
|
|
||
|
We can definitely do a better job on BB placements to eliminate some branches.
|
||
|
It's very common to see llvm generated assembly code that looks like this:
|
||
|
|
||
|
LBB3:
|
||
|
...
|
||
|
LBB4:
|
||
|
...
|
||
|
beq LBB3
|
||
|
b LBB2
|
||
|
|
||
|
If BB4 is the only predecessor of BB3, then we can emit BB3 after BB4. We can
|
||
|
then eliminate beq and and turn the unconditional branch to LBB2 to a bne.
|
||
|
|
||
|
See McCat/18-imp/ComputeBoundingBoxes for an example.
|
||
|
|
||
|
//===---------------------------------------------------------------------===//
|
||
|
|
||
|
Pre-/post- indexed load / stores:
|
||
|
|
||
|
1) We should not make the pre/post- indexed load/store transform if the base ptr
|
||
|
is guaranteed to be live beyond the load/store. This can happen if the base
|
||
|
ptr is live out of the block we are performing the optimization. e.g.
|
||
|
|
||
|
mov r1, r2
|
||
|
ldr r3, [r1], #4
|
||
|
...
|
||
|
|
||
|
vs.
|
||
|
|
||
|
ldr r3, [r2]
|
||
|
add r1, r2, #4
|
||
|
...
|
||
|
|
||
|
In most cases, this is just a wasted optimization. However, sometimes it can
|
||
|
negatively impact the performance because two-address code is more restrictive
|
||
|
when it comes to scheduling.
|
||
|
|
||
|
Unfortunately, liveout information is currently unavailable during DAG combine
|
||
|
time.
|
||
|
|
||
|
2) Consider spliting a indexed load / store into a pair of add/sub + load/store
|
||
|
to solve #1 (in TwoAddressInstructionPass.cpp).
|
||
|
|
||
|
3) Enhance LSR to generate more opportunities for indexed ops.
|
||
|
|
||
|
4) Once we added support for multiple result patterns, write indexed loads
|
||
|
patterns instead of C++ instruction selection code.
|
||
|
|
||
|
5) Use VLDM / VSTM to emulate indexed FP load / store.
|
||
|
|
||
|
//===---------------------------------------------------------------------===//
|
||
|
|
||
|
Implement support for some more tricky ways to materialize immediates. For
|
||
|
example, to get 0xffff8000, we can use:
|
||
|
|
||
|
mov r9, #&3f8000
|
||
|
sub r9, r9, #&400000
|
||
|
|
||
|
//===---------------------------------------------------------------------===//
|
||
|
|
||
|
We sometimes generate multiple add / sub instructions to update sp in prologue
|
||
|
and epilogue if the inc / dec value is too large to fit in a single immediate
|
||
|
operand. In some cases, perhaps it might be better to load the value from a
|
||
|
constantpool instead.
|
||
|
|
||
|
//===---------------------------------------------------------------------===//
|
||
|
|
||
|
GCC generates significantly better code for this function.
|
||
|
|
||
|
int foo(int StackPtr, unsigned char *Line, unsigned char *Stack, int LineLen) {
|
||
|
int i = 0;
|
||
|
|
||
|
if (StackPtr != 0) {
|
||
|
while (StackPtr != 0 && i < (((LineLen) < (32768))? (LineLen) : (32768)))
|
||
|
Line[i++] = Stack[--StackPtr];
|
||
|
if (LineLen > 32768)
|
||
|
{
|
||
|
while (StackPtr != 0 && i < LineLen)
|
||
|
{
|
||
|
i++;
|
||
|
--StackPtr;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
return StackPtr;
|
||
|
}
|
||
|
|
||
|
//===---------------------------------------------------------------------===//
|
||
|
|
||
|
This should compile to the mlas instruction:
|
||
|
int mlas(int x, int y, int z) { return ((x * y + z) < 0) ? 7 : 13; }
|
||
|
|
||
|
//===---------------------------------------------------------------------===//
|
||
|
|
||
|
At some point, we should triage these to see if they still apply to us:
|
||
|
|
||
|
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=19598
|
||
|
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=18560
|
||
|
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=27016
|
||
|
|
||
|
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=11831
|
||
|
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=11826
|
||
|
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=11825
|
||
|
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=11824
|
||
|
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=11823
|
||
|
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=11820
|
||
|
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=10982
|
||
|
|
||
|
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=10242
|
||
|
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=9831
|
||
|
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=9760
|
||
|
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=9759
|
||
|
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=9703
|
||
|
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=9702
|
||
|
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=9663
|
||
|
|
||
|
http://www.inf.u-szeged.hu/gcc-arm/
|
||
|
http://citeseer.ist.psu.edu/debus04linktime.html
|
||
|
|
||
|
//===---------------------------------------------------------------------===//
|
||
|
|
||
|
gcc generates smaller code for this function at -O2 or -Os:
|
||
|
|
||
|
void foo(signed char* p) {
|
||
|
if (*p == 3)
|
||
|
bar();
|
||
|
else if (*p == 4)
|
||
|
baz();
|
||
|
else if (*p == 5)
|
||
|
quux();
|
||
|
}
|
||
|
|
||
|
llvm decides it's a good idea to turn the repeated if...else into a
|
||
|
binary tree, as if it were a switch; the resulting code requires -1
|
||
|
compare-and-branches when *p<=2 or *p==5, the same number if *p==4
|
||
|
or *p>6, and +1 if *p==3. So it should be a speed win
|
||
|
(on balance). However, the revised code is larger, with 4 conditional
|
||
|
branches instead of 3.
|
||
|
|
||
|
More seriously, there is a byte->word extend before
|
||
|
each comparison, where there should be only one, and the condition codes
|
||
|
are not remembered when the same two values are compared twice.
|
||
|
|
||
|
//===---------------------------------------------------------------------===//
|
||
|
|
||
|
More LSR enhancements possible:
|
||
|
|
||
|
1. Teach LSR about pre- and post- indexed ops to allow iv increment be merged
|
||
|
in a load / store.
|
||
|
2. Allow iv reuse even when a type conversion is required. For example, i8
|
||
|
and i32 load / store addressing modes are identical.
|
||
|
|
||
|
|
||
|
//===---------------------------------------------------------------------===//
|
||
|
|
||
|
This:
|
||
|
|
||
|
int foo(int a, int b, int c, int d) {
|
||
|
long long acc = (long long)a * (long long)b;
|
||
|
acc += (long long)c * (long long)d;
|
||
|
return (int)(acc >> 32);
|
||
|
}
|
||
|
|
||
|
Should compile to use SMLAL (Signed Multiply Accumulate Long) which multiplies
|
||
|
two signed 32-bit values to produce a 64-bit value, and accumulates this with
|
||
|
a 64-bit value.
|
||
|
|
||
|
We currently get this with both v4 and v6:
|
||
|
|
||
|
_foo:
|
||
|
smull r1, r0, r1, r0
|
||
|
smull r3, r2, r3, r2
|
||
|
adds r3, r3, r1
|
||
|
adc r0, r2, r0
|
||
|
bx lr
|
||
|
|
||
|
//===---------------------------------------------------------------------===//
|
||
|
|
||
|
This:
|
||
|
#include <algorithm>
|
||
|
std::pair<unsigned, bool> full_add(unsigned a, unsigned b)
|
||
|
{ return std::make_pair(a + b, a + b < a); }
|
||
|
bool no_overflow(unsigned a, unsigned b)
|
||
|
{ return !full_add(a, b).second; }
|
||
|
|
||
|
Should compile to:
|
||
|
|
||
|
_Z8full_addjj:
|
||
|
adds r2, r1, r2
|
||
|
movcc r1, #0
|
||
|
movcs r1, #1
|
||
|
str r2, [r0, #0]
|
||
|
strb r1, [r0, #4]
|
||
|
mov pc, lr
|
||
|
|
||
|
_Z11no_overflowjj:
|
||
|
cmn r0, r1
|
||
|
movcs r0, #0
|
||
|
movcc r0, #1
|
||
|
mov pc, lr
|
||
|
|
||
|
not:
|
||
|
|
||
|
__Z8full_addjj:
|
||
|
add r3, r2, r1
|
||
|
str r3, [r0]
|
||
|
mov r2, #1
|
||
|
mov r12, #0
|
||
|
cmp r3, r1
|
||
|
movlo r12, r2
|
||
|
str r12, [r0, #+4]
|
||
|
bx lr
|
||
|
__Z11no_overflowjj:
|
||
|
add r3, r1, r0
|
||
|
mov r2, #1
|
||
|
mov r1, #0
|
||
|
cmp r3, r0
|
||
|
movhs r1, r2
|
||
|
mov r0, r1
|
||
|
bx lr
|
||
|
|
||
|
//===---------------------------------------------------------------------===//
|
||
|
|
||
|
Some of the NEON intrinsics may be appropriate for more general use, either
|
||
|
as target-independent intrinsics or perhaps elsewhere in the ARM backend.
|
||
|
Some of them may also be lowered to target-independent SDNodes, and perhaps
|
||
|
some new SDNodes could be added.
|
||
|
|
||
|
For example, maximum, minimum, and absolute value operations are well-defined
|
||
|
and standard operations, both for vector and scalar types.
|
||
|
|
||
|
The current NEON-specific intrinsics for count leading zeros and count one
|
||
|
bits could perhaps be replaced by the target-independent ctlz and ctpop
|
||
|
intrinsics. It may also make sense to add a target-independent "ctls"
|
||
|
intrinsic for "count leading sign bits". Likewise, the backend could use
|
||
|
the target-independent SDNodes for these operations.
|
||
|
|
||
|
ARMv6 has scalar saturating and halving adds and subtracts. The same
|
||
|
intrinsics could possibly be used for both NEON's vector implementations of
|
||
|
those operations and the ARMv6 scalar versions.
|
||
|
|
||
|
//===---------------------------------------------------------------------===//
|
||
|
|
||
|
Split out LDR (literal) from normal ARM LDR instruction. Also consider spliting
|
||
|
LDR into imm12 and so_reg forms. This allows us to clean up some code. e.g.
|
||
|
ARMLoadStoreOptimizer does not need to look at LDR (literal) and LDR (so_reg)
|
||
|
while ARMConstantIslandPass only need to worry about LDR (literal).
|
||
|
|
||
|
//===---------------------------------------------------------------------===//
|
||
|
|
||
|
Constant island pass should make use of full range SoImm values for LEApcrel.
|
||
|
Be careful though as the last attempt caused infinite looping on lencod.
|
||
|
|
||
|
//===---------------------------------------------------------------------===//
|
||
|
|
||
|
Predication issue. This function:
|
||
|
|
||
|
extern unsigned array[ 128 ];
|
||
|
int foo( int x ) {
|
||
|
int y;
|
||
|
y = array[ x & 127 ];
|
||
|
if ( x & 128 )
|
||
|
y = 123456789 & ( y >> 2 );
|
||
|
else
|
||
|
y = 123456789 & y;
|
||
|
return y;
|
||
|
}
|
||
|
|
||
|
compiles to:
|
||
|
|
||
|
_foo:
|
||
|
and r1, r0, #127
|
||
|
ldr r2, LCPI1_0
|
||
|
ldr r2, [r2]
|
||
|
ldr r1, [r2, +r1, lsl #2]
|
||
|
mov r2, r1, lsr #2
|
||
|
tst r0, #128
|
||
|
moveq r2, r1
|
||
|
ldr r0, LCPI1_1
|
||
|
and r0, r2, r0
|
||
|
bx lr
|
||
|
|
||
|
It would be better to do something like this, to fold the shift into the
|
||
|
conditional move:
|
||
|
|
||
|
and r1, r0, #127
|
||
|
ldr r2, LCPI1_0
|
||
|
ldr r2, [r2]
|
||
|
ldr r1, [r2, +r1, lsl #2]
|
||
|
tst r0, #128
|
||
|
movne r1, r1, lsr #2
|
||
|
ldr r0, LCPI1_1
|
||
|
and r0, r1, r0
|
||
|
bx lr
|
||
|
|
||
|
it saves an instruction and a register.
|
||
|
|
||
|
//===---------------------------------------------------------------------===//
|
||
|
|
||
|
It might be profitable to cse MOVi16 if there are lots of 32-bit immediates
|
||
|
with the same bottom half.
|
||
|
|
||
|
//===---------------------------------------------------------------------===//
|
||
|
|
||
|
Robert Muth started working on an alternate jump table implementation that
|
||
|
does not put the tables in-line in the text. This is more like the llvm
|
||
|
default jump table implementation. This might be useful sometime. Several
|
||
|
revisions of patches are on the mailing list, beginning at:
|
||
|
http://lists.cs.uiuc.edu/pipermail/llvmdev/2009-June/022763.html
|
||
|
|
||
|
//===---------------------------------------------------------------------===//
|
||
|
|
||
|
Make use of the "rbit" instruction.
|
||
|
|
||
|
//===---------------------------------------------------------------------===//
|
||
|
|
||
|
Take a look at test/CodeGen/Thumb2/machine-licm.ll. ARM should be taught how
|
||
|
to licm and cse the unnecessary load from cp#1.
|
||
|
|
||
|
//===---------------------------------------------------------------------===//
|
||
|
|
||
|
The CMN instruction sets the flags like an ADD instruction, while CMP sets
|
||
|
them like a subtract. Therefore to be able to use CMN for comparisons other
|
||
|
than the Z bit, we'll need additional logic to reverse the conditionals
|
||
|
associated with the comparison. Perhaps a pseudo-instruction for the comparison,
|
||
|
with a post-codegen pass to clean up and handle the condition codes?
|
||
|
See PR5694 for testcase.
|
||
|
|
||
|
//===---------------------------------------------------------------------===//
|
||
|
|
||
|
Given the following on armv5:
|
||
|
int test1(int A, int B) {
|
||
|
return (A&-8388481)|(B&8388480);
|
||
|
}
|
||
|
|
||
|
We currently generate:
|
||
|
ldr r2, .LCPI0_0
|
||
|
and r0, r0, r2
|
||
|
ldr r2, .LCPI0_1
|
||
|
and r1, r1, r2
|
||
|
orr r0, r1, r0
|
||
|
bx lr
|
||
|
|
||
|
We should be able to replace the second ldr+and with a bic (i.e. reuse the
|
||
|
constant which was already loaded). Not sure what's necessary to do that.
|
||
|
|
||
|
//===---------------------------------------------------------------------===//
|
||
|
|
||
|
The code generated for bswap on armv4/5 (CPUs without rev) is less than ideal:
|
||
|
|
||
|
int a(int x) { return __builtin_bswap32(x); }
|
||
|
|
||
|
a:
|
||
|
mov r1, #255, 24
|
||
|
mov r2, #255, 16
|
||
|
and r1, r1, r0, lsr #8
|
||
|
and r2, r2, r0, lsl #8
|
||
|
orr r1, r1, r0, lsr #24
|
||
|
orr r0, r2, r0, lsl #24
|
||
|
orr r0, r0, r1
|
||
|
bx lr
|
||
|
|
||
|
Something like the following would be better (fewer instructions/registers):
|
||
|
eor r1, r0, r0, ror #16
|
||
|
bic r1, r1, #0xff0000
|
||
|
mov r1, r1, lsr #8
|
||
|
eor r0, r1, r0, ror #8
|
||
|
bx lr
|
||
|
|
||
|
A custom Thumb version would also be a slight improvement over the generic
|
||
|
version.
|
||
|
|
||
|
//===---------------------------------------------------------------------===//
|
||
|
|
||
|
Consider the following simple C code:
|
||
|
|
||
|
void foo(unsigned char *a, unsigned char *b, int *c) {
|
||
|
if ((*a | *b) == 0) *c = 0;
|
||
|
}
|
||
|
|
||
|
currently llvm-gcc generates something like this (nice branchless code I'd say):
|
||
|
|
||
|
ldrb r0, [r0]
|
||
|
ldrb r1, [r1]
|
||
|
orr r0, r1, r0
|
||
|
tst r0, #255
|
||
|
moveq r0, #0
|
||
|
streq r0, [r2]
|
||
|
bx lr
|
||
|
|
||
|
Note that both "tst" and "moveq" are redundant.
|
||
|
|
||
|
//===---------------------------------------------------------------------===//
|
||
|
|
||
|
When loading immediate constants with movt/movw, if there are multiple
|
||
|
constants needed with the same low 16 bits, and those values are not live at
|
||
|
the same time, it would be possible to use a single movw instruction, followed
|
||
|
by multiple movt instructions to rewrite the high bits to different values.
|
||
|
For example:
|
||
|
|
||
|
volatile store i32 -1, i32* inttoptr (i32 1342210076 to i32*), align 4,
|
||
|
!tbaa
|
||
|
!0
|
||
|
volatile store i32 -1, i32* inttoptr (i32 1342341148 to i32*), align 4,
|
||
|
!tbaa
|
||
|
!0
|
||
|
|
||
|
is compiled and optimized to:
|
||
|
|
||
|
movw r0, #32796
|
||
|
mov.w r1, #-1
|
||
|
movt r0, #20480
|
||
|
str r1, [r0]
|
||
|
movw r0, #32796 @ <= this MOVW is not needed, value is there already
|
||
|
movt r0, #20482
|
||
|
str r1, [r0]
|
||
|
|
||
|
//===---------------------------------------------------------------------===//
|
||
|
|
||
|
Improve codegen for select's:
|
||
|
if (x != 0) x = 1
|
||
|
if (x == 1) x = 1
|
||
|
|
||
|
ARM codegen used to look like this:
|
||
|
mov r1, r0
|
||
|
cmp r1, #1
|
||
|
mov r0, #0
|
||
|
moveq r0, #1
|
||
|
|
||
|
The naive lowering select between two different values. It should recognize the
|
||
|
test is equality test so it's more a conditional move rather than a select:
|
||
|
cmp r0, #1
|
||
|
movne r0, #0
|
||
|
|
||
|
Currently this is a ARM specific dag combine. We probably should make it into a
|
||
|
target-neutral one.
|
||
|
|
||
|
//===---------------------------------------------------------------------===//
|
||
|
|
||
|
Optimize unnecessary checks for zero with __builtin_clz/ctz. Those builtins
|
||
|
are specified to be undefined at zero, so portable code must check for zero
|
||
|
and handle it as a special case. That is unnecessary on ARM where those
|
||
|
operations are implemented in a way that is well-defined for zero. For
|
||
|
example:
|
||
|
|
||
|
int f(int x) { return x ? __builtin_clz(x) : sizeof(int)*8; }
|
||
|
|
||
|
should just be implemented with a CLZ instruction. Since there are other
|
||
|
targets, e.g., PPC, that share this behavior, it would be best to implement
|
||
|
this in a target-independent way: we should probably fold that (when using
|
||
|
"undefined at zero" semantics) to set the "defined at zero" bit and have
|
||
|
the code generator expand out the right code.
|
||
|
|
||
|
|
||
|
//===---------------------------------------------------------------------===//
|
||
|
|
||
|
Clean up the test/MC/ARM files to have more robust register choices.
|
||
|
|
||
|
R0 should not be used as a register operand in the assembler tests as it's then
|
||
|
not possible to distinguish between a correct encoding and a missing operand
|
||
|
encoding, as zero is the default value for the binary encoder.
|
||
|
e.g.,
|
||
|
add r0, r0 // bad
|
||
|
add r3, r5 // good
|
||
|
|
||
|
Register operands should be distinct. That is, when the encoding does not
|
||
|
require two syntactical operands to refer to the same register, two different
|
||
|
registers should be used in the test so as to catch errors where the
|
||
|
operands are swapped in the encoding.
|
||
|
e.g.,
|
||
|
subs.w r1, r1, r1 // bad
|
||
|
subs.w r1, r2, r3 // good
|
||
|
|