diff --git a/lib/Target/README.txt b/lib/Target/README.txt index f047d087cbf..fa5e6010cfc 100644 --- a/lib/Target/README.txt +++ b/lib/Target/README.txt @@ -2,38 +2,6 @@ Target Independent Opportunities: //===---------------------------------------------------------------------===// -We should recognize idioms for add-with-carry and turn it into the appropriate -intrinsics. This example: - -unsigned add32carry(unsigned sum, unsigned x) { - unsigned z = sum + x; - if (sum + x < x) - z++; - return z; -} - -Compiles to: clang t.c -S -o - -O3 -fomit-frame-pointer -m64 -mkernel - -_add32carry: ## @add32carry - addl %esi, %edi - cmpl %esi, %edi - sbbl %eax, %eax - andl $1, %eax - addl %edi, %eax - ret - -with clang, but to: - -_add32carry: - leal (%rsi,%rdi), %eax - cmpl %esi, %eax - adcl $0, %eax - ret - -with gcc. - -//===---------------------------------------------------------------------===// - Dead argument elimination should be enhanced to handle cases when an argument is dead to an externally visible function. Though the argument can't be removed from the externally visible function, the caller doesn't need to pass it in. @@ -82,6 +50,9 @@ unsigned int mul(unsigned int a,unsigned int b) { return a*b; } +The legalization code for mul-with-overflow needs to be made more robust before +this can be implemented though. + //===---------------------------------------------------------------------===// Get the C front-end to expand hypot(x,y) -> llvm.sqrt(x*x+y*y) when errno and @@ -92,41 +63,6 @@ right). //===---------------------------------------------------------------------===// -Solve this DAG isel folding deficiency: - -int X, Y; - -void fn1(void) -{ - X = X | (Y << 3); -} - -compiles to - -fn1: - movl Y, %eax - shll $3, %eax - orl X, %eax - movl %eax, X - ret - -The problem is the store's chain operand is not the load X but rather -a TokenFactor of the load X and load Y, which prevents the folding. - -There are two ways to fix this: - -1. The dag combiner can start using alias analysis to realize that y/x - don't alias, making the store to X not dependent on the load from Y. -2. The generated isel could be made smarter in the case it can't - disambiguate the pointers. - -Number 1 is the preferred solution. - -This has been "fixed" by a TableGen hack. But that is a short term workaround -which will be removed once the proper fix is made. - -//===---------------------------------------------------------------------===// - On targets with expensive 64-bit multiply, we could LSR this: for (i = ...; ++i) { @@ -339,14 +275,6 @@ unsigned long reverse(unsigned v) { return v ^ (t >> 8); } -Neither is this (very standard idiom): - -int f(int n) -{ - return (((n) << 24) | (((n) & 0xff00) << 8) - | (((n) >> 8) & 0xff00) | ((n) >> 24)); -} - //===---------------------------------------------------------------------===// [LOOP RECOGNITION] @@ -382,9 +310,7 @@ unsigned int popcount(unsigned int input) { return count; } -This is a form of idiom recognition for loops, the same thing that could be -useful for recognizing memset/memcpy. This sort of thing should be added to the -loop idiom pass. +This sort of thing should be added to the loop idiom pass. //===---------------------------------------------------------------------===// @@ -639,46 +565,21 @@ struct THotKey { short Key; bool Control; bool Shift; bool Alt; }; extern THotKey m_HotKey; THotKey GetHotKey () { return m_HotKey; } -into (-O3 -fno-exceptions -static -fomit-frame-pointer): +into (-m64 -O3 -fno-exceptions -static -fomit-frame-pointer): -__Z9GetHotKeyv: - pushl %esi - movl 8(%esp), %eax - movb _m_HotKey+3, %cl - movb _m_HotKey+4, %dl - movb _m_HotKey+2, %ch - movw _m_HotKey, %si - movw %si, (%eax) - movb %ch, 2(%eax) - movb %cl, 3(%eax) - movb %dl, 4(%eax) - popl %esi - ret $4 - -GCC produces: - -__Z9GetHotKeyv: - movl _m_HotKey, %edx - movl 4(%esp), %eax - movl %edx, (%eax) - movzwl _m_HotKey+4, %edx - movw %dx, 4(%eax) - ret $4 - -The LLVM IR contains the needed alignment info, so we should be able to -merge the loads and stores into 4-byte loads: - - %struct.THotKey = type { i16, i8, i8, i8 } -define void @_Z9GetHotKeyv(%struct.THotKey* sret %agg.result) nounwind { -... - %tmp2 = load i16* getelementptr (@m_HotKey, i32 0, i32 0), align 8 - %tmp5 = load i8* getelementptr (@m_HotKey, i32 0, i32 1), align 2 - %tmp8 = load i8* getelementptr (@m_HotKey, i32 0, i32 2), align 1 - %tmp11 = load i8* getelementptr (@m_HotKey, i32 0, i32 3), align 2 - -Alternatively, we should use a small amount of base-offset alias analysis -to make it so the scheduler doesn't need to hold all the loads in regs at -once. +__Z9GetHotKeyv: ## @_Z9GetHotKeyv + movq _m_HotKey@GOTPCREL(%rip), %rax + movzwl (%rax), %ecx + movzbl 2(%rax), %edx + shlq $16, %rdx + orq %rcx, %rdx + movzbl 3(%rax), %ecx + shlq $24, %rcx + orq %rdx, %rcx + movzbl 4(%rax), %eax + shlq $32, %rax + orq %rcx, %rax + ret //===---------------------------------------------------------------------===// @@ -764,20 +665,6 @@ etc. On X86, we miss a bunch of 'rotate by variable' cases because the rotate matching code in dag combine doesn't look through truncates aggressively enough. Here are some testcases reduces from GCC PR17886: -unsigned long long f(unsigned long long x, int y) { - return (x << y) | (x >> 64-y); -} -unsigned f2(unsigned x, int y){ - return (x << y) | (x >> 32-y); -} -unsigned long long f3(unsigned long long x){ - int y = 9; - return (x << y) | (x >> 64-y); -} -unsigned f4(unsigned x){ - int y = 10; - return (x << y) | (x >> 32-y); -} unsigned long long f5(unsigned long long x, unsigned long long y) { return (x << 8) | ((y >> 48) & 0xffull); } @@ -796,11 +683,6 @@ unsigned long long f6(unsigned long long x, unsigned long long y, int z) { } } -On X86-64, we only handle f2/f3/f4 right. On x86-32, a few of these -generate truly horrible code, instead of using shld and friends. On -ARM, we end up with calls to L___lshrdi3/L___ashldi3 in f, which is -badness. PPC64 misses f, f5 and f6. CellSPU aborts in isel. - //===---------------------------------------------------------------------===// This (and similar related idioms): diff --git a/lib/Target/X86/README.txt b/lib/Target/X86/README.txt index 4005431b75a..551d9f08526 100644 --- a/lib/Target/X86/README.txt +++ b/lib/Target/X86/README.txt @@ -1507,6 +1507,8 @@ loop, the value comes into the loop as two values, and RegsForValue::getCopyFromRegs doesn't know how to put an AssertSext on the constructed BUILD_PAIR which represents the cast value. +This can be handled by making CodeGenPrepare sink the cast. + //===---------------------------------------------------------------------===// Test instructions can be eliminated by using EFLAGS values from arithmetic @@ -1847,3 +1849,38 @@ _foo: 0 is the only unsigned number < 1. //===---------------------------------------------------------------------===// + +This code: + +%0 = type { i32, i1 } + +define i32 @add32carry(i32 %sum, i32 %x) nounwind readnone ssp { +entry: + %uadd = tail call %0 @llvm.uadd.with.overflow.i32(i32 %sum, i32 %x) + %cmp = extractvalue %0 %uadd, 1 + %inc = zext i1 %cmp to i32 + %add = add i32 %x, %sum + %z.0 = add i32 %add, %inc + ret i32 %z.0 +} + +declare %0 @llvm.uadd.with.overflow.i32(i32, i32) nounwind readnone + +compiles to: + +_add32carry: ## @add32carry + addl %esi, %edi + sbbl %ecx, %ecx + movl %edi, %eax + subl %ecx, %eax + ret + +But it could be: + +_add32carry: + leal (%rsi,%rdi), %eax + cmpl %esi, %eax + adcl $0, %eax + ret + +//===---------------------------------------------------------------------===//