From eb05f90c71fc7b42e52d4f363fd8325bace9a0e7 Mon Sep 17 00:00:00 2001 From: Chris Lattner Date: Thu, 14 Feb 2008 06:19:02 +0000 Subject: [PATCH] upgrade some entries, remove stuff that is done. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@47109 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/README-SSE.txt | 50 +++++--------- lib/Target/X86/README.txt | 126 ++++++++-------------------------- 2 files changed, 44 insertions(+), 132 deletions(-) diff --git a/lib/Target/X86/README-SSE.txt b/lib/Target/X86/README-SSE.txt index 197dae38c6f..2ccabca2fc7 100644 --- a/lib/Target/X86/README-SSE.txt +++ b/lib/Target/X86/README-SSE.txt @@ -56,22 +56,23 @@ store tmp -> [xslot] time, not at spiller time). *Note* however that this can only be done if Y is dead. Here's a testcase: -%.str_3 = external global [15 x sbyte] ; <[15 x sbyte]*> [#uses=0] -implementation ; Functions: -declare void %printf(int, ...) -void %main() { +@.str_3 = external global [15 x i8] ; <[15 x i8]*> [#uses=0] +declare void @printf(i32, ...) +define void @main() { build_tree.exit: - br label %no_exit.i7 -no_exit.i7: ; preds = %no_exit.i7, %build_tree.exit - %tmp.0.1.0.i9 = phi double [ 0.000000e+00, %build_tree.exit ], [ %tmp.34.i18, %no_exit.i7 ] ; [#uses=1] - %tmp.0.0.0.i10 = phi double [ 0.000000e+00, %build_tree.exit ], [ %tmp.28.i16, %no_exit.i7 ] ; [#uses=1] - %tmp.28.i16 = add double %tmp.0.0.0.i10, 0.000000e+00 - %tmp.34.i18 = add double %tmp.0.1.0.i9, 0.000000e+00 - br bool false, label %Compute_Tree.exit23, label %no_exit.i7 -Compute_Tree.exit23: ; preds = %no_exit.i7 - tail call void (int, ...)* %printf( int 0 ) - store double %tmp.34.i18, double* null - ret void + br label %no_exit.i7 + +no_exit.i7: ; preds = %no_exit.i7, %build_tree.exit + %tmp.0.1.0.i9 = phi double [ 0.000000e+00, %build_tree.exit ], [ %tmp.34.i18, %no_exit.i7 ] ; [#uses=1] + %tmp.0.0.0.i10 = phi double [ 0.000000e+00, %build_tree.exit ], [ %tmp.28.i16, %no_exit.i7 ] ; [#uses=1] + %tmp.28.i16 = add double %tmp.0.0.0.i10, 0.000000e+00 ; [#uses=1] + %tmp.34.i18 = add double %tmp.0.1.0.i9, 0.000000e+00 ; [#uses=2] + br i1 false, label %Compute_Tree.exit23, label %no_exit.i7 + +Compute_Tree.exit23: ; preds = %no_exit.i7 + tail call void (i32, ...)* @printf( i32 0 ) + store double %tmp.34.i18, double* null + ret void } We currently emit: @@ -125,25 +126,6 @@ more experiments on different x86 machines. //===---------------------------------------------------------------------===// -Currently the x86 codegen isn't very good at mixing SSE and FPStack -code: - -unsigned int foo(double x) { return x; } - -foo: - subl $20, %esp - movsd 24(%esp), %xmm0 - movsd %xmm0, 8(%esp) - fldl 8(%esp) - fisttpll (%esp) - movl (%esp), %eax - addl $20, %esp - ret - -This will be solved when we go to a dynamic programming based isel. - -//===---------------------------------------------------------------------===// - Lower memcpy / memset to a series of SSE 128 bit move instructions when it's feasible. diff --git a/lib/Target/X86/README.txt b/lib/Target/X86/README.txt index e9f0d7338b3..5a4f7c4e5f1 100644 --- a/lib/Target/X86/README.txt +++ b/lib/Target/X86/README.txt @@ -435,44 +435,6 @@ require a copy to be inserted (in X86InstrInfo::convertToThreeAddress). //===---------------------------------------------------------------------===// -Consider this: - -typedef struct pair { float A, B; } pair; -void pairtest(pair P, float *FP) { - *FP = P.A+P.B; -} - -We currently generate this code with llvmgcc4: - -_pairtest: - movl 8(%esp), %eax - movl 4(%esp), %ecx - movd %eax, %xmm0 - movd %ecx, %xmm1 - addss %xmm0, %xmm1 - movl 12(%esp), %eax - movss %xmm1, (%eax) - ret - -we should be able to generate: -_pairtest: - movss 4(%esp), %xmm0 - movl 12(%esp), %eax - addss 8(%esp), %xmm0 - movss %xmm0, (%eax) - ret - -The issue is that llvmgcc4 is forcing the struct to memory, then passing it as -integer chunks. It does this so that structs like {short,short} are passed in -a single 32-bit integer stack slot. We should handle the safe cases above much -nicer, while still handling the hard cases. - -While true in general, in this specific case we could do better by promoting -load int + bitcast to float -> load fload. This basically needs alignment info, -the code is already implemented (but disabled) in dag combine). - -//===---------------------------------------------------------------------===// - Another instruction selector deficiency: void %bar() { @@ -551,25 +513,24 @@ do not make use of. //===---------------------------------------------------------------------===// -int %foo(int* %a, int %t) { +define i32 @foo(i32* %a, i32 %t) { entry: - br label %cond_true + br label %cond_true -cond_true: ; preds = %cond_true, %entry - %x.0.0 = phi int [ 0, %entry ], [ %tmp9, %cond_true ] - %t_addr.0.0 = phi int [ %t, %entry ], [ %tmp7, %cond_true ] - %tmp2 = getelementptr int* %a, int %x.0.0 - %tmp3 = load int* %tmp2 ; [#uses=1] - %tmp5 = add int %t_addr.0.0, %x.0.0 ; [#uses=1] - %tmp7 = add int %tmp5, %tmp3 ; [#uses=2] - %tmp9 = add int %x.0.0, 1 ; [#uses=2] - %tmp = setgt int %tmp9, 39 ; [#uses=1] - br bool %tmp, label %bb12, label %cond_true +cond_true: ; preds = %cond_true, %entry + %x.0.0 = phi i32 [ 0, %entry ], [ %tmp9, %cond_true ] ; [#uses=3] + %t_addr.0.0 = phi i32 [ %t, %entry ], [ %tmp7, %cond_true ] ; [#uses=1] + %tmp2 = getelementptr i32* %a, i32 %x.0.0 ; [#uses=1] + %tmp3 = load i32* %tmp2 ; [#uses=1] + %tmp5 = add i32 %t_addr.0.0, %x.0.0 ; [#uses=1] + %tmp7 = add i32 %tmp5, %tmp3 ; [#uses=2] + %tmp9 = add i32 %x.0.0, 1 ; [#uses=2] + %tmp = icmp sgt i32 %tmp9, 39 ; [#uses=1] + br i1 %tmp, label %bb12, label %cond_true -bb12: ; preds = %cond_true - ret int %tmp7 +bb12: ; preds = %cond_true + ret i32 %tmp7 } - is pessimized by -loop-reduce and -indvars //===---------------------------------------------------------------------===// @@ -704,9 +665,9 @@ The add\sub pair is really unneeded here. Consider the expansion of: -uint %test3(uint %X) { - %tmp1 = rem uint %X, 255 - ret uint %tmp1 +define i32 @test3(i32 %X) { + %tmp1 = urem i32 %X, 255 + ret i32 %tmp1 } Currently it compiles to: @@ -948,22 +909,22 @@ Another example is: ;; allocator turns the shift into an LEA. This also occurs for ADD. ; Check that the shift gets turned into an LEA. -; RUN: llvm-upgrade < %s | llvm-as | llc -march=x86 -x86-asm-syntax=intel | \ +; RUN: llvm-as < %s | llc -march=x86 -x86-asm-syntax=intel | \ ; RUN: not grep {mov E.X, E.X} -%G = external global int +@G = external global i32 ; [#uses=3] -int %test1(int %X, int %Y) { - %Z = add int %X, %Y - volatile store int %Y, int* %G - volatile store int %Z, int* %G - ret int %X +define i32 @test1(i32 %X, i32 %Y) { + %Z = add i32 %X, %Y ; [#uses=1] + volatile store i32 %Y, i32* @G + volatile store i32 %Z, i32* @G + ret i32 %X } -int %test2(int %X) { - %Z = add int %X, 1 ;; inc - volatile store int %Z, int* %G - ret int %X +define i32 @test2(i32 %X) { + %Z = add i32 %X, 1 ; [#uses=1] + volatile store i32 %Z, i32* @G + ret i32 %X } //===---------------------------------------------------------------------===// @@ -1238,37 +1199,6 @@ suggests using the 32-bit register (which is what ICC uses). //===---------------------------------------------------------------------===// -rdar://5506677 - We compile this: - -define i32 @foo(double %x) { - %x14 = bitcast double %x to i64 ; [#uses=1] - %tmp713 = trunc i64 %x14 to i32 ; [#uses=1] - %tmp8 = and i32 %tmp713, 2147483647 ; [#uses=1] - ret i32 %tmp8 -} - -to: - -_foo: - subl $12, %esp - fldl 16(%esp) - fstpl (%esp) - movl $2147483647, %eax - andl (%esp), %eax - addl $12, %esp - #FP_REG_KILL - ret - -It would be much better to eliminate the fldl/fstpl by folding the bitcast -into the load SDNode. That would give us: - -_foo: - movl $2147483647, %eax - andl 4(%esp), %eax - ret - -//===---------------------------------------------------------------------===// - We compile this: void compare (long long foo) {