llvm-6502/test/CodeGen/X86/store-narrow.ll

169 lines
4.0 KiB
LLVM
Raw Normal View History

; rdar://7860110
; RUN: llc -asm-verbose=false < %s | FileCheck %s -check-prefix=X64
; RUN: llc -march=x86 -asm-verbose=false < %s | FileCheck %s -check-prefix=X32
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
target triple = "x86_64-apple-darwin10.2"
define void @test1(i32* nocapture %a0, i8 zeroext %a1) nounwind ssp {
entry:
%A = load i32* %a0, align 4
%B = and i32 %A, -256 ; 0xFFFFFF00
%C = zext i8 %a1 to i32
%D = or i32 %C, %B
store i32 %D, i32* %a0, align 4
ret void
; X64-LABEL: test1:
; X64: movb %sil, (%rdi)
; X32-LABEL: test1:
; X32: movb 8(%esp), %al
; X32: movb %al, (%{{.*}})
}
define void @test2(i32* nocapture %a0, i8 zeroext %a1) nounwind ssp {
entry:
%A = load i32* %a0, align 4
%B = and i32 %A, -65281 ; 0xFFFF00FF
%C = zext i8 %a1 to i32
%CS = shl i32 %C, 8
%D = or i32 %B, %CS
store i32 %D, i32* %a0, align 4
ret void
; X64-LABEL: test2:
; X64: movb %sil, 1(%rdi)
; X32-LABEL: test2:
[SDAG] Introduce a combined set to the DAG combiner which tracks nodes which have successfully round-tripped through the combine phase, and use this to ensure all operands to DAG nodes are visited by the combiner, even if they are only added during the combine phase. This is critical to have the combiner reach nodes that are *introduced* during combining. Previously these would sometimes be visited and sometimes not be visited based on whether they happened to end up on the worklist or not. Now we always run them through the combiner. This fixes quite a few bad codegen test cases lurking in the suite while also being more principled. Among these, the TLS codegeneration is particularly exciting for programs that have this in the critical path like TSan-instrumented binaries (although I think they engineer to use a different TLS that is faster anyways). I've tried to check for compile-time regressions here by running llc over a merged (but not LTO-ed) clang bitcode file and observed at most a 3% slowdown in llc. Given that this is essentially a worst case (none of opt or clang are running at this phase) I think this is tolerable. The actual LTO case should be even less costly, and the cost in normal compilation should be negligible. With this combining logic, it is possible to re-legalize as we combine which is necessary to implement PSHUFB formation on x86 as a post-legalize DAG combine (my ultimate goal). Differential Revision: http://reviews.llvm.org/D4638 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@213898 91177308-0d34-0410-b5e6-96231b3b80d8
2014-07-24 22:15:28 +00:00
; X32: movb 8(%esp), %[[REG:[abcd]]]l
[SDAG] Make the DAGCombine worklist not grow endlessly due to duplicate insertions. The old behavior could cause arbitrarily bad memory usage in the DAG combiner if there was heavy traffic of adding nodes already on the worklist to it. This commit switches the DAG combine worklist to work the same way as the instcombine worklist where we null-out removed entries and only add new entries to the worklist. My measurements of codegen time shows slight improvement. The memory utilization is unsurprisingly dominated by other factors (the IR and DAG itself I suspect). This change results in subtle, frustrating churn in the particular order in which DAG combines are applied which causes a number of minor regressions where we fail to match a pattern previously matched by accident. AFAICT, all of these should be using AddToWorklist to directly or should be written in a less brittle way. None of the changes seem drastically bad, and a few of the changes seem distinctly better. A major change required to make this work is to significantly harden the way in which the DAG combiner handle nodes which become dead (zero-uses). Previously, we relied on the ability to "priority-bump" them on the combine worklist to achieve recursive deletion of these nodes and ensure that the frontier of remaining live nodes all were added to the worklist. Instead, I've introduced a routine to just implement that precise logic with no indirection. It is a significantly simpler operation than that of the combiner worklist proper. I suspect this will also fix some other problems with the combiner. I think the x86 changes are really minor and uninteresting, but the avx512 change at least is hiding a "regression" (despite the test case being just noise, not testing some performance invariant) that might be looked into. Not sure if any of the others impact specific "important" code paths, but they didn't look terribly interesting to me, or the changes were really minor. The consensus in review is to fix any regressions that show up after the fact here. Thanks to the other reviewers for checking the output on other architectures. There is a specific regression on ARM that Tim already has a fix prepped to commit. Differential Revision: http://reviews.llvm.org/D4616 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@213727 91177308-0d34-0410-b5e6-96231b3b80d8
2014-07-23 07:08:53 +00:00
; X32: movb %[[REG]]l, 1(%{{.*}})
}
define void @test3(i32* nocapture %a0, i16 zeroext %a1) nounwind ssp {
entry:
%A = load i32* %a0, align 4
%B = and i32 %A, -65536 ; 0xFFFF0000
%C = zext i16 %a1 to i32
%D = or i32 %B, %C
store i32 %D, i32* %a0, align 4
ret void
; X64-LABEL: test3:
; X64: movw %si, (%rdi)
; X32-LABEL: test3:
; X32: movw 8(%esp), %ax
; X32: movw %ax, (%{{.*}})
}
define void @test4(i32* nocapture %a0, i16 zeroext %a1) nounwind ssp {
entry:
%A = load i32* %a0, align 4
%B = and i32 %A, 65535 ; 0x0000FFFF
%C = zext i16 %a1 to i32
%CS = shl i32 %C, 16
%D = or i32 %B, %CS
store i32 %D, i32* %a0, align 4
ret void
; X64-LABEL: test4:
; X64: movw %si, 2(%rdi)
; X32-LABEL: test4:
[SDAG] Introduce a combined set to the DAG combiner which tracks nodes which have successfully round-tripped through the combine phase, and use this to ensure all operands to DAG nodes are visited by the combiner, even if they are only added during the combine phase. This is critical to have the combiner reach nodes that are *introduced* during combining. Previously these would sometimes be visited and sometimes not be visited based on whether they happened to end up on the worklist or not. Now we always run them through the combiner. This fixes quite a few bad codegen test cases lurking in the suite while also being more principled. Among these, the TLS codegeneration is particularly exciting for programs that have this in the critical path like TSan-instrumented binaries (although I think they engineer to use a different TLS that is faster anyways). I've tried to check for compile-time regressions here by running llc over a merged (but not LTO-ed) clang bitcode file and observed at most a 3% slowdown in llc. Given that this is essentially a worst case (none of opt or clang are running at this phase) I think this is tolerable. The actual LTO case should be even less costly, and the cost in normal compilation should be negligible. With this combining logic, it is possible to re-legalize as we combine which is necessary to implement PSHUFB formation on x86 as a post-legalize DAG combine (my ultimate goal). Differential Revision: http://reviews.llvm.org/D4638 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@213898 91177308-0d34-0410-b5e6-96231b3b80d8
2014-07-24 22:15:28 +00:00
; X32: movw 8(%esp), %[[REG:[abcd]]]x
; X32: movw %[[REG]]x, 2(%{{.*}})
}
define void @test5(i64* nocapture %a0, i16 zeroext %a1) nounwind ssp {
entry:
%A = load i64* %a0, align 4
%B = and i64 %A, -4294901761 ; 0xFFFFFFFF0000FFFF
%C = zext i16 %a1 to i64
%CS = shl i64 %C, 16
%D = or i64 %B, %CS
store i64 %D, i64* %a0, align 4
ret void
; X64-LABEL: test5:
; X64: movw %si, 2(%rdi)
; X32-LABEL: test5:
[SDAG] Introduce a combined set to the DAG combiner which tracks nodes which have successfully round-tripped through the combine phase, and use this to ensure all operands to DAG nodes are visited by the combiner, even if they are only added during the combine phase. This is critical to have the combiner reach nodes that are *introduced* during combining. Previously these would sometimes be visited and sometimes not be visited based on whether they happened to end up on the worklist or not. Now we always run them through the combiner. This fixes quite a few bad codegen test cases lurking in the suite while also being more principled. Among these, the TLS codegeneration is particularly exciting for programs that have this in the critical path like TSan-instrumented binaries (although I think they engineer to use a different TLS that is faster anyways). I've tried to check for compile-time regressions here by running llc over a merged (but not LTO-ed) clang bitcode file and observed at most a 3% slowdown in llc. Given that this is essentially a worst case (none of opt or clang are running at this phase) I think this is tolerable. The actual LTO case should be even less costly, and the cost in normal compilation should be negligible. With this combining logic, it is possible to re-legalize as we combine which is necessary to implement PSHUFB formation on x86 as a post-legalize DAG combine (my ultimate goal). Differential Revision: http://reviews.llvm.org/D4638 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@213898 91177308-0d34-0410-b5e6-96231b3b80d8
2014-07-24 22:15:28 +00:00
; X32: movw 8(%esp), %[[REG:[abcd]]]x
; X32: movw %[[REG]]x, 2(%{{.*}})
}
define void @test6(i64* nocapture %a0, i8 zeroext %a1) nounwind ssp {
entry:
%A = load i64* %a0, align 4
%B = and i64 %A, -280375465082881 ; 0xFFFF00FFFFFFFFFF
%C = zext i8 %a1 to i64
%CS = shl i64 %C, 40
%D = or i64 %B, %CS
store i64 %D, i64* %a0, align 4
ret void
; X64-LABEL: test6:
; X64: movb %sil, 5(%rdi)
; X32-LABEL: test6:
; X32: movb 8(%esp), %[[REG:[abcd]l]]
; X32: movb %[[REG]], 5(%{{.*}})
}
define i32 @test7(i64* nocapture %a0, i8 zeroext %a1, i32* %P2) nounwind {
entry:
%OtherLoad = load i32 *%P2
%A = load i64* %a0, align 4
%B = and i64 %A, -280375465082881 ; 0xFFFF00FFFFFFFFFF
%C = zext i8 %a1 to i64
%CS = shl i64 %C, 40
%D = or i64 %B, %CS
store i64 %D, i64* %a0, align 4
ret i32 %OtherLoad
; X64-LABEL: test7:
; X64: movb %sil, 5(%rdi)
; X32-LABEL: test7:
; X32: movb 8(%esp), %[[REG:[abcd]l]]
; X32: movb %[[REG]], 5(%{{.*}})
}
; PR7833
@g_16 = internal global i32 -1
; X64-LABEL: test8:
; X64-NEXT: movl _g_16(%rip), %eax
; X64-NEXT: movl $0, _g_16(%rip)
; X64-NEXT: orl $1, %eax
; X64-NEXT: movl %eax, _g_16(%rip)
; X64-NEXT: ret
define void @test8() nounwind {
%tmp = load i32* @g_16
store i32 0, i32* @g_16
%or = or i32 %tmp, 1
store i32 %or, i32* @g_16
ret void
}
; X64-LABEL: test9:
; X64-NEXT: orb $1, _g_16(%rip)
; X64-NEXT: ret
define void @test9() nounwind {
%tmp = load i32* @g_16
%or = or i32 %tmp, 1
store i32 %or, i32* @g_16
ret void
}
; rdar://8494845 + PR8244
; X64-LABEL: test10:
; X64-NEXT: movsbl (%rdi), %eax
; X64-NEXT: shrl $8, %eax
; X64-NEXT: ret
define i8 @test10(i8* %P) nounwind ssp {
entry:
%tmp = load i8* %P, align 1
%conv = sext i8 %tmp to i32
%shr3 = lshr i32 %conv, 8
%conv2 = trunc i32 %shr3 to i8
ret i8 %conv2
}