mirror of
https://github.com/c64scene-ar/llvm-6502.git
synced 2024-12-16 11:30:51 +00:00
376642ed62
1. Teach it to use overlapping unaligned load / store to copy / set the trailing bytes. e.g. On 86, use two pairs of movups / movaps for 17 - 31 byte copies. 2. Use f64 for memcpy / memset on targets where i64 is not legal but f64 is. e.g. x86 and ARM. 3. When memcpy from a constant string, do *not* replace the load with a constant if it's not possible to materialize an integer immediate with a single instruction (required a new target hook: TLI.isIntImmLegal()). 4. Use unaligned load / stores more aggressively if target hooks indicates they are "fast". 5. Update ARM target hooks to use unaligned load / stores. e.g. vld1.8 / vst1.8. Also increase the threshold to something reasonable (8 for memset, 4 pairs for memcpy). This significantly improves Dhrystone, up to 50% on ARM iOS devices. rdar://12760078 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@169791 91177308-0d34-0410-b5e6-96231b3b80d8
30 lines
902 B
LLVM
30 lines
902 B
LLVM
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7 | FileCheck %s
|
|
; rdar://7396984
|
|
|
|
@str = private constant [28 x i8] c"xxxxxxxxxxxxxxxxxxxxxxxxxxx\00", align 1
|
|
|
|
define void @t(i32 %count) ssp nounwind {
|
|
entry:
|
|
; CHECK: t:
|
|
; CHECK: movups L_str+12(%rip), %xmm0
|
|
; CHECK: movups L_str(%rip), %xmm1
|
|
%tmp0 = alloca [60 x i8], align 1
|
|
%tmp1 = getelementptr inbounds [60 x i8]* %tmp0, i64 0, i64 0
|
|
br label %bb1
|
|
|
|
bb1:
|
|
; CHECK: LBB0_1:
|
|
; CHECK: movups %xmm0, 12(%rsp)
|
|
; CHECK: movaps %xmm1, (%rsp)
|
|
%tmp2 = phi i32 [ %tmp3, %bb1 ], [ 0, %entry ]
|
|
call void @llvm.memcpy.p0i8.p0i8.i64(i8* %tmp1, i8* getelementptr inbounds ([28 x i8]* @str, i64 0, i64 0), i64 28, i32 1, i1 false)
|
|
%tmp3 = add i32 %tmp2, 1
|
|
%tmp4 = icmp eq i32 %tmp3, %count
|
|
br i1 %tmp4, label %bb2, label %bb1
|
|
|
|
bb2:
|
|
ret void
|
|
}
|
|
|
|
declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32, i1) nounwind
|