Revert r122955. It seems using movups to lower memcpy can cause massive regression (even on Nehalem) in edge cases. I also didn't see any real performance benefit.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@123015 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
Evan Cheng 2011-01-07 19:35:30 +00:00
parent 1434f66b2e
commit a5e1362f96
11 changed files with 101 additions and 79 deletions

View File

@ -1063,8 +1063,12 @@ X86TargetLowering::getOptimalMemOpType(uint64_t Size,
// linux. This is because the stack realignment code can't handle certain // linux. This is because the stack realignment code can't handle certain
// cases like PR2962. This should be removed when PR2962 is fixed. // cases like PR2962. This should be removed when PR2962 is fixed.
const Function *F = MF.getFunction(); const Function *F = MF.getFunction();
if (NonScalarIntSafe && !F->hasFnAttr(Attribute::NoImplicitFloat)) { if (NonScalarIntSafe &&
!F->hasFnAttr(Attribute::NoImplicitFloat)) {
if (Size >= 16 && if (Size >= 16 &&
(Subtarget->isUnalignedMemAccessFast() ||
((DstAlign == 0 || DstAlign >= 16) &&
(SrcAlign == 0 || SrcAlign >= 16))) &&
Subtarget->getStackAlignment() >= 16) { Subtarget->getStackAlignment() >= 16) {
if (Subtarget->hasSSE2()) if (Subtarget->hasSSE2())
return MVT::v4i32; return MVT::v4i32;

View File

@ -1,4 +1,4 @@
; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s ; RUN: llc < %s -mtriple=x86_64-apple-darwin | FileCheck %s
; rdar://7842028 ; rdar://7842028
; Do not delete partially dead copy instructions. ; Do not delete partially dead copy instructions.
@ -9,7 +9,7 @@
%struct.F = type { %struct.FC*, i32, i32, i8, i32, i32, i32 } %struct.F = type { %struct.FC*, i32, i32, i8, i32, i32, i32 }
%struct.FC = type { [10 x i8], [32 x i32], %struct.FC*, i32 } %struct.FC = type { [10 x i8], [32 x i32], %struct.FC*, i32 }
define void @t(%struct.F* %this) nounwind optsize { define void @t(%struct.F* %this) nounwind {
entry: entry:
; CHECK: t: ; CHECK: t:
; CHECK: addq $12, %rsi ; CHECK: addq $12, %rsi

View File

@ -26,7 +26,7 @@ bb:
; CHECK: rep;stosl ; CHECK: rep;stosl
%tmp5 = bitcast i32* %tmp4 to i8* %tmp5 = bitcast i32* %tmp4 to i8*
call void @llvm.memset.p0i8.i64(i8* %tmp5, i8 0, i64 124, i32 4, i1 false) call void @llvm.memset.p0i8.i64(i8* %tmp5, i8 0, i64 84, i32 4, i1 false)
%tmp6 = getelementptr inbounds %struct.type* %s, i32 0, i32 62 %tmp6 = getelementptr inbounds %struct.type* %s, i32 0, i32 62
store i32* null, i32** %tmp6, align 8 store i32* null, i32** %tmp6, align 8
br label %bb1 br label %bb1

View File

@ -19,8 +19,8 @@ entry:
} }
; CHECK: movq ___stack_chk_guard@GOTPCREL(%rip), %rax ; CHECK: movq ___stack_chk_guard@GOTPCREL(%rip), %rax
; CHECK: movb 30(%rsp), %cl ; CHECK: movb 30(%rsp), %dl
; CHECK: movb (%rsp), %dl ; CHECK: movb (%rsp), %sil
; CHECK: movb %dl, (%rsp) ; CHECK: movb %sil, (%rsp)
; CHECK: movb %cl, 30(%rsp) ; CHECK: movb %dl, 30(%rsp)
; CHECK: callq ___stack_chk_fail ; CHECK: callq ___stack_chk_fail

View File

@ -1,4 +1,5 @@
; RUN: llc < %s -mattr=+sse2 -mtriple=i686-apple-darwin -mcpu=core2 | FileCheck %s -check-prefix=SSE2 ; RUN: llc < %s -mattr=+sse2 -mtriple=i686-apple-darwin -mcpu=core2 | FileCheck %s -check-prefix=SSE2
; RUN: llc < %s -mattr=+sse,-sse2 -mtriple=i686-apple-darwin -mcpu=core2 | FileCheck %s -check-prefix=SSE1
; RUN: llc < %s -mattr=-sse -mtriple=i686-apple-darwin -mcpu=core2 | FileCheck %s -check-prefix=NOSSE ; RUN: llc < %s -mattr=-sse -mtriple=i686-apple-darwin -mcpu=core2 | FileCheck %s -check-prefix=NOSSE
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core2 | FileCheck %s -check-prefix=X86-64 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core2 | FileCheck %s -check-prefix=X86-64
@ -14,6 +15,13 @@ entry:
; SSE2: movl $0 ; SSE2: movl $0
; SSE2: movl $0 ; SSE2: movl $0
; SSE1: t1:
; SSE1: movaps _.str, %xmm0
; SSE1: movaps %xmm0
; SSE1: movb $0
; SSE1: movl $0
; SSE1: movl $0
; NOSSE: t1: ; NOSSE: t1:
; NOSSE: movb $0 ; NOSSE: movb $0
; NOSSE: movl $0 ; NOSSE: movl $0
@ -43,6 +51,10 @@ entry:
; SSE2: movaps (%eax), %xmm0 ; SSE2: movaps (%eax), %xmm0
; SSE2: movaps %xmm0, (%eax) ; SSE2: movaps %xmm0, (%eax)
; SSE1: t2:
; SSE1: movaps (%eax), %xmm0
; SSE1: movaps %xmm0, (%eax)
; NOSSE: t2: ; NOSSE: t2:
; NOSSE: movl ; NOSSE: movl
; NOSSE: movl ; NOSSE: movl
@ -67,8 +79,22 @@ entry:
define void @t3(%struct.s0* nocapture %a, %struct.s0* nocapture %b) nounwind ssp { define void @t3(%struct.s0* nocapture %a, %struct.s0* nocapture %b) nounwind ssp {
entry: entry:
; SSE2: t3: ; SSE2: t3:
; SSE2: movups (%eax), %xmm0 ; SSE2: movsd (%eax), %xmm0
; SSE2: movups %xmm0, (%eax) ; SSE2: movsd 8(%eax), %xmm1
; SSE2: movsd %xmm1, 8(%eax)
; SSE2: movsd %xmm0, (%eax)
; SSE1: t3:
; SSE1: movl
; SSE1: movl
; SSE1: movl
; SSE1: movl
; SSE1: movl
; SSE1: movl
; SSE1: movl
; SSE1: movl
; SSE1: movl
; SSE1: movl
; NOSSE: t3: ; NOSSE: t3:
; NOSSE: movl ; NOSSE: movl
@ -83,8 +109,10 @@ entry:
; NOSSE: movl ; NOSSE: movl
; X86-64: t3: ; X86-64: t3:
; X86-64: movups (%rsi), %xmm0 ; X86-64: movq (%rsi), %rax
; X86-64: movups %xmm0, (%rdi) ; X86-64: movq 8(%rsi), %rcx
; X86-64: movq %rcx, 8(%rdi)
; X86-64: movq %rax, (%rdi)
%tmp2 = bitcast %struct.s0* %a to i8* ; <i8*> [#uses=1] %tmp2 = bitcast %struct.s0* %a to i8* ; <i8*> [#uses=1]
%tmp3 = bitcast %struct.s0* %b to i8* ; <i8*> [#uses=1] %tmp3 = bitcast %struct.s0* %b to i8* ; <i8*> [#uses=1]
tail call void @llvm.memcpy.i32(i8* %tmp2, i8* %tmp3, i32 16, i32 8) tail call void @llvm.memcpy.i32(i8* %tmp2, i8* %tmp3, i32 16, i32 8)
@ -94,12 +122,24 @@ entry:
define void @t4() nounwind { define void @t4() nounwind {
entry: entry:
; SSE2: t4: ; SSE2: t4:
; SSE2: movups _.str2, %xmm0 ; SSE2: movw $120
; SSE2: movaps %xmm0, (%esp)
; SSE2: movw $120, 28(%esp)
; SSE2: movl $2021161080 ; SSE2: movl $2021161080
; SSE2: movl $2021161080 ; SSE2: movl $2021161080
; SSE2: movl $2021161080 ; SSE2: movl $2021161080
; SSE2: movl $2021161080
; SSE2: movl $2021161080
; SSE2: movl $2021161080
; SSE2: movl $2021161080
; SSE1: t4:
; SSE1: movw $120
; SSE1: movl $2021161080
; SSE1: movl $2021161080
; SSE1: movl $2021161080
; SSE1: movl $2021161080
; SSE1: movl $2021161080
; SSE1: movl $2021161080
; SSE1: movl $2021161080
; NOSSE: t4: ; NOSSE: t4:
; NOSSE: movw $120 ; NOSSE: movw $120
@ -114,8 +154,8 @@ entry:
; X86-64: t4: ; X86-64: t4:
; X86-64: movabsq $8680820740569200760, %rax ; X86-64: movabsq $8680820740569200760, %rax
; X86-64: movq %rax ; X86-64: movq %rax
; X86-64: movups _.str2(%rip), %xmm0 ; X86-64: movq %rax
; X86-64: movaps %xmm0, -40(%rsp) ; X86-64: movq %rax
; X86-64: movw $120 ; X86-64: movw $120
; X86-64: movl $2021161080 ; X86-64: movl $2021161080
%tmp1 = alloca [30 x i8] %tmp1 = alloca [30 x i8]

View File

@ -37,34 +37,26 @@ entry:
tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %A, i8* %B, i64 64, i32 1, i1 false) tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %A, i8* %B, i64 64, i32 1, i1 false)
ret void ret void
; LINUX: test3: ; LINUX: test3:
; LINUX-NOT: memcpy ; LINUX: memcpy
; LINUX: movups
; LINUX: movups
; LINUX: movups
; LINUX: movups
; LINUX: movups
; LINUX: movups
; LINUX: movups
; LINUX: movups
; DARWIN: test3: ; DARWIN: test3:
; DARWIN-NOT: memcpy ; DARWIN-NOT: memcpy
; DARWIN: movups ; DARWIN: movq
; DARWIN: movups ; DARWIN: movq
; DARWIN: movups ; DARWIN: movq
; DARWIN: movups ; DARWIN: movq
; DARWIN: movups ; DARWIN: movq
; DARWIN: movups ; DARWIN: movq
; DARWIN: movups ; DARWIN: movq
; DARWIN: movups ; DARWIN: movq
; DARWIN: movups ; DARWIN: movq
; DARWIN: movups ; DARWIN: movq
; DARWIN: movups ; DARWIN: movq
; DARWIN: movups ; DARWIN: movq
; DARWIN: movups ; DARWIN: movq
; DARWIN: movups ; DARWIN: movq
; DARWIN: movups ; DARWIN: movq
; DARWIN: movups ; DARWIN: movq
} }
; Large constant memcpy's should be inlined when not optimizing for size. ; Large constant memcpy's should be inlined when not optimizing for size.

View File

@ -5,21 +5,7 @@ declare void @llvm.memset.i32(i8*, i8, i32, i32) nounwind
define fastcc void @t1() nounwind { define fastcc void @t1() nounwind {
entry: entry:
; CHECK: t1: ; CHECK: t1:
; CHECK: pxor %xmm0, %xmm0 ; CHECK: calll _memset
; CHECK: movups %xmm0, 160
; CHECK: movups %xmm0, 144
; CHECK: movups %xmm0, 128
; CHECK: movups %xmm0, 112
; CHECK: movups %xmm0, 96
; CHECK: movups %xmm0, 80
; CHECK: movups %xmm0, 64
; CHECK: movups %xmm0, 48
; CHECK: movups %xmm0, 32
; CHECK: movups %xmm0, 16
; CHECK: movups %xmm0, 0
; CHECK: movl $0, 184
; CHECK: movl $0, 180
; CHECK: movl $0, 176
call void @llvm.memset.i32( i8* null, i8 0, i32 188, i32 1 ) nounwind call void @llvm.memset.i32( i8* null, i8 0, i32 188, i32 1 ) nounwind
unreachable unreachable
} }

View File

@ -1,5 +1,6 @@
; RUN: llc < %s -mtriple=i386-apple-darwin -mcpu=nehalem | grep movups | count 5 ; RUN: llc < %s -mtriple=i386-apple-darwin -mcpu=nehalem | grep movups | count 5
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core2 | grep movups | count 5 ; RUN: llc < %s -mtriple=i386-apple-darwin -mcpu=core2 | grep movl | count 20
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core2 | grep movq | count 10
define void @bork() nounwind { define void @bork() nounwind {
entry: entry:

View File

@ -1,12 +1,8 @@
; RUN: llc < %s -mtriple=i386-apple-darwin -mcpu=nehalem | FileCheck %s ; RUN: llc < %s -mtriple=i386-apple-darwin -mcpu=core2 | grep movsd | count 8
; RUN: llc < %s -mtriple=i386-apple-darwin -mcpu=nehalem | grep movups | count 2
define void @ccosl({ x86_fp80, x86_fp80 }* noalias sret %agg.result, { x86_fp80, x86_fp80 }* byval align 4 %z) nounwind { define void @ccosl({ x86_fp80, x86_fp80 }* noalias sret %agg.result, { x86_fp80, x86_fp80 }* byval align 4 %z) nounwind {
entry: entry:
; CHECK: ccosl:
; CHECK: movaps
; CHECK: movaps
; CHECK: movups
; CHECK: movups
%iz = alloca { x86_fp80, x86_fp80 } ; <{ x86_fp80, x86_fp80 }*> [#uses=3] %iz = alloca { x86_fp80, x86_fp80 } ; <{ x86_fp80, x86_fp80 }*> [#uses=3]
%tmp1 = getelementptr { x86_fp80, x86_fp80 }* %z, i32 0, i32 1 ; <x86_fp80*> [#uses=1] %tmp1 = getelementptr { x86_fp80, x86_fp80 }* %z, i32 0, i32 1 ; <x86_fp80*> [#uses=1]
%tmp2 = load x86_fp80* %tmp1, align 16 ; <x86_fp80> [#uses=1] %tmp2 = load x86_fp80* %tmp1, align 16 ; <x86_fp80> [#uses=1]

View File

@ -10,12 +10,8 @@ entry:
unreachable unreachable
; CHECK: movq _c@TLVP(%rip), %rdi ; CHECK: movq _c@TLVP(%rip), %rdi
; CHECK-NEXT: callq *(%rdi) ; CHECK-NEXT: callq *(%rdi)
; CHECK-NEXT: pxor %xmm0, %xmm0 ; CHECK-NEXT: movl $0, 56(%rax)
; CHECK-NEXT: movups %xmm0, 32(%rax) ; CHECK-NEXT: movq $0, 48(%rax)
; CHECK-NEXT: movups %xmm0, 16(%rax)
; CHECK-NEXT: movups %xmm0, (%rax)
; CHECK-NEXT: movl $0, 56(%rax)
; CHECK-NEXT: movq $0, 48(%rax)
} }
declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) nounwind declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) nounwind

View File

@ -1,4 +1,6 @@
; RUN: llc < %s -mtriple=x86_64-apple-darwin10.0 -mcpu=core2 -relocation-model=dynamic-no-pic --asm-verbose=0 | FileCheck %s ; RUN: llc < %s -mtriple=i386-apple-darwin10.0 -mcpu=core2 -relocation-model=dynamic-no-pic --asm-verbose=0 | FileCheck -check-prefix=I386 %s
; RUN: llc < %s -mtriple=x86_64-apple-darwin10.0 -mcpu=core2 -relocation-model=dynamic-no-pic --asm-verbose=0 | FileCheck -check-prefix=CORE2 %s
; RUN: llc < %s -mtriple=x86_64-apple-darwin10.0 -mcpu=corei7 -relocation-model=dynamic-no-pic --asm-verbose=0 | FileCheck -check-prefix=COREI7 %s
@.str1 = internal constant [31 x i8] c"DHRYSTONE PROGRAM, SOME STRING\00", align 8 @.str1 = internal constant [31 x i8] c"DHRYSTONE PROGRAM, SOME STRING\00", align 8
@.str3 = internal constant [31 x i8] c"DHRYSTONE PROGRAM, 2'ND STRING\00", align 8 @.str3 = internal constant [31 x i8] c"DHRYSTONE PROGRAM, 2'ND STRING\00", align 8
@ -11,8 +13,13 @@ entry:
bb: bb:
%String2Loc9 = getelementptr inbounds [31 x i8]* %String2Loc, i64 0, i64 0 %String2Loc9 = getelementptr inbounds [31 x i8]* %String2Loc, i64 0, i64 0
call void @llvm.memcpy.i64(i8* %String2Loc9, i8* getelementptr inbounds ([31 x i8]* @.str3, i64 0, i64 0), i64 31, i32 1) call void @llvm.memcpy.i64(i8* %String2Loc9, i8* getelementptr inbounds ([31 x i8]* @.str3, i64 0, i64 0), i64 31, i32 1)
; CHECK: movabsq $2325069237881678925, %rax ; I386: calll {{_?}}memcpy
; CHECK: movups _.str3(%rip), %xmm0
; CORE2: movabsq
; CORE2: movabsq
; CORE2: movabsq
; COREI7: movups _.str3
br label %bb br label %bb
return: return:
@ -21,9 +28,9 @@ return:
declare void @llvm.memcpy.i64(i8* nocapture, i8* nocapture, i64, i32) nounwind declare void @llvm.memcpy.i64(i8* nocapture, i8* nocapture, i64, i32) nounwind
; CHECK: .section ; CORE2: .section
; CHECK: .align 4 ; CORE2: .align 4
; CHECK-NEXT: _.str1: ; CORE2-NEXT: _.str1:
; CHECK-NEXT: .asciz "DHRYSTONE PROGRAM, SOME STRING" ; CORE2-NEXT: .asciz "DHRYSTONE PROGRAM, SOME STRING"
; CHECK: .align 4 ; CORE2: .align 4
; CHECK-NEXT: _.str3: ; CORE2-NEXT: _.str3: