diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp index 0154afebb44..fbfd8686008 100644 --- a/lib/Target/X86/X86InstrInfo.cpp +++ b/lib/Target/X86/X86InstrInfo.cpp @@ -5999,6 +5999,7 @@ static const uint16_t ReplaceableInstrs[][3] = { { X86::MOVAPSrr, X86::MOVAPDrr, X86::MOVDQArr }, { X86::MOVUPSmr, X86::MOVUPDmr, X86::MOVDQUmr }, { X86::MOVUPSrm, X86::MOVUPDrm, X86::MOVDQUrm }, + { X86::MOVLPSmr, X86::MOVLPDmr, X86::MOVPQI2QImr }, { X86::MOVNTPSmr, X86::MOVNTPDmr, X86::MOVNTDQmr }, { X86::ANDNPSrm, X86::ANDNPDrm, X86::PANDNrm }, { X86::ANDNPSrr, X86::ANDNPDrr, X86::PANDNrr }, @@ -6014,6 +6015,7 @@ static const uint16_t ReplaceableInstrs[][3] = { { X86::VMOVAPSrr, X86::VMOVAPDrr, X86::VMOVDQArr }, { X86::VMOVUPSmr, X86::VMOVUPDmr, X86::VMOVDQUmr }, { X86::VMOVUPSrm, X86::VMOVUPDrm, X86::VMOVDQUrm }, + // TODO: Add the AVX versions of MOVLPSmr { X86::VMOVNTPSmr, X86::VMOVNTPDmr, X86::VMOVNTDQmr }, { X86::VANDNPSrm, X86::VANDNPDrm, X86::VPANDNrm }, { X86::VANDNPSrr, X86::VANDNPDrr, X86::VPANDNrr }, diff --git a/test/CodeGen/X86/2011-10-19-widen_vselect.ll b/test/CodeGen/X86/2011-10-19-widen_vselect.ll index da3c3222826..07dff9539cd 100644 --- a/test/CodeGen/X86/2011-10-19-widen_vselect.ll +++ b/test/CodeGen/X86/2011-10-19-widen_vselect.ll @@ -26,7 +26,7 @@ entry: } ; CHECK-LABEL: zero_test -; CHECK: pxor %xmm0, %xmm0 +; CHECK: xorps %xmm0, %xmm0 ; CHECK: ret define void @zero_test() { diff --git a/test/CodeGen/X86/2012-07-10-extload64.ll b/test/CodeGen/X86/2012-07-10-extload64.ll index f33fc8c69ef..a366102fbd7 100644 --- a/test/CodeGen/X86/2012-07-10-extload64.ll +++ b/test/CodeGen/X86/2012-07-10-extload64.ll @@ -6,7 +6,7 @@ entry: ; CHECK: pmovzxwd %A27 = load <4 x i16>, <4 x i16>* %in, align 4 %A28 = add <4 x i16> %A27, %A27 -; CHECK: movlpd +; CHECK: movq store <4 x i16> %A28, <4 x i16>* %in, align 4 ret void ; CHECK: ret @@ -18,7 +18,7 @@ define void @store_64(<2 x i32>* %ptr) { BB: store <2 x i32> zeroinitializer, <2 x i32>* %ptr ret void -;CHECK: movlpd +;CHECK: movlps ;CHECK: ret } diff --git a/test/CodeGen/X86/exedeps-movq.ll b/test/CodeGen/X86/exedeps-movq.ll new file mode 100644 index 00000000000..b702c8716a9 --- /dev/null +++ b/test/CodeGen/X86/exedeps-movq.ll @@ -0,0 +1,73 @@ +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=sse2 | FileCheck %s --check-prefix=SSE +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx | FileCheck %s --check-prefix=AVX + +; Verify that we select the correct version of the instruction that stores the low 64-bits +; of a 128-bit vector. We want to avoid int/fp domain crossing penalties, so ignore the +; bitcast ops and choose: +; +; movlps for floats +; movlpd for doubles +; movq for integers + +define void @store_floats(<4 x float> %x, i64* %p) { +; SSE-LABEL: store_floats: +; SSE: # BB#0: +; SSE-NEXT: addps %xmm0, %xmm0 +; SSE-NEXT: movlps %xmm0, (%rdi) +; SSE-NEXT: retq +; +; AVX-LABEL: store_floats: +; AVX: # BB#0: +; AVX-NEXT: vaddps %xmm0, %xmm0, %xmm0 + + +; !!! FIXME - the AVX version is not handled correctly. +; AVX-NEXT: vmovq %xmm0, (%rdi) + + +; AVX-NEXT: retq + %a = fadd <4 x float> %x, %x + %b = shufflevector <4 x float> %a, <4 x float> undef, <2 x i32> + %c = bitcast <2 x float> %b to i64 + store i64 %c, i64* %p + ret void +} + +define void @store_double(<2 x double> %x, i64* %p) { +; SSE-LABEL: store_double: +; SSE: # BB#0: +; SSE-NEXT: addpd %xmm0, %xmm0 +; SSE-NEXT: movlpd %xmm0, (%rdi) +; SSE-NEXT: retq +; +; AVX-LABEL: store_double: +; AVX: # BB#0: +; AVX-NEXT: vaddpd %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vmovlpd %xmm0, (%rdi) +; AVX-NEXT: retq + %a = fadd <2 x double> %x, %x + %b = extractelement <2 x double> %a, i32 0 + %c = bitcast double %b to i64 + store i64 %c, i64* %p + ret void +} + +define void @store_int(<4 x i32> %x, <2 x float>* %p) { +; SSE-LABEL: store_int: +; SSE: # BB#0: +; SSE-NEXT: paddd %xmm0, %xmm0 +; SSE-NEXT: movq %xmm0, (%rdi) +; SSE-NEXT: retq +; +; AVX-LABEL: store_int: +; AVX: # BB#0: +; AVX-NEXT: vpaddd %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vmovq %xmm0, (%rdi) +; AVX-NEXT: retq + %a = add <4 x i32> %x, %x + %b = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> + %c = bitcast <2 x i32> %b to <2 x float> + store <2 x float> %c, <2 x float>* %p + ret void +} + diff --git a/test/CodeGen/X86/sse2-intrinsics-x86.ll b/test/CodeGen/X86/sse2-intrinsics-x86.ll index cab62a38661..5afebd24bb4 100644 --- a/test/CodeGen/X86/sse2-intrinsics-x86.ll +++ b/test/CodeGen/X86/sse2-intrinsics-x86.ll @@ -581,7 +581,7 @@ declare <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double>) nounwind readnone define void @test_x86_sse2_storel_dq(i8* %a0, <4 x i32> %a1) { ; CHECK: test_x86_sse2_storel_dq ; CHECK: movl - ; CHECK: movq + ; CHECK: movlps call void @llvm.x86.sse2.storel.dq(i8* %a0, <4 x i32> %a1) ret void } diff --git a/test/CodeGen/X86/vec_insert-5.ll b/test/CodeGen/X86/vec_insert-5.ll index 26ac2a23e31..4018a21090e 100644 --- a/test/CodeGen/X86/vec_insert-5.ll +++ b/test/CodeGen/X86/vec_insert-5.ll @@ -9,7 +9,7 @@ define void @t1(i32 %a, x86_mmx* %P) nounwind { ; CHECK-NEXT: shll $12, %ecx ; CHECK-NEXT: movd %ecx, %xmm0 ; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,1,1] -; CHECK-NEXT: movlpd %xmm0, (%eax) +; CHECK-NEXT: movq %xmm0, (%eax) ; CHECK-NEXT: retl %tmp12 = shl i32 %a, 12 %tmp21 = insertelement <2 x i32> undef, i32 %tmp12, i32 1 diff --git a/test/CodeGen/X86/vec_insert-mmx.ll b/test/CodeGen/X86/vec_insert-mmx.ll index c77bffcbd37..cbd420885ac 100644 --- a/test/CodeGen/X86/vec_insert-mmx.ll +++ b/test/CodeGen/X86/vec_insert-mmx.ll @@ -7,7 +7,7 @@ define x86_mmx @t0(i32 %A) nounwind { ; X86-32: ## BB#0: ; X86-32: movd {{[0-9]+}}(%esp), %xmm0 ; X86-32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,1,1] -; X86-32-NEXT: movlpd %xmm0, (%esp) +; X86-32-NEXT: movq %xmm0, (%esp) ; X86-32-NEXT: movq (%esp), %mm0 ; X86-32-NEXT: addl $12, %esp ; X86-32-NEXT: retl diff --git a/test/CodeGen/X86/vec_zero_cse.ll b/test/CodeGen/X86/vec_zero_cse.ll index 66c64d32522..8ed8083a284 100644 --- a/test/CodeGen/X86/vec_zero_cse.ll +++ b/test/CodeGen/X86/vec_zero_cse.ll @@ -9,7 +9,7 @@ define void @test1() { ;CHECK-LABEL: @test1 -;CHECK: xorpd +;CHECK: xorps store <1 x i64> zeroinitializer, <1 x i64>* @M1 store <2 x i32> zeroinitializer, <2 x i32>* @M2 ret void diff --git a/test/CodeGen/X86/vector-shuffle-mmx.ll b/test/CodeGen/X86/vector-shuffle-mmx.ll index 094722d2680..dbccd2694b0 100644 --- a/test/CodeGen/X86/vector-shuffle-mmx.ll +++ b/test/CodeGen/X86/vector-shuffle-mmx.ll @@ -9,7 +9,7 @@ define void @test0(<1 x i64>* %x) { ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; X32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; X32-NEXT: movlpd %xmm0, (%eax) +; X32-NEXT: movq %xmm0, (%eax) ; X32-NEXT: retl ; ; X64-LABEL: test0: @@ -38,13 +38,13 @@ define void @test1() { ; X32-NEXT: .cfi_def_cfa_offset 24 ; X32-NEXT: Ltmp2: ; X32-NEXT: .cfi_offset %edi, -8 -; X32-NEXT: xorpd %xmm0, %xmm0 -; X32-NEXT: movlpd %xmm0, (%esp) +; X32-NEXT: xorps %xmm0, %xmm0 +; X32-NEXT: movlps %xmm0, (%esp) ; X32-NEXT: movq (%esp), %mm0 ; X32-NEXT: pshuflw {{.*#+}} xmm0 = mem[0,2,2,3,4,5,6,7] ; X32-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] ; X32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; X32-NEXT: movlpd %xmm0, {{[0-9]+}}(%esp) +; X32-NEXT: movq %xmm0, {{[0-9]+}}(%esp) ; X32-NEXT: movq {{[0-9]+}}(%esp), %mm1 ; X32-NEXT: xorl %edi, %edi ; X32-NEXT: maskmovq %mm1, %mm0 @@ -54,8 +54,8 @@ define void @test1() { ; ; X64-LABEL: test1: ; X64: ## BB#0: ## %entry -; X64-NEXT: pxor %xmm0, %xmm0 -; X64-NEXT: movq %xmm0, -{{[0-9]+}}(%rsp) +; X64-NEXT: xorps %xmm0, %xmm0 +; X64-NEXT: movlps %xmm0, -{{[0-9]+}}(%rsp) ; X64-NEXT: movq -{{[0-9]+}}(%rsp), %mm0 ; X64-NEXT: pshuflw {{.*#+}} xmm0 = mem[0,2,2,3,4,5,6,7] ; X64-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] diff --git a/test/CodeGen/X86/widen_cast-1.ll b/test/CodeGen/X86/widen_cast-1.ll index 6b7f489bf85..b0240ddb043 100644 --- a/test/CodeGen/X86/widen_cast-1.ll +++ b/test/CodeGen/X86/widen_cast-1.ll @@ -3,12 +3,14 @@ ; CHECK: movl ; CHECK: paddw -; CHECK: movlpd +; CHECK: movq + +; FIXME - if this test cares about scheduling, why isn't it being checked? ; Scheduler causes produce a different instruction order ; ATOM: movl ; ATOM: paddw -; ATOM: movlpd +; ATOM: movq ; bitcast a v4i16 to v2i32 diff --git a/test/CodeGen/X86/widen_cast-4.ll b/test/CodeGen/X86/widen_cast-4.ll index 060dfb1011b..8ed2785ae73 100644 --- a/test/CodeGen/X86/widen_cast-4.ll +++ b/test/CodeGen/X86/widen_cast-4.ll @@ -52,7 +52,7 @@ forbody: ; preds = %forcond ; CHECK-NEXT: psraw $8 ; CHECK-NEXT: psraw $2 ; CHECK-NEXT: pshufb -; CHECK-NEXT: movlpd +; CHECK-NEXT: movq ; ; FIXME: We shouldn't require both a movd and an insert. ; CHECK-WIDE: %forbody diff --git a/test/CodeGen/X86/widen_cast-5.ll b/test/CodeGen/X86/widen_cast-5.ll index ccf0bd1d0b6..4e9d2dfdb5d 100644 --- a/test/CodeGen/X86/widen_cast-5.ll +++ b/test/CodeGen/X86/widen_cast-5.ll @@ -1,6 +1,6 @@ ; RUN: llc < %s -march=x86 -mattr=+sse4.2 | FileCheck %s ; CHECK: movl -; CHECK: movlpd +; CHECK: movq ; bitcast a i64 to v2i32 define void @convert(<2 x i32>* %dst.addr, i64 %src) nounwind { diff --git a/test/CodeGen/X86/widen_shuffle-1.ll b/test/CodeGen/X86/widen_shuffle-1.ll index 2aa870f16eb..302805213d0 100644 --- a/test/CodeGen/X86/widen_shuffle-1.ll +++ b/test/CodeGen/X86/widen_shuffle-1.ll @@ -84,7 +84,7 @@ define void @shuf5(<8 x i8>* %p) nounwind { ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ; CHECK-NEXT: movdqa {{.*#+}} xmm0 = [33,33,33,33,33,33,33,33] ; CHECK-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] -; CHECK-NEXT: movlpd %xmm0, (%eax) +; CHECK-NEXT: movq %xmm0, (%eax) ; CHECK-NEXT: retl %v = shufflevector <2 x i8> , <2 x i8> undef, <8 x i32> store <8 x i8> %v, <8 x i8>* %p, align 8