From 807360ab087a0d2d9ef056f4d737fc93a3feaaf0 Mon Sep 17 00:00:00 2001 From: Bruno Cardoso Lopes Date: Fri, 23 Jan 2015 22:44:16 +0000 Subject: [PATCH] [x86] Combine x86mmx/i64 to v2i64 conversion to use scalar_to_vector Handle the poor codegen for i64/x86xmm->v2i64 (%mm -> %xmm) moves. Instead of using stack store/load pair to do the job, use scalar_to_vector directly, which in the MMX case can use movq2dq. This was the current behavior prior to improvements for vector legalization of extloads in r213897. This commit fixes the regression and as a side-effect also remove some unnecessary shuffles. In the new attached testcase, we go from: pshufw $-18, (%rdi), %mm0 movq %mm0, -8(%rsp) movq -8(%rsp), %xmm0 pshufd $-44, %xmm0, %xmm0 movd %xmm0, %eax ... To: pshufw $-18, (%rdi), %mm0 movq2dq %mm0, %xmm0 movd %xmm0, %eax ... Differential Revision: http://reviews.llvm.org/D7126 rdar://problem/19413324 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@226953 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86ISelLowering.cpp | 29 +++++++++++++++++++++++++ test/CodeGen/X86/2012-01-18-vbitcast.ll | 11 +++++----- test/CodeGen/X86/lower-bitcast.ll | 5 +++-- test/CodeGen/X86/mmx-movq2dq.ll | 29 +++++++++++++++++++++++++ test/CodeGen/X86/widen_load-2.ll | 3 +-- 5 files changed, 68 insertions(+), 9 deletions(-) create mode 100644 test/CodeGen/X86/mmx-movq2dq.ll diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index e92a099753b..37026ce0f12 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -24757,6 +24757,8 @@ static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG, LoadSDNode *Ld = cast(N); EVT RegVT = Ld->getValueType(0); EVT MemVT = Ld->getMemoryVT(); + SDValue Ptr = Ld->getBasePtr(); + SDValue Chain = Ld->getChain(); SDLoc dl(Ld); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); @@ -24795,6 +24797,33 @@ static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG, return DCI.CombineTo(N, NewVec, TF, true); } + // Conversion from x86mmx/i64 to v2i64 types is often done via stack + // store/load. Under certain conditions we can bypass the memory access and + // combine this load to use a scalar_to_vector instead. This leads to + // a reduction in the stack use, redundant emission of shuffles and create + // isel matching candidates for movq2dq instructions. + if (RegVT == MVT::v2i64 && Subtarget->hasSSE2() && Ext == ISD::EXTLOAD && + !Ld->isVolatile() && ISD::isNON_TRUNCStore(Chain.getNode())) { + + // If this load is directly stored, get the original source value. + StoreSDNode *PrevST = cast(Chain); + EVT SrcTy = PrevST->getValue().getValueType(); + if (PrevST->getBasePtr() != Ptr || + !(SrcTy == MVT::i64 || SrcTy == MVT::x86mmx)) + return SDValue(); + SDValue SrcVal = Chain.getOperand(1); + + // On 32bit systems, we can't save 64bit integers, use f64 instead. + bool Usef64 = TLI.isTypeLegal(MVT::f64) && !Subtarget->is64Bit(); + if (Usef64) + SrcVal = DAG.getNode(ISD::BITCAST, dl, MVT::f64, SrcVal); + SrcVal = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, Usef64 ? MVT::v2f64 : RegVT, + SrcVal); + + return DCI.CombineTo(N, Usef64 ? + DAG.getNode(ISD::BITCAST, dl, RegVT, SrcVal) : SrcVal, Chain); + } + return SDValue(); } diff --git a/test/CodeGen/X86/2012-01-18-vbitcast.ll b/test/CodeGen/X86/2012-01-18-vbitcast.ll index 9eb59e41ef7..efba66be97e 100644 --- a/test/CodeGen/X86/2012-01-18-vbitcast.ll +++ b/test/CodeGen/X86/2012-01-18-vbitcast.ll @@ -1,14 +1,15 @@ ; RUN: llc < %s -march=x86-64 -mcpu=corei7 -mtriple=x86_64-pc-win32 | FileCheck %s -;CHECK-LABEL: vcast: +; CHECK-LABEL: vcast: define <2 x i32> @vcast(<2 x float> %a, <2 x float> %b) { -;CHECK: pmovzxdq -;CHECK: pmovzxdq +; CHECK-NOT: pmovzxdq +; CHECK-NOT: pmovzxdq +; CHECK: movdqa (%{{.*}}), %[[R0:xmm[0-9]+]] %af = bitcast <2 x float> %a to <2 x i32> %bf = bitcast <2 x float> %b to <2 x i32> +; CHECK-NEXT: psubq (%{{.*}}), %[[R0]] %x = sub <2 x i32> %af, %bf -;CHECK: psubq +; CHECK: ret ret <2 x i32> %x -;CHECK: ret } diff --git a/test/CodeGen/X86/lower-bitcast.ll b/test/CodeGen/X86/lower-bitcast.ll index edb8433ec30..5fad82497b9 100644 --- a/test/CodeGen/X86/lower-bitcast.ll +++ b/test/CodeGen/X86/lower-bitcast.ll @@ -68,12 +68,13 @@ define i64 @test4(i64 %A) { %2 = bitcast <2 x i32> %add to i64 ret i64 %2 } -; FIXME: At the moment we still produce the sequence pshufd+paddd+pshufd. +; FIXME: At the moment we still produce the sequence paddd+pshufd. ; Ideally, we should fold that sequence into a single paddd. This is fixed with ; the widening legalization. ; ; CHECK-LABEL: test4 -; CHECK: pshufd +; CHECK: movd +; CHECK-NOT: pshufd ; CHECK-NEXT: paddd ; CHECK-NEXT: pshufd ; CHECK: ret diff --git a/test/CodeGen/X86/mmx-movq2dq.ll b/test/CodeGen/X86/mmx-movq2dq.ll new file mode 100644 index 00000000000..9f46da53bd7 --- /dev/null +++ b/test/CodeGen/X86/mmx-movq2dq.ll @@ -0,0 +1,29 @@ +; RUN: llc < %s -march=x86 -mattr=+mmx,+sse2 | FileCheck %s -check-prefix=X86-32 +; RUN: llc < %s -march=x86-64 -mattr=+mmx,+sse2 | FileCheck %s -check-prefix=X86-64 + +; X86-32-LABEL: test0 +; X86-64-LABEL: test0 +define i32 @test0(<1 x i64>* %v4) { + %v5 = load <1 x i64>* %v4, align 8 + %v12 = bitcast <1 x i64> %v5 to <4 x i16> + %v13 = bitcast <4 x i16> %v12 to x86_mmx + ; X86-32: pshufw $238 + ; X86-32-NOT: movq + ; X86-32-NOT: movsd + ; X86-32: movq2dq + ; X86-64: pshufw $238 + ; X86-64-NOT: movq + ; X86-64-NOT: pshufd + ; X86-64: movq2dq + ; X86-64-NEXT: movd + %v14 = tail call x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx %v13, i8 -18) + %v15 = bitcast x86_mmx %v14 to <4 x i16> + %v16 = bitcast <4 x i16> %v15 to <1 x i64> + %v17 = extractelement <1 x i64> %v16, i32 0 + %v18 = bitcast i64 %v17 to <2 x i32> + %v19 = extractelement <2 x i32> %v18, i32 0 + %v20 = add i32 %v19, 32 + ret i32 %v20 +} + +declare x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx, i8) diff --git a/test/CodeGen/X86/widen_load-2.ll b/test/CodeGen/X86/widen_load-2.ll index c6bd96421d7..0d5380eb3ac 100644 --- a/test/CodeGen/X86/widen_load-2.ll +++ b/test/CodeGen/X86/widen_load-2.ll @@ -78,8 +78,7 @@ define void @add3i16(%i16vec3* nocapture sret %ret, %i16vec3* %ap, %i16vec3* %bp ; CHECK-NEXT: paddd %[[R0]], %[[R1]] ; CHECK-NEXT: pextrw $4, %[[R1]], 4(%{{.*}}) ; CHECK-NEXT: pshufb {{.*}}, %[[R1]] -; CHECK-NEXT: pmovzxdq %[[R1]], %[[R0]] -; CHECK-NEXT: movd %[[R0]], (%{{.*}}) +; CHECK-NEXT: movd %[[R1]], (%{{.*}}) %a = load %i16vec3* %ap, align 16 %b = load %i16vec3* %bp, align 16 %x = add %i16vec3 %a, %b