From c16105b078a09cbf377c637a5e7e6573ddb58593 Mon Sep 17 00:00:00 2001 From: Chandler Carruth Date: Sat, 20 Sep 2014 04:15:22 +0000 Subject: [PATCH] [x86] Teach the v4f32 path of the new shuffle lowering to handle the tricky case of single-element insertion into the zero lane of a zero vector. We can't just use the same pattern here as we do in every other vector type because the general insertion logic can handle insertion into the non-zero lane of the vector. However, in SSE4.1 with v4f32 vectors we have INSERTPS that is a much better choice than the generic one for such lowerings. But INSERTPS can do lots of other lowerings as well so factoring its logic into the general insertion logic doesn't work very well. We also can't just extract the core common part of the general insertion logic that is faster (forming VZEXT_MOVL synthetic nodes that lower to MOVSS when they can) because VZEXT_MOVL is often *faster* than a blend while INSERTPS is slower! So instead we do a restrictive condition on attempting to use the generic insertion logic to narrow it to those cases where VZEXT_MOVL won't need a shuffle afterward and thus will do better than INSERTPS. Then we try blending. Then we go back to INSERTPS. This still doesn't generate perfect code for some silly reasons that can be fixed by tweaking the td files for lowering VZEXT_MOVL to use XORPS+BLENDPS when available rather than XORPS+MOVSS when the input ends up in a register rather than a load from memory -- BLENDPSrr has twice the reciprocal throughput of MOVSSrr. Don't you love this ISA? git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@218177 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86ISelLowering.cpp | 10 ++++ test/CodeGen/X86/vector-shuffle-128-v4.ll | 71 +++++++++++++++++++++++ 2 files changed, 81 insertions(+) diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 7ca44b4615a..6ffda166ed2 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -7784,6 +7784,16 @@ static SDValue lowerV4F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, if (isShuffleEquivalent(Mask, 2, 6, 3, 7)) return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4f32, V1, V2); + // There are special ways we can lower some single-element blends. However, we + // have custom ways we can lower more complex single-element blends below that + // we defer to if both this and BLENDPS fail to match, so restrict this to + // when the V2 input is targeting element 0 of the mask -- that is the fast + // case here. + if (NumV2Elements == 1 && Mask[0] >= 4) + if (SDValue V = lowerVectorShuffleAsElementInsertion(MVT::v4f32, DL, V1, V2, + Mask, Subtarget, DAG)) + return V; + if (Subtarget->hasSSE41()) if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask, DAG)) diff --git a/test/CodeGen/X86/vector-shuffle-128-v4.ll b/test/CodeGen/X86/vector-shuffle-128-v4.ll index 1dbc7f5e1da..3645c9475a2 100644 --- a/test/CodeGen/X86/vector-shuffle-128-v4.ll +++ b/test/CodeGen/X86/vector-shuffle-128-v4.ll @@ -806,3 +806,74 @@ define <4 x i32> @shuffle_v4i32_0z1z(<4 x i32> %a) { %shuffle = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32> ret <4 x i32> %shuffle } + +define <4 x i32> @insert_reg_and_zero_v4i32(i32 %a) { +; ALL-LABEL: @insert_reg_and_zero_v4i32 +; ALL: # BB#0: +; ALL-NEXT: movd %edi, %xmm0 +; ALL-NEXT: retq + %v = insertelement <4 x i32> undef, i32 %a, i32 0 + %shuffle = shufflevector <4 x i32> %v, <4 x i32> zeroinitializer, <4 x i32> + ret <4 x i32> %shuffle +} + +define <4 x i32> @insert_mem_and_zero_v4i32(i32* %ptr) { +; ALL-LABEL: @insert_mem_and_zero_v4i32 +; ALL: # BB#0: +; ALL-NEXT: movd (%rdi), %xmm0 +; ALL-NEXT: retq + %a = load i32* %ptr + %v = insertelement <4 x i32> undef, i32 %a, i32 0 + %shuffle = shufflevector <4 x i32> %v, <4 x i32> zeroinitializer, <4 x i32> + ret <4 x i32> %shuffle +} + +define <4 x float> @insert_reg_and_zero_v4f32(float %a) { +; SSE2-LABEL: @insert_reg_and_zero_v4f32 +; SSE2: # BB#0: +; SSE2-NEXT: xorps %[[X:xmm[0-9]+]], %[[X]] +; SSE2-NEXT: movss %xmm0, %[[X]] +; SSE2-NEXT: movaps %[[X]], %xmm0 +; SSE2-NEXT: retq +; +; SSE3-LABEL: @insert_reg_and_zero_v4f32 +; SSE3: # BB#0: +; SSE3-NEXT: xorps %[[X:xmm[0-9]+]], %[[X]] +; SSE3-NEXT: movss %xmm0, %[[X]] +; SSE3-NEXT: movaps %[[X]], %xmm0 +; SSE3-NEXT: retq +; +; SSSE3-LABEL: @insert_reg_and_zero_v4f32 +; SSSE3: # BB#0: +; SSSE3-NEXT: xorps %[[X:xmm[0-9]+]], %[[X]] +; SSSE3-NEXT: movss %xmm0, %[[X]] +; SSSE3-NEXT: movaps %[[X]], %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: @insert_reg_and_zero_v4f32 +; SSE41: # BB#0: +; SSE41-NEXT: xorps %[[X:xmm[0-9]+]], %[[X]] +; SSE41-NEXT: movss %xmm0, %[[X]] +; SSE41-NEXT: movaps %[[X]], %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: @insert_reg_and_zero_v4f32 +; AVX1: # BB#0: +; AVX1-NEXT: vxorps %[[X:xmm[0-9]+]], %[[X]], %[[X]] +; AVX1-NEXT: vmovss %xmm0, %[[X]], %xmm0 +; AVX1-NEXT: retq + %v = insertelement <4 x float> undef, float %a, i32 0 + %shuffle = shufflevector <4 x float> %v, <4 x float> zeroinitializer, <4 x i32> + ret <4 x float> %shuffle +} + +define <4 x float> @insert_mem_and_zero_v4f32(float* %ptr) { +; ALL-LABEL: @insert_mem_and_zero_v4f32 +; ALL: # BB#0: +; ALL-NEXT: movss (%rdi), %xmm0 +; ALL-NEXT: retq + %a = load float* %ptr + %v = insertelement <4 x float> undef, float %a, i32 0 + %shuffle = shufflevector <4 x float> %v, <4 x float> zeroinitializer, <4 x i32> + ret <4 x float> %shuffle +}