From ac7b9c828fba1c7676102d4aac43ec1b1ce97c25 Mon Sep 17 00:00:00 2001 From: Andrea Di Biagio Date: Fri, 16 Jan 2015 14:55:26 +0000 Subject: [PATCH] [X86][DAG] Disable target specific combine on INSERTPS dag nodes at -O0. This patch disables target specific combine on X86ISD::INSERTPS dag nodes if optlevel is CodeGenOpt::None. The backend currently implements a target specific combine rule that converts a vector load used by an INSERTPS dag node into a scalar load plus a scalar_to_vector. This allows ISel to select a single INSERTPSrm instead of two instructions (i.e. a vector load plus INSERTPSrr). However, the existing target combine rule on INSERTPS nodes only works under the assumption that ISel will always be able to match an INSERTPSrm. This is not true in general at -O0, since the backend only allows folding a load into the memory operand of an instruction if the optimization level is not CodeGenOpt::None. In the example below: // __m128 test(__m128 a, __m128 *b) { __m128 c = _mm_insert_ps(a, *b, 1 << 6); return c; } // Before this patch, at -O0, the backend would have canonicalized the load to 'b' into a scalar load plus scalar_to_vector. Later on, ISel would have selected an INSERTPSrr leaving the insertps mask in an inconsistent state: movss 4(%rdi), %xmm1 insertps $64, %xmm1, %xmm0 # xmm0 = xmm1[1],xmm0[1,2,3]. With this patch, the backend avoids folding the vector load into the operand of the INSERTPS. The new codegen at -O0 is: movaps (%rdi), %xmm1 insertps $64, %xmm1, %xmm0 # %xmm1[1],xmm0[1,2,3]. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@226277 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86ISelLowering.cpp | 7 ++-- test/CodeGen/X86/insertps-O0-bug.ll | 52 +++++++++++++++++++++++++++++ 2 files changed, 57 insertions(+), 2 deletions(-) create mode 100644 test/CodeGen/X86/insertps-O0-bug.ll diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index a1fd34ea800..40609a6df37 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -25874,8 +25874,11 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case ISD::FMA: return PerformFMACombine(N, DAG, Subtarget); case ISD::INTRINSIC_WO_CHAIN: return PerformINTRINSIC_WO_CHAINCombine(N, DAG, Subtarget); - case X86ISD::INSERTPS: - return PerformINSERTPSCombine(N, DAG, Subtarget); + case X86ISD::INSERTPS: { + if (getTargetMachine().getOptLevel() > CodeGenOpt::None) + return PerformINSERTPSCombine(N, DAG, Subtarget); + break; + } case ISD::BUILD_VECTOR: return PerformBUILD_VECTORCombine(N, DAG, Subtarget); } diff --git a/test/CodeGen/X86/insertps-O0-bug.ll b/test/CodeGen/X86/insertps-O0-bug.ll new file mode 100644 index 00000000000..e89ac26ea07 --- /dev/null +++ b/test/CodeGen/X86/insertps-O0-bug.ll @@ -0,0 +1,52 @@ +; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 -O0 < %s | FileCheck %s + +; Check that at -O0, the backend doesn't attempt to canonicalize a vector load +; used by an INSERTPS into a scalar load plus scalar_to_vector. +; +; In order to fold a load into the memory operand of an INSERTPSrm, the backend +; tries to canonicalize a vector load in input to an INSERTPS node into a +; scalar load plus scalar_to_vector. This would allow ISel to match the +; INSERTPSrm variant rather than a load plus INSERTPSrr. +; +; However, ISel can only select an INSERTPSrm if folding a load into the operand +; of an insertps is considered to be profitable. +; +; In the example below: +; +; __m128 test(__m128 a, __m128 *b) { +; __m128 c = _mm_insert_ps(a, *b, 1 << 6); +; return c; +; } +; +; At -O0, the backend would attempt to canonicalize the load to 'b' into +; a scalar load in the hope of matching an INSERTPSrm. +; However, ISel would fail to recognize an INSERTPSrm since load folding is +; always considered unprofitable at -O0. This would leave the insertps mask +; in an invalid state. +; +; The problem with the canonicalization rule performed by the backend is that +; it assumes ISel to always be able to match an INSERTPSrm. This assumption is +; not always correct at -O0. In this example, FastISel fails to lower the +; arguments needed by the entry block. This is enough to enable the DAGCombiner +; and eventually trigger the canonicalization on the INSERTPS node. +; +; This test checks that the vector load in input to the insertps is not +; canonicalized into a scalar load plus scalar_to_vector (a movss). + +define <4 x float> @test(<4 x float> %a, <4 x float>* %b) { +; CHECK-LABEL: test: +; CHECK: movaps (%rdi), [[REG:%[a-z0-9]+]] +; CHECK-NOT: movss +; CHECK: insertps $64, [[REG]], +; CHECK: ret +entry: + %0 = load <4 x float>* %b, align 16 + %1 = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %0, i32 64) + %2 = alloca <4 x float>, align 16 + store <4 x float> %1, <4 x float>* %2, align 16 + %3 = load <4 x float>* %2, align 16 + ret <4 x float> %3 +} + + +declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i32)