llvm-6502/test/CodeGen/X86/insertps-O0-bug.ll

; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 -O0 < %s | FileCheck %s

; Check that at -O0, the backend doesn't attempt to canonicalize a vector load
; used by an INSERTPS into a scalar load plus scalar_to_vector.
;
; In order to fold a load into the memory operand of an INSERTPSrm, the backend
; tries to canonicalize a vector load in input to an INSERTPS node into a
; scalar load plus scalar_to_vector. This would allow ISel to match the
; INSERTPSrm variant rather than a load plus INSERTPSrr.
;
; However, ISel can only select an INSERTPSrm if folding a load into the operand
; of an insertps is considered to be profitable.
;
; In the example below:
;
; __m128 test(__m128 a, __m128 *b) {
;   __m128 c = _mm_insert_ps(a, *b, 1 << 6);
;   return c;
; }
;
; At -O0, the backend would attempt to canonicalize the load to 'b' into
; a scalar load in the hope of matching an INSERTPSrm.
; However, ISel would fail to recognize an INSERTPSrm since load folding is
; always considered unprofitable at -O0. This would leave the insertps mask
; in an invalid state.
;
; The problem with the canonicalization rule performed by the backend is that
; it assumes ISel to always be able to match an INSERTPSrm. This assumption is
; not always correct at -O0. In this example, FastISel fails to lower the
; arguments needed by the entry block. This is enough to enable the DAGCombiner
; and eventually trigger the canonicalization on the INSERTPS node.
;
; This test checks that the vector load in input to the insertps is not
; canonicalized into a scalar load plus scalar_to_vector (a movss).

define <4 x float> @test(<4 x float> %a, <4 x float>* %b) {
; CHECK-LABEL: test:
; CHECK: movaps (%rdi), [[REG:%[a-z0-9]+]]
; CHECK-NOT: movss
; CHECK: insertps $64, [[REG]],
; CHECK: ret
entry:
  %0 = load <4 x float>* %b, align 16
  %1 = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %0, i32 64)
  %2 = alloca <4 x float>, align 16
  store <4 x float> %1, <4 x float>* %2, align 16
  %3 = load <4 x float>* %2, align 16
  ret <4 x float> %3
}


declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i32)
[X86][DAG] Disable target specific combine on INSERTPS dag nodes at -O0. This patch disables target specific combine on X86ISD::INSERTPS dag nodes if optlevel is CodeGenOpt::None. The backend currently implements a target specific combine rule that converts a vector load used by an INSERTPS dag node into a scalar load plus a scalar_to_vector. This allows ISel to select a single INSERTPSrm instead of two instructions (i.e. a vector load plus INSERTPSrr). However, the existing target combine rule on INSERTPS nodes only works under the assumption that ISel will always be able to match an INSERTPSrm. This is not true in general at -O0, since the backend only allows folding a load into the memory operand of an instruction if the optimization level is not CodeGenOpt::None. In the example below: // __m128 test(__m128 a, __m128 b) { __m128 c = _mm_insert_ps(a, b, 1 << 6); return c; } // Before this patch, at -O0, the backend would have canonicalized the load to 'b' into a scalar load plus scalar_to_vector. Later on, ISel would have selected an INSERTPSrr leaving the insertps mask in an inconsistent state: movss 4(%rdi), %xmm1 insertps $64, %xmm1, %xmm0 # xmm0 = xmm1[1],xmm0[1,2,3]. With this patch, the backend avoids folding the vector load into the operand of the INSERTPS. The new codegen at -O0 is: movaps (%rdi), %xmm1 insertps $64, %xmm1, %xmm0 # %xmm1[1],xmm0[1,2,3]. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@226277 91177308-0d34-0410-b5e6-96231b3b80d8 2015-01-16 14:55:26 +00:00			`; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 -O0 < %s \| FileCheck %s`

			`; Check that at -O0, the backend doesn't attempt to canonicalize a vector load`
			`; used by an INSERTPS into a scalar load plus scalar_to_vector.`
			`;`
			`; In order to fold a load into the memory operand of an INSERTPSrm, the backend`
			`; tries to canonicalize a vector load in input to an INSERTPS node into a`
			`; scalar load plus scalar_to_vector. This would allow ISel to match the`
			`; INSERTPSrm variant rather than a load plus INSERTPSrr.`
			`;`
			`; However, ISel can only select an INSERTPSrm if folding a load into the operand`
			`; of an insertps is considered to be profitable.`
			`;`
			`; In the example below:`
			`;`
			`; __m128 test(__m128 a, __m128 *b) {`
			`; __m128 c = _mm_insert_ps(a, *b, 1 << 6);`
			`; return c;`
			`; }`
			`;`
			`; At -O0, the backend would attempt to canonicalize the load to 'b' into`
			`; a scalar load in the hope of matching an INSERTPSrm.`
			`; However, ISel would fail to recognize an INSERTPSrm since load folding is`
			`; always considered unprofitable at -O0. This would leave the insertps mask`
			`; in an invalid state.`
			`;`
			`; The problem with the canonicalization rule performed by the backend is that`
			`; it assumes ISel to always be able to match an INSERTPSrm. This assumption is`
			`; not always correct at -O0. In this example, FastISel fails to lower the`
			`; arguments needed by the entry block. This is enough to enable the DAGCombiner`
			`; and eventually trigger the canonicalization on the INSERTPS node.`
			`;`
			`; This test checks that the vector load in input to the insertps is not`
			`; canonicalized into a scalar load plus scalar_to_vector (a movss).`

			`define <4 x float> @test(<4 x float> %a, <4 x float>* %b) {`
			`; CHECK-LABEL: test:`
			`; CHECK: movaps (%rdi), [[REG:%[a-z0-9]+]]`
			`; CHECK-NOT: movss`
			`; CHECK: insertps $64, [[REG]],`
			`; CHECK: ret`
			`entry:`
			`%0 = load <4 x float>* %b, align 16`
			`%1 = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %0, i32 64)`
			`%2 = alloca <4 x float>, align 16`
			`store <4 x float> %1, <4 x float>* %2, align 16`
			`%3 = load <4 x float>* %2, align 16`
			`ret <4 x float> %3`
			`}`


			`declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i32)`