diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index 9b134d7a690..2697a543df6 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -643,9 +643,6 @@ let Predicates = [UseAVX] in { // Represent the same patterns above but in the form they appear for // 256-bit types - def : Pat<(v8i32 (X86vzmovl (insert_subvector undef, - (v4i32 (scalar_to_vector (loadi32 addr:$src))), (iPTR 0)))), - (SUBREG_TO_REG (i32 0), (VMOVSSrm addr:$src), sub_xmm)>; def : Pat<(v8f32 (X86vzmovl (insert_subvector undef, (v4f32 (scalar_to_vector (loadf32 addr:$src))), (iPTR 0)))), (SUBREG_TO_REG (i32 0), (VMOVSSrm addr:$src), sub_xmm)>; @@ -653,9 +650,6 @@ let Predicates = [UseAVX] in { (v2f64 (scalar_to_vector (loadf64 addr:$src))), (iPTR 0)))), (SUBREG_TO_REG (i32 0), (VMOVSDrm addr:$src), sub_xmm)>; } - def : Pat<(v4i64 (X86vzmovl (insert_subvector undef, - (v2i64 (scalar_to_vector (loadi64 addr:$src))), (iPTR 0)))), - (SUBREG_TO_REG (i64 0), (VMOVSDrm addr:$src), sub_xmm)>; // Extract and store. def : Pat<(store (f32 (vector_extract (v4f32 VR128:$src), (iPTR 0))), @@ -793,7 +787,7 @@ let Predicates = [UseSSE2] in { (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>; // FIXME: Instead of a X86Movlps there should be a X86Movsd here, the problem - // is during lowering, where it's not possible to recognize the fold cause + // is during lowering, where it's not possible to recognize the fold because // it has two uses through a bitcast. One use disappears at isel time and the // fold opportunity reappears. def : Pat<(v2f64 (X86Movlpd VR128:$src1, VR128:$src2)), @@ -4907,7 +4901,8 @@ let Predicates = [UseAVX] in { def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector GR32:$src)))), (VMOVDI2PDIrr GR32:$src)>; - // AVX 128-bit movd/movq instruction write zeros in the high 128-bit part. + // AVX 128-bit movd/movq instructions write zeros in the high 128-bit part. + // These instructions also write zeros in the high part of a 256-bit register. let AddedComplexity = 20 in { def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector (loadi32 addr:$src))))), (VMOVDI2PDIrm addr:$src)>; @@ -4915,6 +4910,9 @@ let Predicates = [UseAVX] in { (VMOVDI2PDIrm addr:$src)>; def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv2i64 addr:$src)))), (VMOVDI2PDIrm addr:$src)>; + def : Pat<(v8i32 (X86vzmovl (insert_subvector undef, + (v4i32 (scalar_to_vector (loadi32 addr:$src))), (iPTR 0)))), + (SUBREG_TO_REG (i32 0), (VMOVDI2PDIrm addr:$src), sub_xmm)>; } // Use regular 128-bit instructions to match 256-bit scalar_to_vec+zext. def : Pat<(v8i32 (X86vzmovl (insert_subvector undef, @@ -5033,6 +5031,9 @@ let Predicates = [UseAVX], AddedComplexity = 20 in { (VMOVZQI2PQIrm addr:$src)>; def : Pat<(v2i64 (X86vzload addr:$src)), (VMOVZQI2PQIrm addr:$src)>; + def : Pat<(v4i64 (X86vzmovl (insert_subvector undef, + (v2i64 (scalar_to_vector (loadi64 addr:$src))), (iPTR 0)))), + (SUBREG_TO_REG (i64 0), (VMOVZQI2PQIrm addr:$src), sub_xmm)>; } let Predicates = [UseSSE2], AddedComplexity = 20 in { diff --git a/test/CodeGen/X86/2012-1-10-buildvector.ll b/test/CodeGen/X86/2012-1-10-buildvector.ll index ece28bfb6c5..7c625c226cf 100644 --- a/test/CodeGen/X86/2012-1-10-buildvector.ll +++ b/test/CodeGen/X86/2012-1-10-buildvector.ll @@ -17,7 +17,7 @@ entry: ; CHECK-LABEL: bad_insert: define void @bad_insert(i32 %t) { entry: -; CHECK: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; CHECK: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: vmovaps %ymm0 ; CHECK: ret diff --git a/test/CodeGen/X86/vector-shuffle-256-v4.ll b/test/CodeGen/X86/vector-shuffle-256-v4.ll index 2c9af163f1e..e4aedf18858 100644 --- a/test/CodeGen/X86/vector-shuffle-256-v4.ll +++ b/test/CodeGen/X86/vector-shuffle-256-v4.ll @@ -827,12 +827,12 @@ define <4 x i64> @insert_reg_and_zero_v4i64(i64 %a) { define <4 x i64> @insert_mem_and_zero_v4i64(i64* %ptr) { ; AVX1-LABEL: insert_mem_and_zero_v4i64: ; AVX1: # BB#0: -; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; AVX1-NEXT: retq ; ; AVX2-LABEL: insert_mem_and_zero_v4i64: ; AVX2: # BB#0: -; AVX2-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; AVX2-NEXT: retq %a = load i64, i64* %ptr %v = insertelement <4 x i64> undef, i64 %a, i64 0 diff --git a/test/CodeGen/X86/vector-shuffle-256-v8.ll b/test/CodeGen/X86/vector-shuffle-256-v8.ll index a318e8a17c3..6feffb83609 100644 --- a/test/CodeGen/X86/vector-shuffle-256-v8.ll +++ b/test/CodeGen/X86/vector-shuffle-256-v8.ll @@ -2090,3 +2090,20 @@ entry: %res = shufflevector <4 x float> %tmp76, <4 x float> undef, <8 x i32> ret <8 x float> %res } + +define <8 x i32> @insert_mem_and_zero_v8i32(i32* %ptr) { +; AVX1-LABEL: insert_mem_and_zero_v8i32: +; AVX1: # BB#0: +; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX1-NEXT: retq +; +; AVX2-LABEL: insert_mem_and_zero_v8i32: +; AVX2: # BB#0: +; AVX2-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX2-NEXT: retq + %a = load i32, i32* %ptr + %v = insertelement <8 x i32> undef, i32 %a, i32 0 + %shuffle = shufflevector <8 x i32> %v, <8 x i32> zeroinitializer, <8 x i32> + ret <8 x i32> %shuffle +} +