diff --git a/lib/Target/X86/X86InstrInfo.td b/lib/Target/X86/X86InstrInfo.td index 49f54ebe9a5..cb208bdd4e3 100644 --- a/lib/Target/X86/X86InstrInfo.td +++ b/lib/Target/X86/X86InstrInfo.td @@ -693,6 +693,7 @@ def UseSSE3 : Predicate<"Subtarget->hasSSE3() && !Subtarget->hasAVX()">; def HasSSSE3 : Predicate<"Subtarget->hasSSSE3()">; def UseSSSE3 : Predicate<"Subtarget->hasSSSE3() && !Subtarget->hasAVX()">; def HasSSE41 : Predicate<"Subtarget->hasSSE41()">; +def NoSSE41 : Predicate<"!Subtarget->hasSSE41()">; def UseSSE41 : Predicate<"Subtarget->hasSSE41() && !Subtarget->hasAVX()">; def HasSSE42 : Predicate<"Subtarget->hasSSE42()">; def UseSSE42 : Predicate<"Subtarget->hasSSE42() && !Subtarget->hasAVX()">; diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index bd00bdd02b5..a2d97456405 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -612,29 +612,6 @@ let canFoldAsLoad = 1, isReMaterializable = 1 in { // Patterns let Predicates = [UseAVX] in { - let AddedComplexity = 15 in { - // Move scalar to XMM zero-extended, zeroing a VR128 then do a - // MOVS{S,D} to the lower bits. - def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector FR32:$src)))), - (VMOVSSrr (v4f32 (V_SET0)), FR32:$src)>; - def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))), - (VMOVSSrr (v4f32 (V_SET0)), (COPY_TO_REGCLASS VR128:$src, FR32))>; - def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))), - (VMOVSSrr (v4i32 (V_SET0)), (COPY_TO_REGCLASS VR128:$src, FR32))>; - def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector FR64:$src)))), - (VMOVSDrr (v2f64 (V_SET0)), FR64:$src)>; - - // Move low f32 and clear high bits. - def : Pat<(v8f32 (X86vzmovl (v8f32 VR256:$src))), - (SUBREG_TO_REG (i32 0), - (VMOVSSrr (v4f32 (V_SET0)), - (EXTRACT_SUBREG (v8f32 VR256:$src), sub_xmm)), sub_xmm)>; - def : Pat<(v8i32 (X86vzmovl (v8i32 VR256:$src))), - (SUBREG_TO_REG (i32 0), - (VMOVSSrr (v4i32 (V_SET0)), - (EXTRACT_SUBREG (v8i32 VR256:$src), sub_xmm)), sub_xmm)>; - } - let AddedComplexity = 20 in { // MOVSSrm zeros the high parts of the register; represent this // with SUBREG_TO_REG. The AVX versions also write: DST[255:128] <- 0 @@ -670,31 +647,10 @@ let Predicates = [UseAVX] in { (v2f64 (scalar_to_vector (loadf64 addr:$src))), (iPTR 0)))), (SUBREG_TO_REG (i32 0), (VMOVSDrm addr:$src), sub_xmm)>; } - def : Pat<(v8f32 (X86vzmovl (insert_subvector undef, - (v4f32 (scalar_to_vector FR32:$src)), (iPTR 0)))), - (SUBREG_TO_REG (i32 0), - (v4f32 (VMOVSSrr (v4f32 (V_SET0)), FR32:$src)), - sub_xmm)>; - def : Pat<(v4f64 (X86vzmovl (insert_subvector undef, - (v2f64 (scalar_to_vector FR64:$src)), (iPTR 0)))), - (SUBREG_TO_REG (i64 0), - (v2f64 (VMOVSDrr (v2f64 (V_SET0)), FR64:$src)), - sub_xmm)>; def : Pat<(v4i64 (X86vzmovl (insert_subvector undef, (v2i64 (scalar_to_vector (loadi64 addr:$src))), (iPTR 0)))), (SUBREG_TO_REG (i64 0), (VMOVSDrm addr:$src), sub_xmm)>; - // Move low f64 and clear high bits. - def : Pat<(v4f64 (X86vzmovl (v4f64 VR256:$src))), - (SUBREG_TO_REG (i32 0), - (VMOVSDrr (v2f64 (V_SET0)), - (EXTRACT_SUBREG (v4f64 VR256:$src), sub_xmm)), sub_xmm)>; - - def : Pat<(v4i64 (X86vzmovl (v4i64 VR256:$src))), - (SUBREG_TO_REG (i32 0), - (VMOVSDrr (v2i64 (V_SET0)), - (EXTRACT_SUBREG (v4i64 VR256:$src), sub_xmm)), sub_xmm)>; - // Extract and store. def : Pat<(store (f32 (vector_extract (v4f32 VR128:$src), (iPTR 0))), addr:$dst), @@ -745,7 +701,6 @@ let Predicates = [UseAVX] in { (EXTRACT_SUBREG (v4f64 VR256:$src2), sub_xmm)), sub_xmm)>; - // FIXME: Instead of a X86Movlps there should be a X86Movsd here, the problem // is during lowering, where it's not possible to recognize the fold cause // it has two uses through a bitcast. One use disappears at isel time and the @@ -761,7 +716,7 @@ let Predicates = [UseAVX] in { } let Predicates = [UseSSE1] in { - let AddedComplexity = 15 in { + let Predicates = [NoSSE41], AddedComplexity = 15 in { // Move scalar to XMM zero-extended, zeroing a VR128 then do a // MOVSS to the lower bits. def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector FR32:$src)))), @@ -795,7 +750,7 @@ let Predicates = [UseSSE1] in { } let Predicates = [UseSSE2] in { - let AddedComplexity = 15 in { + let Predicates = [NoSSE41], AddedComplexity = 15 in { // Move scalar to XMM zero-extended, zeroing a VR128 then do a // MOVSD to the lower bits. def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector FR64:$src)))), @@ -7576,6 +7531,57 @@ let Predicates = [HasAVX2] in { (VPBLENDWYrri VR256:$src1, VR256:$src2, imm:$mask)>; } +// Patterns +let Predicates = [UseAVX] in { + let AddedComplexity = 15 in { + // Move scalar to XMM zero-extended, zeroing a VR128 then do a + // MOVS{S,D} to the lower bits. + def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector FR32:$src)))), + (VMOVSSrr (v4f32 (V_SET0)), FR32:$src)>; + def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))), + (VBLENDPSrri (v4f32 (V_SET0)), VR128:$src, (i8 1))>; + def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))), + (VBLENDPSrri (v4i32 (V_SET0)), VR128:$src, (i8 1))>; + def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector FR64:$src)))), + (VMOVSDrr (v2f64 (V_SET0)), FR64:$src)>; + + // Move low f32 and clear high bits. + def : Pat<(v8f32 (X86vzmovl (v8f32 VR256:$src))), + (VBLENDPSYrri (v8f32 (AVX_SET0)), VR256:$src, (i8 1))>; + def : Pat<(v8i32 (X86vzmovl (v8i32 VR256:$src))), + (VBLENDPSYrri (v8i32 (AVX_SET0)), VR256:$src, (i8 1))>; + } + + def : Pat<(v8f32 (X86vzmovl (insert_subvector undef, + (v4f32 (scalar_to_vector FR32:$src)), (iPTR 0)))), + (SUBREG_TO_REG (i32 0), + (v4f32 (VMOVSSrr (v4f32 (V_SET0)), FR32:$src)), + sub_xmm)>; + def : Pat<(v4f64 (X86vzmovl (insert_subvector undef, + (v2f64 (scalar_to_vector FR64:$src)), (iPTR 0)))), + (SUBREG_TO_REG (i64 0), + (v2f64 (VMOVSDrr (v2f64 (V_SET0)), FR64:$src)), + sub_xmm)>; + + // Move low f64 and clear high bits. + def : Pat<(v4f64 (X86vzmovl (v4f64 VR256:$src))), + (VBLENDPDYrri (v4f64 (AVX_SET0)), VR256:$src, (i8 1))>; + + def : Pat<(v4i64 (X86vzmovl (v4i64 VR256:$src))), + (VBLENDPDYrri (v4i64 (AVX_SET0)), VR256:$src, (i8 1))>; +} + +let Predicates = [UseSSE41] in { + // With SSE41 we can use blends for these patterns. + def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))), + (BLENDPSrri (v4f32 (V_SET0)), VR128:$src, (i8 1))>; + def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))), + (BLENDPSrri (v4f32 (V_SET0)), VR128:$src, (i8 1))>; + def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))), + (BLENDPDrri (v2f64 (V_SET0)), VR128:$src, (i8 1))>; +} + + /// SS41I_ternary_int - SSE 4.1 ternary operator let Uses = [XMM0], Constraints = "$src1 = $dst" in { multiclass SS41I_ternary_int opc, string OpcodeStr, PatFrag mem_frag, diff --git a/test/CodeGen/X86/combine-or.ll b/test/CodeGen/X86/combine-or.ll index 42a50b65905..c1f6c79e81a 100644 --- a/test/CodeGen/X86/combine-or.ll +++ b/test/CodeGen/X86/combine-or.ll @@ -228,9 +228,9 @@ define <4 x i32> @test18(<4 x i32> %a, <4 x i32> %b) { ; CHECK: # BB#0: ; CHECK-NEXT: xorps %xmm2, %xmm2 ; CHECK-NEXT: xorps %xmm3, %xmm3 -; CHECK-NEXT: movss %xmm0, %xmm3 +; CHECK-NEXT: blendps $1, %xmm0, %xmm3 ; CHECK-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm2[0,0] -; CHECK-NEXT: movss %xmm1, %xmm2 +; CHECK-NEXT: blendps $1, %xmm1, %xmm2 ; CHECK-NEXT: orps %xmm3, %xmm2 ; CHECK-NEXT: movaps %xmm2, %xmm0 ; CHECK-NEXT: retq diff --git a/test/CodeGen/X86/sse41.ll b/test/CodeGen/X86/sse41.ll index 3be2520ac00..c8e509ce69e 100644 --- a/test/CodeGen/X86/sse41.ll +++ b/test/CodeGen/X86/sse41.ll @@ -522,7 +522,7 @@ define <4 x float> @shuf_X00A(<4 x float> %x, <4 x float> %a) { ; X32-LABEL: shuf_X00A: ; X32: ## BB#0: ; X32-NEXT: xorps %xmm2, %xmm2 -; X32-NEXT: movss %xmm0, %xmm2 +; X32-NEXT: blendps $1, %xmm0, %xmm2 ; X32-NEXT: insertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm1[0] ; X32-NEXT: movaps %xmm2, %xmm0 ; X32-NEXT: retl @@ -530,7 +530,7 @@ define <4 x float> @shuf_X00A(<4 x float> %x, <4 x float> %a) { ; X64-LABEL: shuf_X00A: ; X64: ## BB#0: ; X64-NEXT: xorps %xmm2, %xmm2 -; X64-NEXT: movss %xmm0, %xmm2 +; X64-NEXT: blendps $1, %xmm0, %xmm2 ; X64-NEXT: insertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm1[0] ; X64-NEXT: movaps %xmm2, %xmm0 ; X64-NEXT: retq @@ -546,7 +546,7 @@ define <4 x float> @shuf_X00X(<4 x float> %x, <4 x float> %a) { ; X32-LABEL: shuf_X00X: ; X32: ## BB#0: ; X32-NEXT: xorps %xmm1, %xmm1 -; X32-NEXT: movss %xmm0, %xmm1 +; X32-NEXT: blendps $1, %xmm0, %xmm1 ; X32-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[0] ; X32-NEXT: movaps %xmm1, %xmm0 ; X32-NEXT: retl @@ -554,7 +554,7 @@ define <4 x float> @shuf_X00X(<4 x float> %x, <4 x float> %a) { ; X64-LABEL: shuf_X00X: ; X64: ## BB#0: ; X64-NEXT: xorps %xmm1, %xmm1 -; X64-NEXT: movss %xmm0, %xmm1 +; X64-NEXT: blendps $1, %xmm0, %xmm1 ; X64-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[0] ; X64-NEXT: movaps %xmm1, %xmm0 ; X64-NEXT: retq @@ -570,7 +570,7 @@ define <4 x float> @shuf_X0YC(<4 x float> %x, <4 x float> %a) { ; X32-LABEL: shuf_X0YC: ; X32: ## BB#0: ; X32-NEXT: xorps %xmm2, %xmm2 -; X32-NEXT: movss %xmm0, %xmm2 +; X32-NEXT: blendps $1, %xmm0, %xmm2 ; X32-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[1,0] ; X32-NEXT: insertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm1[2] ; X32-NEXT: movaps %xmm2, %xmm0 @@ -579,7 +579,7 @@ define <4 x float> @shuf_X0YC(<4 x float> %x, <4 x float> %a) { ; X64-LABEL: shuf_X0YC: ; X64: ## BB#0: ; X64-NEXT: xorps %xmm2, %xmm2 -; X64-NEXT: movss %xmm0, %xmm2 +; X64-NEXT: blendps $1, %xmm0, %xmm2 ; X64-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[1,0] ; X64-NEXT: insertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm1[2] ; X64-NEXT: movaps %xmm2, %xmm0 @@ -692,7 +692,7 @@ define <4 x i32> @i32_shuf_X00A(<4 x i32> %x, <4 x i32> %a) { ; X32-LABEL: i32_shuf_X00A: ; X32: ## BB#0: ; X32-NEXT: xorps %xmm2, %xmm2 -; X32-NEXT: movss %xmm0, %xmm2 +; X32-NEXT: blendps $1, %xmm0, %xmm2 ; X32-NEXT: insertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm1[0] ; X32-NEXT: movaps %xmm2, %xmm0 ; X32-NEXT: retl @@ -700,7 +700,7 @@ define <4 x i32> @i32_shuf_X00A(<4 x i32> %x, <4 x i32> %a) { ; X64-LABEL: i32_shuf_X00A: ; X64: ## BB#0: ; X64-NEXT: xorps %xmm2, %xmm2 -; X64-NEXT: movss %xmm0, %xmm2 +; X64-NEXT: blendps $1, %xmm0, %xmm2 ; X64-NEXT: insertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm1[0] ; X64-NEXT: movaps %xmm2, %xmm0 ; X64-NEXT: retq @@ -716,7 +716,7 @@ define <4 x i32> @i32_shuf_X00X(<4 x i32> %x, <4 x i32> %a) { ; X32-LABEL: i32_shuf_X00X: ; X32: ## BB#0: ; X32-NEXT: xorps %xmm1, %xmm1 -; X32-NEXT: movss %xmm0, %xmm1 +; X32-NEXT: blendps $1, %xmm0, %xmm1 ; X32-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[0] ; X32-NEXT: movaps %xmm1, %xmm0 ; X32-NEXT: retl @@ -724,7 +724,7 @@ define <4 x i32> @i32_shuf_X00X(<4 x i32> %x, <4 x i32> %a) { ; X64-LABEL: i32_shuf_X00X: ; X64: ## BB#0: ; X64-NEXT: xorps %xmm1, %xmm1 -; X64-NEXT: movss %xmm0, %xmm1 +; X64-NEXT: blendps $1, %xmm0, %xmm1 ; X64-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[0] ; X64-NEXT: movaps %xmm1, %xmm0 ; X64-NEXT: retq @@ -740,7 +740,7 @@ define <4 x i32> @i32_shuf_X0YC(<4 x i32> %x, <4 x i32> %a) { ; X32-LABEL: i32_shuf_X0YC: ; X32: ## BB#0: ; X32-NEXT: xorps %xmm2, %xmm2 -; X32-NEXT: movss %xmm0, %xmm2 +; X32-NEXT: blendps $1, %xmm0, %xmm2 ; X32-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[1,0] ; X32-NEXT: insertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm1[2] ; X32-NEXT: movaps %xmm2, %xmm0 @@ -749,7 +749,7 @@ define <4 x i32> @i32_shuf_X0YC(<4 x i32> %x, <4 x i32> %a) { ; X64-LABEL: i32_shuf_X0YC: ; X64: ## BB#0: ; X64-NEXT: xorps %xmm2, %xmm2 -; X64-NEXT: movss %xmm0, %xmm2 +; X64-NEXT: blendps $1, %xmm0, %xmm2 ; X64-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[1,0] ; X64-NEXT: insertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm1[2] ; X64-NEXT: movaps %xmm2, %xmm0 diff --git a/test/CodeGen/X86/vec_set-3.ll b/test/CodeGen/X86/vec_set-3.ll index 043cf96a671..b38b8bfb81f 100644 --- a/test/CodeGen/X86/vec_set-3.ll +++ b/test/CodeGen/X86/vec_set-3.ll @@ -39,7 +39,7 @@ entry: define <4 x float> @test3(<4 x float> %A) { ; CHECK-LABEL: test3: ; CHECK: xorps %[[X1:xmm[0-9]+]], %[[X1]] -; CHECK-NEXT: movss %xmm0, %[[X1]] +; CHECK-NEXT: blendps $1, %xmm0, %[[X1]] ; CHECK-NEXT: pshufd {{.*#+}} xmm0 = [[X1]][1,0,1,1] ; CHECK-NEXT: retl ; diff --git a/test/CodeGen/X86/vector-shuffle-128-v4.ll b/test/CodeGen/X86/vector-shuffle-128-v4.ll index f6ba5db85f3..019988b3762 100644 --- a/test/CodeGen/X86/vector-shuffle-128-v4.ll +++ b/test/CodeGen/X86/vector-shuffle-128-v4.ll @@ -438,17 +438,38 @@ define <4 x i32> @shuffle_v4i32_4015(<4 x i32> %a, <4 x i32> %b) { } define <4 x float> @shuffle_v4f32_4zzz(<4 x float> %a) { -; SSE-LABEL: shuffle_v4f32_4zzz: -; SSE: # BB#0: -; SSE-NEXT: xorps %xmm1, %xmm1 -; SSE-NEXT: movss %xmm0, %xmm1 -; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: shuffle_v4f32_4zzz: +; SSE2: # BB#0: +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: movss %xmm0, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE3-LABEL: shuffle_v4f32_4zzz: +; SSE3: # BB#0: +; SSE3-NEXT: xorps %xmm1, %xmm1 +; SSE3-NEXT: movss %xmm0, %xmm1 +; SSE3-NEXT: movaps %xmm1, %xmm0 +; SSE3-NEXT: retq +; +; SSSE3-LABEL: shuffle_v4f32_4zzz: +; SSSE3: # BB#0: +; SSSE3-NEXT: xorps %xmm1, %xmm1 +; SSSE3-NEXT: movss %xmm0, %xmm1 +; SSSE3-NEXT: movaps %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v4f32_4zzz: +; SSE41: # BB#0: +; SSE41-NEXT: xorps %xmm1, %xmm1 +; SSE41-NEXT: blendps {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] +; SSE41-NEXT: movaps %xmm1, %xmm0 +; SSE41-NEXT: retq ; ; AVX-LABEL: shuffle_v4f32_4zzz: ; AVX: # BB#0: ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vmovss %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] ; AVX-NEXT: retq %shuffle = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> ret <4 x float> %shuffle @@ -639,34 +660,76 @@ define <4 x float> @shuffle_v4f32_z6zz(<4 x float> %a) { } define <4 x i32> @shuffle_v4i32_4zzz(<4 x i32> %a) { -; SSE-LABEL: shuffle_v4i32_4zzz: -; SSE: # BB#0: -; SSE-NEXT: xorps %xmm1, %xmm1 -; SSE-NEXT: movss %xmm0, %xmm1 -; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: shuffle_v4i32_4zzz: +; SSE2: # BB#0: +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: movss %xmm0, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE3-LABEL: shuffle_v4i32_4zzz: +; SSE3: # BB#0: +; SSE3-NEXT: xorps %xmm1, %xmm1 +; SSE3-NEXT: movss %xmm0, %xmm1 +; SSE3-NEXT: movaps %xmm1, %xmm0 +; SSE3-NEXT: retq +; +; SSSE3-LABEL: shuffle_v4i32_4zzz: +; SSSE3: # BB#0: +; SSSE3-NEXT: xorps %xmm1, %xmm1 +; SSSE3-NEXT: movss %xmm0, %xmm1 +; SSSE3-NEXT: movaps %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v4i32_4zzz: +; SSE41: # BB#0: +; SSE41-NEXT: xorps %xmm1, %xmm1 +; SSE41-NEXT: blendps {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] +; SSE41-NEXT: movaps %xmm1, %xmm0 +; SSE41-NEXT: retq ; ; AVX-LABEL: shuffle_v4i32_4zzz: ; AVX: # BB#0: ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vmovss %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] ; AVX-NEXT: retq %shuffle = shufflevector <4 x i32> zeroinitializer, <4 x i32> %a, <4 x i32> ret <4 x i32> %shuffle } define <4 x i32> @shuffle_v4i32_z4zz(<4 x i32> %a) { -; SSE-LABEL: shuffle_v4i32_z4zz: -; SSE: # BB#0: -; SSE-NEXT: xorps %xmm1, %xmm1 -; SSE-NEXT: movss %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,0,1,1] -; SSE-NEXT: retq +; SSE2-LABEL: shuffle_v4i32_z4zz: +; SSE2: # BB#0: +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: movss %xmm0, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,0,1,1] +; SSE2-NEXT: retq +; +; SSE3-LABEL: shuffle_v4i32_z4zz: +; SSE3: # BB#0: +; SSE3-NEXT: xorps %xmm1, %xmm1 +; SSE3-NEXT: movss %xmm0, %xmm1 +; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,0,1,1] +; SSE3-NEXT: retq +; +; SSSE3-LABEL: shuffle_v4i32_z4zz: +; SSSE3: # BB#0: +; SSSE3-NEXT: xorps %xmm1, %xmm1 +; SSSE3-NEXT: movss %xmm0, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,0,1,1] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v4i32_z4zz: +; SSE41: # BB#0: +; SSE41-NEXT: xorps %xmm1, %xmm1 +; SSE41-NEXT: blendps {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,0,1,1] +; SSE41-NEXT: retq ; ; AVX-LABEL: shuffle_v4i32_z4zz: ; AVX: # BB#0: ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vmovss %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,1,1] ; AVX-NEXT: retq %shuffle = shufflevector <4 x i32> zeroinitializer, <4 x i32> %a, <4 x i32> @@ -674,17 +737,38 @@ define <4 x i32> @shuffle_v4i32_z4zz(<4 x i32> %a) { } define <4 x i32> @shuffle_v4i32_zz4z(<4 x i32> %a) { -; SSE-LABEL: shuffle_v4i32_zz4z: -; SSE: # BB#0: -; SSE-NEXT: xorps %xmm1, %xmm1 -; SSE-NEXT: movss %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,0,1] -; SSE-NEXT: retq +; SSE2-LABEL: shuffle_v4i32_zz4z: +; SSE2: # BB#0: +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: movss %xmm0, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,0,1] +; SSE2-NEXT: retq +; +; SSE3-LABEL: shuffle_v4i32_zz4z: +; SSE3: # BB#0: +; SSE3-NEXT: xorps %xmm1, %xmm1 +; SSE3-NEXT: movss %xmm0, %xmm1 +; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,0,1] +; SSE3-NEXT: retq +; +; SSSE3-LABEL: shuffle_v4i32_zz4z: +; SSSE3: # BB#0: +; SSSE3-NEXT: xorps %xmm1, %xmm1 +; SSSE3-NEXT: movss %xmm0, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,0,1] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v4i32_zz4z: +; SSE41: # BB#0: +; SSE41-NEXT: xorps %xmm1, %xmm1 +; SSE41-NEXT: blendps {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,0,1] +; SSE41-NEXT: retq ; ; AVX-LABEL: shuffle_v4i32_zz4z: ; AVX: # BB#0: ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vmovss %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,0,1] ; AVX-NEXT: retq %shuffle = shufflevector <4 x i32> zeroinitializer, <4 x i32> %a, <4 x i32> @@ -692,17 +776,38 @@ define <4 x i32> @shuffle_v4i32_zz4z(<4 x i32> %a) { } define <4 x i32> @shuffle_v4i32_zuu4(<4 x i32> %a) { -; SSE-LABEL: shuffle_v4i32_zuu4: -; SSE: # BB#0: -; SSE-NEXT: xorps %xmm1, %xmm1 -; SSE-NEXT: movss %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,0] -; SSE-NEXT: retq +; SSE2-LABEL: shuffle_v4i32_zuu4: +; SSE2: # BB#0: +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: movss %xmm0, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,0] +; SSE2-NEXT: retq +; +; SSE3-LABEL: shuffle_v4i32_zuu4: +; SSE3: # BB#0: +; SSE3-NEXT: xorps %xmm1, %xmm1 +; SSE3-NEXT: movss %xmm0, %xmm1 +; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,0] +; SSE3-NEXT: retq +; +; SSSE3-LABEL: shuffle_v4i32_zuu4: +; SSSE3: # BB#0: +; SSSE3-NEXT: xorps %xmm1, %xmm1 +; SSSE3-NEXT: movss %xmm0, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,0] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v4i32_zuu4: +; SSE41: # BB#0: +; SSE41-NEXT: xorps %xmm1, %xmm1 +; SSE41-NEXT: blendps {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,0] +; SSE41-NEXT: retq ; ; AVX-LABEL: shuffle_v4i32_zuu4: ; AVX: # BB#0: ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vmovss %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,0] ; AVX-NEXT: retq %shuffle = shufflevector <4 x i32> zeroinitializer, <4 x i32> %a, <4 x i32> @@ -1031,12 +1136,33 @@ define <4 x i32> @insert_mem_and_zero_v4i32(i32* %ptr) { } define <4 x float> @insert_reg_and_zero_v4f32(float %a) { -; SSE-LABEL: insert_reg_and_zero_v4f32: -; SSE: # BB#0: -; SSE-NEXT: xorps %xmm1, %xmm1 -; SSE-NEXT: movss %xmm0, %xmm1 -; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: insert_reg_and_zero_v4f32: +; SSE2: # BB#0: +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: movss %xmm0, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE3-LABEL: insert_reg_and_zero_v4f32: +; SSE3: # BB#0: +; SSE3-NEXT: xorps %xmm1, %xmm1 +; SSE3-NEXT: movss %xmm0, %xmm1 +; SSE3-NEXT: movaps %xmm1, %xmm0 +; SSE3-NEXT: retq +; +; SSSE3-LABEL: insert_reg_and_zero_v4f32: +; SSSE3: # BB#0: +; SSSE3-NEXT: xorps %xmm1, %xmm1 +; SSSE3-NEXT: movss %xmm0, %xmm1 +; SSSE3-NEXT: movaps %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: insert_reg_and_zero_v4f32: +; SSE41: # BB#0: +; SSE41-NEXT: xorps %xmm1, %xmm1 +; SSE41-NEXT: blendps {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] +; SSE41-NEXT: movaps %xmm1, %xmm0 +; SSE41-NEXT: retq ; ; AVX-LABEL: insert_reg_and_zero_v4f32: ; AVX: # BB#0: diff --git a/test/CodeGen/X86/vector-shuffle-256-v4.ll b/test/CodeGen/X86/vector-shuffle-256-v4.ll index 595447775b5..32ee62fa985 100644 --- a/test/CodeGen/X86/vector-shuffle-256-v4.ll +++ b/test/CodeGen/X86/vector-shuffle-256-v4.ll @@ -678,8 +678,8 @@ define <4 x i64> @insert_reg_and_zero_v4i64(i64 %a) { ; AVX1-LABEL: insert_reg_and_zero_v4i64: ; AVX1: # BB#0: ; AVX1-NEXT: vmovq %rdi, %xmm0 -; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vmovsd %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vxorpd %ymm1, %ymm1, %ymm1 +; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3] ; AVX1-NEXT: retq ; ; AVX2-LABEL: insert_reg_and_zero_v4i64: @@ -697,8 +697,8 @@ define <4 x i64> @insert_mem_and_zero_v4i64(i64* %ptr) { ; AVX1-LABEL: insert_mem_and_zero_v4i64: ; AVX1: # BB#0: ; AVX1-NEXT: vmovq (%rdi), %xmm0 -; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vmovsd %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vxorpd %ymm1, %ymm1, %ymm1 +; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3] ; AVX1-NEXT: retq ; ; AVX2-LABEL: insert_mem_and_zero_v4i64: