diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index 450d29a8574..8cd00a9aa0e 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -7723,6 +7723,20 @@ let Predicates = [HasAVX2] in { (VPBROADCASTQrm addr:$src)>; def : Pat<(v4i64 (X86VBroadcast (loadi64 addr:$src))), (VPBROADCASTQYrm addr:$src)>; + + // Provide fallback in case the load node that is used in the patterns above + // is used by additional users, which prevents the pattern selection. + let AddedComplexity = 20 in { + def : Pat<(v4f32 (X86VBroadcast FR32:$src)), + (VBROADCASTSSrr + (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FR32:$src, sub_ss))>; + def : Pat<(v8f32 (X86VBroadcast FR32:$src)), + (VBROADCASTSSYrr + (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FR32:$src, sub_ss))>; + def : Pat<(v4f64 (X86VBroadcast FR64:$src)), + (VBROADCASTSDrr + (INSERT_SUBREG (v2f64 (IMPLICIT_DEF)), FR64:$src, sub_sd))>; + } } // AVX1 broadcast patterns @@ -7735,11 +7749,38 @@ def : Pat<(v8f32 (X86VBroadcast (loadf32 addr:$src))), (VBROADCASTSSYrm addr:$src)>; def : Pat<(v4f64 (X86VBroadcast (loadf64 addr:$src))), (VBROADCASTSDrm addr:$src)>; - def : Pat<(v4f32 (X86VBroadcast (loadf32 addr:$src))), (VBROADCASTSSrm addr:$src)>; def : Pat<(v4i32 (X86VBroadcast (loadi32 addr:$src))), (VBROADCASTSSrm addr:$src)>; + + // Provide fallback in case the load node that is used in the patterns above + // is used by additional users, which prevents the pattern selection. + let AddedComplexity = 20 in { + // 128bit broadcasts: + def : Pat<(v2f64 (X86VBroadcast FR64:$src)), + (VPSHUFDri + (INSERT_SUBREG (v2f64 (IMPLICIT_DEF)), FR64:$src, sub_sd), 0)>; + def : Pat<(v4f32 (X86VBroadcast FR32:$src)), + (VPSHUFDri + (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FR32:$src, sub_ss), 0)>; + def : Pat<(v8f32 (X86VBroadcast FR32:$src)), + (VINSERTF128rr (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)), + (VPSHUFDri + (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FR32:$src, sub_ss), 0), + sub_xmm), + (VPSHUFDri + (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FR32:$src, sub_ss), + 0), 1)>; + def : Pat<(v4f64 (X86VBroadcast FR64:$src)), + (VINSERTF128rr (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)), + (VPSHUFDri + (INSERT_SUBREG (v2f64 (IMPLICIT_DEF)), FR64:$src, sub_sd), 0), + sub_xmm), + (VPSHUFDri + (INSERT_SUBREG (v2f64 (IMPLICIT_DEF)), FR64:$src, sub_sd), + 0), 1)>; + } } //===----------------------------------------------------------------------===// diff --git a/test/CodeGen/X86/avx2-vbroadcast.ll b/test/CodeGen/X86/avx2-vbroadcast.ll index 1a78414761c..6eba694bd4a 100644 --- a/test/CodeGen/X86/avx2-vbroadcast.ll +++ b/test/CodeGen/X86/avx2-vbroadcast.ll @@ -160,6 +160,15 @@ entry: ret <8 x i32> %g } +; CHECK: V113 +; CHECK: vbroadcastss +; CHECK: ret +define <8 x float> @V113(<8 x float> %in) nounwind uwtable readnone ssp { +entry: + %g = fadd <8 x float> %in, + ret <8 x float> %g +} + ; CHECK: _e2 ; CHECK: vbroadcastss ; CHECK: ret @@ -179,9 +188,37 @@ define <8 x i8> @_e4(i8* %ptr) nounwind uwtable readnone ssp { %vecinit1.i = insertelement <8 x i8> %vecinit0.i, i8 52, i32 1 %vecinit2.i = insertelement <8 x i8> %vecinit1.i, i8 52, i32 2 %vecinit3.i = insertelement <8 x i8> %vecinit2.i, i8 52, i32 3 - %vecinit4.i = insertelement <8 x i8> %vecinit3.i, i8 52, i32 3 - %vecinit5.i = insertelement <8 x i8> %vecinit4.i, i8 52, i32 3 - %vecinit6.i = insertelement <8 x i8> %vecinit5.i, i8 52, i32 3 - %vecinit7.i = insertelement <8 x i8> %vecinit6.i, i8 52, i32 3 + %vecinit4.i = insertelement <8 x i8> %vecinit3.i, i8 52, i32 4 + %vecinit5.i = insertelement <8 x i8> %vecinit4.i, i8 52, i32 5 + %vecinit6.i = insertelement <8 x i8> %vecinit5.i, i8 52, i32 6 + %vecinit7.i = insertelement <8 x i8> %vecinit6.i, i8 52, i32 7 ret <8 x i8> %vecinit7.i } + + +define void @crash() nounwind alwaysinline { +WGLoopsEntry: + br i1 undef, label %ret, label %footer329VF + +footer329VF: + %A.0.inVF = fmul float undef, 6.553600e+04 + %B.0.in407VF = fmul <8 x float> undef, + %A.0VF = fptosi float %A.0.inVF to i32 + %B.0408VF = fptosi <8 x float> %B.0.in407VF to <8 x i32> + %0 = and <8 x i32> %B.0408VF, + %1 = and i32 %A.0VF, 65535 + %temp1098VF = insertelement <8 x i32> undef, i32 %1, i32 0 + %vector1099VF = shufflevector <8 x i32> %temp1098VF, <8 x i32> undef, <8 x i32> zeroinitializer + br i1 undef, label %preload1201VF, label %footer349VF + +preload1201VF: + br label %footer349VF + +footer349VF: + %2 = mul nsw <8 x i32> undef, %0 + %3 = mul nsw <8 x i32> undef, %vector1099VF + br label %footer329VF + +ret: + ret void +}