diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp index 53d5ee30224..9f5f66e37c8 100644 --- a/lib/Target/X86/X86InstrInfo.cpp +++ b/lib/Target/X86/X86InstrInfo.cpp @@ -539,6 +539,8 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm) { X86::VSQRTPSr_Int, X86::VSQRTPSm_Int, TB_ALIGN_16 }, { X86::VUCOMISDrr, X86::VUCOMISDrm, 0 }, { X86::VUCOMISSrr, X86::VUCOMISSrm, 0 }, + { X86::VBROADCASTSSrr, X86::VBROADCASTSSrm, TB_NO_REVERSE }, + // AVX 256-bit foldable instructions { X86::VMOVAPDYrr, X86::VMOVAPDYrm, TB_ALIGN_32 }, { X86::VMOVAPSYrr, X86::VMOVAPSYrm, TB_ALIGN_32 }, @@ -547,6 +549,7 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm) { X86::VMOVUPSYrr, X86::VMOVUPSYrm, 0 }, { X86::VPERMILPDYri, X86::VPERMILPDYmi, TB_ALIGN_32 }, { X86::VPERMILPSYri, X86::VPERMILPSYmi, TB_ALIGN_32 }, + // AVX2 foldable instructions { X86::VPABSBrr256, X86::VPABSBrm256, TB_ALIGN_32 }, { X86::VPABSDrr256, X86::VPABSDrm256, TB_ALIGN_32 }, @@ -562,6 +565,8 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm) { X86::VSQRTPDYr_Int, X86::VSQRTPDYm_Int, TB_ALIGN_32 }, { X86::VSQRTPSYr, X86::VSQRTPSYm, TB_ALIGN_32 }, { X86::VSQRTPSYr_Int, X86::VSQRTPSYm_Int, TB_ALIGN_32 }, + { X86::VBROADCASTSSYrr, X86::VBROADCASTSSYrm, TB_NO_REVERSE }, + { X86::VBROADCASTSDYrr, X86::VBROADCASTSDYrm, TB_NO_REVERSE }, }; for (unsigned i = 0, e = array_lengthof(OpTbl1); i != e; ++i) { diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index 5a224aff9a1..e4caace00ca 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -7274,7 +7274,7 @@ let ExeDomain = SSEPackedSingle in { int_x86_avx_vbroadcast_ss_256>; } let ExeDomain = SSEPackedDouble in -def VBROADCASTSDrm : avx_broadcast<0x19, "vbroadcastsd", VR256, f64mem, +def VBROADCASTSDYrm : avx_broadcast<0x19, "vbroadcastsd", VR256, f64mem, int_x86_avx_vbroadcast_sd_256>; def VBROADCASTF128 : avx_broadcast<0x1A, "vbroadcastf128", VR256, f128mem, int_x86_avx_vbroadcastf128_pd_256>; @@ -7753,11 +7753,11 @@ let Predicates = [HasAVX] in { def : Pat<(v8i32 (X86VBroadcast (loadi32 addr:$src))), (VBROADCASTSSYrm addr:$src)>; def : Pat<(v4i64 (X86VBroadcast (loadi64 addr:$src))), - (VBROADCASTSDrm addr:$src)>; + (VBROADCASTSDYrm addr:$src)>; def : Pat<(v8f32 (X86VBroadcast (loadf32 addr:$src))), (VBROADCASTSSYrm addr:$src)>; def : Pat<(v4f64 (X86VBroadcast (loadf64 addr:$src))), - (VBROADCASTSDrm addr:$src)>; + (VBROADCASTSDYrm addr:$src)>; def : Pat<(v4f32 (X86VBroadcast (loadf32 addr:$src))), (VBROADCASTSSrm addr:$src)>; def : Pat<(v4i32 (X86VBroadcast (loadi32 addr:$src))), diff --git a/test/CodeGen/X86/2012-07-15-broadcastfold.ll b/test/CodeGen/X86/2012-07-15-broadcastfold.ll new file mode 100644 index 00000000000..6a7bb63b88e --- /dev/null +++ b/test/CodeGen/X86/2012-07-15-broadcastfold.ll @@ -0,0 +1,21 @@ +; RUN: llc < %s -march=x86 -mcpu=corei7 -mattr=+avx2 | FileCheck %s + +declare x86_fastcallcc i64 @barrier() + +;CHECK: bcast_fold +;CHECK: vbroadcastss -24(%ebp), %ymm0 # 16-byte Folded Reload +;CHECK: ret +define <8 x float> @bcast_fold( float* %A) { +BB: + %A0 = load float* %A + %tt3 = call x86_fastcallcc i64 @barrier() + br i1 undef, label %work, label %exit + +work: + %A1 = insertelement <8 x float> undef, float %A0, i32 0 + %A2 = shufflevector <8 x float> %A1, <8 x float> undef, <8 x i32> + ret <8 x float> %A2 + +exit: + ret <8 x float> undef +}