diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index caaf54490c3..6deee4f68ef 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -520,6 +520,8 @@ let Predicates = [HasSSE2] in { // is during lowering, where it's not possible to recognize the fold cause // it has two uses through a bitcast. One use disappears at isel time and the // fold opportunity reappears. + def : Pat<(v2f64 (X86Movlpd VR128:$src1, VR128:$src2)), + (MOVSDrr VR128:$src1, (EXTRACT_SUBREG (v2f64 VR128:$src2),sub_sd))>; def : Pat<(v4f32 (X86Movlps VR128:$src1, VR128:$src2)), (MOVSDrr VR128:$src1, (EXTRACT_SUBREG (v4f32 VR128:$src2),sub_sd))>; def : Pat<(v4i32 (X86Movlps VR128:$src1, VR128:$src2)), @@ -647,6 +649,9 @@ let Predicates = [HasAVX] in { // is during lowering, where it's not possible to recognize the fold cause // it has two uses through a bitcast. One use disappears at isel time and the // fold opportunity reappears. + def : Pat<(v2f64 (X86Movlpd VR128:$src1, VR128:$src2)), + (VMOVSDrr VR128:$src1, (EXTRACT_SUBREG (v2f64 VR128:$src2), + sub_sd))>; def : Pat<(v4f32 (X86Movlps VR128:$src1, VR128:$src2)), (VMOVSDrr VR128:$src1, (EXTRACT_SUBREG (v4f32 VR128:$src2), sub_sd))>; diff --git a/test/CodeGen/X86/vec_shuffle-39.ll b/test/CodeGen/X86/vec_shuffle-39.ll index f9c00eb0c89..64508b529d6 100644 --- a/test/CodeGen/X86/vec_shuffle-39.ll +++ b/test/CodeGen/X86/vec_shuffle-39.ll @@ -49,3 +49,21 @@ entry: store double %2, double* %0 ret void } + +; rdar://10436044 +define <2 x double> @t3() nounwind readonly { +bb: +; CHECK: t3: +; CHECK: punpcklqdq %xmm1, %xmm0 +; CHECK: movq (%rax), %xmm1 +; CHECK: movsd %xmm1, %xmm0 + %tmp0 = load i128* null, align 1 + %tmp1 = load <2 x i32>* undef, align 8 + %tmp2 = bitcast i128 %tmp0 to <16 x i8> + %tmp3 = bitcast <2 x i32> %tmp1 to i64 + %tmp4 = insertelement <2 x i64> undef, i64 %tmp3, i32 0 + %tmp5 = bitcast <16 x i8> %tmp2 to <2 x double> + %tmp6 = bitcast <2 x i64> %tmp4 to <2 x double> + %tmp7 = shufflevector <2 x double> %tmp5, <2 x double> %tmp6, <2 x i32> + ret <2 x double> %tmp7 +}