diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index 959ae36fc59..1276bda524a 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -1019,6 +1019,46 @@ let Predicates = [HasAVX] in { (VMOVUPSYmr addr:$dst, VR256:$src)>; def : Pat<(store (v32i8 VR256:$src), addr:$dst), (VMOVUPSYmr addr:$dst, VR256:$src)>; + + // Special patterns for storing subvector extracts of lower 128-bits + // Its cheaper to just use VMOVAPS/VMOVUPS instead of VEXTRACTF128mr + def : Pat<(alignedstore (v2f64 (extract_subvector + (v4f64 VR256:$src), (i32 0))), addr:$dst), + (VMOVAPDmr addr:$dst, (v2f64 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>; + def : Pat<(alignedstore (v4f32 (extract_subvector + (v8f32 VR256:$src), (i32 0))), addr:$dst), + (VMOVAPSmr addr:$dst, (v4f32 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>; + def : Pat<(alignedstore (v2i64 (extract_subvector + (v4i64 VR256:$src), (i32 0))), addr:$dst), + (VMOVAPDmr addr:$dst, (v2i64 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>; + def : Pat<(alignedstore (v4i32 (extract_subvector + (v8i32 VR256:$src), (i32 0))), addr:$dst), + (VMOVAPSmr addr:$dst, (v4i32 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>; + def : Pat<(alignedstore (v8i16 (extract_subvector + (v16i16 VR256:$src), (i32 0))), addr:$dst), + (VMOVAPSmr addr:$dst, (v8i16 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>; + def : Pat<(alignedstore (v16i8 (extract_subvector + (v32i8 VR256:$src), (i32 0))), addr:$dst), + (VMOVAPSmr addr:$dst, (v16i8 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>; + + def : Pat<(store (v2f64 (extract_subvector + (v4f64 VR256:$src), (i32 0))), addr:$dst), + (VMOVUPDmr addr:$dst, (v2f64 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>; + def : Pat<(store (v4f32 (extract_subvector + (v8f32 VR256:$src), (i32 0))), addr:$dst), + (VMOVUPSmr addr:$dst, (v4f32 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>; + def : Pat<(store (v2i64 (extract_subvector + (v4i64 VR256:$src), (i32 0))), addr:$dst), + (VMOVUPDmr addr:$dst, (v2i64 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>; + def : Pat<(store (v4i32 (extract_subvector + (v8i32 VR256:$src), (i32 0))), addr:$dst), + (VMOVUPSmr addr:$dst, (v4i32 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>; + def : Pat<(store (v8i16 (extract_subvector + (v16i16 VR256:$src), (i32 0))), addr:$dst), + (VMOVAPSmr addr:$dst, (v8i16 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>; + def : Pat<(store (v16i8 (extract_subvector + (v32i8 VR256:$src), (i32 0))), addr:$dst), + (VMOVUPSmr addr:$dst, (v16i8 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>; } // Use movaps / movups for SSE integer load / store (one byte shorter). diff --git a/test/CodeGen/X86/avx-vextractf128.ll b/test/CodeGen/X86/avx-vextractf128.ll index a900eeb0609..ff56a454996 100644 --- a/test/CodeGen/X86/avx-vextractf128.ll +++ b/test/CodeGen/X86/avx-vextractf128.ll @@ -19,12 +19,12 @@ entry: } ; CHECK: @t0 -; CHECK-NOT: vextractf128 $0, %ymm0, %xmm0 +; CHECK-NOT: vextractf128 $1, %ymm0, %xmm0 ; CHECK-NOT: vmovaps %xmm0, (%rdi) -; CHECK: vextractf128 $0, %ymm0, (%rdi) +; CHECK: vextractf128 $1, %ymm0, (%rdi) define void @t0(float* nocapture %addr, <8 x float> %a) nounwind uwtable ssp { entry: - %0 = tail call <4 x float> @llvm.x86.avx.vextractf128.ps.256(<8 x float> %a, i8 0) + %0 = tail call <4 x float> @llvm.x86.avx.vextractf128.ps.256(<8 x float> %a, i8 1) %1 = bitcast float* %addr to <4 x float>* store <4 x float> %0, <4 x float>* %1, align 16 ret void @@ -33,12 +33,12 @@ entry: declare <4 x float> @llvm.x86.avx.vextractf128.ps.256(<8 x float>, i8) nounwind readnone ; CHECK: @t2 -; CHECK-NOT: vextractf128 $0, %ymm0, %xmm0 +; CHECK-NOT: vextractf128 $1, %ymm0, %xmm0 ; CHECK-NOT: vmovaps %xmm0, (%rdi) -; CHECK: vextractf128 $0, %ymm0, (%rdi) +; CHECK: vextractf128 $1, %ymm0, (%rdi) define void @t2(double* nocapture %addr, <4 x double> %a) nounwind uwtable ssp { entry: - %0 = tail call <2 x double> @llvm.x86.avx.vextractf128.pd.256(<4 x double> %a, i8 0) + %0 = tail call <2 x double> @llvm.x86.avx.vextractf128.pd.256(<4 x double> %a, i8 1) %1 = bitcast double* %addr to <2 x double>* store <2 x double> %0, <2 x double>* %1, align 16 ret void @@ -47,10 +47,43 @@ entry: declare <2 x double> @llvm.x86.avx.vextractf128.pd.256(<4 x double>, i8) nounwind readnone ; CHECK: @t4 -; CHECK-NOT: vextractf128 $0, %ymm0, %xmm0 +; CHECK-NOT: vextractf128 $1, %ymm0, %xmm0 ; CHECK-NOT: vmovaps %xmm0, (%rdi) -; CHECK: vextractf128 $0, %ymm0, (%rdi) +; CHECK: vextractf128 $1, %ymm0, (%rdi) define void @t4(<2 x i64>* nocapture %addr, <4 x i64> %a) nounwind uwtable ssp { +entry: + %0 = bitcast <4 x i64> %a to <8 x i32> + %1 = tail call <4 x i32> @llvm.x86.avx.vextractf128.si.256(<8 x i32> %0, i8 1) + %2 = bitcast <4 x i32> %1 to <2 x i64> + store <2 x i64> %2, <2 x i64>* %addr, align 16 + ret void +} + +declare <4 x i32> @llvm.x86.avx.vextractf128.si.256(<8 x i32>, i8) nounwind readnone + +; CHECK: @t5 +; CHECK: vmovaps %xmm0, (%rdi) +define void @t5(float* nocapture %addr, <8 x float> %a) nounwind uwtable ssp { +entry: + %0 = tail call <4 x float> @llvm.x86.avx.vextractf128.ps.256(<8 x float> %a, i8 0) + %1 = bitcast float* %addr to <4 x float>* + store <4 x float> %0, <4 x float>* %1, align 16 + ret void +} + +; CHECK: @t6 +; CHECK: vmovaps %xmm0, (%rdi) +define void @t6(double* nocapture %addr, <4 x double> %a) nounwind uwtable ssp { +entry: + %0 = tail call <2 x double> @llvm.x86.avx.vextractf128.pd.256(<4 x double> %a, i8 0) + %1 = bitcast double* %addr to <2 x double>* + store <2 x double> %0, <2 x double>* %1, align 16 + ret void +} + +; CHECK: @t7 +; CHECK: vmovaps %xmm0, (%rdi) +define void @t7(<2 x i64>* nocapture %addr, <4 x i64> %a) nounwind uwtable ssp { entry: %0 = bitcast <4 x i64> %a to <8 x i32> %1 = tail call <4 x i32> @llvm.x86.avx.vextractf128.si.256(<8 x i32> %0, i8 0) @@ -59,4 +92,13 @@ entry: ret void } -declare <4 x i32> @llvm.x86.avx.vextractf128.si.256(<8 x i32>, i8) nounwind readnone +; CHECK: @t8 +; CHECK: vmovups %xmm0, (%rdi) +define void @t8(<2 x i64>* nocapture %addr, <4 x i64> %a) nounwind uwtable ssp { +entry: + %0 = bitcast <4 x i64> %a to <8 x i32> + %1 = tail call <4 x i32> @llvm.x86.avx.vextractf128.si.256(<8 x i32> %0, i8 0) + %2 = bitcast <4 x i32> %1 to <2 x i64> + store <2 x i64> %2, <2 x i64>* %addr, align 1 + ret void +}