mirror of
https://github.com/c64scene-ar/llvm-6502.git
synced 2024-12-15 04:30:12 +00:00
b4c1547749
Now that we've replaced the vinsertf128 intrinsics, do the same for their extract twins. This is very much like D8086 (checked in at r231794): We want to replace as much custom x86 shuffling via intrinsics as possible because pushing the code down the generic shuffle optimization path allows for better codegen and less complexity in LLVM. This is also the LLVM sibling to the cfe D8275 patch. Differential Revision: http://reviews.llvm.org/D8276 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@232045 91177308-0d34-0410-b5e6-96231b3b80d8
146 lines
6.6 KiB
LLVM
146 lines
6.6 KiB
LLVM
; RUN: llc < %s -mtriple=x86_64-apple-darwin -march=x86 -mcpu=corei7-avx | FileCheck %s
|
|
|
|
; We don't check any vinsertf128 variant with immediate 0 because that's just a blend.
|
|
|
|
define <4 x double> @test_x86_avx_vinsertf128_pd_256_1(<4 x double> %a0, <2 x double> %a1) {
|
|
; CHECK-LABEL: test_x86_avx_vinsertf128_pd_256_1:
|
|
; CHECK: vinsertf128 $1, %xmm1, %ymm0, %ymm0
|
|
%res = call <4 x double> @llvm.x86.avx.vinsertf128.pd.256(<4 x double> %a0, <2 x double> %a1, i8 1)
|
|
ret <4 x double> %res
|
|
}
|
|
declare <4 x double> @llvm.x86.avx.vinsertf128.pd.256(<4 x double>, <2 x double>, i8) nounwind readnone
|
|
|
|
define <8 x float> @test_x86_avx_vinsertf128_ps_256_1(<8 x float> %a0, <4 x float> %a1) {
|
|
; CHECK-LABEL: test_x86_avx_vinsertf128_ps_256_1:
|
|
; CHECK: vinsertf128 $1, %xmm1, %ymm0, %ymm0
|
|
%res = call <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float> %a0, <4 x float> %a1, i8 1)
|
|
ret <8 x float> %res
|
|
}
|
|
declare <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float>, <4 x float>, i8) nounwind readnone
|
|
|
|
define <8 x i32> @test_x86_avx_vinsertf128_si_256_1(<8 x i32> %a0, <4 x i32> %a1) {
|
|
; CHECK-LABEL: test_x86_avx_vinsertf128_si_256_1:
|
|
; CHECK: vinsertf128 $1, %xmm1, %ymm0, %ymm0
|
|
%res = call <8 x i32> @llvm.x86.avx.vinsertf128.si.256(<8 x i32> %a0, <4 x i32> %a1, i8 1)
|
|
ret <8 x i32> %res
|
|
}
|
|
|
|
; Verify that high bits of the immediate are masked off. This should be the equivalent
|
|
; of a vinsertf128 $0 which should be optimized into a blend, so just check that it's
|
|
; not a vinsertf128 $1.
|
|
define <8 x i32> @test_x86_avx_vinsertf128_si_256_2(<8 x i32> %a0, <4 x i32> %a1) {
|
|
; CHECK-LABEL: test_x86_avx_vinsertf128_si_256_2:
|
|
; CHECK-NOT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
|
|
%res = call <8 x i32> @llvm.x86.avx.vinsertf128.si.256(<8 x i32> %a0, <4 x i32> %a1, i8 2)
|
|
ret <8 x i32> %res
|
|
}
|
|
declare <8 x i32> @llvm.x86.avx.vinsertf128.si.256(<8 x i32>, <4 x i32>, i8) nounwind readnone
|
|
|
|
; We don't check any vextractf128 variant with immediate 0 because that's just a move.
|
|
|
|
define <2 x double> @test_x86_avx_vextractf128_pd_256_1(<4 x double> %a0) {
|
|
; CHECK-LABEL: test_x86_avx_vextractf128_pd_256_1:
|
|
; CHECK: vextractf128 $1, %ymm0, %xmm0
|
|
%res = call <2 x double> @llvm.x86.avx.vextractf128.pd.256(<4 x double> %a0, i8 1)
|
|
ret <2 x double> %res
|
|
}
|
|
declare <2 x double> @llvm.x86.avx.vextractf128.pd.256(<4 x double>, i8) nounwind readnone
|
|
|
|
define <4 x float> @test_x86_avx_vextractf128_ps_256_1(<8 x float> %a0) {
|
|
; CHECK-LABEL: test_x86_avx_vextractf128_ps_256_1:
|
|
; CHECK: vextractf128 $1, %ymm0, %xmm0
|
|
%res = call <4 x float> @llvm.x86.avx.vextractf128.ps.256(<8 x float> %a0, i8 1)
|
|
ret <4 x float> %res
|
|
}
|
|
declare <4 x float> @llvm.x86.avx.vextractf128.ps.256(<8 x float>, i8) nounwind readnone
|
|
|
|
define <4 x i32> @test_x86_avx_vextractf128_si_256_1(<8 x i32> %a0) {
|
|
; CHECK-LABEL: test_x86_avx_vextractf128_si_256_1:
|
|
; CHECK: vextractf128 $1, %ymm0, %xmm0
|
|
%res = call <4 x i32> @llvm.x86.avx.vextractf128.si.256(<8 x i32> %a0, i8 1)
|
|
ret <4 x i32> %res
|
|
}
|
|
declare <4 x i32> @llvm.x86.avx.vextractf128.si.256(<8 x i32>, i8) nounwind readnone
|
|
|
|
; Verify that high bits of the immediate are masked off. This should be the equivalent
|
|
; of a vextractf128 $0 which should be optimized away, so just check that it's
|
|
; not a vextractf128 of any kind.
|
|
define <2 x double> @test_x86_avx_extractf128_pd_256_2(<4 x double> %a0) {
|
|
; CHECK-LABEL: test_x86_avx_extractf128_pd_256_2:
|
|
; CHECK-NOT: vextractf128
|
|
%res = call <2 x double> @llvm.x86.avx.vextractf128.pd.256(<4 x double> %a0, i8 2)
|
|
ret <2 x double> %res
|
|
}
|
|
|
|
|
|
define <4 x double> @test_x86_avx_blend_pd_256(<4 x double> %a0, <4 x double> %a1) {
|
|
; CHECK-LABEL: test_x86_avx_blend_pd_256:
|
|
; CHECK: vblendpd
|
|
%res = call <4 x double> @llvm.x86.avx.blend.pd.256(<4 x double> %a0, <4 x double> %a1, i32 7) ; <<4 x double>> [#uses=1]
|
|
ret <4 x double> %res
|
|
}
|
|
declare <4 x double> @llvm.x86.avx.blend.pd.256(<4 x double>, <4 x double>, i32) nounwind readnone
|
|
|
|
|
|
define <8 x float> @test_x86_avx_blend_ps_256(<8 x float> %a0, <8 x float> %a1) {
|
|
; CHECK-LABEL: test_x86_avx_blend_ps_256:
|
|
; CHECK: vblendps
|
|
%res = call <8 x float> @llvm.x86.avx.blend.ps.256(<8 x float> %a0, <8 x float> %a1, i32 7) ; <<8 x float>> [#uses=1]
|
|
ret <8 x float> %res
|
|
}
|
|
declare <8 x float> @llvm.x86.avx.blend.ps.256(<8 x float>, <8 x float>, i32) nounwind readnone
|
|
|
|
|
|
define <8 x float> @test_x86_avx_dp_ps_256(<8 x float> %a0, <8 x float> %a1) {
|
|
; CHECK-LABEL: test_x86_avx_dp_ps_256:
|
|
; CHECK: vdpps
|
|
%res = call <8 x float> @llvm.x86.avx.dp.ps.256(<8 x float> %a0, <8 x float> %a1, i32 7) ; <<8 x float>> [#uses=1]
|
|
ret <8 x float> %res
|
|
}
|
|
declare <8 x float> @llvm.x86.avx.dp.ps.256(<8 x float>, <8 x float>, i32) nounwind readnone
|
|
|
|
|
|
define <2 x i64> @test_x86_sse2_psll_dq(<2 x i64> %a0) {
|
|
; CHECK-LABEL: test_x86_sse2_psll_dq:
|
|
; CHECK: vpslldq {{.*#+}} xmm0 = zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
|
|
%res = call <2 x i64> @llvm.x86.sse2.psll.dq(<2 x i64> %a0, i32 8) ; <<2 x i64>> [#uses=1]
|
|
ret <2 x i64> %res
|
|
}
|
|
declare <2 x i64> @llvm.x86.sse2.psll.dq(<2 x i64>, i32) nounwind readnone
|
|
|
|
|
|
define <2 x i64> @test_x86_sse2_psrl_dq(<2 x i64> %a0) {
|
|
; CHECK-LABEL: test_x86_sse2_psrl_dq:
|
|
; CHECK: vpsrldq {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero
|
|
%res = call <2 x i64> @llvm.x86.sse2.psrl.dq(<2 x i64> %a0, i32 8) ; <<2 x i64>> [#uses=1]
|
|
ret <2 x i64> %res
|
|
}
|
|
declare <2 x i64> @llvm.x86.sse2.psrl.dq(<2 x i64>, i32) nounwind readnone
|
|
|
|
|
|
define <2 x double> @test_x86_sse41_blendpd(<2 x double> %a0, <2 x double> %a1) {
|
|
; CHECK-LABEL: test_x86_sse41_blendpd:
|
|
; CHECK: vblendpd
|
|
%res = call <2 x double> @llvm.x86.sse41.blendpd(<2 x double> %a0, <2 x double> %a1, i8 2) ; <<2 x double>> [#uses=1]
|
|
ret <2 x double> %res
|
|
}
|
|
declare <2 x double> @llvm.x86.sse41.blendpd(<2 x double>, <2 x double>, i8) nounwind readnone
|
|
|
|
|
|
define <4 x float> @test_x86_sse41_blendps(<4 x float> %a0, <4 x float> %a1) {
|
|
; CHECK-LABEL: test_x86_sse41_blendps:
|
|
; CHECK: vblendps
|
|
%res = call <4 x float> @llvm.x86.sse41.blendps(<4 x float> %a0, <4 x float> %a1, i8 7) ; <<4 x float>> [#uses=1]
|
|
ret <4 x float> %res
|
|
}
|
|
declare <4 x float> @llvm.x86.sse41.blendps(<4 x float>, <4 x float>, i8) nounwind readnone
|
|
|
|
|
|
define <8 x i16> @test_x86_sse41_pblendw(<8 x i16> %a0, <8 x i16> %a1) {
|
|
; CHECK-LABEL: test_x86_sse41_pblendw:
|
|
; CHECK: vpblendw
|
|
%res = call <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16> %a0, <8 x i16> %a1, i8 7) ; <<8 x i16>> [#uses=1]
|
|
ret <8 x i16> %res
|
|
}
|
|
declare <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16>, <8 x i16>, i8) nounwind readnone
|