mirror of
https://github.com/c64scene-ar/llvm-6502.git
synced 2024-12-27 13:30:05 +00:00
2360e51fd0
This patch teaches the backend how to combine packed SSE2/AVX2 arithmetic shift intrinsics. The rules are: - Always fold a packed arithmetic shift by zero to its first operand; - Convert a packed arithmetic shift intrinsic dag node into a ISD::SRA only if the shift count is known to be smaller than the vector element size. This patch also teaches to function 'getTargetVShiftByConstNode' how fold target specific vector shifts by zero. Added two new tests to verify that the DAGCombiner is able to fold sequences of SSE2/AVX2 packed arithmetic shift calls. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@208342 91177308-0d34-0410-b5e6-96231b3b80d8
52 lines
1.9 KiB
LLVM
52 lines
1.9 KiB
LLVM
; RUN: llc < %s -march=x86-64 -mcpu=core-avx2 | FileCheck %s
|
|
|
|
; Verify that the backend correctly combines AVX2 builtin intrinsics.
|
|
|
|
|
|
define <8 x i32> @test_psra_1(<8 x i32> %A) {
|
|
%1 = tail call <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32> %A, i32 3)
|
|
%2 = tail call <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32> %1, <4 x i32> <i32 3, i32 0, i32 7, i32 0>)
|
|
%3 = tail call <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32> %2, i32 2)
|
|
ret <8 x i32> %3
|
|
}
|
|
; CHECK-LABEL: test_psra_1
|
|
; CHECK: vpsrad $8, %ymm0, %ymm0
|
|
; CHECK-NEXT: ret
|
|
|
|
define <16 x i16> @test_psra_2(<16 x i16> %A) {
|
|
%1 = tail call <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16> %A, i32 3)
|
|
%2 = tail call <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16> %1, <8 x i16> <i16 3, i16 0, i16 0, i16 0, i16 7, i16 0, i16 0, i16 0>)
|
|
%3 = tail call <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16> %2, i32 2)
|
|
ret <16 x i16> %3
|
|
}
|
|
; CHECK-LABEL: test_psra_2
|
|
; CHECK: vpsraw $8, %ymm0, %ymm0
|
|
; CHECK-NEXT: ret
|
|
|
|
define <16 x i16> @test_psra_3(<16 x i16> %A) {
|
|
%1 = tail call <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16> %A, i32 0)
|
|
%2 = tail call <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16> %1, <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 7, i16 0, i16 0, i16 0>)
|
|
%3 = tail call <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16> %2, i32 0)
|
|
ret <16 x i16> %3
|
|
}
|
|
; CHECK-LABEL: test_psra_3
|
|
; CHECK-NOT: vpsraw
|
|
; CHECK: ret
|
|
|
|
define <8 x i32> @test_psra_4(<8 x i32> %A) {
|
|
%1 = tail call <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32> %A, i32 0)
|
|
%2 = tail call <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32> %1, <4 x i32> <i32 0, i32 0, i32 7, i32 0>)
|
|
%3 = tail call <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32> %2, i32 0)
|
|
ret <8 x i32> %3
|
|
}
|
|
; CHECK-LABEL: test_psra_4
|
|
; CHECK-NOT: vpsrad
|
|
; CHECK: ret
|
|
|
|
|
|
declare <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16>, <8 x i16>)
|
|
declare <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16>, i32)
|
|
declare <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32>, <4 x i32>)
|
|
declare <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32>, i32)
|
|
|