mirror of
https://github.com/c64scene-ar/llvm-6502.git
synced 2025-01-14 00:32:55 +00:00
8887371782
Instead of expanding a packed shift into a sequence of scalar shifts, the backend now tries (when possible) to convert the vector shift into a vector multiply. Before this change, a shift of a MVT::v8i16 vector by a build_vector of constants was always scalarized into a long sequence of "vector extracts + scalar shifts + vector insert". With this change, if there is SSE2 support, we emit a single vector multiply. This change also affects SSE4.1, AVX, AVX2 shifts: - A shift of a MVT::v4i32 vector by a build_vector of non uniform constants is now lowered when possible into a single SSE4.1 vector multiply. - Packed v16i16 shift left by constant build_vector are now expanded when possible into a single AVX2 vpmullw. This change also improves the lowering of AVX512f vector shifts. Added test CodeGen/X86/vec_shift6.ll with some code examples that are affected by this change. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@201271 91177308-0d34-0410-b5e6-96231b3b80d8
149 lines
4.3 KiB
LLVM
149 lines
4.3 KiB
LLVM
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck %s
|
|
|
|
;;; Shift left
|
|
; CHECK: vpslld
|
|
; CHECK: vpslld
|
|
define <8 x i32> @vshift00(<8 x i32> %a) nounwind readnone {
|
|
%s = shl <8 x i32> %a, <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32
|
|
2>
|
|
ret <8 x i32> %s
|
|
}
|
|
|
|
; CHECK: vpsllw
|
|
; CHECK: vpsllw
|
|
define <16 x i16> @vshift01(<16 x i16> %a) nounwind readnone {
|
|
%s = shl <16 x i16> %a, <i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2>
|
|
ret <16 x i16> %s
|
|
}
|
|
|
|
; CHECK: vpsllq
|
|
; CHECK: vpsllq
|
|
define <4 x i64> @vshift02(<4 x i64> %a) nounwind readnone {
|
|
%s = shl <4 x i64> %a, <i64 2, i64 2, i64 2, i64 2>
|
|
ret <4 x i64> %s
|
|
}
|
|
|
|
;;; Logical Shift right
|
|
; CHECK: vpsrld
|
|
; CHECK: vpsrld
|
|
define <8 x i32> @vshift03(<8 x i32> %a) nounwind readnone {
|
|
%s = lshr <8 x i32> %a, <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32
|
|
2>
|
|
ret <8 x i32> %s
|
|
}
|
|
|
|
; CHECK: vpsrlw
|
|
; CHECK: vpsrlw
|
|
define <16 x i16> @vshift04(<16 x i16> %a) nounwind readnone {
|
|
%s = lshr <16 x i16> %a, <i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2>
|
|
ret <16 x i16> %s
|
|
}
|
|
|
|
; CHECK: vpsrlq
|
|
; CHECK: vpsrlq
|
|
define <4 x i64> @vshift05(<4 x i64> %a) nounwind readnone {
|
|
%s = lshr <4 x i64> %a, <i64 2, i64 2, i64 2, i64 2>
|
|
ret <4 x i64> %s
|
|
}
|
|
|
|
;;; Arithmetic Shift right
|
|
; CHECK: vpsrad
|
|
; CHECK: vpsrad
|
|
define <8 x i32> @vshift06(<8 x i32> %a) nounwind readnone {
|
|
%s = ashr <8 x i32> %a, <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32
|
|
2>
|
|
ret <8 x i32> %s
|
|
}
|
|
|
|
; CHECK: vpsraw
|
|
; CHECK: vpsraw
|
|
define <16 x i16> @vshift07(<16 x i16> %a) nounwind readnone {
|
|
%s = ashr <16 x i16> %a, <i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2>
|
|
ret <16 x i16> %s
|
|
}
|
|
|
|
; CHECK: vpsrlw
|
|
; CHECK: pand
|
|
; CHECK: pxor
|
|
; CHECK: psubb
|
|
; CHECK: vpsrlw
|
|
; CHECK: pand
|
|
; CHECK: pxor
|
|
; CHECK: psubb
|
|
define <32 x i8> @vshift09(<32 x i8> %a) nounwind readnone {
|
|
%s = ashr <32 x i8> %a, <i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2>
|
|
ret <32 x i8> %s
|
|
}
|
|
|
|
; CHECK: pxor
|
|
; CHECK: pcmpgtb
|
|
; CHECK: pcmpgtb
|
|
define <32 x i8> @vshift10(<32 x i8> %a) nounwind readnone {
|
|
%s = ashr <32 x i8> %a, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
|
|
ret <32 x i8> %s
|
|
}
|
|
|
|
; CHECK: vpsrlw
|
|
; CHECK: pand
|
|
; CHECK: vpsrlw
|
|
; CHECK: pand
|
|
define <32 x i8> @vshift11(<32 x i8> %a) nounwind readnone {
|
|
%s = lshr <32 x i8> %a, <i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2>
|
|
ret <32 x i8> %s
|
|
}
|
|
|
|
; CHECK: vpsllw
|
|
; CHECK: pand
|
|
; CHECK: vpsllw
|
|
; CHECK: pand
|
|
define <32 x i8> @vshift12(<32 x i8> %a) nounwind readnone {
|
|
%s = shl <32 x i8> %a, <i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2>
|
|
ret <32 x i8> %s
|
|
}
|
|
|
|
;;; Support variable shifts
|
|
; CHECK: _vshift08
|
|
; CHECK: vpslld $23
|
|
; CHECK: vextractf128 $1
|
|
; CHECK: vpslld $23
|
|
; CHECK: ret
|
|
define <8 x i32> @vshift08(<8 x i32> %a) nounwind {
|
|
%bitop = shl <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>, %a
|
|
ret <8 x i32> %bitop
|
|
}
|
|
|
|
; PR15141
|
|
; CHECK: _vshift13:
|
|
; CHECK-NOT: vpsll
|
|
; CHECK-NOT: vcvttps2dq
|
|
; CHECK: vpmulld
|
|
define <4 x i32> @vshift13(<4 x i32> %in) {
|
|
%T = shl <4 x i32> %in, <i32 0, i32 1, i32 2, i32 4>
|
|
ret <4 x i32> %T
|
|
}
|
|
|
|
;;; Uses shifts for sign extension
|
|
; CHECK: _sext_v16i16
|
|
; CHECK: vpsllw
|
|
; CHECK: vpsraw
|
|
; CHECK: vpsllw
|
|
; CHECK: vpsraw
|
|
; CHECK: vinsertf128
|
|
define <16 x i16> @sext_v16i16(<16 x i16> %a) nounwind {
|
|
%b = trunc <16 x i16> %a to <16 x i8>
|
|
%c = sext <16 x i8> %b to <16 x i16>
|
|
ret <16 x i16> %c
|
|
}
|
|
|
|
; CHECK: _sext_v8i32
|
|
; CHECK: vpslld
|
|
; CHECK: vpsrad
|
|
; CHECK: vpslld
|
|
; CHECK: vpsrad
|
|
; CHECK: vinsertf128
|
|
define <8 x i32> @sext_v8i32(<8 x i32> %a) nounwind {
|
|
%b = trunc <8 x i32> %a to <8 x i16>
|
|
%c = sext <8 x i16> %b to <8 x i32>
|
|
ret <8 x i32> %c
|
|
}
|