llvm-6502/test/CodeGen/X86/avx-shift.ll
Andrea Di Biagio 8887371782 [X86] Teach the backend how to lower vector shift left into multiply rather than scalarizing it.
Instead of expanding a packed shift into a sequence of scalar shifts,
the backend now tries (when possible) to convert the vector shift into a
vector multiply.

Before this change, a shift of a MVT::v8i16 vector by a
build_vector of constants was always scalarized into a long sequence of "vector
extracts + scalar shifts + vector insert".
With this change, if there is SSE2 support, we emit a single vector multiply.

This change also affects SSE4.1, AVX, AVX2 shifts:
 - A shift of a MVT::v4i32 vector by a build_vector of non uniform constants
is now lowered when possible into a single SSE4.1 vector multiply.
 - Packed v16i16 shift left by constant build_vector are now expanded when
possible into a single AVX2 vpmullw.
This change also improves the lowering of AVX512f vector shifts.

Added test CodeGen/X86/vec_shift6.ll with some code examples that are affected
by this change.



git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@201271 91177308-0d34-0410-b5e6-96231b3b80d8
2014-02-12 23:42:28 +00:00

149 lines
4.3 KiB
LLVM

; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck %s
;;; Shift left
; CHECK: vpslld
; CHECK: vpslld
define <8 x i32> @vshift00(<8 x i32> %a) nounwind readnone {
%s = shl <8 x i32> %a, <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32
2>
ret <8 x i32> %s
}
; CHECK: vpsllw
; CHECK: vpsllw
define <16 x i16> @vshift01(<16 x i16> %a) nounwind readnone {
%s = shl <16 x i16> %a, <i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2>
ret <16 x i16> %s
}
; CHECK: vpsllq
; CHECK: vpsllq
define <4 x i64> @vshift02(<4 x i64> %a) nounwind readnone {
%s = shl <4 x i64> %a, <i64 2, i64 2, i64 2, i64 2>
ret <4 x i64> %s
}
;;; Logical Shift right
; CHECK: vpsrld
; CHECK: vpsrld
define <8 x i32> @vshift03(<8 x i32> %a) nounwind readnone {
%s = lshr <8 x i32> %a, <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32
2>
ret <8 x i32> %s
}
; CHECK: vpsrlw
; CHECK: vpsrlw
define <16 x i16> @vshift04(<16 x i16> %a) nounwind readnone {
%s = lshr <16 x i16> %a, <i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2>
ret <16 x i16> %s
}
; CHECK: vpsrlq
; CHECK: vpsrlq
define <4 x i64> @vshift05(<4 x i64> %a) nounwind readnone {
%s = lshr <4 x i64> %a, <i64 2, i64 2, i64 2, i64 2>
ret <4 x i64> %s
}
;;; Arithmetic Shift right
; CHECK: vpsrad
; CHECK: vpsrad
define <8 x i32> @vshift06(<8 x i32> %a) nounwind readnone {
%s = ashr <8 x i32> %a, <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32
2>
ret <8 x i32> %s
}
; CHECK: vpsraw
; CHECK: vpsraw
define <16 x i16> @vshift07(<16 x i16> %a) nounwind readnone {
%s = ashr <16 x i16> %a, <i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2>
ret <16 x i16> %s
}
; CHECK: vpsrlw
; CHECK: pand
; CHECK: pxor
; CHECK: psubb
; CHECK: vpsrlw
; CHECK: pand
; CHECK: pxor
; CHECK: psubb
define <32 x i8> @vshift09(<32 x i8> %a) nounwind readnone {
%s = ashr <32 x i8> %a, <i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2>
ret <32 x i8> %s
}
; CHECK: pxor
; CHECK: pcmpgtb
; CHECK: pcmpgtb
define <32 x i8> @vshift10(<32 x i8> %a) nounwind readnone {
%s = ashr <32 x i8> %a, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
ret <32 x i8> %s
}
; CHECK: vpsrlw
; CHECK: pand
; CHECK: vpsrlw
; CHECK: pand
define <32 x i8> @vshift11(<32 x i8> %a) nounwind readnone {
%s = lshr <32 x i8> %a, <i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2>
ret <32 x i8> %s
}
; CHECK: vpsllw
; CHECK: pand
; CHECK: vpsllw
; CHECK: pand
define <32 x i8> @vshift12(<32 x i8> %a) nounwind readnone {
%s = shl <32 x i8> %a, <i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2>
ret <32 x i8> %s
}
;;; Support variable shifts
; CHECK: _vshift08
; CHECK: vpslld $23
; CHECK: vextractf128 $1
; CHECK: vpslld $23
; CHECK: ret
define <8 x i32> @vshift08(<8 x i32> %a) nounwind {
%bitop = shl <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>, %a
ret <8 x i32> %bitop
}
; PR15141
; CHECK: _vshift13:
; CHECK-NOT: vpsll
; CHECK-NOT: vcvttps2dq
; CHECK: vpmulld
define <4 x i32> @vshift13(<4 x i32> %in) {
%T = shl <4 x i32> %in, <i32 0, i32 1, i32 2, i32 4>
ret <4 x i32> %T
}
;;; Uses shifts for sign extension
; CHECK: _sext_v16i16
; CHECK: vpsllw
; CHECK: vpsraw
; CHECK: vpsllw
; CHECK: vpsraw
; CHECK: vinsertf128
define <16 x i16> @sext_v16i16(<16 x i16> %a) nounwind {
%b = trunc <16 x i16> %a to <16 x i8>
%c = sext <16 x i8> %b to <16 x i16>
ret <16 x i16> %c
}
; CHECK: _sext_v8i32
; CHECK: vpslld
; CHECK: vpsrad
; CHECK: vpslld
; CHECK: vpsrad
; CHECK: vinsertf128
define <8 x i32> @sext_v8i32(<8 x i32> %a) nounwind {
%b = trunc <8 x i32> %a to <8 x i16>
%c = sext <8 x i16> %b to <8 x i32>
ret <8 x i32> %c
}