X86: fold SSE2/AVX2 logical shift by immediate amount into zero vector when possible

Patch by Andrea Di Biagio


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@186165 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
Stephen Lin 2013-07-12 15:31:36 +00:00
parent 55ec2218c4
commit fff967358b
3 changed files with 532 additions and 0 deletions

View File

@ -16321,6 +16321,38 @@ static SDValue PerformSHLCombine(SDNode *N, SelectionDAG &DAG) {
return SDValue();
}
/// \brief Returns a vector of 0s if the node in input is a vector logical
/// shift by a constant amount which is known to be bigger than or equal
/// to the vector element size in bits.
static SDValue performShiftToAllZeros(SDNode *N, SelectionDAG &DAG,
const X86Subtarget *Subtarget) {
EVT VT = N->getValueType(0);
if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16 &&
(!Subtarget->hasInt256() ||
(VT != MVT::v4i64 && VT != MVT::v8i32 && VT != MVT::v16i16)))
return SDValue();
SDValue Amt = N->getOperand(1);
SDLoc DL(N);
if (isSplatVector(Amt.getNode())) {
SDValue SclrAmt = Amt->getOperand(0);
if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(SclrAmt)) {
APInt ShiftAmt = C->getAPIntValue();
unsigned MaxAmount = VT.getVectorElementType().getSizeInBits();
// SSE2/AVX2 logical shifts always return a vector of 0s
// if the shift amount is bigger than or equal to
// the element size. The constant shift amount will be
// encoded as a 8-bit immediate.
if (ShiftAmt.trunc(8).uge(MaxAmount))
return getZeroVector(VT, Subtarget, DAG, DL);
}
}
return SDValue();
}
/// PerformShiftCombine - Combine shifts.
static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
@ -16330,6 +16362,12 @@ static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG,
if (V.getNode()) return V;
}
if (N->getOpcode() != ISD::SRA) {
// Try to fold this logical shift into a zero vector.
SDValue V = performShiftToAllZeros(N, DAG, Subtarget);
if (V.getNode()) return V;
}
return SDValue();
}

View File

@ -0,0 +1,247 @@
; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=core-avx2 -mattr=+avx2 | FileCheck %s
; AVX2 Logical Shift Left
define <16 x i16> @test_sllw_1(<16 x i16> %InVec) {
entry:
%shl = shl <16 x i16> %InVec, <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>
ret <16 x i16> %shl
}
; CHECK: test_sllw_1:
; CHECK: vpsllw $0, %ymm0, %ymm0
; CHECK: ret
define <16 x i16> @test_sllw_2(<16 x i16> %InVec) {
entry:
%shl = shl <16 x i16> %InVec, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
ret <16 x i16> %shl
}
; CHECK: test_sllw_2:
; CHECK: vpaddw %ymm0, %ymm0, %ymm0
; CHECK: ret
define <16 x i16> @test_sllw_3(<16 x i16> %InVec) {
entry:
%shl = shl <16 x i16> %InVec, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
ret <16 x i16> %shl
}
; CHECK: test_sllw_3:
; CHECK: vxorps %ymm0, %ymm0, %ymm0
; CHECK: ret
define <8 x i32> @test_slld_1(<8 x i32> %InVec) {
entry:
%shl = shl <8 x i32> %InVec, <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
ret <8 x i32> %shl
}
; CHECK: test_slld_1:
; CHECK: vpslld $0, %ymm0, %ymm0
; CHECK: ret
define <8 x i32> @test_slld_2(<8 x i32> %InVec) {
entry:
%shl = shl <8 x i32> %InVec, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
ret <8 x i32> %shl
}
; CHECK: test_slld_2:
; CHECK: vpaddd %ymm0, %ymm0, %ymm0
; CHECK: ret
define <8 x i32> @test_slld_3(<8 x i32> %InVec) {
entry:
%shl = shl <8 x i32> %InVec, <i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32>
ret <8 x i32> %shl
}
; CHECK: test_slld_3:
; CHECK: vxorps %ymm0, %ymm0, %ymm0
; CHECK: ret
define <4 x i64> @test_sllq_1(<4 x i64> %InVec) {
entry:
%shl = shl <4 x i64> %InVec, <i64 0, i64 0, i64 0, i64 0>
ret <4 x i64> %shl
}
; CHECK: test_sllq_1:
; CHECK: vpsllq $0, %ymm0, %ymm0
; CHECK: ret
define <4 x i64> @test_sllq_2(<4 x i64> %InVec) {
entry:
%shl = shl <4 x i64> %InVec, <i64 1, i64 1, i64 1, i64 1>
ret <4 x i64> %shl
}
; CHECK: test_sllq_2:
; CHECK: vpaddq %ymm0, %ymm0, %ymm0
; CHECK: ret
define <4 x i64> @test_sllq_3(<4 x i64> %InVec) {
entry:
%shl = shl <4 x i64> %InVec, <i64 64, i64 64, i64 64, i64 64>
ret <4 x i64> %shl
}
; CHECK: test_sllq_3:
; CHECK: vxorps %ymm0, %ymm0, %ymm0
; CHECK: ret
; AVX2 Arithmetic Shift
define <16 x i16> @test_sraw_1(<16 x i16> %InVec) {
entry:
%shl = ashr <16 x i16> %InVec, <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>
ret <16 x i16> %shl
}
; CHECK: test_sraw_1:
; CHECK: vpsraw $0, %ymm0, %ymm0
; CHECK: ret
define <16 x i16> @test_sraw_2(<16 x i16> %InVec) {
entry:
%shl = ashr <16 x i16> %InVec, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
ret <16 x i16> %shl
}
; CHECK: test_sraw_2:
; CHECK: vpsraw $1, %ymm0, %ymm0
; CHECK: ret
define <16 x i16> @test_sraw_3(<16 x i16> %InVec) {
entry:
%shl = ashr <16 x i16> %InVec, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
ret <16 x i16> %shl
}
; CHECK: test_sraw_3:
; CHECK: vpsraw $16, %ymm0, %ymm0
; CHECK: ret
define <8 x i32> @test_srad_1(<8 x i32> %InVec) {
entry:
%shl = ashr <8 x i32> %InVec, <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
ret <8 x i32> %shl
}
; CHECK: test_srad_1:
; CHECK: vpsrad $0, %ymm0, %ymm0
; CHECK: ret
define <8 x i32> @test_srad_2(<8 x i32> %InVec) {
entry:
%shl = ashr <8 x i32> %InVec, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
ret <8 x i32> %shl
}
; CHECK: test_srad_2:
; CHECK: vpsrad $1, %ymm0, %ymm0
; CHECK: ret
define <8 x i32> @test_srad_3(<8 x i32> %InVec) {
entry:
%shl = ashr <8 x i32> %InVec, <i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32>
ret <8 x i32> %shl
}
; CHECK: test_srad_3:
; CHECK: vpsrad $32, %ymm0, %ymm0
; CHECK: ret
; SSE Logical Shift Right
define <16 x i16> @test_srlw_1(<16 x i16> %InVec) {
entry:
%shl = lshr <16 x i16> %InVec, <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>
ret <16 x i16> %shl
}
; CHECK: test_srlw_1:
; CHECK: vpsrlw $0, %ymm0, %ymm0
; CHECK: ret
define <16 x i16> @test_srlw_2(<16 x i16> %InVec) {
entry:
%shl = lshr <16 x i16> %InVec, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
ret <16 x i16> %shl
}
; CHECK: test_srlw_2:
; CHECK: vpsrlw $1, %ymm0, %ymm0
; CHECK: ret
define <16 x i16> @test_srlw_3(<16 x i16> %InVec) {
entry:
%shl = lshr <16 x i16> %InVec, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
ret <16 x i16> %shl
}
; CHECK: test_srlw_3:
; CHECK: vxorps %ymm0, %ymm0, %ymm0
; CHECK: ret
define <8 x i32> @test_srld_1(<8 x i32> %InVec) {
entry:
%shl = lshr <8 x i32> %InVec, <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
ret <8 x i32> %shl
}
; CHECK: test_srld_1:
; CHECK: vpsrld $0, %ymm0, %ymm0
; CHECK: ret
define <8 x i32> @test_srld_2(<8 x i32> %InVec) {
entry:
%shl = lshr <8 x i32> %InVec, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
ret <8 x i32> %shl
}
; CHECK: test_srld_2:
; CHECK: vpsrld $1, %ymm0, %ymm0
; CHECK: ret
define <8 x i32> @test_srld_3(<8 x i32> %InVec) {
entry:
%shl = lshr <8 x i32> %InVec, <i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32>
ret <8 x i32> %shl
}
; CHECK: test_srld_3:
; CHECK: vxorps %ymm0, %ymm0, %ymm0
; CHECK: ret
define <4 x i64> @test_srlq_1(<4 x i64> %InVec) {
entry:
%shl = lshr <4 x i64> %InVec, <i64 0, i64 0, i64 0, i64 0>
ret <4 x i64> %shl
}
; CHECK: test_srlq_1:
; CHECK: vpsrlq $0, %ymm0, %ymm0
; CHECK: ret
define <4 x i64> @test_srlq_2(<4 x i64> %InVec) {
entry:
%shl = lshr <4 x i64> %InVec, <i64 1, i64 1, i64 1, i64 1>
ret <4 x i64> %shl
}
; CHECK: test_srlq_2:
; CHECK: vpsrlq $1, %ymm0, %ymm0
; CHECK: ret
define <4 x i64> @test_srlq_3(<4 x i64> %InVec) {
entry:
%shl = lshr <4 x i64> %InVec, <i64 64, i64 64, i64 64, i64 64>
ret <4 x i64> %shl
}
; CHECK: test_srlq_3:
; CHECK: vxorps %ymm0, %ymm0, %ymm0
; CHECK: ret

View File

@ -0,0 +1,247 @@
; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+sse2 -mcpu=corei7 | FileCheck %s
; SSE2 Logical Shift Left
define <8 x i16> @test_sllw_1(<8 x i16> %InVec) {
entry:
%shl = shl <8 x i16> %InVec, <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>
ret <8 x i16> %shl
}
; CHECK: test_sllw_1:
; CHECK: psllw $0, %xmm0
; CHECK-NEXT: ret
define <8 x i16> @test_sllw_2(<8 x i16> %InVec) {
entry:
%shl = shl <8 x i16> %InVec, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
ret <8 x i16> %shl
}
; CHECK: test_sllw_2:
; CHECK: paddw %xmm0, %xmm0
; CHECK-NEXT: ret
define <8 x i16> @test_sllw_3(<8 x i16> %InVec) {
entry:
%shl = shl <8 x i16> %InVec, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
ret <8 x i16> %shl
}
; CHECK: test_sllw_3:
; CHECK: xorps %xmm0, %xmm0
; CHECK-NEXT: ret
define <4 x i32> @test_slld_1(<4 x i32> %InVec) {
entry:
%shl = shl <4 x i32> %InVec, <i32 0, i32 0, i32 0, i32 0>
ret <4 x i32> %shl
}
; CHECK: test_slld_1:
; CHECK: pslld $0, %xmm0
; CHECK-NEXT: ret
define <4 x i32> @test_slld_2(<4 x i32> %InVec) {
entry:
%shl = shl <4 x i32> %InVec, <i32 1, i32 1, i32 1, i32 1>
ret <4 x i32> %shl
}
; CHECK: test_slld_2:
; CHECK: paddd %xmm0, %xmm0
; CHECK-NEXT: ret
define <4 x i32> @test_slld_3(<4 x i32> %InVec) {
entry:
%shl = shl <4 x i32> %InVec, <i32 32, i32 32, i32 32, i32 32>
ret <4 x i32> %shl
}
; CHECK: test_slld_3:
; CHECK: xorps %xmm0, %xmm0
; CHECK-NEXT: ret
define <2 x i64> @test_sllq_1(<2 x i64> %InVec) {
entry:
%shl = shl <2 x i64> %InVec, <i64 0, i64 0>
ret <2 x i64> %shl
}
; CHECK: test_sllq_1:
; CHECK: psllq $0, %xmm0
; CHECK-NEXT: ret
define <2 x i64> @test_sllq_2(<2 x i64> %InVec) {
entry:
%shl = shl <2 x i64> %InVec, <i64 1, i64 1>
ret <2 x i64> %shl
}
; CHECK: test_sllq_2:
; CHECK: paddq %xmm0, %xmm0
; CHECK-NEXT: ret
define <2 x i64> @test_sllq_3(<2 x i64> %InVec) {
entry:
%shl = shl <2 x i64> %InVec, <i64 64, i64 64>
ret <2 x i64> %shl
}
; CHECK: test_sllq_3:
; CHECK: xorps %xmm0, %xmm0
; CHECK-NEXT: ret
; SSE2 Arithmetic Shift
define <8 x i16> @test_sraw_1(<8 x i16> %InVec) {
entry:
%shl = ashr <8 x i16> %InVec, <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>
ret <8 x i16> %shl
}
; CHECK: test_sraw_1:
; CHECK: psraw $0, %xmm0
; CHECK-NEXT: ret
define <8 x i16> @test_sraw_2(<8 x i16> %InVec) {
entry:
%shl = ashr <8 x i16> %InVec, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
ret <8 x i16> %shl
}
; CHECK: test_sraw_2:
; CHECK: psraw $1, %xmm0
; CHECK-NEXT: ret
define <8 x i16> @test_sraw_3(<8 x i16> %InVec) {
entry:
%shl = ashr <8 x i16> %InVec, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
ret <8 x i16> %shl
}
; CHECK: test_sraw_3:
; CHECK: psraw $16, %xmm0
; CHECK-NEXT: ret
define <4 x i32> @test_srad_1(<4 x i32> %InVec) {
entry:
%shl = ashr <4 x i32> %InVec, <i32 0, i32 0, i32 0, i32 0>
ret <4 x i32> %shl
}
; CHECK: test_srad_1:
; CHECK: psrad $0, %xmm0
; CHECK-NEXT: ret
define <4 x i32> @test_srad_2(<4 x i32> %InVec) {
entry:
%shl = ashr <4 x i32> %InVec, <i32 1, i32 1, i32 1, i32 1>
ret <4 x i32> %shl
}
; CHECK: test_srad_2:
; CHECK: psrad $1, %xmm0
; CHECK-NEXT: ret
define <4 x i32> @test_srad_3(<4 x i32> %InVec) {
entry:
%shl = ashr <4 x i32> %InVec, <i32 32, i32 32, i32 32, i32 32>
ret <4 x i32> %shl
}
; CHECK: test_srad_3:
; CHECK: psrad $32, %xmm0
; CHECK-NEXT: ret
; SSE Logical Shift Right
define <8 x i16> @test_srlw_1(<8 x i16> %InVec) {
entry:
%shl = lshr <8 x i16> %InVec, <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>
ret <8 x i16> %shl
}
; CHECK: test_srlw_1:
; CHECK: psrlw $0, %xmm0
; CHECK-NEXT: ret
define <8 x i16> @test_srlw_2(<8 x i16> %InVec) {
entry:
%shl = lshr <8 x i16> %InVec, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
ret <8 x i16> %shl
}
; CHECK: test_srlw_2:
; CHECK: psrlw $1, %xmm0
; CHECK-NEXT: ret
define <8 x i16> @test_srlw_3(<8 x i16> %InVec) {
entry:
%shl = lshr <8 x i16> %InVec, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
ret <8 x i16> %shl
}
; CHECK: test_srlw_3:
; CHECK: xorps %xmm0, %xmm0
; CHECK-NEXT: ret
define <4 x i32> @test_srld_1(<4 x i32> %InVec) {
entry:
%shl = lshr <4 x i32> %InVec, <i32 0, i32 0, i32 0, i32 0>
ret <4 x i32> %shl
}
; CHECK: test_srld_1:
; CHECK: psrld $0, %xmm0
; CHECK-NEXT: ret
define <4 x i32> @test_srld_2(<4 x i32> %InVec) {
entry:
%shl = lshr <4 x i32> %InVec, <i32 1, i32 1, i32 1, i32 1>
ret <4 x i32> %shl
}
; CHECK: test_srld_2:
; CHECK: psrld $1, %xmm0
; CHECK-NEXT: ret
define <4 x i32> @test_srld_3(<4 x i32> %InVec) {
entry:
%shl = lshr <4 x i32> %InVec, <i32 32, i32 32, i32 32, i32 32>
ret <4 x i32> %shl
}
; CHECK: test_srld_3:
; CHECK: xorps %xmm0, %xmm0
; CHECK-NEXT: ret
define <2 x i64> @test_srlq_1(<2 x i64> %InVec) {
entry:
%shl = lshr <2 x i64> %InVec, <i64 0, i64 0>
ret <2 x i64> %shl
}
; CHECK: test_srlq_1:
; CHECK: psrlq $0, %xmm0
; CHECK-NEXT: ret
define <2 x i64> @test_srlq_2(<2 x i64> %InVec) {
entry:
%shl = lshr <2 x i64> %InVec, <i64 1, i64 1>
ret <2 x i64> %shl
}
; CHECK: test_srlq_2:
; CHECK: psrlq $1, %xmm0
; CHECK-NEXT: ret
define <2 x i64> @test_srlq_3(<2 x i64> %InVec) {
entry:
%shl = lshr <2 x i64> %InVec, <i64 64, i64 64>
ret <2 x i64> %shl
}
; CHECK: test_srlq_3:
; CHECK: xorps %xmm0, %xmm0
; CHECK-NEXT: ret