Add code for lowering v32i8 shifts by a splat to AVX2 immediate shift instructions. Remove 256-bit splat handling from LowerShift as it was already handled by PerformShiftCombine.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@145005 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
Craig Topper 2011-11-20 00:12:05 +00:00
parent 745a86bac9
commit 0d86d462f8
3 changed files with 95 additions and 57 deletions

View File

@ -10338,47 +10338,48 @@ SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const {
return Res;
}
if (Subtarget->hasAVX2()) {
if (VT == MVT::v4i64 && Op.getOpcode() == ISD::SHL)
return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
DAG.getConstant(Intrinsic::x86_avx2_pslli_q, MVT::i32),
R, DAG.getConstant(ShiftAmt, MVT::i32));
if (VT == MVT::v8i32 && Op.getOpcode() == ISD::SHL)
return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
DAG.getConstant(Intrinsic::x86_avx2_pslli_d, MVT::i32),
R, DAG.getConstant(ShiftAmt, MVT::i32));
if (VT == MVT::v16i16 && Op.getOpcode() == ISD::SHL)
return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
DAG.getConstant(Intrinsic::x86_avx2_pslli_w, MVT::i32),
R, DAG.getConstant(ShiftAmt, MVT::i32));
if (VT == MVT::v4i64 && Op.getOpcode() == ISD::SRL)
return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
DAG.getConstant(Intrinsic::x86_avx2_psrli_q, MVT::i32),
R, DAG.getConstant(ShiftAmt, MVT::i32));
if (VT == MVT::v8i32 && Op.getOpcode() == ISD::SRL)
return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
DAG.getConstant(Intrinsic::x86_avx2_psrli_d, MVT::i32),
R, DAG.getConstant(ShiftAmt, MVT::i32));
if (VT == MVT::v16i16 && Op.getOpcode() == ISD::SRL)
return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
DAG.getConstant(Intrinsic::x86_avx2_psrli_w, MVT::i32),
R, DAG.getConstant(ShiftAmt, MVT::i32));
if (VT == MVT::v8i32 && Op.getOpcode() == ISD::SRA)
return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
DAG.getConstant(Intrinsic::x86_avx2_psrai_d, MVT::i32),
R, DAG.getConstant(ShiftAmt, MVT::i32));
if (VT == MVT::v16i16 && Op.getOpcode() == ISD::SRA)
return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
DAG.getConstant(Intrinsic::x86_avx2_psrai_w, MVT::i32),
R, DAG.getConstant(ShiftAmt, MVT::i32));
if (Subtarget->hasAVX2() && VT == MVT::v32i8) {
if (Op.getOpcode() == ISD::SHL) {
// Make a large shift.
SDValue SHL =
DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
DAG.getConstant(Intrinsic::x86_avx2_pslli_w, MVT::i32),
R, DAG.getConstant(ShiftAmt, MVT::i32));
// Zero out the rightmost bits.
SmallVector<SDValue, 32> V(32, DAG.getConstant(uint8_t(-1U << ShiftAmt),
MVT::i8));
return DAG.getNode(ISD::AND, dl, VT, SHL,
DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 32));
}
if (Op.getOpcode() == ISD::SRL) {
// Make a large shift.
SDValue SRL =
DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
DAG.getConstant(Intrinsic::x86_avx2_psrli_w, MVT::i32),
R, DAG.getConstant(ShiftAmt, MVT::i32));
// Zero out the leftmost bits.
SmallVector<SDValue, 32> V(32, DAG.getConstant(uint8_t(-1U) >> ShiftAmt,
MVT::i8));
return DAG.getNode(ISD::AND, dl, VT, SRL,
DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 32));
}
if (Op.getOpcode() == ISD::SRA) {
if (ShiftAmt == 7) {
// R s>> 7 === R s< 0
SDValue Zeros = getZeroVector(VT, true /* HasXMMInt */, DAG, dl);
return DAG.getNode(X86ISD::PCMPGTB, dl, VT, Zeros, R);
}
// R s>> a === ((R u>> a) ^ m) - m
SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
SmallVector<SDValue, 32> V(32, DAG.getConstant(128 >> ShiftAmt,
MVT::i8));
SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 32);
Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
return Res;
}
}
}
}

View File

@ -311,17 +311,16 @@ def : Pat<(bc_v4i64 (v8f32 immAllZerosV)),
// JIT implementation, it does not expand the instructions below like
// X86MCInstLower does.
let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
isCodeGenOnly = 1, ExeDomain = SSEPackedInt in
isCodeGenOnly = 1, ExeDomain = SSEPackedInt in {
def V_SETALLONES : PDI<0x76, MRMInitReg, (outs VR128:$dst), (ins), "",
[(set VR128:$dst, (v4i32 immAllOnesV))]>;
let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
isCodeGenOnly = 1, ExeDomain = SSEPackedInt, Predicates = [HasAVX] in
let Predicates = [HasAVX] in
def AVX_SETALLONES : PDI<0x76, MRMInitReg, (outs VR128:$dst), (ins), "",
[(set VR128:$dst, (v4i32 immAllOnesV))]>, VEX_4V;
let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
isCodeGenOnly = 1, ExeDomain = SSEPackedInt, Predicates = [HasAVX2] in
let Predicates = [HasAVX2] in
def AVX2_SETALLONES : PDI<0x76, MRMInitReg, (outs VR256:$dst), (ins), "",
[(set VR256:$dst, (v8i32 immAllOnesV))]>, VEX_4V;
}
//===----------------------------------------------------------------------===//

View File

@ -58,14 +58,14 @@ define <4 x i64> @variable_srl3(<4 x i64> %x, <4 x i64> %y) {
}
; CHECK: variable_sra0
; CHECK: psravd
; CHECK: vpsravd
; CHECK: ret
define <4 x i32> @variable_sra0(<4 x i32> %x, <4 x i32> %y) {
%k = ashr <4 x i32> %x, %y
ret <4 x i32> %k
}
; CHECK: variable_sra1
; CHECK: psravd
; CHECK: vpsravd
; CHECK: ret
define <8 x i32> @variable_sra1(<8 x i32> %x, <8 x i32> %y) {
%k = ashr <8 x i32> %x, %y
@ -127,7 +127,7 @@ define <16 x i16> @vshift07(<16 x i16> %a) nounwind readnone {
}
; CHECK: variable_sra0_load
; CHECK: psravd (%
; CHECK: vpsravd (%
; CHECK: ret
define <4 x i32> @variable_sra0_load(<4 x i32> %x, <4 x i32>* %y) {
%y1 = load <4 x i32>* %y
@ -136,7 +136,7 @@ define <4 x i32> @variable_sra0_load(<4 x i32> %x, <4 x i32>* %y) {
}
; CHECK: variable_sra1_load
; CHECK: psravd (%
; CHECK: vpsravd (%
; CHECK: ret
define <8 x i32> @variable_sra1_load(<8 x i32> %x, <8 x i32>* %y) {
%y1 = load <8 x i32>* %y
@ -145,7 +145,7 @@ define <8 x i32> @variable_sra1_load(<8 x i32> %x, <8 x i32>* %y) {
}
; CHECK: variable_shl0_load
; CHECK: psllvd (%
; CHECK: vpsllvd (%
; CHECK: ret
define <4 x i32> @variable_shl0_load(<4 x i32> %x, <4 x i32>* %y) {
%y1 = load <4 x i32>* %y
@ -153,7 +153,7 @@ define <4 x i32> @variable_shl0_load(<4 x i32> %x, <4 x i32>* %y) {
ret <4 x i32> %k
}
; CHECK: variable_shl1_load
; CHECK: psllvd (%
; CHECK: vpsllvd (%
; CHECK: ret
define <8 x i32> @variable_shl1_load(<8 x i32> %x, <8 x i32>* %y) {
%y1 = load <8 x i32>* %y
@ -161,7 +161,7 @@ define <8 x i32> @variable_shl1_load(<8 x i32> %x, <8 x i32>* %y) {
ret <8 x i32> %k
}
; CHECK: variable_shl2_load
; CHECK: psllvq (%
; CHECK: vpsllvq (%
; CHECK: ret
define <2 x i64> @variable_shl2_load(<2 x i64> %x, <2 x i64>* %y) {
%y1 = load <2 x i64>* %y
@ -169,7 +169,7 @@ define <2 x i64> @variable_shl2_load(<2 x i64> %x, <2 x i64>* %y) {
ret <2 x i64> %k
}
; CHECK: variable_shl3_load
; CHECK: psllvq (%
; CHECK: vpsllvq (%
; CHECK: ret
define <4 x i64> @variable_shl3_load(<4 x i64> %x, <4 x i64>* %y) {
%y1 = load <4 x i64>* %y
@ -177,7 +177,7 @@ define <4 x i64> @variable_shl3_load(<4 x i64> %x, <4 x i64>* %y) {
ret <4 x i64> %k
}
; CHECK: variable_srl0_load
; CHECK: psrlvd (%
; CHECK: vpsrlvd (%
; CHECK: ret
define <4 x i32> @variable_srl0_load(<4 x i32> %x, <4 x i32>* %y) {
%y1 = load <4 x i32>* %y
@ -185,7 +185,7 @@ define <4 x i32> @variable_srl0_load(<4 x i32> %x, <4 x i32>* %y) {
ret <4 x i32> %k
}
; CHECK: variable_srl1_load
; CHECK: psrlvd (%
; CHECK: vpsrlvd (%
; CHECK: ret
define <8 x i32> @variable_srl1_load(<8 x i32> %x, <8 x i32>* %y) {
%y1 = load <8 x i32>* %y
@ -193,7 +193,7 @@ define <8 x i32> @variable_srl1_load(<8 x i32> %x, <8 x i32>* %y) {
ret <8 x i32> %k
}
; CHECK: variable_srl2_load
; CHECK: psrlvq (%
; CHECK: vpsrlvq (%
; CHECK: ret
define <2 x i64> @variable_srl2_load(<2 x i64> %x, <2 x i64>* %y) {
%y1 = load <2 x i64>* %y
@ -201,10 +201,48 @@ define <2 x i64> @variable_srl2_load(<2 x i64> %x, <2 x i64>* %y) {
ret <2 x i64> %k
}
; CHECK: variable_srl3_load
; CHECK: psrlvq (%
; CHECK: vpsrlvq (%
; CHECK: ret
define <4 x i64> @variable_srl3_load(<4 x i64> %x, <4 x i64>* %y) {
%y1 = load <4 x i64>* %y
%k = lshr <4 x i64> %x, %y1
ret <4 x i64> %k
}
define <32 x i8> @shl9(<32 x i8> %A) nounwind {
%B = shl <32 x i8> %A, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
ret <32 x i8> %B
; CHECK: shl9:
; CHECK: vpsllw $3
; CHECK: vpand
; CHECK: ret
}
define <32 x i8> @shr9(<32 x i8> %A) nounwind {
%B = lshr <32 x i8> %A, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
ret <32 x i8> %B
; CHECK: shr9:
; CHECK: vpsrlw $3
; CHECK: vpand
; CHECK: ret
}
define <32 x i8> @sra_v32i8_7(<32 x i8> %A) nounwind {
%B = ashr <32 x i8> %A, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
ret <32 x i8> %B
; CHECK: sra_v32i8_7:
; CHECK: vxorps
; CHECK: vpcmpgtb
; CHECK: ret
}
define <32 x i8> @sra_v32i8(<32 x i8> %A) nounwind {
%B = ashr <32 x i8> %A, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
ret <32 x i8> %B
; CHECK: sra_v32i8:
; CHECK: vpsrlw $3
; CHECK: vpand
; CHECK: vpxor
; CHECK: vpsubb
; CHECK: ret
}