From 0d86d462f8458b803d5209a77961dc63d5a9dae0 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Sun, 20 Nov 2011 00:12:05 +0000 Subject: [PATCH] Add code for lowering v32i8 shifts by a splat to AVX2 immediate shift instructions. Remove 256-bit splat handling from LowerShift as it was already handled by PerformShiftCombine. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@145005 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86ISelLowering.cpp | 81 +++++++++++++++--------------- lib/Target/X86/X86InstrSSE.td | 9 ++-- test/CodeGen/X86/avx2-shift.ll | 62 ++++++++++++++++++----- 3 files changed, 95 insertions(+), 57 deletions(-) diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 61d9246b259..470a1157478 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -10338,47 +10338,48 @@ SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const { return Res; } - if (Subtarget->hasAVX2()) { - if (VT == MVT::v4i64 && Op.getOpcode() == ISD::SHL) - return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, - DAG.getConstant(Intrinsic::x86_avx2_pslli_q, MVT::i32), - R, DAG.getConstant(ShiftAmt, MVT::i32)); - - if (VT == MVT::v8i32 && Op.getOpcode() == ISD::SHL) - return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, - DAG.getConstant(Intrinsic::x86_avx2_pslli_d, MVT::i32), - R, DAG.getConstant(ShiftAmt, MVT::i32)); - - if (VT == MVT::v16i16 && Op.getOpcode() == ISD::SHL) - return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, - DAG.getConstant(Intrinsic::x86_avx2_pslli_w, MVT::i32), - R, DAG.getConstant(ShiftAmt, MVT::i32)); - - if (VT == MVT::v4i64 && Op.getOpcode() == ISD::SRL) - return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, - DAG.getConstant(Intrinsic::x86_avx2_psrli_q, MVT::i32), - R, DAG.getConstant(ShiftAmt, MVT::i32)); - - if (VT == MVT::v8i32 && Op.getOpcode() == ISD::SRL) - return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, - DAG.getConstant(Intrinsic::x86_avx2_psrli_d, MVT::i32), - R, DAG.getConstant(ShiftAmt, MVT::i32)); - - if (VT == MVT::v16i16 && Op.getOpcode() == ISD::SRL) - return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, - DAG.getConstant(Intrinsic::x86_avx2_psrli_w, MVT::i32), - R, DAG.getConstant(ShiftAmt, MVT::i32)); - - if (VT == MVT::v8i32 && Op.getOpcode() == ISD::SRA) - return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, - DAG.getConstant(Intrinsic::x86_avx2_psrai_d, MVT::i32), - R, DAG.getConstant(ShiftAmt, MVT::i32)); - - if (VT == MVT::v16i16 && Op.getOpcode() == ISD::SRA) - return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, - DAG.getConstant(Intrinsic::x86_avx2_psrai_w, MVT::i32), - R, DAG.getConstant(ShiftAmt, MVT::i32)); + if (Subtarget->hasAVX2() && VT == MVT::v32i8) { + if (Op.getOpcode() == ISD::SHL) { + // Make a large shift. + SDValue SHL = + DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, + DAG.getConstant(Intrinsic::x86_avx2_pslli_w, MVT::i32), + R, DAG.getConstant(ShiftAmt, MVT::i32)); + // Zero out the rightmost bits. + SmallVector V(32, DAG.getConstant(uint8_t(-1U << ShiftAmt), + MVT::i8)); + return DAG.getNode(ISD::AND, dl, VT, SHL, + DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 32)); } + if (Op.getOpcode() == ISD::SRL) { + // Make a large shift. + SDValue SRL = + DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, + DAG.getConstant(Intrinsic::x86_avx2_psrli_w, MVT::i32), + R, DAG.getConstant(ShiftAmt, MVT::i32)); + // Zero out the leftmost bits. + SmallVector V(32, DAG.getConstant(uint8_t(-1U) >> ShiftAmt, + MVT::i8)); + return DAG.getNode(ISD::AND, dl, VT, SRL, + DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 32)); + } + if (Op.getOpcode() == ISD::SRA) { + if (ShiftAmt == 7) { + // R s>> 7 === R s< 0 + SDValue Zeros = getZeroVector(VT, true /* HasXMMInt */, DAG, dl); + return DAG.getNode(X86ISD::PCMPGTB, dl, VT, Zeros, R); + } + + // R s>> a === ((R u>> a) ^ m) - m + SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt); + SmallVector V(32, DAG.getConstant(128 >> ShiftAmt, + MVT::i8)); + SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 32); + Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask); + Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask); + return Res; + } + } } } diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index 1e74da1b95e..68010b7186b 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -311,17 +311,16 @@ def : Pat<(bc_v4i64 (v8f32 immAllZerosV)), // JIT implementation, it does not expand the instructions below like // X86MCInstLower does. let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1, - isCodeGenOnly = 1, ExeDomain = SSEPackedInt in + isCodeGenOnly = 1, ExeDomain = SSEPackedInt in { def V_SETALLONES : PDI<0x76, MRMInitReg, (outs VR128:$dst), (ins), "", [(set VR128:$dst, (v4i32 immAllOnesV))]>; -let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1, - isCodeGenOnly = 1, ExeDomain = SSEPackedInt, Predicates = [HasAVX] in + let Predicates = [HasAVX] in def AVX_SETALLONES : PDI<0x76, MRMInitReg, (outs VR128:$dst), (ins), "", [(set VR128:$dst, (v4i32 immAllOnesV))]>, VEX_4V; -let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1, - isCodeGenOnly = 1, ExeDomain = SSEPackedInt, Predicates = [HasAVX2] in + let Predicates = [HasAVX2] in def AVX2_SETALLONES : PDI<0x76, MRMInitReg, (outs VR256:$dst), (ins), "", [(set VR256:$dst, (v8i32 immAllOnesV))]>, VEX_4V; +} //===----------------------------------------------------------------------===// diff --git a/test/CodeGen/X86/avx2-shift.ll b/test/CodeGen/X86/avx2-shift.ll index f7593616a81..b9d1edcb139 100644 --- a/test/CodeGen/X86/avx2-shift.ll +++ b/test/CodeGen/X86/avx2-shift.ll @@ -58,14 +58,14 @@ define <4 x i64> @variable_srl3(<4 x i64> %x, <4 x i64> %y) { } ; CHECK: variable_sra0 -; CHECK: psravd +; CHECK: vpsravd ; CHECK: ret define <4 x i32> @variable_sra0(<4 x i32> %x, <4 x i32> %y) { %k = ashr <4 x i32> %x, %y ret <4 x i32> %k } ; CHECK: variable_sra1 -; CHECK: psravd +; CHECK: vpsravd ; CHECK: ret define <8 x i32> @variable_sra1(<8 x i32> %x, <8 x i32> %y) { %k = ashr <8 x i32> %x, %y @@ -127,7 +127,7 @@ define <16 x i16> @vshift07(<16 x i16> %a) nounwind readnone { } ; CHECK: variable_sra0_load -; CHECK: psravd (% +; CHECK: vpsravd (% ; CHECK: ret define <4 x i32> @variable_sra0_load(<4 x i32> %x, <4 x i32>* %y) { %y1 = load <4 x i32>* %y @@ -136,7 +136,7 @@ define <4 x i32> @variable_sra0_load(<4 x i32> %x, <4 x i32>* %y) { } ; CHECK: variable_sra1_load -; CHECK: psravd (% +; CHECK: vpsravd (% ; CHECK: ret define <8 x i32> @variable_sra1_load(<8 x i32> %x, <8 x i32>* %y) { %y1 = load <8 x i32>* %y @@ -145,7 +145,7 @@ define <8 x i32> @variable_sra1_load(<8 x i32> %x, <8 x i32>* %y) { } ; CHECK: variable_shl0_load -; CHECK: psllvd (% +; CHECK: vpsllvd (% ; CHECK: ret define <4 x i32> @variable_shl0_load(<4 x i32> %x, <4 x i32>* %y) { %y1 = load <4 x i32>* %y @@ -153,7 +153,7 @@ define <4 x i32> @variable_shl0_load(<4 x i32> %x, <4 x i32>* %y) { ret <4 x i32> %k } ; CHECK: variable_shl1_load -; CHECK: psllvd (% +; CHECK: vpsllvd (% ; CHECK: ret define <8 x i32> @variable_shl1_load(<8 x i32> %x, <8 x i32>* %y) { %y1 = load <8 x i32>* %y @@ -161,7 +161,7 @@ define <8 x i32> @variable_shl1_load(<8 x i32> %x, <8 x i32>* %y) { ret <8 x i32> %k } ; CHECK: variable_shl2_load -; CHECK: psllvq (% +; CHECK: vpsllvq (% ; CHECK: ret define <2 x i64> @variable_shl2_load(<2 x i64> %x, <2 x i64>* %y) { %y1 = load <2 x i64>* %y @@ -169,7 +169,7 @@ define <2 x i64> @variable_shl2_load(<2 x i64> %x, <2 x i64>* %y) { ret <2 x i64> %k } ; CHECK: variable_shl3_load -; CHECK: psllvq (% +; CHECK: vpsllvq (% ; CHECK: ret define <4 x i64> @variable_shl3_load(<4 x i64> %x, <4 x i64>* %y) { %y1 = load <4 x i64>* %y @@ -177,7 +177,7 @@ define <4 x i64> @variable_shl3_load(<4 x i64> %x, <4 x i64>* %y) { ret <4 x i64> %k } ; CHECK: variable_srl0_load -; CHECK: psrlvd (% +; CHECK: vpsrlvd (% ; CHECK: ret define <4 x i32> @variable_srl0_load(<4 x i32> %x, <4 x i32>* %y) { %y1 = load <4 x i32>* %y @@ -185,7 +185,7 @@ define <4 x i32> @variable_srl0_load(<4 x i32> %x, <4 x i32>* %y) { ret <4 x i32> %k } ; CHECK: variable_srl1_load -; CHECK: psrlvd (% +; CHECK: vpsrlvd (% ; CHECK: ret define <8 x i32> @variable_srl1_load(<8 x i32> %x, <8 x i32>* %y) { %y1 = load <8 x i32>* %y @@ -193,7 +193,7 @@ define <8 x i32> @variable_srl1_load(<8 x i32> %x, <8 x i32>* %y) { ret <8 x i32> %k } ; CHECK: variable_srl2_load -; CHECK: psrlvq (% +; CHECK: vpsrlvq (% ; CHECK: ret define <2 x i64> @variable_srl2_load(<2 x i64> %x, <2 x i64>* %y) { %y1 = load <2 x i64>* %y @@ -201,10 +201,48 @@ define <2 x i64> @variable_srl2_load(<2 x i64> %x, <2 x i64>* %y) { ret <2 x i64> %k } ; CHECK: variable_srl3_load -; CHECK: psrlvq (% +; CHECK: vpsrlvq (% ; CHECK: ret define <4 x i64> @variable_srl3_load(<4 x i64> %x, <4 x i64>* %y) { %y1 = load <4 x i64>* %y %k = lshr <4 x i64> %x, %y1 ret <4 x i64> %k } + +define <32 x i8> @shl9(<32 x i8> %A) nounwind { + %B = shl <32 x i8> %A, + ret <32 x i8> %B +; CHECK: shl9: +; CHECK: vpsllw $3 +; CHECK: vpand +; CHECK: ret +} + +define <32 x i8> @shr9(<32 x i8> %A) nounwind { + %B = lshr <32 x i8> %A, + ret <32 x i8> %B +; CHECK: shr9: +; CHECK: vpsrlw $3 +; CHECK: vpand +; CHECK: ret +} + +define <32 x i8> @sra_v32i8_7(<32 x i8> %A) nounwind { + %B = ashr <32 x i8> %A, + ret <32 x i8> %B +; CHECK: sra_v32i8_7: +; CHECK: vxorps +; CHECK: vpcmpgtb +; CHECK: ret +} + +define <32 x i8> @sra_v32i8(<32 x i8> %A) nounwind { + %B = ashr <32 x i8> %A, + ret <32 x i8> %B +; CHECK: sra_v32i8: +; CHECK: vpsrlw $3 +; CHECK: vpand +; CHECK: vpxor +; CHECK: vpsubb +; CHECK: ret +}