From 51409214d7471328df2e92e0a8f9d05221fe0e6f Mon Sep 17 00:00:00 2001 From: Nate Begeman Date: Wed, 28 Jul 2010 00:21:48 +0000 Subject: [PATCH] Implement a vectorized algorithm for <16 x i8> << <16 x i8> This is about 4x faster and smaller than the existing scalarization. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@109566 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86ISelLowering.cpp | 94 +++++++++++++++++++++++------- test/CodeGen/X86/vec_shift4.ll | 13 ++++- 2 files changed, 85 insertions(+), 22 deletions(-) diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 88bc8d0a92b..c0f5a403b9f 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -840,6 +840,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) // Can turn SHL into an integer multiply. setOperationAction(ISD::SHL, MVT::v4i32, Custom); + setOperationAction(ISD::SHL, MVT::v16i8, Custom); // i8 and i16 vectors are custom , because the source register and source // source memory operand types are not the same width. f32 vectors are @@ -7506,29 +7507,80 @@ SDValue X86TargetLowering::LowerSHL(SDValue Op, SelectionDAG &DAG) const { DebugLoc dl = Op.getDebugLoc(); SDValue R = Op.getOperand(0); - assert(Subtarget->hasSSE41() && "Cannot lower SHL without SSE4.1 or later"); - assert(VT == MVT::v4i32 && "Only know how to lower v4i32"); - - Op = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, - DAG.getConstant(Intrinsic::x86_sse2_pslli_d, MVT::i32), - Op.getOperand(1), DAG.getConstant(23, MVT::i32)); - - std::vector CV; LLVMContext *Context = DAG.getContext(); - CV.push_back(ConstantInt::get(*Context, APInt(32, 0x3f800000U))); - CV.push_back(ConstantInt::get(*Context, APInt(32, 0x3f800000U))); - CV.push_back(ConstantInt::get(*Context, APInt(32, 0x3f800000U))); - CV.push_back(ConstantInt::get(*Context, APInt(32, 0x3f800000U))); - Constant *C = ConstantVector::get(CV); - SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); - SDValue Addend = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, - PseudoSourceValue::getConstantPool(), 0, - false, false, 16); - Op = DAG.getNode(ISD::ADD, dl, VT, Op, Addend); - Op = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v4f32, Op); - Op = DAG.getNode(ISD::FP_TO_SINT, dl, VT, Op); - return DAG.getNode(ISD::MUL, dl, VT, Op, R); + assert(Subtarget->hasSSE41() && "Cannot lower SHL without SSE4.1 or later"); + + if (VT == MVT::v4i32) { + Op = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, + DAG.getConstant(Intrinsic::x86_sse2_pslli_d, MVT::i32), + Op.getOperand(1), DAG.getConstant(23, MVT::i32)); + + ConstantInt *CI = ConstantInt::get(*Context, APInt(32, 0x3f800000U)); + + std::vector CV(4, CI); + Constant *C = ConstantVector::get(CV); + SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); + SDValue Addend = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, + PseudoSourceValue::getConstantPool(), 0, + false, false, 16); + + Op = DAG.getNode(ISD::ADD, dl, VT, Op, Addend); + Op = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v4f32, Op); + Op = DAG.getNode(ISD::FP_TO_SINT, dl, VT, Op); + return DAG.getNode(ISD::MUL, dl, VT, Op, R); + } + if (VT == MVT::v16i8) { + // a = a << 5; + Op = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, + DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32), + Op.getOperand(1), DAG.getConstant(5, MVT::i32)); + + ConstantInt *CM1 = ConstantInt::get(*Context, APInt(8, 15)); + ConstantInt *CM2 = ConstantInt::get(*Context, APInt(8, 63)); + + std::vector CVM1(16, CM1); + std::vector CVM2(16, CM2); + Constant *C = ConstantVector::get(CVM1); + SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); + SDValue M = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, + PseudoSourceValue::getConstantPool(), 0, + false, false, 16); + + // r = pblendv(r, psllw(r & (char16)15, 4), a); + M = DAG.getNode(ISD::AND, dl, VT, R, M); + M = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, + DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32), M, + DAG.getConstant(4, MVT::i32)); + R = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, + DAG.getConstant(Intrinsic::x86_sse41_pblendvb, MVT::i32), + R, M, Op); + // a += a + Op = DAG.getNode(ISD::ADD, dl, VT, Op, Op); + + C = ConstantVector::get(CVM2); + CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); + M = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, + PseudoSourceValue::getConstantPool(), 0, false, false, 16); + + // r = pblendv(r, psllw(r & (char16)63, 2), a); + M = DAG.getNode(ISD::AND, dl, VT, R, M); + M = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, + DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32), M, + DAG.getConstant(2, MVT::i32)); + R = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, + DAG.getConstant(Intrinsic::x86_sse41_pblendvb, MVT::i32), + R, M, Op); + // a += a + Op = DAG.getNode(ISD::ADD, dl, VT, Op, Op); + + // return pblendv(r, r+r, a); + R = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, + DAG.getConstant(Intrinsic::x86_sse41_pblendvb, MVT::i32), + R, DAG.getNode(ISD::ADD, dl, VT, R, R), Op); + return R; + } + return SDValue(); } SDValue X86TargetLowering::LowerXALUO(SDValue Op, SelectionDAG &DAG) const { diff --git a/test/CodeGen/X86/vec_shift4.ll b/test/CodeGen/X86/vec_shift4.ll index d8f4e4ec689..9ef7fbdb0c5 100644 --- a/test/CodeGen/X86/vec_shift4.ll +++ b/test/CodeGen/X86/vec_shift4.ll @@ -1,6 +1,6 @@ ; RUN: llc < %s -march=x86 -mattr=+sse41 | FileCheck %s -define <2 x i64> @shl(<4 x i32> %r, <4 x i32> %a) nounwind readnone ssp { +define <2 x i64> @shl1(<4 x i32> %r, <4 x i32> %a) nounwind readnone ssp { entry: ; CHECK-NOT: shll ; CHECK: pslld @@ -12,3 +12,14 @@ entry: %tmp2 = bitcast <4 x i32> %shl to <2 x i64> ; <<2 x i64>> [#uses=1] ret <2 x i64> %tmp2 } + +define <2 x i64> @shl2(<16 x i8> %r, <16 x i8> %a) nounwind readnone ssp { +entry: +; CHECK-NOT: shlb +; CHECK: pblendvb +; CHECK: pblendvb +; CHECK: pblendvb + %shl = shl <16 x i8> %r, %a ; <<16 x i8>> [#uses=1] + %tmp2 = bitcast <16 x i8> %shl to <2 x i64> ; <<2 x i64>> [#uses=1] + ret <2 x i64> %tmp2 +}