mirror of
https://github.com/c64scene-ar/llvm-6502.git
synced 2025-01-20 12:31:40 +00:00
Implement a vectorized algorithm for <16 x i8> << <16 x i8>
This is about 4x faster and smaller than the existing scalarization. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@109566 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
parent
f374ba2bcd
commit
51409214d7
@ -840,6 +840,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
|
|||||||
|
|
||||||
// Can turn SHL into an integer multiply.
|
// Can turn SHL into an integer multiply.
|
||||||
setOperationAction(ISD::SHL, MVT::v4i32, Custom);
|
setOperationAction(ISD::SHL, MVT::v4i32, Custom);
|
||||||
|
setOperationAction(ISD::SHL, MVT::v16i8, Custom);
|
||||||
|
|
||||||
// i8 and i16 vectors are custom , because the source register and source
|
// i8 and i16 vectors are custom , because the source register and source
|
||||||
// source memory operand types are not the same width. f32 vectors are
|
// source memory operand types are not the same width. f32 vectors are
|
||||||
@ -7506,19 +7507,18 @@ SDValue X86TargetLowering::LowerSHL(SDValue Op, SelectionDAG &DAG) const {
|
|||||||
DebugLoc dl = Op.getDebugLoc();
|
DebugLoc dl = Op.getDebugLoc();
|
||||||
SDValue R = Op.getOperand(0);
|
SDValue R = Op.getOperand(0);
|
||||||
|
|
||||||
assert(Subtarget->hasSSE41() && "Cannot lower SHL without SSE4.1 or later");
|
LLVMContext *Context = DAG.getContext();
|
||||||
assert(VT == MVT::v4i32 && "Only know how to lower v4i32");
|
|
||||||
|
|
||||||
|
assert(Subtarget->hasSSE41() && "Cannot lower SHL without SSE4.1 or later");
|
||||||
|
|
||||||
|
if (VT == MVT::v4i32) {
|
||||||
Op = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
|
Op = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
|
||||||
DAG.getConstant(Intrinsic::x86_sse2_pslli_d, MVT::i32),
|
DAG.getConstant(Intrinsic::x86_sse2_pslli_d, MVT::i32),
|
||||||
Op.getOperand(1), DAG.getConstant(23, MVT::i32));
|
Op.getOperand(1), DAG.getConstant(23, MVT::i32));
|
||||||
|
|
||||||
std::vector<Constant*> CV;
|
ConstantInt *CI = ConstantInt::get(*Context, APInt(32, 0x3f800000U));
|
||||||
LLVMContext *Context = DAG.getContext();
|
|
||||||
CV.push_back(ConstantInt::get(*Context, APInt(32, 0x3f800000U)));
|
std::vector<Constant*> CV(4, CI);
|
||||||
CV.push_back(ConstantInt::get(*Context, APInt(32, 0x3f800000U)));
|
|
||||||
CV.push_back(ConstantInt::get(*Context, APInt(32, 0x3f800000U)));
|
|
||||||
CV.push_back(ConstantInt::get(*Context, APInt(32, 0x3f800000U)));
|
|
||||||
Constant *C = ConstantVector::get(CV);
|
Constant *C = ConstantVector::get(CV);
|
||||||
SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16);
|
SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16);
|
||||||
SDValue Addend = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
|
SDValue Addend = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
|
||||||
@ -7530,6 +7530,58 @@ SDValue X86TargetLowering::LowerSHL(SDValue Op, SelectionDAG &DAG) const {
|
|||||||
Op = DAG.getNode(ISD::FP_TO_SINT, dl, VT, Op);
|
Op = DAG.getNode(ISD::FP_TO_SINT, dl, VT, Op);
|
||||||
return DAG.getNode(ISD::MUL, dl, VT, Op, R);
|
return DAG.getNode(ISD::MUL, dl, VT, Op, R);
|
||||||
}
|
}
|
||||||
|
if (VT == MVT::v16i8) {
|
||||||
|
// a = a << 5;
|
||||||
|
Op = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
|
||||||
|
DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32),
|
||||||
|
Op.getOperand(1), DAG.getConstant(5, MVT::i32));
|
||||||
|
|
||||||
|
ConstantInt *CM1 = ConstantInt::get(*Context, APInt(8, 15));
|
||||||
|
ConstantInt *CM2 = ConstantInt::get(*Context, APInt(8, 63));
|
||||||
|
|
||||||
|
std::vector<Constant*> CVM1(16, CM1);
|
||||||
|
std::vector<Constant*> CVM2(16, CM2);
|
||||||
|
Constant *C = ConstantVector::get(CVM1);
|
||||||
|
SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16);
|
||||||
|
SDValue M = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
|
||||||
|
PseudoSourceValue::getConstantPool(), 0,
|
||||||
|
false, false, 16);
|
||||||
|
|
||||||
|
// r = pblendv(r, psllw(r & (char16)15, 4), a);
|
||||||
|
M = DAG.getNode(ISD::AND, dl, VT, R, M);
|
||||||
|
M = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
|
||||||
|
DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32), M,
|
||||||
|
DAG.getConstant(4, MVT::i32));
|
||||||
|
R = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
|
||||||
|
DAG.getConstant(Intrinsic::x86_sse41_pblendvb, MVT::i32),
|
||||||
|
R, M, Op);
|
||||||
|
// a += a
|
||||||
|
Op = DAG.getNode(ISD::ADD, dl, VT, Op, Op);
|
||||||
|
|
||||||
|
C = ConstantVector::get(CVM2);
|
||||||
|
CPIdx = DAG.getConstantPool(C, getPointerTy(), 16);
|
||||||
|
M = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
|
||||||
|
PseudoSourceValue::getConstantPool(), 0, false, false, 16);
|
||||||
|
|
||||||
|
// r = pblendv(r, psllw(r & (char16)63, 2), a);
|
||||||
|
M = DAG.getNode(ISD::AND, dl, VT, R, M);
|
||||||
|
M = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
|
||||||
|
DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32), M,
|
||||||
|
DAG.getConstant(2, MVT::i32));
|
||||||
|
R = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
|
||||||
|
DAG.getConstant(Intrinsic::x86_sse41_pblendvb, MVT::i32),
|
||||||
|
R, M, Op);
|
||||||
|
// a += a
|
||||||
|
Op = DAG.getNode(ISD::ADD, dl, VT, Op, Op);
|
||||||
|
|
||||||
|
// return pblendv(r, r+r, a);
|
||||||
|
R = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
|
||||||
|
DAG.getConstant(Intrinsic::x86_sse41_pblendvb, MVT::i32),
|
||||||
|
R, DAG.getNode(ISD::ADD, dl, VT, R, R), Op);
|
||||||
|
return R;
|
||||||
|
}
|
||||||
|
return SDValue();
|
||||||
|
}
|
||||||
|
|
||||||
SDValue X86TargetLowering::LowerXALUO(SDValue Op, SelectionDAG &DAG) const {
|
SDValue X86TargetLowering::LowerXALUO(SDValue Op, SelectionDAG &DAG) const {
|
||||||
// Lower the "add/sub/mul with overflow" instruction into a regular ins plus
|
// Lower the "add/sub/mul with overflow" instruction into a regular ins plus
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
; RUN: llc < %s -march=x86 -mattr=+sse41 | FileCheck %s
|
; RUN: llc < %s -march=x86 -mattr=+sse41 | FileCheck %s
|
||||||
|
|
||||||
define <2 x i64> @shl(<4 x i32> %r, <4 x i32> %a) nounwind readnone ssp {
|
define <2 x i64> @shl1(<4 x i32> %r, <4 x i32> %a) nounwind readnone ssp {
|
||||||
entry:
|
entry:
|
||||||
; CHECK-NOT: shll
|
; CHECK-NOT: shll
|
||||||
; CHECK: pslld
|
; CHECK: pslld
|
||||||
@ -12,3 +12,14 @@ entry:
|
|||||||
%tmp2 = bitcast <4 x i32> %shl to <2 x i64> ; <<2 x i64>> [#uses=1]
|
%tmp2 = bitcast <4 x i32> %shl to <2 x i64> ; <<2 x i64>> [#uses=1]
|
||||||
ret <2 x i64> %tmp2
|
ret <2 x i64> %tmp2
|
||||||
}
|
}
|
||||||
|
|
||||||
|
define <2 x i64> @shl2(<16 x i8> %r, <16 x i8> %a) nounwind readnone ssp {
|
||||||
|
entry:
|
||||||
|
; CHECK-NOT: shlb
|
||||||
|
; CHECK: pblendvb
|
||||||
|
; CHECK: pblendvb
|
||||||
|
; CHECK: pblendvb
|
||||||
|
%shl = shl <16 x i8> %r, %a ; <<16 x i8>> [#uses=1]
|
||||||
|
%tmp2 = bitcast <16 x i8> %shl to <2 x i64> ; <<2 x i64>> [#uses=1]
|
||||||
|
ret <2 x i64> %tmp2
|
||||||
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user