mirror of
https://github.com/c64scene-ar/llvm-6502.git
synced 2025-01-28 06:32:09 +00:00
Add custom lowering of X86 vector SRA/SRL/SHL when the shift amount is a splat vector.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@131179 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
parent
41cdc16e73
commit
4301222525
@ -927,7 +927,6 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
|
||||
// Can turn SHL into an integer multiply.
|
||||
setOperationAction(ISD::SHL, MVT::v4i32, Custom);
|
||||
setOperationAction(ISD::SHL, MVT::v16i8, Custom);
|
||||
setOperationAction(ISD::SRL, MVT::v4i32, Legal);
|
||||
|
||||
// i8 and i16 vectors are custom , because the source register and source
|
||||
// source memory operand types are not the same width. f32 vectors are
|
||||
@ -949,6 +948,19 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
|
||||
}
|
||||
}
|
||||
|
||||
if (Subtarget->hasSSE2()) {
|
||||
setOperationAction(ISD::SRL, MVT::v2i64, Custom);
|
||||
setOperationAction(ISD::SRL, MVT::v4i32, Custom);
|
||||
setOperationAction(ISD::SRL, MVT::v16i8, Custom);
|
||||
|
||||
setOperationAction(ISD::SHL, MVT::v2i64, Custom);
|
||||
setOperationAction(ISD::SHL, MVT::v4i32, Custom);
|
||||
setOperationAction(ISD::SHL, MVT::v8i16, Custom);
|
||||
|
||||
setOperationAction(ISD::SRA, MVT::v4i32, Custom);
|
||||
setOperationAction(ISD::SRA, MVT::v8i16, Custom);
|
||||
}
|
||||
|
||||
if (Subtarget->hasSSE42())
|
||||
setOperationAction(ISD::VSETCC, MVT::v2i64, Custom);
|
||||
|
||||
@ -6616,9 +6628,9 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
|
||||
}
|
||||
|
||||
|
||||
/// LowerShift - Lower SRA_PARTS and friends, which return two i32 values and
|
||||
/// LowerShiftParts - Lower SRA_PARTS and friends, which return two i32 values and
|
||||
/// take a 2 x i32 value to shift plus a shift amount.
|
||||
SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const {
|
||||
SDValue X86TargetLowering::LowerShiftParts(SDValue Op, SelectionDAG &DAG) const {
|
||||
assert(Op.getNumOperands() == 3 && "Not a double-shift!");
|
||||
EVT VT = Op.getValueType();
|
||||
unsigned VTBits = VT.getSizeInBits();
|
||||
@ -8778,16 +8790,71 @@ SDValue X86TargetLowering::LowerMUL_V2I64(SDValue Op, SelectionDAG &DAG) const {
|
||||
return Res;
|
||||
}
|
||||
|
||||
SDValue X86TargetLowering::LowerSHL(SDValue Op, SelectionDAG &DAG) const {
|
||||
SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const {
|
||||
|
||||
EVT VT = Op.getValueType();
|
||||
DebugLoc dl = Op.getDebugLoc();
|
||||
SDValue R = Op.getOperand(0);
|
||||
SDValue Amt = Op.getOperand(1);
|
||||
|
||||
LLVMContext *Context = DAG.getContext();
|
||||
|
||||
assert(Subtarget->hasSSE41() && "Cannot lower SHL without SSE4.1 or later");
|
||||
// Must have SSE2.
|
||||
if (!Subtarget->hasSSE2()) return SDValue();
|
||||
|
||||
if (VT == MVT::v4i32) {
|
||||
// Optimize shl/srl/sra with constant shift amount.
|
||||
if (isSplatVector(Amt.getNode())) {
|
||||
SDValue SclrAmt = Amt->getOperand(0);
|
||||
if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(SclrAmt)) {
|
||||
uint64_t ShiftAmt = C->getZExtValue();
|
||||
|
||||
if (VT == MVT::v2i64 && Op.getOpcode() == ISD::SHL)
|
||||
return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
|
||||
DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32),
|
||||
R, DAG.getConstant(ShiftAmt, MVT::i32));
|
||||
|
||||
if (VT == MVT::v4i32 && Op.getOpcode() == ISD::SHL)
|
||||
return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
|
||||
DAG.getConstant(Intrinsic::x86_sse2_pslli_d, MVT::i32),
|
||||
R, DAG.getConstant(ShiftAmt, MVT::i32));
|
||||
|
||||
if (VT == MVT::v8i16 && Op.getOpcode() == ISD::SHL)
|
||||
return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
|
||||
DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32),
|
||||
R, DAG.getConstant(ShiftAmt, MVT::i32));
|
||||
|
||||
if (VT == MVT::v2i64 && Op.getOpcode() == ISD::SRL)
|
||||
return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
|
||||
DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32),
|
||||
R, DAG.getConstant(ShiftAmt, MVT::i32));
|
||||
|
||||
if (VT == MVT::v4i32 && Op.getOpcode() == ISD::SRL)
|
||||
return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
|
||||
DAG.getConstant(Intrinsic::x86_sse2_psrli_d, MVT::i32),
|
||||
R, DAG.getConstant(ShiftAmt, MVT::i32));
|
||||
|
||||
if (VT == MVT::v8i16 && Op.getOpcode() == ISD::SRL)
|
||||
return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
|
||||
DAG.getConstant(Intrinsic::x86_sse2_psrli_w, MVT::i32),
|
||||
R, DAG.getConstant(ShiftAmt, MVT::i32));
|
||||
|
||||
if (VT == MVT::v4i32 && Op.getOpcode() == ISD::SRA)
|
||||
return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
|
||||
DAG.getConstant(Intrinsic::x86_sse2_psrai_d, MVT::i32),
|
||||
R, DAG.getConstant(ShiftAmt, MVT::i32));
|
||||
|
||||
if (VT == MVT::v8i16 && Op.getOpcode() == ISD::SRA)
|
||||
return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
|
||||
DAG.getConstant(Intrinsic::x86_sse2_psrai_w, MVT::i32),
|
||||
R, DAG.getConstant(ShiftAmt, MVT::i32));
|
||||
}
|
||||
}
|
||||
|
||||
// Lower SHL with variable shift amount.
|
||||
// Cannot lower SHL without SSE4.1 or later.
|
||||
if (!Subtarget->hasSSE41()) return SDValue();
|
||||
|
||||
if (VT == MVT::v4i32 && Op->getOpcode() == ISD::SHL) {
|
||||
Op = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
|
||||
DAG.getConstant(Intrinsic::x86_sse2_pslli_d, MVT::i32),
|
||||
Op.getOperand(1), DAG.getConstant(23, MVT::i32));
|
||||
@ -8806,7 +8873,7 @@ SDValue X86TargetLowering::LowerSHL(SDValue Op, SelectionDAG &DAG) const {
|
||||
Op = DAG.getNode(ISD::FP_TO_SINT, dl, VT, Op);
|
||||
return DAG.getNode(ISD::MUL, dl, VT, Op, R);
|
||||
}
|
||||
if (VT == MVT::v16i8) {
|
||||
if (VT == MVT::v16i8 && Op->getOpcode() == ISD::SHL) {
|
||||
// a = a << 5;
|
||||
Op = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
|
||||
DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32),
|
||||
@ -9111,7 +9178,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
|
||||
case ISD::BlockAddress: return LowerBlockAddress(Op, DAG);
|
||||
case ISD::SHL_PARTS:
|
||||
case ISD::SRA_PARTS:
|
||||
case ISD::SRL_PARTS: return LowerShift(Op, DAG);
|
||||
case ISD::SRL_PARTS: return LowerShiftParts(Op, DAG);
|
||||
case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
|
||||
case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
|
||||
case ISD::FP_TO_SINT: return LowerFP_TO_SINT(Op, DAG);
|
||||
@ -9139,7 +9206,9 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
|
||||
case ISD::CTLZ: return LowerCTLZ(Op, DAG);
|
||||
case ISD::CTTZ: return LowerCTTZ(Op, DAG);
|
||||
case ISD::MUL: return LowerMUL_V2I64(Op, DAG);
|
||||
case ISD::SHL: return LowerSHL(Op, DAG);
|
||||
case ISD::SRA:
|
||||
case ISD::SRL:
|
||||
case ISD::SHL: return LowerShift(Op, DAG);
|
||||
case ISD::SADDO:
|
||||
case ISD::UADDO:
|
||||
case ISD::SSUBO:
|
||||
|
@ -770,7 +770,7 @@ namespace llvm {
|
||||
SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
|
||||
SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
|
||||
SDValue LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const;
|
||||
SDValue LowerShift(SDValue Op, SelectionDAG &DAG) const;
|
||||
SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) const;
|
||||
SDValue BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain, SDValue StackSlot,
|
||||
SelectionDAG &DAG) const;
|
||||
SDValue LowerBITCAST(SDValue op, SelectionDAG &DAG) const;
|
||||
@ -805,7 +805,7 @@ namespace llvm {
|
||||
SDValue LowerCTLZ(SDValue Op, SelectionDAG &DAG) const;
|
||||
SDValue LowerCTTZ(SDValue Op, SelectionDAG &DAG) const;
|
||||
SDValue LowerMUL_V2I64(SDValue Op, SelectionDAG &DAG) const;
|
||||
SDValue LowerSHL(SDValue Op, SelectionDAG &DAG) const;
|
||||
SDValue LowerShift(SDValue Op, SelectionDAG &DAG) const;
|
||||
SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) const;
|
||||
|
||||
SDValue LowerCMP_SWAP(SDValue Op, SelectionDAG &DAG) const;
|
||||
|
142
test/CodeGen/X86/x86-shifts.ll
Normal file
142
test/CodeGen/X86/x86-shifts.ll
Normal file
@ -0,0 +1,142 @@
|
||||
; RUN: llc < %s -march=x86-64 -mcpu=corei7 | FileCheck %s
|
||||
|
||||
; Splat patterns below
|
||||
|
||||
|
||||
define <4 x i32> @shl4(<4 x i32> %A) nounwind {
|
||||
entry:
|
||||
; CHECK: shl4
|
||||
; CHECK: pslld
|
||||
; CHECK-NEXT: pslld
|
||||
%B = shl <4 x i32> %A, < i32 2, i32 2, i32 2, i32 2>
|
||||
%C = shl <4 x i32> %A, < i32 1, i32 1, i32 1, i32 1>
|
||||
%K = xor <4 x i32> %B, %C
|
||||
ret <4 x i32> %K
|
||||
}
|
||||
|
||||
define <4 x i32> @shr4(<4 x i32> %A) nounwind {
|
||||
entry:
|
||||
; CHECK: shr4
|
||||
; CHECK: psrld
|
||||
; CHECK-NEXT: psrld
|
||||
%B = lshr <4 x i32> %A, < i32 2, i32 2, i32 2, i32 2>
|
||||
%C = lshr <4 x i32> %A, < i32 1, i32 1, i32 1, i32 1>
|
||||
%K = xor <4 x i32> %B, %C
|
||||
ret <4 x i32> %K
|
||||
}
|
||||
|
||||
define <4 x i32> @sra4(<4 x i32> %A) nounwind {
|
||||
entry:
|
||||
; CHECK: sra4
|
||||
; CHECK: psrad
|
||||
; CHECK-NEXT: psrad
|
||||
%B = ashr <4 x i32> %A, < i32 2, i32 2, i32 2, i32 2>
|
||||
%C = ashr <4 x i32> %A, < i32 1, i32 1, i32 1, i32 1>
|
||||
%K = xor <4 x i32> %B, %C
|
||||
ret <4 x i32> %K
|
||||
}
|
||||
|
||||
define <2 x i64> @shl2(<2 x i64> %A) nounwind {
|
||||
entry:
|
||||
; CHECK: shl2
|
||||
; CHECK: psllq
|
||||
; CHECK-NEXT: psllq
|
||||
%B = shl <2 x i64> %A, < i64 2, i64 2>
|
||||
%C = shl <2 x i64> %A, < i64 9, i64 9>
|
||||
%K = xor <2 x i64> %B, %C
|
||||
ret <2 x i64> %K
|
||||
}
|
||||
|
||||
define <2 x i64> @shr2(<2 x i64> %A) nounwind {
|
||||
entry:
|
||||
; CHECK: shr2
|
||||
; CHECK: psrlq
|
||||
; CHECK-NEXT: psrlq
|
||||
%B = lshr <2 x i64> %A, < i64 8, i64 8>
|
||||
%C = lshr <2 x i64> %A, < i64 1, i64 1>
|
||||
%K = xor <2 x i64> %B, %C
|
||||
ret <2 x i64> %K
|
||||
}
|
||||
|
||||
|
||||
define <8 x i16> @shl8(<8 x i16> %A) nounwind {
|
||||
entry:
|
||||
; CHECK: shl8
|
||||
; CHECK: psllw
|
||||
; CHECK-NEXT: psllw
|
||||
%B = shl <8 x i16> %A, < i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2>
|
||||
%C = shl <8 x i16> %A, < i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
|
||||
%K = xor <8 x i16> %B, %C
|
||||
ret <8 x i16> %K
|
||||
}
|
||||
|
||||
define <8 x i16> @shr8(<8 x i16> %A) nounwind {
|
||||
entry:
|
||||
; CHECK: shr8
|
||||
; CHECK: psrlw
|
||||
; CHECK-NEXT: psrlw
|
||||
%B = lshr <8 x i16> %A, < i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2>
|
||||
%C = lshr <8 x i16> %A, < i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
|
||||
%K = xor <8 x i16> %B, %C
|
||||
ret <8 x i16> %K
|
||||
}
|
||||
|
||||
define <8 x i16> @sra8(<8 x i16> %A) nounwind {
|
||||
entry:
|
||||
; CHECK: sra8
|
||||
; CHECK: psraw
|
||||
; CHECK-NEXT: psraw
|
||||
%B = ashr <8 x i16> %A, < i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2>
|
||||
%C = ashr <8 x i16> %A, < i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
|
||||
%K = xor <8 x i16> %B, %C
|
||||
ret <8 x i16> %K
|
||||
}
|
||||
|
||||
; non splat test
|
||||
|
||||
|
||||
define <8 x i16> @sll8_nosplat(<8 x i16> %A) nounwind {
|
||||
entry:
|
||||
; CHECK: sll8_nosplat
|
||||
; CHECK-NOT: psll
|
||||
; CHECK-NOT: psll
|
||||
%B = shl <8 x i16> %A, < i16 1, i16 2, i16 3, i16 6, i16 2, i16 2, i16 2, i16 2>
|
||||
%C = shl <8 x i16> %A, < i16 9, i16 7, i16 5, i16 1, i16 4, i16 1, i16 1, i16 1>
|
||||
%K = xor <8 x i16> %B, %C
|
||||
ret <8 x i16> %K
|
||||
}
|
||||
|
||||
|
||||
define <2 x i64> @shr2_nosplat(<2 x i64> %A) nounwind {
|
||||
entry:
|
||||
; CHECK: shr2_nosplat
|
||||
; CHECK-NOT: psrlq
|
||||
; CHECK-NOT: psrlq
|
||||
%B = lshr <2 x i64> %A, < i64 8, i64 1>
|
||||
%C = lshr <2 x i64> %A, < i64 1, i64 0>
|
||||
%K = xor <2 x i64> %B, %C
|
||||
ret <2 x i64> %K
|
||||
}
|
||||
|
||||
|
||||
; Other shifts
|
||||
|
||||
define <2 x i32> @shl2_other(<2 x i32> %A) nounwind {
|
||||
entry:
|
||||
; CHECK: shl2_other
|
||||
; CHECK-not: psllq
|
||||
%B = shl <2 x i32> %A, < i32 2, i32 2>
|
||||
%C = shl <2 x i32> %A, < i32 9, i32 9>
|
||||
%K = xor <2 x i32> %B, %C
|
||||
ret <2 x i32> %K
|
||||
}
|
||||
|
||||
define <2 x i32> @shr2_other(<2 x i32> %A) nounwind {
|
||||
entry:
|
||||
; CHECK: shr2_other
|
||||
; CHECK-NOT: psrlq
|
||||
%B = lshr <2 x i32> %A, < i32 8, i32 8>
|
||||
%C = lshr <2 x i32> %A, < i32 1, i32 1>
|
||||
%K = xor <2 x i32> %B, %C
|
||||
ret <2 x i32> %K
|
||||
}
|
Loading…
x
Reference in New Issue
Block a user