mirror of
https://github.com/c64scene-ar/llvm-6502.git
synced 2025-04-05 17:39:16 +00:00
canonicalize a v2f64 blendi of 2 registers
This canonicalization step saves us 3 pattern matching possibilities * 4 math ops for scalar FP math that uses xmm regs. The backend can re-commute the operands post-instruction-selection if that makes register allocation better. The tests in llvm/test/CodeGen/X86/sse-scalar-fp-arith.ll cover this scenario already, so there are no new tests with this patch. Differential Revision: http://reviews.llvm.org/D7777 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@230024 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
parent
6039d05a22
commit
74e8bf678a
@ -23024,6 +23024,32 @@ static SDValue PerformINSERTPSCombine(SDNode *N, SelectionDAG &DAG,
|
||||
LoadScalarToVector, N->getOperand(2));
|
||||
}
|
||||
|
||||
static SDValue PerformBLENDICombine(SDNode *N, SelectionDAG &DAG) {
|
||||
SDValue V0 = N->getOperand(0);
|
||||
SDValue V1 = N->getOperand(1);
|
||||
SDLoc DL(N);
|
||||
EVT VT = N->getValueType(0);
|
||||
|
||||
// Canonicalize a v2f64 blend with a mask of 2 by swapping the vector
|
||||
// operands and changing the mask to 1. This saves us a bunch of
|
||||
// pattern-matching possibilities related to scalar math ops in SSE/AVX.
|
||||
// x86InstrInfo knows how to commute this back after instruction selection
|
||||
// if it would help register allocation.
|
||||
|
||||
// TODO: If optimizing for size or a processor that doesn't suffer from
|
||||
// partial register update stalls, this should be transformed into a MOVSD
|
||||
// instruction because a MOVSD is 1-2 bytes smaller than a BLENDPD.
|
||||
|
||||
if (VT == MVT::v2f64)
|
||||
if (auto *Mask = dyn_cast<ConstantSDNode>(N->getOperand(2)))
|
||||
if (Mask->getZExtValue() == 2 && !isShuffleFoldableLoad(V0)) {
|
||||
SDValue NewMask = DAG.getConstant(1, MVT::i8);
|
||||
return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V0, NewMask);
|
||||
}
|
||||
|
||||
return SDValue();
|
||||
}
|
||||
|
||||
// Helper function of PerformSETCCCombine. It is to materialize "setb reg"
|
||||
// as "sbb reg,reg", since it can be extended without zext and produces
|
||||
// an all-ones bit which is more useful than 0/1 in some cases.
|
||||
@ -23440,6 +23466,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
|
||||
return PerformINSERTPSCombine(N, DAG, Subtarget);
|
||||
break;
|
||||
}
|
||||
case X86ISD::BLENDI: return PerformBLENDICombine(N, DAG);
|
||||
case ISD::BUILD_VECTOR: return PerformBUILD_VECTORCombine(N, DAG, Subtarget);
|
||||
}
|
||||
|
||||
|
@ -3161,8 +3161,7 @@ let isCodeGenOnly = 1 in {
|
||||
// addss %xmm1, %xmm0
|
||||
|
||||
// TODO: Some canonicalization in lowering would simplify the number of
|
||||
// patterns we have to try to match. In particular, the reversed order blends
|
||||
// seem unnecessary.
|
||||
// patterns we have to try to match.
|
||||
multiclass scalar_math_f32_patterns<SDNode Op, string OpcPrefix> {
|
||||
let Predicates = [UseSSE1] in {
|
||||
// extracted scalar math op with insert via movss
|
||||
@ -3263,16 +3262,9 @@ multiclass scalar_math_f64_patterns<SDNode Op, string OpcPrefix> {
|
||||
def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst),
|
||||
(Op (v2f64 VR128:$dst), (v2f64 VR128:$src)), (i8 1))),
|
||||
(!cast<I>(OpcPrefix#SDrr_Int) v2f64:$dst, v2f64:$src)>;
|
||||
|
||||
// vector math op with insert via blend (reversed order)
|
||||
def : Pat<(v2f64 (X86Blendi
|
||||
(Op (v2f64 VR128:$dst), (v2f64 VR128:$src)),
|
||||
(v2f64 VR128:$dst), (i8 2))),
|
||||
(!cast<I>(OpcPrefix#SDrr_Int) v2f64:$dst, v2f64:$src)>;
|
||||
}
|
||||
|
||||
// Repeat everything for AVX and add one more pattern
|
||||
// (the scalar + blend reversed order) for good measure.
|
||||
// Repeat everything for AVX.
|
||||
let Predicates = [HasAVX] in {
|
||||
// extracted scalar math op with insert via movsd
|
||||
def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), (v2f64 (scalar_to_vector
|
||||
@ -3288,13 +3280,6 @@ multiclass scalar_math_f64_patterns<SDNode Op, string OpcPrefix> {
|
||||
(!cast<I>("V"#OpcPrefix#SDrr_Int) v2f64:$dst,
|
||||
(COPY_TO_REGCLASS FR64:$src, VR128))>;
|
||||
|
||||
// extracted scalar math op with insert via blend (reversed order)
|
||||
def : Pat<(v2f64 (X86Blendi (v2f64 (scalar_to_vector
|
||||
(Op (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))),
|
||||
FR64:$src))), (v2f64 VR128:$dst), (i8 2))),
|
||||
(!cast<I>("V"#OpcPrefix#SDrr_Int) v2f64:$dst,
|
||||
(COPY_TO_REGCLASS FR64:$src, VR128))>;
|
||||
|
||||
// vector math op with insert via movsd
|
||||
def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst),
|
||||
(Op (v2f64 VR128:$dst), (v2f64 VR128:$src)))),
|
||||
@ -3304,12 +3289,6 @@ multiclass scalar_math_f64_patterns<SDNode Op, string OpcPrefix> {
|
||||
def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst),
|
||||
(Op (v2f64 VR128:$dst), (v2f64 VR128:$src)), (i8 1))),
|
||||
(!cast<I>("V"#OpcPrefix#SDrr_Int) v2f64:$dst, v2f64:$src)>;
|
||||
|
||||
// vector math op with insert via blend (reversed order)
|
||||
def : Pat<(v2f64 (X86Blendi
|
||||
(Op (v2f64 VR128:$dst), (v2f64 VR128:$src)),
|
||||
(v2f64 VR128:$dst), (i8 2))),
|
||||
(!cast<I>("V"#OpcPrefix#SDrr_Int) v2f64:$dst, v2f64:$src)>;
|
||||
}
|
||||
}
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user