diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index da20cefdfbf..7acfc0e96f0 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -23024,6 +23024,32 @@ static SDValue PerformINSERTPSCombine(SDNode *N, SelectionDAG &DAG, LoadScalarToVector, N->getOperand(2)); } +static SDValue PerformBLENDICombine(SDNode *N, SelectionDAG &DAG) { + SDValue V0 = N->getOperand(0); + SDValue V1 = N->getOperand(1); + SDLoc DL(N); + EVT VT = N->getValueType(0); + + // Canonicalize a v2f64 blend with a mask of 2 by swapping the vector + // operands and changing the mask to 1. This saves us a bunch of + // pattern-matching possibilities related to scalar math ops in SSE/AVX. + // x86InstrInfo knows how to commute this back after instruction selection + // if it would help register allocation. + + // TODO: If optimizing for size or a processor that doesn't suffer from + // partial register update stalls, this should be transformed into a MOVSD + // instruction because a MOVSD is 1-2 bytes smaller than a BLENDPD. + + if (VT == MVT::v2f64) + if (auto *Mask = dyn_cast(N->getOperand(2))) + if (Mask->getZExtValue() == 2 && !isShuffleFoldableLoad(V0)) { + SDValue NewMask = DAG.getConstant(1, MVT::i8); + return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V0, NewMask); + } + + return SDValue(); +} + // Helper function of PerformSETCCCombine. It is to materialize "setb reg" // as "sbb reg,reg", since it can be extended without zext and produces // an all-ones bit which is more useful than 0/1 in some cases. @@ -23440,6 +23466,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, return PerformINSERTPSCombine(N, DAG, Subtarget); break; } + case X86ISD::BLENDI: return PerformBLENDICombine(N, DAG); case ISD::BUILD_VECTOR: return PerformBUILD_VECTORCombine(N, DAG, Subtarget); } diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index 079487d19b4..c64d35cf71f 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -3161,8 +3161,7 @@ let isCodeGenOnly = 1 in { // addss %xmm1, %xmm0 // TODO: Some canonicalization in lowering would simplify the number of -// patterns we have to try to match. In particular, the reversed order blends -// seem unnecessary. +// patterns we have to try to match. multiclass scalar_math_f32_patterns { let Predicates = [UseSSE1] in { // extracted scalar math op with insert via movss @@ -3263,16 +3262,9 @@ multiclass scalar_math_f64_patterns { def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), (Op (v2f64 VR128:$dst), (v2f64 VR128:$src)), (i8 1))), (!cast(OpcPrefix#SDrr_Int) v2f64:$dst, v2f64:$src)>; - - // vector math op with insert via blend (reversed order) - def : Pat<(v2f64 (X86Blendi - (Op (v2f64 VR128:$dst), (v2f64 VR128:$src)), - (v2f64 VR128:$dst), (i8 2))), - (!cast(OpcPrefix#SDrr_Int) v2f64:$dst, v2f64:$src)>; } - // Repeat everything for AVX and add one more pattern - // (the scalar + blend reversed order) for good measure. + // Repeat everything for AVX. let Predicates = [HasAVX] in { // extracted scalar math op with insert via movsd def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), (v2f64 (scalar_to_vector @@ -3288,13 +3280,6 @@ multiclass scalar_math_f64_patterns { (!cast("V"#OpcPrefix#SDrr_Int) v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; - // extracted scalar math op with insert via blend (reversed order) - def : Pat<(v2f64 (X86Blendi (v2f64 (scalar_to_vector - (Op (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), - FR64:$src))), (v2f64 VR128:$dst), (i8 2))), - (!cast("V"#OpcPrefix#SDrr_Int) v2f64:$dst, - (COPY_TO_REGCLASS FR64:$src, VR128))>; - // vector math op with insert via movsd def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), (Op (v2f64 VR128:$dst), (v2f64 VR128:$src)))), @@ -3304,12 +3289,6 @@ multiclass scalar_math_f64_patterns { def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), (Op (v2f64 VR128:$dst), (v2f64 VR128:$src)), (i8 1))), (!cast("V"#OpcPrefix#SDrr_Int) v2f64:$dst, v2f64:$src)>; - - // vector math op with insert via blend (reversed order) - def : Pat<(v2f64 (X86Blendi - (Op (v2f64 VR128:$dst), (v2f64 VR128:$src)), - (v2f64 VR128:$dst), (i8 2))), - (!cast("V"#OpcPrefix#SDrr_Int) v2f64:$dst, v2f64:$src)>; } }