diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 3a861c041d1..b7307ea1e95 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -2593,7 +2593,10 @@ static bool isTargetShuffle(unsigned Opcode) { case X86ISD::SHUFPD: case X86ISD::SHUFPS: case X86ISD::MOVLHPS: + case X86ISD::MOVLHPD: case X86ISD::MOVHLPS: + case X86ISD::MOVLPS: + case X86ISD::MOVLPD: case X86ISD::MOVSHDUP: case X86ISD::MOVSLDUP: case X86ISD::MOVSS: @@ -2648,6 +2651,8 @@ static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT, case X86ISD::MOVLHPS: case X86ISD::MOVLHPD: case X86ISD::MOVHLPS: + case X86ISD::MOVLPS: + case X86ISD::MOVLPD: case X86ISD::MOVSS: case X86ISD::MOVSD: case X86ISD::PUNPCKLDQ: @@ -3664,7 +3669,6 @@ SDValue getShuffleScalarElt(SDNode *N, int Index, SelectionDAG &DAG) { SDValue V = SDValue(N, 0); EVT VT = V.getValueType(); unsigned Opcode = V.getOpcode(); - int NumElems = VT.getVectorNumElements(); // Recurse into ISD::VECTOR_SHUFFLE node to find scalars. if (const ShuffleVectorSDNode *SV = dyn_cast(N)) { @@ -3673,6 +3677,7 @@ SDValue getShuffleScalarElt(SDNode *N, int Index, SelectionDAG &DAG) { if (Index < 0) return DAG.getUNDEF(VT.getVectorElementType()); + int NumElems = VT.getVectorNumElements(); SDValue NewV = (Index < NumElems) ? SV->getOperand(0) : SV->getOperand(1); return getShuffleScalarElt(NewV.getNode(), Index % NumElems, DAG); } @@ -3698,8 +3703,9 @@ SDValue getShuffleScalarElt(SDNode *N, int Index, SelectionDAG &DAG) { if (Opcode == ISD::BIT_CONVERT) { V = V.getOperand(0); EVT SrcVT = V.getValueType(); + unsigned NumElems = VT.getVectorNumElements(); - if (!SrcVT.isVector() || SrcVT.getVectorNumElements() != (unsigned)NumElems) + if (!SrcVT.isVector() || SrcVT.getVectorNumElements() != NumElems) return SDValue(); } @@ -5061,6 +5067,67 @@ SDValue getMOVHighToLow(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG) { return getTargetShuffleNode(X86ISD::MOVHLPS, dl, VT, V1, V2, DAG); } +static +SDValue getMOVLP(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG, bool HasSSE2) { + SDValue V1 = Op.getOperand(0); + SDValue V2 = Op.getOperand(1); + EVT VT = Op.getValueType(); + unsigned NumElems = VT.getVectorNumElements(); + + // Use MOVLPS and MOVLPD in case V1 or V2 are loads. During isel, the second + // operand of these instructions is only memory, so check if there's a + // potencial load folding here, otherwise use SHUFPS or MOVSD to match the + // same masks. + bool CanFoldLoad = false; + SDValue TmpV1 = V1; + SDValue TmpV2 = V2; + + // Trivial case, when V2 is a load. + if (TmpV2.getOpcode() == ISD::BIT_CONVERT) + TmpV2 = TmpV2.getOperand(0); + if (TmpV2.getOpcode() == ISD::SCALAR_TO_VECTOR) + TmpV2 = TmpV2.getOperand(0); + if (MayFoldLoad(TmpV2)) + CanFoldLoad = true; + + // When V1 is a load, it can be folded later into a store in isel, example: + // (store (v4f32 (X86Movlps (load addr:$src1), VR128:$src2)), addr:$src1) + // turns into: + // (MOVLPSmr addr:$src1, VR128:$src2) + // So, recognize this potential and also use MOVLPS or MOVLPD + if (TmpV1.getOpcode() == ISD::BIT_CONVERT) + TmpV1 = TmpV1.getOperand(0); + if (MayFoldLoad(TmpV1)) + CanFoldLoad = true; + + if (CanFoldLoad) { + if (HasSSE2 && NumElems == 2) + return getTargetShuffleNode(X86ISD::MOVLPD, dl, VT, V1, V2, DAG); + + if (NumElems == 4) + return getTargetShuffleNode(X86ISD::MOVLPS, dl, VT, V1, V2, DAG); + } + + ShuffleVectorSDNode *SVOp = cast(Op); + // movl and movlp will both match v2i64, but v2i64 is never matched by + // movl earlier because we make it strict to avoid messing with the movlp load + // folding logic (see the code above getMOVLP call). Match it here then, + // this is horrible, but will stay like this until we move all shuffle + // matching to x86 specific nodes. Note that for the 1st condition all + // types are matched with movsd. + if ((HasSSE2 && NumElems == 2) || !X86::isMOVLMask(SVOp)) + return getTargetShuffleNode(X86ISD::MOVSD, dl, VT, V1, V2, DAG); + else if (HasSSE2) + return getTargetShuffleNode(X86ISD::MOVSS, dl, VT, V1, V2, DAG); + + + assert(VT != MVT::v4i32 && "unsupported shuffle type"); + + // Invert the operand order and use SHUFPS to match it. + return getTargetShuffleNode(X86ISD::SHUFPS, dl, VT, V2, V1, + X86::getShuffleSHUFImmediate(SVOp), DAG); +} + SDValue X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { ShuffleVectorSDNode *SVOp = cast(Op); @@ -5182,7 +5249,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { return getTargetShuffleNode(X86ISD::MOVSLDUP, dl, VT, V1, DAG); if (X86::isMOVLPMask(SVOp)) - return Op; + return getMOVLP(Op, dl, DAG, HasSSE2); } if (ShouldXformToMOVHLPS(SVOp) || @@ -8433,6 +8500,8 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::MOVLHPD: return "X86ISD::MOVLHPD"; case X86ISD::MOVHLPS: return "X86ISD::MOVHLPS"; case X86ISD::MOVHLPD: return "X86ISD::MOVHLPD"; + case X86ISD::MOVLPS: return "X86ISD::MOVLPS"; + case X86ISD::MOVLPD: return "X86ISD::MOVLPD"; case X86ISD::MOVDDUP: return "X86ISD::MOVDDUP"; case X86ISD::MOVSHDUP: return "X86ISD::MOVSHDUP"; case X86ISD::MOVSLDUP: return "X86ISD::MOVSLDUP"; diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h index 7cd9a157a3b..d2d9b28a039 100644 --- a/lib/Target/X86/X86ISelLowering.h +++ b/lib/Target/X86/X86ISelLowering.h @@ -269,6 +269,8 @@ namespace llvm { MOVLHPD, MOVHLPS, MOVHLPD, + MOVLPS, + MOVLPD, MOVSD, MOVSS, UNPCKLPS, diff --git a/lib/Target/X86/X86InstrFragmentsSIMD.td b/lib/Target/X86/X86InstrFragmentsSIMD.td index 7f9a39707d2..01149b69921 100644 --- a/lib/Target/X86/X86InstrFragmentsSIMD.td +++ b/lib/Target/X86/X86InstrFragmentsSIMD.td @@ -134,9 +134,6 @@ def SDTShuff2OpI : SDTypeProfile<1, 2, [SDTCisVec<0>, def SDTShuff3OpI : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>, SDTCisSameAs<0,2>, SDTCisInt<3>]>; -def SDTShuff2OpLd : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>, - SDTCisPtrTy<2>]>; - def SDTShuff2OpLdI : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisPtrTy<1>, SDTCisInt<2>]>; @@ -161,14 +158,11 @@ def X86Movss : SDNode<"X86ISD::MOVSS", SDTShuff2Op>; def X86Movlhps : SDNode<"X86ISD::MOVLHPS", SDTShuff2Op>; def X86Movlhpd : SDNode<"X86ISD::MOVLHPD", SDTShuff2Op>; - def X86Movhlps : SDNode<"X86ISD::MOVHLPS", SDTShuff2Op>; def X86Movhlpd : SDNode<"X86ISD::MOVHLPD", SDTShuff2Op>; -def X86MovlpsLd : SDNode<"X86ISD::MOVLPS", SDTShuff2OpLd, - [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>; -def X86MovlpdLd : SDNode<"X86ISD::MOVLPD", SDTShuff2OpLd, - [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>; +def X86Movlps : SDNode<"X86ISD::MOVLPS", SDTShuff2Op>; +def X86Movlpd : SDNode<"X86ISD::MOVLPD", SDTShuff2Op>; def X86Unpcklps : SDNode<"X86ISD::UNPCKLPS", SDTShuff2Op>; def X86Unpcklpd : SDNode<"X86ISD::UNPCKLPD", SDTShuff2Op>; diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index bc6278a2b49..8a1786272fd 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -5840,9 +5840,9 @@ def : Pat<(v2f64 (X86Movsd VR128:$src1, VR128:$src2)), (MOVSDrr (v2f64 VR128:$src1), (EXTRACT_SUBREG (v2f64 VR128:$src2), sub_sd))>; def : Pat<(v4f32 (X86Movsd VR128:$src1, VR128:$src2)), - (MOVSDrr VR128:$src1, (EXTRACT_SUBREG VR128:$src2, sub_sd))>; + (MOVSDrr VR128:$src1, (EXTRACT_SUBREG (v4f32 VR128:$src2), sub_sd))>; def : Pat<(v4i32 (X86Movsd VR128:$src1, VR128:$src2)), - (MOVSDrr VR128:$src1, (EXTRACT_SUBREG VR128:$src2, sub_sd))>; + (MOVSDrr VR128:$src1, (EXTRACT_SUBREG (v4i32 VR128:$src2), sub_sd))>; // Shuffle with MOVSHDUP def : Pat<(v4i32 (X86Movshdup VR128:$src)), @@ -5901,7 +5901,25 @@ def : Pat<(v8i16 (X86PAlign VR128:$src1, VR128:$src2, (i8 imm:$imm))), def : Pat<(v16i8 (X86PAlign VR128:$src1, VR128:$src2, (i8 imm:$imm))), (PALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>; -// Extra patterns to match stores +// Shuffle with MOVLPS +def : Pat<(v4f32 (X86Movlps VR128:$src1, (load addr:$src2))), + (MOVLPSrm VR128:$src1, addr:$src2)>; +def : Pat<(v4i32 (X86Movlps VR128:$src1, (load addr:$src2))), + (MOVLPSrm VR128:$src1, addr:$src2)>; +def : Pat<(X86Movlps VR128:$src1, + (bc_v4f32 (v2f64 (scalar_to_vector (loadf64 addr:$src2))))), + (MOVLPSrm VR128:$src1, addr:$src2)>; + +// Shuffle with MOVLPD +def : Pat<(v2f64 (X86Movlpd VR128:$src1, (load addr:$src2))), + (MOVLPDrm VR128:$src1, addr:$src2)>; +def : Pat<(v2i64 (X86Movlpd VR128:$src1, (load addr:$src2))), + (MOVLPDrm VR128:$src1, addr:$src2)>; +def : Pat<(v2f64 (X86Movlpd VR128:$src1, + (scalar_to_vector (loadf64 addr:$src2)))), + (MOVLPDrm VR128:$src1, addr:$src2)>; + +// Extra patterns to match stores with MOVHPS/PD and MOVLPS/PD def : Pat<(store (f64 (vector_extract (v2f64 (X86Unpckhps VR128:$src, (undef))), (iPTR 0))),addr:$dst), (MOVHPSmr addr:$dst, VR128:$src)>; @@ -5909,3 +5927,13 @@ def : Pat<(store (f64 (vector_extract (v2f64 (X86Unpckhpd VR128:$src, (undef))), (iPTR 0))),addr:$dst), (MOVHPDmr addr:$dst, VR128:$src)>; +def : Pat<(store (v4f32 (X86Movlps (load addr:$src1), VR128:$src2)),addr:$src1), + (MOVLPSmr addr:$src1, VR128:$src2)>; +def : Pat<(store (v4i32 (X86Movlps + (bc_v4i32 (loadv2i64 addr:$src1)), VR128:$src2)), addr:$src1), + (MOVLPSmr addr:$src1, VR128:$src2)>; + +def : Pat<(store (v2f64 (X86Movlpd (load addr:$src1), VR128:$src2)),addr:$src1), + (MOVLPDmr addr:$src1, VR128:$src2)>; +def : Pat<(store (v2i64 (X86Movlpd (load addr:$src1), VR128:$src2)),addr:$src1), + (MOVLPDmr addr:$src1, VR128:$src2)>;