From 7e2ff77ef05c23db6b9c82bc7a4110e170d7f94c Mon Sep 17 00:00:00 2001 From: Evan Cheng Date: Thu, 8 May 2008 00:57:18 +0000 Subject: [PATCH] Handle vector move / load which zero the destination register top bits (i.e. movd, movq, movss (addr), movsd (addr)) with X86 specific dag combine. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@50838 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86ISelDAGToDAG.cpp | 43 +++----- lib/Target/X86/X86ISelLowering.cpp | 156 ++++++++++++++++++++--------- lib/Target/X86/X86ISelLowering.h | 13 ++- lib/Target/X86/X86InstrMMX.td | 26 ++--- lib/Target/X86/X86InstrSSE.td | 64 +++++------- test/CodeGen/X86/vec_set-5.ll | 3 +- test/CodeGen/X86/vec_set-6.ll | 3 +- test/CodeGen/X86/vec_set-C.ll | 7 ++ test/CodeGen/X86/vec_set-D.ll | 7 ++ 9 files changed, 175 insertions(+), 147 deletions(-) create mode 100644 test/CodeGen/X86/vec_set-C.ll create mode 100644 test/CodeGen/X86/vec_set-D.ll diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp index 8286cd05a03..1e9a0da9c68 100644 --- a/lib/Target/X86/X86ISelDAGToDAG.cpp +++ b/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -975,38 +975,19 @@ bool X86DAGToDAGISel::SelectScalarSSELoad(SDOperand Op, SDOperand Pred, // Also handle the case where we explicitly require zeros in the top // elements. This is a vector shuffle from the zero vector. - if (N.getOpcode() == ISD::VECTOR_SHUFFLE && N.Val->hasOneUse() && + if (N.getOpcode() == X86ISD::ZEXT_VMOVL && N.Val->hasOneUse() && // Check to see if the top elements are all zeros (or bitcast of zeros). - ISD::isBuildVectorAllZeros(N.getOperand(0).Val) && - N.getOperand(1).getOpcode() == ISD::SCALAR_TO_VECTOR && - N.getOperand(1).Val->hasOneUse() && - ISD::isNON_EXTLoad(N.getOperand(1).getOperand(0).Val) && - N.getOperand(1).getOperand(0).hasOneUse()) { - // Check to see if the shuffle mask is 4/L/L/L or 2/L, where L is something - // from the LHS. - unsigned VecWidth=MVT::getVectorNumElements(N.getOperand(0).getValueType()); - SDOperand ShufMask = N.getOperand(2); - assert(ShufMask.getOpcode() == ISD::BUILD_VECTOR && "Invalid shuf mask!"); - if (ConstantSDNode *C = dyn_cast(ShufMask.getOperand(0))) { - if (C->getValue() == VecWidth) { - for (unsigned i = 1; i != VecWidth; ++i) { - if (ShufMask.getOperand(i).getOpcode() == ISD::UNDEF) { - // ok. - } else { - ConstantSDNode *C = cast(ShufMask.getOperand(i)); - if (C->getValue() >= VecWidth) return false; - } - } - } - - // Okay, this is a zero extending load. Fold it. - LoadSDNode *LD = cast(N.getOperand(1).getOperand(0)); - if (!SelectAddr(Op, LD->getBasePtr(), Base, Scale, Index, Disp)) - return false; - OutChain = LD->getChain(); - InChain = SDOperand(LD, 1); - return true; - } + N.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR && + N.getOperand(0).Val->hasOneUse() && + ISD::isNON_EXTLoad(N.getOperand(0).getOperand(0).Val) && + N.getOperand(0).getOperand(0).hasOneUse()) { + // Okay, this is a zero extending load. Fold it. + LoadSDNode *LD = cast(N.getOperand(0).getOperand(0)); + if (!SelectAddr(Op, LD->getBasePtr(), Base, Scale, Index, Disp)) + return false; + OutChain = LD->getChain(); + InChain = SDOperand(LD, 1); + return true; } return false; } diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 4210ec9d8f8..d2441fc1372 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -2605,11 +2605,16 @@ static bool ShouldXformToMOVHLPS(SDNode *Mask) { } /// isScalarLoadToVector - Returns true if the node is a scalar load that -/// is promoted to a vector. -static inline bool isScalarLoadToVector(SDNode *N) { +/// is promoted to a vector. It also returns the LoadSDNode by reference if +/// required. +static bool isScalarLoadToVector(SDNode *N, LoadSDNode **LD = NULL) { if (N->getOpcode() == ISD::SCALAR_TO_VECTOR) { N = N->getOperand(0).Val; - return ISD::isNON_EXTLoad(N); + if (ISD::isNON_EXTLoad(N)) { + if (LD) + *LD = cast(N); + return true; + } } return false; } @@ -3082,8 +3087,16 @@ X86TargetLowering::LowerBUILD_VECTOR(SDOperand Op, SelectionDAG &DAG) { return SDOperand(); // Let legalizer expand 2-wide build_vectors. - if (EVTBits == 64) + if (EVTBits == 64) { + if (NumNonZero == 1) { + // One half is zero or undef. + unsigned Idx = CountTrailingZeros_32(NonZeros); + SDOperand V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, VT, + Op.getOperand(Idx)); + return getShuffleVectorZeroOrUndef(V2, Idx, true, DAG); + } return SDOperand(); + } // If element VT is < 32 bits, convert it to inserts into a zero vector. if (EVTBits == 8 && NumElems == 16) { @@ -3131,13 +3144,6 @@ X86TargetLowering::LowerBUILD_VECTOR(SDOperand Op, SelectionDAG &DAG) { } } - // Take advantage of the fact GR32 to VR128 scalar_to_vector (i.e. movd) - // clears the upper bits. - // FIXME: we can do the same for v4f32 case when we know both parts of - // the lower half come from scalar_to_vector (loadf32). We should do - // that in post legalizer dag combiner with target specific hooks. - if (MVT::isInteger(EVT) && (NonZeros & (0x3 << 2)) == 0) - return V[0]; MVT::ValueType MaskVT = MVT::getIntVectorWithNumElements(NumElems); MVT::ValueType EVT = MVT::getVectorElementType(MaskVT); SmallVector MaskVec; @@ -3475,6 +3481,38 @@ SDOperand RewriteAsNarrowerShuffle(SDOperand V1, SDOperand V2, &MaskVec[0], MaskVec.size())); } +/// getZextVMoveL - Return a zero-extending vector move low node. +/// +static SDOperand getZextVMoveL(MVT::ValueType VT, MVT::ValueType OpVT, + SDOperand SrcOp, SelectionDAG &DAG, + const X86Subtarget *Subtarget) { + if (VT == MVT::v2f64 || VT == MVT::v4f32) { + LoadSDNode *LD = NULL; + if (!isScalarLoadToVector(SrcOp.Val, &LD)) + LD = dyn_cast(SrcOp); + if (!LD) { + // movssrr and movsdrr do not clear top bits. Try to use movd, movq + // instead. + MVT::ValueType EVT = (OpVT == MVT::v2f64) ? MVT::i64 : MVT::i32; + if ((EVT != MVT::i64 || Subtarget->is64Bit()) && + SrcOp.getOpcode() == ISD::SCALAR_TO_VECTOR && + SrcOp.getOperand(0).getOpcode() == ISD::BIT_CONVERT && + SrcOp.getOperand(0).getOperand(0).getValueType() == EVT) { + // PR2108 + OpVT = (OpVT == MVT::v2f64) ? MVT::v2i64 : MVT::v4i32; + return DAG.getNode(ISD::BIT_CONVERT, VT, + DAG.getNode(X86ISD::ZEXT_VMOVL, OpVT, + DAG.getNode(ISD::SCALAR_TO_VECTOR, OpVT, + SrcOp.getOperand(0).getOperand(0)))); + } + } + } + + return DAG.getNode(ISD::BIT_CONVERT, VT, + DAG.getNode(X86ISD::ZEXT_VMOVL, OpVT, + DAG.getNode(ISD::BIT_CONVERT, OpVT, SrcOp))); +} + SDOperand X86TargetLowering::LowerVECTOR_SHUFFLE(SDOperand Op, SelectionDAG &DAG) { SDOperand V1 = Op.getOperand(0); @@ -3515,27 +3553,33 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDOperand Op, SelectionDAG &DAG) { // FIXME: Figure out a cleaner way to do this. // Try to make use of movq to zero out the top part. if (ISD::isBuildVectorAllZeros(V2.Val)) { - SDOperand NewOp = RewriteAsNarrowerShuffle(V1, V2, VT, PermMask, DAG, *this); + SDOperand NewOp = RewriteAsNarrowerShuffle(V1, V2, VT, PermMask, + DAG, *this); if (NewOp.Val) { SDOperand NewV1 = NewOp.getOperand(0); SDOperand NewV2 = NewOp.getOperand(1); SDOperand NewMask = NewOp.getOperand(2); if (isCommutedMOVL(NewMask.Val, true, false)) { NewOp = CommuteVectorShuffle(NewOp, NewV1, NewV2, NewMask, DAG); - NewOp = DAG.getNode(ISD::VECTOR_SHUFFLE, NewOp.getValueType(), - NewV1, NewV2, getMOVLMask(2, DAG)); - return DAG.getNode(ISD::BIT_CONVERT, VT, LowerVECTOR_SHUFFLE(NewOp, DAG)); + return getZextVMoveL(VT, NewOp.getValueType(), NewV2, DAG, Subtarget); } } } else if (ISD::isBuildVectorAllZeros(V1.Val)) { - SDOperand NewOp= RewriteAsNarrowerShuffle(V1, V2, VT, PermMask, DAG, *this); + SDOperand NewOp= RewriteAsNarrowerShuffle(V1, V2, VT, PermMask, + DAG, *this); if (NewOp.Val && X86::isMOVLMask(NewOp.getOperand(2).Val)) - return DAG.getNode(ISD::BIT_CONVERT, VT, LowerVECTOR_SHUFFLE(NewOp, DAG)); + return getZextVMoveL(VT, NewOp.getValueType(), NewOp.getOperand(1), + DAG, Subtarget); } } - if (X86::isMOVLMask(PermMask.Val)) - return (V1IsUndef) ? V2 : Op; + if (X86::isMOVLMask(PermMask.Val)) { + if (V1IsUndef) + return V2; + if (ISD::isBuildVectorAllZeros(V1.Val)) + return getZextVMoveL(VT, VT, V2, DAG, Subtarget); + return Op; + } if (X86::isMOVSHDUPMask(PermMask.Val) || X86::isMOVSLDUPMask(PermMask.Val) || @@ -5629,8 +5673,9 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::EH_RETURN: return "X86ISD::EH_RETURN"; case X86ISD::TC_RETURN: return "X86ISD::TC_RETURN"; case X86ISD::FNSTCW16m: return "X86ISD::FNSTCW16m"; - case X86ISD::LCMPXCHG_DAG: return "x86ISD::LCMPXCHG_DAG"; - case X86ISD::LCMPXCHG8_DAG: return "x86ISD::LCMPXCHG8_DAG"; + case X86ISD::LCMPXCHG_DAG: return "X86ISD::LCMPXCHG_DAG"; + case X86ISD::LCMPXCHG8_DAG: return "X86ISD::LCMPXCHG8_DAG"; + case X86ISD::ZEXT_VMOVL: return "X86ISD::ZEXT_VMOVL"; } } @@ -6192,16 +6237,46 @@ static bool isConsecutiveLoad(SDNode *N, SDNode *Base, int Dist, int Size, return false; } -static bool isBaseAlignment16(SDNode *Base, MachineFrameInfo *MFI, - const X86Subtarget *Subtarget) { +static bool isBaseAlignmentOfN(unsigned N, SDNode *Base, MachineFrameInfo *MFI, + const X86Subtarget *Subtarget) { GlobalValue *GV; int64_t Offset = 0; if (isGAPlusOffset(Base, GV, Offset)) - return (GV->getAlignment() >= 16 && (Offset % 16) == 0); + return (GV->getAlignment() >= N && (Offset % N) == 0); // DAG combine handles the stack object case. return false; } +static bool EltsFromConsecutiveLoads(SDNode *N, SDOperand PermMask, + unsigned NumElems, MVT::ValueType EVT, + MachineFrameInfo *MFI, + SelectionDAG &DAG, SDNode *&Base) { + Base = NULL; + for (unsigned i = 0; i < NumElems; ++i) { + SDOperand Idx = PermMask.getOperand(i); + if (Idx.getOpcode() == ISD::UNDEF) { + if (!Base) + return false; + continue; + } + + unsigned Index = cast(Idx)->getValue(); + SDOperand Elt = getShuffleScalarElt(N, Index, DAG); + if (!Elt.Val || + (Elt.getOpcode() != ISD::UNDEF && !ISD::isNON_EXTLoad(Elt.Val))) + return false; + if (!Base) { + Base = Elt.Val; + continue; + } + if (Elt.getOpcode() == ISD::UNDEF) + continue; + + if (!isConsecutiveLoad(Elt.Val, Base, i, MVT::getSizeInBits(EVT)/8,MFI)) + return false; + } + return true; +} /// PerformShuffleCombine - Combine a vector_shuffle that is equal to /// build_vector load1, load2, load3, load4, <0, 1, 2, 3> into a 128-bit load @@ -6209,36 +6284,17 @@ static bool isBaseAlignment16(SDNode *Base, MachineFrameInfo *MFI, /// order. static SDOperand PerformShuffleCombine(SDNode *N, SelectionDAG &DAG, const X86Subtarget *Subtarget) { - MachineFunction &MF = DAG.getMachineFunction(); - MachineFrameInfo *MFI = MF.getFrameInfo(); + MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); MVT::ValueType VT = N->getValueType(0); MVT::ValueType EVT = MVT::getVectorElementType(VT); SDOperand PermMask = N->getOperand(2); unsigned NumElems = PermMask.getNumOperands(); SDNode *Base = NULL; - for (unsigned i = 0; i < NumElems; ++i) { - SDOperand Elt = PermMask.getOperand(i); - if (Elt.getOpcode() == ISD::UNDEF) { - if (!Base) - return SDOperand(); - continue; - } - - unsigned Idx = cast(Elt)->getValue(); - SDOperand Arg = getShuffleScalarElt(N, Idx, DAG); - if (!Arg.Val || !ISD::isNON_EXTLoad(Arg.Val)) - return SDOperand(); - if (!Base) { - Base = Arg.Val; - continue; - } - - if (!isConsecutiveLoad(Arg.Val, Base, i, MVT::getSizeInBits(EVT)/8,MFI)) - return SDOperand(); - } + if (!EltsFromConsecutiveLoads(N, PermMask, NumElems, EVT, MFI, DAG, Base)) + return SDOperand(); LoadSDNode *LD = cast(Base); - if (isBaseAlignment16(Base->getOperand(1).Val, MFI, Subtarget)) + if (isBaseAlignmentOfN(16, Base->getOperand(1).Val, MFI, Subtarget)) return DAG.getLoad(VT, LD->getChain(), LD->getBasePtr(), LD->getSrcValue(), LD->getSrcValueOffset(), LD->isVolatile()); return DAG.getLoad(VT, LD->getChain(), LD->getBasePtr(), LD->getSrcValue(), @@ -6319,12 +6375,13 @@ static SDOperand PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, } /// PerformSTORECombine - Do target-specific dag combines on STORE nodes. -static SDOperand PerformSTORECombine(StoreSDNode *St, SelectionDAG &DAG, +static SDOperand PerformSTORECombine(SDNode *N, SelectionDAG &DAG, const X86Subtarget *Subtarget) { // Turn load->store of MMX types into GPR load/stores. This avoids clobbering // the FP state in cases where an emms may be missing. // A preferable solution to the general problem is to figure out the right // places to insert EMMS. This qualifies as a quick hack. + StoreSDNode *St = cast(N); if (MVT::isVector(St->getValue().getValueType()) && MVT::getSizeInBits(St->getValue().getValueType()) == 64 && isa(St->getValue()) && @@ -6442,8 +6499,7 @@ SDOperand X86TargetLowering::PerformDAGCombine(SDNode *N, default: break; case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, Subtarget); case ISD::SELECT: return PerformSELECTCombine(N, DAG, Subtarget); - case ISD::STORE: - return PerformSTORECombine(cast(N), DAG, Subtarget); + case ISD::STORE: return PerformSTORECombine(N, DAG, Subtarget); case X86ISD::FXOR: case X86ISD::FOR: return PerformFORCombine(N, DAG); case X86ISD::FAND: return PerformFANDCombine(N, DAG); diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h index 287903913ed..a6556b7695a 100644 --- a/lib/Target/X86/X86ISelLowering.h +++ b/lib/Target/X86/X86ISelLowering.h @@ -181,10 +181,10 @@ namespace llvm { /// in order to obtain suitable precision. FRSQRT, FRCP, - // Thread Local Storage + // TLSADDR, THREAThread - Thread Local Storage. TLSADDR, THREAD_POINTER, - // Exception Handling helpers + // EH_RETURN - Exception Handling helpers. EH_RETURN, /// TC_RETURN - Tail call return. @@ -194,12 +194,15 @@ namespace llvm { /// operand #3 optional in flag TC_RETURN, - // compare and swap + // LCMPXCHG_DAG, LCMPXCHG8_DAG - Compare and swap. LCMPXCHG_DAG, LCMPXCHG8_DAG, - // Store FP control world into i16 memory - FNSTCW16m + // FNSTCW16m - Store FP control world into i16 memory. + FNSTCW16m, + + // ZEXT_VMOVL - Vector move low and zero extend. + ZEXT_VMOVL }; } diff --git a/lib/Target/X86/X86InstrMMX.td b/lib/Target/X86/X86InstrMMX.td index d484695f60e..3c1fc750a14 100644 --- a/lib/Target/X86/X86InstrMMX.td +++ b/lib/Target/X86/X86InstrMMX.td @@ -200,18 +200,14 @@ let AddedComplexity = 15 in // movd to MMX register zero-extends def MMX_MOVZDI2PDIrr : MMXI<0x6E, MRMSrcReg, (outs VR64:$dst), (ins GR32:$src), "movd\t{$src, $dst|$dst, $src}", - [(set VR64:$dst, - (v2i32 (vector_shuffle immAllZerosV, - (v2i32 (scalar_to_vector GR32:$src)), - MMX_MOVL_shuffle_mask)))]>; + [(set VR64:$dst, + (v2i32 (X86zvmovl (v2i32 (scalar_to_vector GR32:$src)))))]>; let AddedComplexity = 20 in def MMX_MOVZDI2PDIrm : MMXI<0x6E, MRMSrcMem, (outs VR64:$dst), (ins i32mem:$src), "movd\t{$src, $dst|$dst, $src}", - [(set VR64:$dst, - (v2i32 (vector_shuffle immAllZerosV, - (v2i32 (scalar_to_vector - (loadi32 addr:$src))), - MMX_MOVL_shuffle_mask)))]>; + [(set VR64:$dst, + (v2i32 (X86zvmovl (v2i32 + (scalar_to_vector (loadi32 addr:$src))))))]>; // Arithmetic Instructions @@ -564,14 +560,10 @@ def : Pat<(i64 (bitconvert (v8i8 VR64:$src))), // Move scalar to XMM zero-extended // movd to XMM register zero-extends let AddedComplexity = 15 in { - def : Pat<(v8i8 (vector_shuffle immAllZerosV_bc, - (bc_v8i8 (v2i32 (scalar_to_vector GR32:$src))), - MMX_MOVL_shuffle_mask)), - (MMX_MOVZDI2PDIrr GR32:$src)>; - def : Pat<(v4i16 (vector_shuffle immAllZerosV_bc, - (bc_v4i16 (v2i32 (scalar_to_vector GR32:$src))), - MMX_MOVL_shuffle_mask)), - (MMX_MOVZDI2PDIrr GR32:$src)>; + def : Pat<(v8i8 (X86zvmovl (bc_v8i8 (v2i32 (scalar_to_vector GR32:$src))))), + (MMX_MOVZDI2PDIrr GR32:$src)>; + def : Pat<(v4i16 (X86zvmovl (bc_v8i8 (v2i32 (scalar_to_vector GR32:$src))))), + (MMX_MOVZDI2PDIrr GR32:$src)>; } // Scalar to v4i16 / v8i8. The source may be a GR32, but only the lower diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index 9c07de6fd03..375d0add62a 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -47,6 +47,7 @@ def X86pinsrw : SDNode<"X86ISD::PINSRW", def X86insrtps : SDNode<"X86ISD::INSERTPS", SDTypeProfile<1, 3, [SDTCisVT<0, v4f32>, SDTCisSameAs<0,1>, SDTCisVT<2, f32>, SDTCisPtrTy<3>]>>; +def X86zvmovl : SDNode<"X86ISD::ZEXT_VMOVL", SDTUnaryOp>; //===----------------------------------------------------------------------===// // SSE Complex Patterns @@ -1007,10 +1008,11 @@ let neverHasSideEffects = 1 in let AddedComplexity = 20 in def MOVZSS2PSrm : SSI<0x10, MRMSrcMem, (outs VR128:$dst), (ins f32mem:$src), "movss\t{$src, $dst|$dst, $src}", - [(set VR128:$dst, (v4f32 (vector_shuffle immAllZerosV_bc, - (v4f32 (scalar_to_vector (loadf32 addr:$src))), - MOVL_shuffle_mask)))]>; + [(set VR128:$dst, (v4f32 (X86zvmovl (v4f32 (scalar_to_vector + (loadf32 addr:$src))))))]>; +def : Pat<(v4f32 (X86zvmovl (memopv4f32 addr:$src))), + (MOVZSS2PSrm addr:$src)>; //===----------------------------------------------------------------------===// // SSE2 Instructions @@ -2264,51 +2266,36 @@ let AddedComplexity = 20 in def MOVZSD2PDrm : SDI<0x10, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src), "movsd\t{$src, $dst|$dst, $src}", [(set VR128:$dst, - (v2f64 (vector_shuffle immAllZerosV_bc, - (v2f64 (scalar_to_vector - (loadf64 addr:$src))), - MOVL_shuffle_mask)))]>; + (v2f64 (X86zvmovl (v2f64 (scalar_to_vector + (loadf64 addr:$src))))))]>; + +def : Pat<(v2f64 (X86zvmovl (memopv2f64 addr:$src))), + (MOVZSD2PDrm addr:$src)>; // movd / movq to XMM register zero-extends let AddedComplexity = 15 in { def MOVZDI2PDIrr : PDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src), "movd\t{$src, $dst|$dst, $src}", - [(set VR128:$dst, - (v4i32 (vector_shuffle immAllZerosV, - (v4i32 (scalar_to_vector GR32:$src)), - MOVL_shuffle_mask)))]>; + [(set VR128:$dst, (v4i32 (X86zvmovl + (v4i32 (scalar_to_vector GR32:$src)))))]>; // This is X86-64 only. def MOVZQI2PQIrr : RPDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src), "mov{d|q}\t{$src, $dst|$dst, $src}", - [(set VR128:$dst, - (v2i64 (vector_shuffle immAllZerosV_bc, - (v2i64 (scalar_to_vector GR64:$src)), - MOVL_shuffle_mask)))]>; + [(set VR128:$dst, (v2i64 (X86zvmovl + (v2i64 (scalar_to_vector GR64:$src)))))]>; } -// Handle the v2f64 form of 'MOVZQI2PQIrr' for PR2108. FIXME: this would be -// better written as a dag combine xform. -let AddedComplexity = 15 in -def : Pat<(v2f64 (vector_shuffle immAllZerosV_bc, - (v2f64 (scalar_to_vector - (f64 (bitconvert GR64:$src)))), - MOVL_shuffle_mask)), - (MOVZQI2PQIrr GR64:$src)>, Requires<[HasSSE2]>; - - let AddedComplexity = 20 in { def MOVZDI2PDIrm : PDI<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src), "movd\t{$src, $dst|$dst, $src}", [(set VR128:$dst, - (v4i32 (vector_shuffle immAllZerosV, - (v4i32 (scalar_to_vector (loadi32 addr:$src))), - MOVL_shuffle_mask)))]>; + (v4i32 (X86zvmovl (v4i32 (scalar_to_vector + (loadi32 addr:$src))))))]>; def MOVZQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), "movq\t{$src, $dst|$dst, $src}", [(set VR128:$dst, - (v2i64 (vector_shuffle immAllZerosV_bc, - (v2i64 (scalar_to_vector (loadi64 addr:$src))), - MOVL_shuffle_mask)))]>, XS, + (v2i64 (X86zvmovl (v2i64 (scalar_to_vector + (loadi64 addr:$src))))))]>, XS, Requires<[HasSSE2]>; } @@ -2317,17 +2304,14 @@ def MOVZQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), let AddedComplexity = 15 in def MOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "movq\t{$src, $dst|$dst, $src}", - [(set VR128:$dst, (v2i64 (vector_shuffle immAllZerosV_bc, - VR128:$src, - MOVL_shuffle_mask)))]>, + [(set VR128:$dst, (v2i64 (X86zvmovl (v2i64 VR128:$src))))]>, XS, Requires<[HasSSE2]>; let AddedComplexity = 20 in def MOVZPQILo2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), "movq\t{$src, $dst|$dst, $src}", - [(set VR128:$dst, (v2i64 (vector_shuffle immAllZerosV_bc, - (memopv2i64 addr:$src), - MOVL_shuffle_mask)))]>, + [(set VR128:$dst, (v2i64 (X86zvmovl + (memopv2i64 addr:$src))))]>, XS, Requires<[HasSSE2]>; //===----------------------------------------------------------------------===// @@ -2774,11 +2758,9 @@ let Predicates = [HasSSE2] in { // movd to XMM register zero-extends let AddedComplexity = 15 in { // Zeroing a VR128 then do a MOVS{S|D} to the lower bits. -def : Pat<(v2f64 (vector_shuffle immAllZerosV_bc, - (v2f64 (scalar_to_vector FR64:$src)), MOVL_shuffle_mask)), +def : Pat<(v2f64 (X86zvmovl (v2f64 (scalar_to_vector FR64:$src)))), (MOVLSD2PDrr (V_SET0), FR64:$src)>, Requires<[HasSSE2]>; -def : Pat<(v4f32 (vector_shuffle immAllZerosV_bc, - (v4f32 (scalar_to_vector FR32:$src)), MOVL_shuffle_mask)), +def : Pat<(v4f32 (X86zvmovl (v4f32 (scalar_to_vector FR32:$src)))), (MOVLSS2PSrr (V_SET0), FR32:$src)>, Requires<[HasSSE2]>; } diff --git a/test/CodeGen/X86/vec_set-5.ll b/test/CodeGen/X86/vec_set-5.ll index 687d6afca4f..4fc652c022a 100644 --- a/test/CodeGen/X86/vec_set-5.ll +++ b/test/CodeGen/X86/vec_set-5.ll @@ -1,8 +1,7 @@ ; RUN: llvm-as < %s | llc -march=x86 -mattr=+sse2 -o %t -f ; RUN: grep movlhps %t | count 1 -; RUN: grep unpcklps %t | count 1 -; RUN: grep punpckldq %t | count 1 ; RUN: grep movq %t | count 1 +; RUN: grep movsd %t | count 1 define <4 x float> @test1(float %a, float %b) nounwind { %tmp = insertelement <4 x float> zeroinitializer, float %a, i32 0 ; <<4 x float>> [#uses=1] diff --git a/test/CodeGen/X86/vec_set-6.ll b/test/CodeGen/X86/vec_set-6.ll index 1eeedf184dd..02df526cee8 100644 --- a/test/CodeGen/X86/vec_set-6.ll +++ b/test/CodeGen/X86/vec_set-6.ll @@ -1,5 +1,6 @@ ; RUN: llvm-as < %s | llc -march=x86 -mattr=+sse2 -o %t -f -; RUN: grep unpcklps %t | count 1 +; RUN: grep movss %t | count 1 +; RUN: grep movups %t | count 1 ; RUN: grep shufps %t | count 1 define <4 x float> @test(float %a, float %b, float %c) nounwind { diff --git a/test/CodeGen/X86/vec_set-C.ll b/test/CodeGen/X86/vec_set-C.ll new file mode 100644 index 00000000000..eef9a61ab94 --- /dev/null +++ b/test/CodeGen/X86/vec_set-C.ll @@ -0,0 +1,7 @@ +; RUN: llvm-as < %s | llc -march=x86 -mattr=+sse2 | grep movq +; RUN: llvm-as < %s | llc -march=x86-64 -mattr=+sse2 | grep movd + +define <2 x i64> @t1(i64 %x) nounwind { + %tmp8 = insertelement <2 x i64> zeroinitializer, i64 %x, i32 0 + ret <2 x i64> %tmp8 +} diff --git a/test/CodeGen/X86/vec_set-D.ll b/test/CodeGen/X86/vec_set-D.ll new file mode 100644 index 00000000000..71bdd849953 --- /dev/null +++ b/test/CodeGen/X86/vec_set-D.ll @@ -0,0 +1,7 @@ +; RUN: llvm-as < %s | llc -march=x86 -mattr=+sse2 | grep movq + +define <4 x i32> @t(i32 %x, i32 %y) nounwind { + %tmp1 = insertelement <4 x i32> zeroinitializer, i32 %x, i32 0 + %tmp2 = insertelement <4 x i32> %tmp1, i32 %y, i32 1 + ret <4 x i32> %tmp2 +}