mirror of
https://github.com/c64scene-ar/llvm-6502.git
synced 2024-12-13 04:30:23 +00:00
Handle vector move / load which zero the destination register top bits (i.e. movd, movq, movss (addr), movsd (addr)) with X86 specific dag combine.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@50838 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
parent
687bcb2be0
commit
7e2ff77ef0
@ -975,38 +975,19 @@ bool X86DAGToDAGISel::SelectScalarSSELoad(SDOperand Op, SDOperand Pred,
|
||||
|
||||
// Also handle the case where we explicitly require zeros in the top
|
||||
// elements. This is a vector shuffle from the zero vector.
|
||||
if (N.getOpcode() == ISD::VECTOR_SHUFFLE && N.Val->hasOneUse() &&
|
||||
if (N.getOpcode() == X86ISD::ZEXT_VMOVL && N.Val->hasOneUse() &&
|
||||
// Check to see if the top elements are all zeros (or bitcast of zeros).
|
||||
ISD::isBuildVectorAllZeros(N.getOperand(0).Val) &&
|
||||
N.getOperand(1).getOpcode() == ISD::SCALAR_TO_VECTOR &&
|
||||
N.getOperand(1).Val->hasOneUse() &&
|
||||
ISD::isNON_EXTLoad(N.getOperand(1).getOperand(0).Val) &&
|
||||
N.getOperand(1).getOperand(0).hasOneUse()) {
|
||||
// Check to see if the shuffle mask is 4/L/L/L or 2/L, where L is something
|
||||
// from the LHS.
|
||||
unsigned VecWidth=MVT::getVectorNumElements(N.getOperand(0).getValueType());
|
||||
SDOperand ShufMask = N.getOperand(2);
|
||||
assert(ShufMask.getOpcode() == ISD::BUILD_VECTOR && "Invalid shuf mask!");
|
||||
if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(ShufMask.getOperand(0))) {
|
||||
if (C->getValue() == VecWidth) {
|
||||
for (unsigned i = 1; i != VecWidth; ++i) {
|
||||
if (ShufMask.getOperand(i).getOpcode() == ISD::UNDEF) {
|
||||
// ok.
|
||||
} else {
|
||||
ConstantSDNode *C = cast<ConstantSDNode>(ShufMask.getOperand(i));
|
||||
if (C->getValue() >= VecWidth) return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Okay, this is a zero extending load. Fold it.
|
||||
LoadSDNode *LD = cast<LoadSDNode>(N.getOperand(1).getOperand(0));
|
||||
if (!SelectAddr(Op, LD->getBasePtr(), Base, Scale, Index, Disp))
|
||||
return false;
|
||||
OutChain = LD->getChain();
|
||||
InChain = SDOperand(LD, 1);
|
||||
return true;
|
||||
}
|
||||
N.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR &&
|
||||
N.getOperand(0).Val->hasOneUse() &&
|
||||
ISD::isNON_EXTLoad(N.getOperand(0).getOperand(0).Val) &&
|
||||
N.getOperand(0).getOperand(0).hasOneUse()) {
|
||||
// Okay, this is a zero extending load. Fold it.
|
||||
LoadSDNode *LD = cast<LoadSDNode>(N.getOperand(0).getOperand(0));
|
||||
if (!SelectAddr(Op, LD->getBasePtr(), Base, Scale, Index, Disp))
|
||||
return false;
|
||||
OutChain = LD->getChain();
|
||||
InChain = SDOperand(LD, 1);
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
@ -2605,11 +2605,16 @@ static bool ShouldXformToMOVHLPS(SDNode *Mask) {
|
||||
}
|
||||
|
||||
/// isScalarLoadToVector - Returns true if the node is a scalar load that
|
||||
/// is promoted to a vector.
|
||||
static inline bool isScalarLoadToVector(SDNode *N) {
|
||||
/// is promoted to a vector. It also returns the LoadSDNode by reference if
|
||||
/// required.
|
||||
static bool isScalarLoadToVector(SDNode *N, LoadSDNode **LD = NULL) {
|
||||
if (N->getOpcode() == ISD::SCALAR_TO_VECTOR) {
|
||||
N = N->getOperand(0).Val;
|
||||
return ISD::isNON_EXTLoad(N);
|
||||
if (ISD::isNON_EXTLoad(N)) {
|
||||
if (LD)
|
||||
*LD = cast<LoadSDNode>(N);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
@ -3082,8 +3087,16 @@ X86TargetLowering::LowerBUILD_VECTOR(SDOperand Op, SelectionDAG &DAG) {
|
||||
return SDOperand();
|
||||
|
||||
// Let legalizer expand 2-wide build_vectors.
|
||||
if (EVTBits == 64)
|
||||
if (EVTBits == 64) {
|
||||
if (NumNonZero == 1) {
|
||||
// One half is zero or undef.
|
||||
unsigned Idx = CountTrailingZeros_32(NonZeros);
|
||||
SDOperand V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, VT,
|
||||
Op.getOperand(Idx));
|
||||
return getShuffleVectorZeroOrUndef(V2, Idx, true, DAG);
|
||||
}
|
||||
return SDOperand();
|
||||
}
|
||||
|
||||
// If element VT is < 32 bits, convert it to inserts into a zero vector.
|
||||
if (EVTBits == 8 && NumElems == 16) {
|
||||
@ -3131,13 +3144,6 @@ X86TargetLowering::LowerBUILD_VECTOR(SDOperand Op, SelectionDAG &DAG) {
|
||||
}
|
||||
}
|
||||
|
||||
// Take advantage of the fact GR32 to VR128 scalar_to_vector (i.e. movd)
|
||||
// clears the upper bits.
|
||||
// FIXME: we can do the same for v4f32 case when we know both parts of
|
||||
// the lower half come from scalar_to_vector (loadf32). We should do
|
||||
// that in post legalizer dag combiner with target specific hooks.
|
||||
if (MVT::isInteger(EVT) && (NonZeros & (0x3 << 2)) == 0)
|
||||
return V[0];
|
||||
MVT::ValueType MaskVT = MVT::getIntVectorWithNumElements(NumElems);
|
||||
MVT::ValueType EVT = MVT::getVectorElementType(MaskVT);
|
||||
SmallVector<SDOperand, 8> MaskVec;
|
||||
@ -3475,6 +3481,38 @@ SDOperand RewriteAsNarrowerShuffle(SDOperand V1, SDOperand V2,
|
||||
&MaskVec[0], MaskVec.size()));
|
||||
}
|
||||
|
||||
/// getZextVMoveL - Return a zero-extending vector move low node.
|
||||
///
|
||||
static SDOperand getZextVMoveL(MVT::ValueType VT, MVT::ValueType OpVT,
|
||||
SDOperand SrcOp, SelectionDAG &DAG,
|
||||
const X86Subtarget *Subtarget) {
|
||||
if (VT == MVT::v2f64 || VT == MVT::v4f32) {
|
||||
LoadSDNode *LD = NULL;
|
||||
if (!isScalarLoadToVector(SrcOp.Val, &LD))
|
||||
LD = dyn_cast<LoadSDNode>(SrcOp);
|
||||
if (!LD) {
|
||||
// movssrr and movsdrr do not clear top bits. Try to use movd, movq
|
||||
// instead.
|
||||
MVT::ValueType EVT = (OpVT == MVT::v2f64) ? MVT::i64 : MVT::i32;
|
||||
if ((EVT != MVT::i64 || Subtarget->is64Bit()) &&
|
||||
SrcOp.getOpcode() == ISD::SCALAR_TO_VECTOR &&
|
||||
SrcOp.getOperand(0).getOpcode() == ISD::BIT_CONVERT &&
|
||||
SrcOp.getOperand(0).getOperand(0).getValueType() == EVT) {
|
||||
// PR2108
|
||||
OpVT = (OpVT == MVT::v2f64) ? MVT::v2i64 : MVT::v4i32;
|
||||
return DAG.getNode(ISD::BIT_CONVERT, VT,
|
||||
DAG.getNode(X86ISD::ZEXT_VMOVL, OpVT,
|
||||
DAG.getNode(ISD::SCALAR_TO_VECTOR, OpVT,
|
||||
SrcOp.getOperand(0).getOperand(0))));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return DAG.getNode(ISD::BIT_CONVERT, VT,
|
||||
DAG.getNode(X86ISD::ZEXT_VMOVL, OpVT,
|
||||
DAG.getNode(ISD::BIT_CONVERT, OpVT, SrcOp)));
|
||||
}
|
||||
|
||||
SDOperand
|
||||
X86TargetLowering::LowerVECTOR_SHUFFLE(SDOperand Op, SelectionDAG &DAG) {
|
||||
SDOperand V1 = Op.getOperand(0);
|
||||
@ -3515,27 +3553,33 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDOperand Op, SelectionDAG &DAG) {
|
||||
// FIXME: Figure out a cleaner way to do this.
|
||||
// Try to make use of movq to zero out the top part.
|
||||
if (ISD::isBuildVectorAllZeros(V2.Val)) {
|
||||
SDOperand NewOp = RewriteAsNarrowerShuffle(V1, V2, VT, PermMask, DAG, *this);
|
||||
SDOperand NewOp = RewriteAsNarrowerShuffle(V1, V2, VT, PermMask,
|
||||
DAG, *this);
|
||||
if (NewOp.Val) {
|
||||
SDOperand NewV1 = NewOp.getOperand(0);
|
||||
SDOperand NewV2 = NewOp.getOperand(1);
|
||||
SDOperand NewMask = NewOp.getOperand(2);
|
||||
if (isCommutedMOVL(NewMask.Val, true, false)) {
|
||||
NewOp = CommuteVectorShuffle(NewOp, NewV1, NewV2, NewMask, DAG);
|
||||
NewOp = DAG.getNode(ISD::VECTOR_SHUFFLE, NewOp.getValueType(),
|
||||
NewV1, NewV2, getMOVLMask(2, DAG));
|
||||
return DAG.getNode(ISD::BIT_CONVERT, VT, LowerVECTOR_SHUFFLE(NewOp, DAG));
|
||||
return getZextVMoveL(VT, NewOp.getValueType(), NewV2, DAG, Subtarget);
|
||||
}
|
||||
}
|
||||
} else if (ISD::isBuildVectorAllZeros(V1.Val)) {
|
||||
SDOperand NewOp= RewriteAsNarrowerShuffle(V1, V2, VT, PermMask, DAG, *this);
|
||||
SDOperand NewOp= RewriteAsNarrowerShuffle(V1, V2, VT, PermMask,
|
||||
DAG, *this);
|
||||
if (NewOp.Val && X86::isMOVLMask(NewOp.getOperand(2).Val))
|
||||
return DAG.getNode(ISD::BIT_CONVERT, VT, LowerVECTOR_SHUFFLE(NewOp, DAG));
|
||||
return getZextVMoveL(VT, NewOp.getValueType(), NewOp.getOperand(1),
|
||||
DAG, Subtarget);
|
||||
}
|
||||
}
|
||||
|
||||
if (X86::isMOVLMask(PermMask.Val))
|
||||
return (V1IsUndef) ? V2 : Op;
|
||||
if (X86::isMOVLMask(PermMask.Val)) {
|
||||
if (V1IsUndef)
|
||||
return V2;
|
||||
if (ISD::isBuildVectorAllZeros(V1.Val))
|
||||
return getZextVMoveL(VT, VT, V2, DAG, Subtarget);
|
||||
return Op;
|
||||
}
|
||||
|
||||
if (X86::isMOVSHDUPMask(PermMask.Val) ||
|
||||
X86::isMOVSLDUPMask(PermMask.Val) ||
|
||||
@ -5629,8 +5673,9 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
|
||||
case X86ISD::EH_RETURN: return "X86ISD::EH_RETURN";
|
||||
case X86ISD::TC_RETURN: return "X86ISD::TC_RETURN";
|
||||
case X86ISD::FNSTCW16m: return "X86ISD::FNSTCW16m";
|
||||
case X86ISD::LCMPXCHG_DAG: return "x86ISD::LCMPXCHG_DAG";
|
||||
case X86ISD::LCMPXCHG8_DAG: return "x86ISD::LCMPXCHG8_DAG";
|
||||
case X86ISD::LCMPXCHG_DAG: return "X86ISD::LCMPXCHG_DAG";
|
||||
case X86ISD::LCMPXCHG8_DAG: return "X86ISD::LCMPXCHG8_DAG";
|
||||
case X86ISD::ZEXT_VMOVL: return "X86ISD::ZEXT_VMOVL";
|
||||
}
|
||||
}
|
||||
|
||||
@ -6192,16 +6237,46 @@ static bool isConsecutiveLoad(SDNode *N, SDNode *Base, int Dist, int Size,
|
||||
return false;
|
||||
}
|
||||
|
||||
static bool isBaseAlignment16(SDNode *Base, MachineFrameInfo *MFI,
|
||||
const X86Subtarget *Subtarget) {
|
||||
static bool isBaseAlignmentOfN(unsigned N, SDNode *Base, MachineFrameInfo *MFI,
|
||||
const X86Subtarget *Subtarget) {
|
||||
GlobalValue *GV;
|
||||
int64_t Offset = 0;
|
||||
if (isGAPlusOffset(Base, GV, Offset))
|
||||
return (GV->getAlignment() >= 16 && (Offset % 16) == 0);
|
||||
return (GV->getAlignment() >= N && (Offset % N) == 0);
|
||||
// DAG combine handles the stack object case.
|
||||
return false;
|
||||
}
|
||||
|
||||
static bool EltsFromConsecutiveLoads(SDNode *N, SDOperand PermMask,
|
||||
unsigned NumElems, MVT::ValueType EVT,
|
||||
MachineFrameInfo *MFI,
|
||||
SelectionDAG &DAG, SDNode *&Base) {
|
||||
Base = NULL;
|
||||
for (unsigned i = 0; i < NumElems; ++i) {
|
||||
SDOperand Idx = PermMask.getOperand(i);
|
||||
if (Idx.getOpcode() == ISD::UNDEF) {
|
||||
if (!Base)
|
||||
return false;
|
||||
continue;
|
||||
}
|
||||
|
||||
unsigned Index = cast<ConstantSDNode>(Idx)->getValue();
|
||||
SDOperand Elt = getShuffleScalarElt(N, Index, DAG);
|
||||
if (!Elt.Val ||
|
||||
(Elt.getOpcode() != ISD::UNDEF && !ISD::isNON_EXTLoad(Elt.Val)))
|
||||
return false;
|
||||
if (!Base) {
|
||||
Base = Elt.Val;
|
||||
continue;
|
||||
}
|
||||
if (Elt.getOpcode() == ISD::UNDEF)
|
||||
continue;
|
||||
|
||||
if (!isConsecutiveLoad(Elt.Val, Base, i, MVT::getSizeInBits(EVT)/8,MFI))
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
/// PerformShuffleCombine - Combine a vector_shuffle that is equal to
|
||||
/// build_vector load1, load2, load3, load4, <0, 1, 2, 3> into a 128-bit load
|
||||
@ -6209,36 +6284,17 @@ static bool isBaseAlignment16(SDNode *Base, MachineFrameInfo *MFI,
|
||||
/// order.
|
||||
static SDOperand PerformShuffleCombine(SDNode *N, SelectionDAG &DAG,
|
||||
const X86Subtarget *Subtarget) {
|
||||
MachineFunction &MF = DAG.getMachineFunction();
|
||||
MachineFrameInfo *MFI = MF.getFrameInfo();
|
||||
MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
|
||||
MVT::ValueType VT = N->getValueType(0);
|
||||
MVT::ValueType EVT = MVT::getVectorElementType(VT);
|
||||
SDOperand PermMask = N->getOperand(2);
|
||||
unsigned NumElems = PermMask.getNumOperands();
|
||||
SDNode *Base = NULL;
|
||||
for (unsigned i = 0; i < NumElems; ++i) {
|
||||
SDOperand Elt = PermMask.getOperand(i);
|
||||
if (Elt.getOpcode() == ISD::UNDEF) {
|
||||
if (!Base)
|
||||
return SDOperand();
|
||||
continue;
|
||||
}
|
||||
|
||||
unsigned Idx = cast<ConstantSDNode>(Elt)->getValue();
|
||||
SDOperand Arg = getShuffleScalarElt(N, Idx, DAG);
|
||||
if (!Arg.Val || !ISD::isNON_EXTLoad(Arg.Val))
|
||||
return SDOperand();
|
||||
if (!Base) {
|
||||
Base = Arg.Val;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!isConsecutiveLoad(Arg.Val, Base, i, MVT::getSizeInBits(EVT)/8,MFI))
|
||||
return SDOperand();
|
||||
}
|
||||
if (!EltsFromConsecutiveLoads(N, PermMask, NumElems, EVT, MFI, DAG, Base))
|
||||
return SDOperand();
|
||||
|
||||
LoadSDNode *LD = cast<LoadSDNode>(Base);
|
||||
if (isBaseAlignment16(Base->getOperand(1).Val, MFI, Subtarget))
|
||||
if (isBaseAlignmentOfN(16, Base->getOperand(1).Val, MFI, Subtarget))
|
||||
return DAG.getLoad(VT, LD->getChain(), LD->getBasePtr(), LD->getSrcValue(),
|
||||
LD->getSrcValueOffset(), LD->isVolatile());
|
||||
return DAG.getLoad(VT, LD->getChain(), LD->getBasePtr(), LD->getSrcValue(),
|
||||
@ -6319,12 +6375,13 @@ static SDOperand PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
|
||||
}
|
||||
|
||||
/// PerformSTORECombine - Do target-specific dag combines on STORE nodes.
|
||||
static SDOperand PerformSTORECombine(StoreSDNode *St, SelectionDAG &DAG,
|
||||
static SDOperand PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
|
||||
const X86Subtarget *Subtarget) {
|
||||
// Turn load->store of MMX types into GPR load/stores. This avoids clobbering
|
||||
// the FP state in cases where an emms may be missing.
|
||||
// A preferable solution to the general problem is to figure out the right
|
||||
// places to insert EMMS. This qualifies as a quick hack.
|
||||
StoreSDNode *St = cast<StoreSDNode>(N);
|
||||
if (MVT::isVector(St->getValue().getValueType()) &&
|
||||
MVT::getSizeInBits(St->getValue().getValueType()) == 64 &&
|
||||
isa<LoadSDNode>(St->getValue()) &&
|
||||
@ -6442,8 +6499,7 @@ SDOperand X86TargetLowering::PerformDAGCombine(SDNode *N,
|
||||
default: break;
|
||||
case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, Subtarget);
|
||||
case ISD::SELECT: return PerformSELECTCombine(N, DAG, Subtarget);
|
||||
case ISD::STORE:
|
||||
return PerformSTORECombine(cast<StoreSDNode>(N), DAG, Subtarget);
|
||||
case ISD::STORE: return PerformSTORECombine(N, DAG, Subtarget);
|
||||
case X86ISD::FXOR:
|
||||
case X86ISD::FOR: return PerformFORCombine(N, DAG);
|
||||
case X86ISD::FAND: return PerformFANDCombine(N, DAG);
|
||||
|
@ -181,10 +181,10 @@ namespace llvm {
|
||||
/// in order to obtain suitable precision.
|
||||
FRSQRT, FRCP,
|
||||
|
||||
// Thread Local Storage
|
||||
// TLSADDR, THREAThread - Thread Local Storage.
|
||||
TLSADDR, THREAD_POINTER,
|
||||
|
||||
// Exception Handling helpers
|
||||
// EH_RETURN - Exception Handling helpers.
|
||||
EH_RETURN,
|
||||
|
||||
/// TC_RETURN - Tail call return.
|
||||
@ -194,12 +194,15 @@ namespace llvm {
|
||||
/// operand #3 optional in flag
|
||||
TC_RETURN,
|
||||
|
||||
// compare and swap
|
||||
// LCMPXCHG_DAG, LCMPXCHG8_DAG - Compare and swap.
|
||||
LCMPXCHG_DAG,
|
||||
LCMPXCHG8_DAG,
|
||||
|
||||
// Store FP control world into i16 memory
|
||||
FNSTCW16m
|
||||
// FNSTCW16m - Store FP control world into i16 memory.
|
||||
FNSTCW16m,
|
||||
|
||||
// ZEXT_VMOVL - Vector move low and zero extend.
|
||||
ZEXT_VMOVL
|
||||
};
|
||||
}
|
||||
|
||||
|
@ -200,18 +200,14 @@ let AddedComplexity = 15 in
|
||||
// movd to MMX register zero-extends
|
||||
def MMX_MOVZDI2PDIrr : MMXI<0x6E, MRMSrcReg, (outs VR64:$dst), (ins GR32:$src),
|
||||
"movd\t{$src, $dst|$dst, $src}",
|
||||
[(set VR64:$dst,
|
||||
(v2i32 (vector_shuffle immAllZerosV,
|
||||
(v2i32 (scalar_to_vector GR32:$src)),
|
||||
MMX_MOVL_shuffle_mask)))]>;
|
||||
[(set VR64:$dst,
|
||||
(v2i32 (X86zvmovl (v2i32 (scalar_to_vector GR32:$src)))))]>;
|
||||
let AddedComplexity = 20 in
|
||||
def MMX_MOVZDI2PDIrm : MMXI<0x6E, MRMSrcMem, (outs VR64:$dst), (ins i32mem:$src),
|
||||
"movd\t{$src, $dst|$dst, $src}",
|
||||
[(set VR64:$dst,
|
||||
(v2i32 (vector_shuffle immAllZerosV,
|
||||
(v2i32 (scalar_to_vector
|
||||
(loadi32 addr:$src))),
|
||||
MMX_MOVL_shuffle_mask)))]>;
|
||||
[(set VR64:$dst,
|
||||
(v2i32 (X86zvmovl (v2i32
|
||||
(scalar_to_vector (loadi32 addr:$src))))))]>;
|
||||
|
||||
// Arithmetic Instructions
|
||||
|
||||
@ -564,14 +560,10 @@ def : Pat<(i64 (bitconvert (v8i8 VR64:$src))),
|
||||
// Move scalar to XMM zero-extended
|
||||
// movd to XMM register zero-extends
|
||||
let AddedComplexity = 15 in {
|
||||
def : Pat<(v8i8 (vector_shuffle immAllZerosV_bc,
|
||||
(bc_v8i8 (v2i32 (scalar_to_vector GR32:$src))),
|
||||
MMX_MOVL_shuffle_mask)),
|
||||
(MMX_MOVZDI2PDIrr GR32:$src)>;
|
||||
def : Pat<(v4i16 (vector_shuffle immAllZerosV_bc,
|
||||
(bc_v4i16 (v2i32 (scalar_to_vector GR32:$src))),
|
||||
MMX_MOVL_shuffle_mask)),
|
||||
(MMX_MOVZDI2PDIrr GR32:$src)>;
|
||||
def : Pat<(v8i8 (X86zvmovl (bc_v8i8 (v2i32 (scalar_to_vector GR32:$src))))),
|
||||
(MMX_MOVZDI2PDIrr GR32:$src)>;
|
||||
def : Pat<(v4i16 (X86zvmovl (bc_v8i8 (v2i32 (scalar_to_vector GR32:$src))))),
|
||||
(MMX_MOVZDI2PDIrr GR32:$src)>;
|
||||
}
|
||||
|
||||
// Scalar to v4i16 / v8i8. The source may be a GR32, but only the lower
|
||||
|
@ -47,6 +47,7 @@ def X86pinsrw : SDNode<"X86ISD::PINSRW",
|
||||
def X86insrtps : SDNode<"X86ISD::INSERTPS",
|
||||
SDTypeProfile<1, 3, [SDTCisVT<0, v4f32>, SDTCisSameAs<0,1>,
|
||||
SDTCisVT<2, f32>, SDTCisPtrTy<3>]>>;
|
||||
def X86zvmovl : SDNode<"X86ISD::ZEXT_VMOVL", SDTUnaryOp>;
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
// SSE Complex Patterns
|
||||
@ -1007,10 +1008,11 @@ let neverHasSideEffects = 1 in
|
||||
let AddedComplexity = 20 in
|
||||
def MOVZSS2PSrm : SSI<0x10, MRMSrcMem, (outs VR128:$dst), (ins f32mem:$src),
|
||||
"movss\t{$src, $dst|$dst, $src}",
|
||||
[(set VR128:$dst, (v4f32 (vector_shuffle immAllZerosV_bc,
|
||||
(v4f32 (scalar_to_vector (loadf32 addr:$src))),
|
||||
MOVL_shuffle_mask)))]>;
|
||||
[(set VR128:$dst, (v4f32 (X86zvmovl (v4f32 (scalar_to_vector
|
||||
(loadf32 addr:$src))))))]>;
|
||||
|
||||
def : Pat<(v4f32 (X86zvmovl (memopv4f32 addr:$src))),
|
||||
(MOVZSS2PSrm addr:$src)>;
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
// SSE2 Instructions
|
||||
@ -2264,51 +2266,36 @@ let AddedComplexity = 20 in
|
||||
def MOVZSD2PDrm : SDI<0x10, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
|
||||
"movsd\t{$src, $dst|$dst, $src}",
|
||||
[(set VR128:$dst,
|
||||
(v2f64 (vector_shuffle immAllZerosV_bc,
|
||||
(v2f64 (scalar_to_vector
|
||||
(loadf64 addr:$src))),
|
||||
MOVL_shuffle_mask)))]>;
|
||||
(v2f64 (X86zvmovl (v2f64 (scalar_to_vector
|
||||
(loadf64 addr:$src))))))]>;
|
||||
|
||||
def : Pat<(v2f64 (X86zvmovl (memopv2f64 addr:$src))),
|
||||
(MOVZSD2PDrm addr:$src)>;
|
||||
|
||||
// movd / movq to XMM register zero-extends
|
||||
let AddedComplexity = 15 in {
|
||||
def MOVZDI2PDIrr : PDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src),
|
||||
"movd\t{$src, $dst|$dst, $src}",
|
||||
[(set VR128:$dst,
|
||||
(v4i32 (vector_shuffle immAllZerosV,
|
||||
(v4i32 (scalar_to_vector GR32:$src)),
|
||||
MOVL_shuffle_mask)))]>;
|
||||
[(set VR128:$dst, (v4i32 (X86zvmovl
|
||||
(v4i32 (scalar_to_vector GR32:$src)))))]>;
|
||||
// This is X86-64 only.
|
||||
def MOVZQI2PQIrr : RPDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
|
||||
"mov{d|q}\t{$src, $dst|$dst, $src}",
|
||||
[(set VR128:$dst,
|
||||
(v2i64 (vector_shuffle immAllZerosV_bc,
|
||||
(v2i64 (scalar_to_vector GR64:$src)),
|
||||
MOVL_shuffle_mask)))]>;
|
||||
[(set VR128:$dst, (v2i64 (X86zvmovl
|
||||
(v2i64 (scalar_to_vector GR64:$src)))))]>;
|
||||
}
|
||||
|
||||
// Handle the v2f64 form of 'MOVZQI2PQIrr' for PR2108. FIXME: this would be
|
||||
// better written as a dag combine xform.
|
||||
let AddedComplexity = 15 in
|
||||
def : Pat<(v2f64 (vector_shuffle immAllZerosV_bc,
|
||||
(v2f64 (scalar_to_vector
|
||||
(f64 (bitconvert GR64:$src)))),
|
||||
MOVL_shuffle_mask)),
|
||||
(MOVZQI2PQIrr GR64:$src)>, Requires<[HasSSE2]>;
|
||||
|
||||
|
||||
let AddedComplexity = 20 in {
|
||||
def MOVZDI2PDIrm : PDI<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src),
|
||||
"movd\t{$src, $dst|$dst, $src}",
|
||||
[(set VR128:$dst,
|
||||
(v4i32 (vector_shuffle immAllZerosV,
|
||||
(v4i32 (scalar_to_vector (loadi32 addr:$src))),
|
||||
MOVL_shuffle_mask)))]>;
|
||||
(v4i32 (X86zvmovl (v4i32 (scalar_to_vector
|
||||
(loadi32 addr:$src))))))]>;
|
||||
def MOVZQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
|
||||
"movq\t{$src, $dst|$dst, $src}",
|
||||
[(set VR128:$dst,
|
||||
(v2i64 (vector_shuffle immAllZerosV_bc,
|
||||
(v2i64 (scalar_to_vector (loadi64 addr:$src))),
|
||||
MOVL_shuffle_mask)))]>, XS,
|
||||
(v2i64 (X86zvmovl (v2i64 (scalar_to_vector
|
||||
(loadi64 addr:$src))))))]>, XS,
|
||||
Requires<[HasSSE2]>;
|
||||
}
|
||||
|
||||
@ -2317,17 +2304,14 @@ def MOVZQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
|
||||
let AddedComplexity = 15 in
|
||||
def MOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
|
||||
"movq\t{$src, $dst|$dst, $src}",
|
||||
[(set VR128:$dst, (v2i64 (vector_shuffle immAllZerosV_bc,
|
||||
VR128:$src,
|
||||
MOVL_shuffle_mask)))]>,
|
||||
[(set VR128:$dst, (v2i64 (X86zvmovl (v2i64 VR128:$src))))]>,
|
||||
XS, Requires<[HasSSE2]>;
|
||||
|
||||
let AddedComplexity = 20 in
|
||||
def MOVZPQILo2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
|
||||
"movq\t{$src, $dst|$dst, $src}",
|
||||
[(set VR128:$dst, (v2i64 (vector_shuffle immAllZerosV_bc,
|
||||
(memopv2i64 addr:$src),
|
||||
MOVL_shuffle_mask)))]>,
|
||||
[(set VR128:$dst, (v2i64 (X86zvmovl
|
||||
(memopv2i64 addr:$src))))]>,
|
||||
XS, Requires<[HasSSE2]>;
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
@ -2774,11 +2758,9 @@ let Predicates = [HasSSE2] in {
|
||||
// movd to XMM register zero-extends
|
||||
let AddedComplexity = 15 in {
|
||||
// Zeroing a VR128 then do a MOVS{S|D} to the lower bits.
|
||||
def : Pat<(v2f64 (vector_shuffle immAllZerosV_bc,
|
||||
(v2f64 (scalar_to_vector FR64:$src)), MOVL_shuffle_mask)),
|
||||
def : Pat<(v2f64 (X86zvmovl (v2f64 (scalar_to_vector FR64:$src)))),
|
||||
(MOVLSD2PDrr (V_SET0), FR64:$src)>, Requires<[HasSSE2]>;
|
||||
def : Pat<(v4f32 (vector_shuffle immAllZerosV_bc,
|
||||
(v4f32 (scalar_to_vector FR32:$src)), MOVL_shuffle_mask)),
|
||||
def : Pat<(v4f32 (X86zvmovl (v4f32 (scalar_to_vector FR32:$src)))),
|
||||
(MOVLSS2PSrr (V_SET0), FR32:$src)>, Requires<[HasSSE2]>;
|
||||
}
|
||||
|
||||
|
@ -1,8 +1,7 @@
|
||||
; RUN: llvm-as < %s | llc -march=x86 -mattr=+sse2 -o %t -f
|
||||
; RUN: grep movlhps %t | count 1
|
||||
; RUN: grep unpcklps %t | count 1
|
||||
; RUN: grep punpckldq %t | count 1
|
||||
; RUN: grep movq %t | count 1
|
||||
; RUN: grep movsd %t | count 1
|
||||
|
||||
define <4 x float> @test1(float %a, float %b) nounwind {
|
||||
%tmp = insertelement <4 x float> zeroinitializer, float %a, i32 0 ; <<4 x float>> [#uses=1]
|
||||
|
@ -1,5 +1,6 @@
|
||||
; RUN: llvm-as < %s | llc -march=x86 -mattr=+sse2 -o %t -f
|
||||
; RUN: grep unpcklps %t | count 1
|
||||
; RUN: grep movss %t | count 1
|
||||
; RUN: grep movups %t | count 1
|
||||
; RUN: grep shufps %t | count 1
|
||||
|
||||
define <4 x float> @test(float %a, float %b, float %c) nounwind {
|
||||
|
7
test/CodeGen/X86/vec_set-C.ll
Normal file
7
test/CodeGen/X86/vec_set-C.ll
Normal file
@ -0,0 +1,7 @@
|
||||
; RUN: llvm-as < %s | llc -march=x86 -mattr=+sse2 | grep movq
|
||||
; RUN: llvm-as < %s | llc -march=x86-64 -mattr=+sse2 | grep movd
|
||||
|
||||
define <2 x i64> @t1(i64 %x) nounwind {
|
||||
%tmp8 = insertelement <2 x i64> zeroinitializer, i64 %x, i32 0
|
||||
ret <2 x i64> %tmp8
|
||||
}
|
7
test/CodeGen/X86/vec_set-D.ll
Normal file
7
test/CodeGen/X86/vec_set-D.ll
Normal file
@ -0,0 +1,7 @@
|
||||
; RUN: llvm-as < %s | llc -march=x86 -mattr=+sse2 | grep movq
|
||||
|
||||
define <4 x i32> @t(i32 %x, i32 %y) nounwind {
|
||||
%tmp1 = insertelement <4 x i32> zeroinitializer, i32 %x, i32 0
|
||||
%tmp2 = insertelement <4 x i32> %tmp1, i32 %y, i32 1
|
||||
ret <4 x i32> %tmp2
|
||||
}
|
Loading…
Reference in New Issue
Block a user