Slightly generalize the code that handles shuffles of consecutive loads

on x86 to handle more cases.  Fix a bug in said code that would cause it 
to read past the end of an object.  Rewrite the code in 
SelectionDAGLegalize::ExpandBUILD_VECTOR to be a bit more general. 
Remove PerformBuildVectorCombine, which is no longer necessary with 
these changes.  In addition to simplifying the code, with this change, 
we can now catch a few more cases of consecutive loads.



git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@73012 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
Eli Friedman 2009-06-07 06:52:44 +00:00
parent dcef849ab0
commit 7a5e55509b
5 changed files with 90 additions and 185 deletions

View File

@ -1785,48 +1785,41 @@ SDValue SelectionDAGLegalize::ExpandSCALAR_TO_VECTOR(SDNode *Node) {
/// support the operation, but do support the resultant vector type. /// support the operation, but do support the resultant vector type.
SDValue SelectionDAGLegalize::ExpandBUILD_VECTOR(SDNode *Node) { SDValue SelectionDAGLegalize::ExpandBUILD_VECTOR(SDNode *Node) {
unsigned NumElems = Node->getNumOperands(); unsigned NumElems = Node->getNumOperands();
SDValue SplatValue = Node->getOperand(0); SDValue Value1, Value2;
DebugLoc dl = Node->getDebugLoc(); DebugLoc dl = Node->getDebugLoc();
MVT VT = Node->getValueType(0); MVT VT = Node->getValueType(0);
MVT OpVT = SplatValue.getValueType(); MVT OpVT = Node->getOperand(0).getValueType();
MVT EltVT = VT.getVectorElementType(); MVT EltVT = VT.getVectorElementType();
// If the only non-undef value is the low element, turn this into a // If the only non-undef value is the low element, turn this into a
// SCALAR_TO_VECTOR node. If this is { X, X, X, X }, determine X. // SCALAR_TO_VECTOR node. If this is { X, X, X, X }, determine X.
bool isOnlyLowElement = true; bool isOnlyLowElement = true;
bool MoreThanTwoValues = false;
// FIXME: it would be far nicer to change this into map<SDValue,uint64_t>
// and use a bitmask instead of a list of elements.
// FIXME: this doesn't treat <0, u, 0, u> for example, as a splat.
std::map<SDValue, std::vector<unsigned> > Values;
Values[SplatValue].push_back(0);
bool isConstant = true; bool isConstant = true;
if (!isa<ConstantFPSDNode>(SplatValue) && !isa<ConstantSDNode>(SplatValue) && for (unsigned i = 0; i < NumElems; ++i) {
SplatValue.getOpcode() != ISD::UNDEF)
isConstant = false;
for (unsigned i = 1; i < NumElems; ++i) {
SDValue V = Node->getOperand(i); SDValue V = Node->getOperand(i);
Values[V].push_back(i); if (V.getOpcode() == ISD::UNDEF)
if (V.getOpcode() != ISD::UNDEF) continue;
if (i > 0)
isOnlyLowElement = false; isOnlyLowElement = false;
if (SplatValue != V) if (!isa<ConstantFPSDNode>(V) && !isa<ConstantSDNode>(V))
SplatValue = SDValue(0, 0);
// If this isn't a constant element or an undef, we can't use a constant
// pool load.
if (!isa<ConstantFPSDNode>(V) && !isa<ConstantSDNode>(V) &&
V.getOpcode() != ISD::UNDEF)
isConstant = false; isConstant = false;
if (!Value1.getNode()) {
Value1 = V;
} else if (!Value2.getNode()) {
if (V != Value1)
Value2 = V;
} else if (V != Value1 && V != Value2) {
MoreThanTwoValues = true;
}
} }
if (isOnlyLowElement) { if (!Value1.getNode())
// If the low element is an undef too, then this whole things is an undef. return DAG.getUNDEF(VT);
if (Node->getOperand(0).getOpcode() == ISD::UNDEF)
return DAG.getUNDEF(VT); if (isOnlyLowElement)
// Otherwise, turn this into a scalar_to_vector node.
return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Node->getOperand(0)); return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Node->getOperand(0));
}
// If all elements are constants, create a load from the constant pool. // If all elements are constants, create a load from the constant pool.
if (isConstant) { if (isConstant) {
@ -1852,59 +1845,25 @@ SDValue SelectionDAGLegalize::ExpandBUILD_VECTOR(SDNode *Node) {
false, Alignment); false, Alignment);
} }
if (SplatValue.getNode()) { // Splat of one value? if (!MoreThanTwoValues) {
// Build the shuffle constant vector: <0, 0, 0, 0> SmallVector<int, 8> ShuffleVec(NumElems, -1);
SmallVector<int, 8> ZeroVec(NumElems, 0); for (unsigned i = 0; i < NumElems; ++i) {
SDValue V = Node->getOperand(i);
// If the target supports VECTOR_SHUFFLE and this shuffle mask, use it. if (V.getOpcode() == ISD::UNDEF)
if (TLI.isShuffleMaskLegal(ZeroVec, Node->getValueType(0))) { continue;
ShuffleVec[i] = V == Value1 ? 0 : NumElems;
}
if (TLI.isShuffleMaskLegal(ShuffleVec, Node->getValueType(0))) {
// Get the splatted value into the low element of a vector register. // Get the splatted value into the low element of a vector register.
SDValue LowValVec = SDValue Vec1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Value1);
DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, SplatValue); SDValue Vec2;
if (Value2.getNode())
Vec2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Value2);
else
Vec2 = DAG.getUNDEF(VT);
// Return shuffle(LowValVec, undef, <0,0,0,0>) // Return shuffle(LowValVec, undef, <0,0,0,0>)
return DAG.getVectorShuffle(VT, dl, LowValVec, DAG.getUNDEF(VT), return DAG.getVectorShuffle(VT, dl, Vec1, Vec2, ShuffleVec.data());
&ZeroVec[0]);
}
}
// If there are only two unique elements, we may be able to turn this into a
// vector shuffle.
if (Values.size() == 2) {
// Get the two values in deterministic order.
SDValue Val1 = Node->getOperand(1);
SDValue Val2;
std::map<SDValue, std::vector<unsigned> >::iterator MI = Values.begin();
if (MI->first != Val1)
Val2 = MI->first;
else
Val2 = (++MI)->first;
// If Val1 is an undef, make sure it ends up as Val2, to ensure that our
// vector shuffle has the undef vector on the RHS.
if (Val1.getOpcode() == ISD::UNDEF)
std::swap(Val1, Val2);
// Build the shuffle constant vector: e.g. <0, 4, 0, 4>
SmallVector<int, 8> ShuffleMask(NumElems, -1);
// Set elements of the shuffle mask for Val1.
std::vector<unsigned> &Val1Elts = Values[Val1];
for (unsigned i = 0, e = Val1Elts.size(); i != e; ++i)
ShuffleMask[Val1Elts[i]] = 0;
// Set elements of the shuffle mask for Val2.
std::vector<unsigned> &Val2Elts = Values[Val2];
for (unsigned i = 0, e = Val2Elts.size(); i != e; ++i)
if (Val2.getOpcode() != ISD::UNDEF)
ShuffleMask[Val2Elts[i]] = NumElems;
// If the target supports SCALAR_TO_VECTOR and this shuffle mask, use it.
if (TLI.isOperationLegalOrCustom(ISD::SCALAR_TO_VECTOR, VT) &&
TLI.isShuffleMaskLegal(ShuffleMask, VT)) {
Val1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Val1);
Val2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Val2);
return DAG.getVectorShuffle(VT, dl, Val1, Val2, &ShuffleMask[0]);
} }
} }

View File

@ -7691,13 +7691,15 @@ static bool isBaseAlignmentOfN(unsigned N, SDNode *Base,
} }
static bool EltsFromConsecutiveLoads(ShuffleVectorSDNode *N, unsigned NumElems, static bool EltsFromConsecutiveLoads(ShuffleVectorSDNode *N, unsigned NumElems,
MVT EVT, SDNode *&Base, MVT EVT, LoadSDNode *&LDBase,
unsigned &LastLoadedElt,
SelectionDAG &DAG, MachineFrameInfo *MFI, SelectionDAG &DAG, MachineFrameInfo *MFI,
const TargetLowering &TLI) { const TargetLowering &TLI) {
Base = NULL; LDBase = NULL;
LastLoadedElt = -1;
for (unsigned i = 0; i < NumElems; ++i) { for (unsigned i = 0; i < NumElems; ++i) {
if (N->getMaskElt(i) < 0) { if (N->getMaskElt(i) < 0) {
if (!Base) if (!LDBase)
return false; return false;
continue; continue;
} }
@ -7706,19 +7708,20 @@ static bool EltsFromConsecutiveLoads(ShuffleVectorSDNode *N, unsigned NumElems,
if (!Elt.getNode() || if (!Elt.getNode() ||
(Elt.getOpcode() != ISD::UNDEF && !ISD::isNON_EXTLoad(Elt.getNode()))) (Elt.getOpcode() != ISD::UNDEF && !ISD::isNON_EXTLoad(Elt.getNode())))
return false; return false;
if (!Base) { if (!LDBase) {
Base = Elt.getNode(); if (Elt.getNode()->getOpcode() == ISD::UNDEF)
if (Base->getOpcode() == ISD::UNDEF)
return false; return false;
LDBase = cast<LoadSDNode>(Elt.getNode());
LastLoadedElt = i;
continue; continue;
} }
if (Elt.getOpcode() == ISD::UNDEF) if (Elt.getOpcode() == ISD::UNDEF)
continue; continue;
LoadSDNode *LD = cast<LoadSDNode>(Elt); LoadSDNode *LD = cast<LoadSDNode>(Elt);
LoadSDNode *LDBase = cast<LoadSDNode>(Base);
if (!TLI.isConsecutiveLoad(LD, LDBase, EVT.getSizeInBits()/8, i, MFI)) if (!TLI.isConsecutiveLoad(LD, LDBase, EVT.getSizeInBits()/8, i, MFI))
return false; return false;
LastLoadedElt = i;
} }
return true; return true;
} }
@ -7737,6 +7740,9 @@ static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG,
ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(N); ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(N);
unsigned NumElems = VT.getVectorNumElements(); unsigned NumElems = VT.getVectorNumElements();
if (VT.getSizeInBits() != 128)
return SDValue();
// For x86-32 machines, if we see an insert and then a shuffle in a v2i64 // For x86-32 machines, if we see an insert and then a shuffle in a v2i64
// where the upper half is 0, it is advantageous to rewrite it as a build // where the upper half is 0, it is advantageous to rewrite it as a build
// vector of (0, val) so it can use movq. // vector of (0, val) so it can use movq.
@ -7764,107 +7770,24 @@ static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG,
// Try to combine a vector_shuffle into a 128-bit load. // Try to combine a vector_shuffle into a 128-bit load.
MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
SDNode *Base = NULL; LoadSDNode *LD = NULL;
if (!EltsFromConsecutiveLoads(SVN, NumElems, EVT, Base, DAG, MFI, TLI)) unsigned LastLoadedElt;
if (!EltsFromConsecutiveLoads(SVN, NumElems, EVT, LD, LastLoadedElt, DAG,
MFI, TLI))
return SDValue(); return SDValue();
LoadSDNode *LD = cast<LoadSDNode>(Base); if (LastLoadedElt == NumElems - 1) {
if (isBaseAlignmentOfN(16, Base->getOperand(1).getNode(), TLI)) if (isBaseAlignmentOfN(16, LD->getBasePtr().getNode(), TLI))
return DAG.getLoad(VT, dl, LD->getChain(), LD->getBasePtr(),
LD->getSrcValue(), LD->getSrcValueOffset(),
LD->isVolatile());
return DAG.getLoad(VT, dl, LD->getChain(), LD->getBasePtr(), return DAG.getLoad(VT, dl, LD->getChain(), LD->getBasePtr(),
LD->getSrcValue(), LD->getSrcValueOffset(), LD->getSrcValue(), LD->getSrcValueOffset(),
LD->isVolatile()); LD->isVolatile(), LD->getAlignment());
return DAG.getLoad(VT, dl, LD->getChain(), LD->getBasePtr(), } else if (NumElems == 4 && LastLoadedElt == 1) {
LD->getSrcValue(), LD->getSrcValueOffset(), SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other);
LD->isVolatile(), LD->getAlignment());
}
/// PerformBuildVectorCombine - build_vector 0,(load i64 / f64) -> movq / movsd.
static SDValue PerformBuildVectorCombine(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget *Subtarget,
const TargetLowering &TLI) {
unsigned NumOps = N->getNumOperands();
DebugLoc dl = N->getDebugLoc();
// Ignore single operand BUILD_VECTOR.
if (NumOps == 1)
return SDValue();
MVT VT = N->getValueType(0);
MVT EVT = VT.getVectorElementType();
// Before or during type legalization, we want to try and convert a
// build_vector of an i64 load and a zero value into vzext_movl before the
// legalizer can break it up.
// FIXME: does the case below remove the need to do this?
if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) {
if ((EVT != MVT::i64 && EVT != MVT::f64) || Subtarget->is64Bit())
return SDValue();
// This must be an insertion into a zero vector.
SDValue HighElt = N->getOperand(1);
if (!isZeroNode(HighElt))
return SDValue();
// Value must be a load.
SDNode *Base = N->getOperand(0).getNode();
if (!isa<LoadSDNode>(Base)) {
if (Base->getOpcode() != ISD::BIT_CONVERT)
return SDValue();
Base = Base->getOperand(0).getNode();
if (!isa<LoadSDNode>(Base))
return SDValue();
}
// Transform it into VZEXT_LOAD addr.
LoadSDNode *LD = cast<LoadSDNode>(Base);
// Load must not be an extload.
if (LD->getExtensionType() != ISD::NON_EXTLOAD)
return SDValue();
// Load type should legal type so we don't have to legalize it.
if (!TLI.isTypeLegal(VT))
return SDValue();
SDVTList Tys = DAG.getVTList(VT, MVT::Other);
SDValue Ops[] = { LD->getChain(), LD->getBasePtr() }; SDValue Ops[] = { LD->getChain(), LD->getBasePtr() };
SDValue ResNode = DAG.getNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, 2); SDValue ResNode = DAG.getNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, 2);
TargetLowering::TargetLoweringOpt TLO(DAG);
TLO.CombineTo(SDValue(Base, 1), ResNode.getValue(1));
DCI.CommitTargetLoweringOpt(TLO);
return ResNode;
}
// The type legalizer will have broken apart v2i64 build_vector created during
// widening before the code which handles that case is run. Look for build
// vector (load, load + 4, 0/undef, 0/undef)
if (VT == MVT::v4i32 || VT == MVT::v4f32) {
LoadSDNode *LD0 = dyn_cast<LoadSDNode>(N->getOperand(0));
LoadSDNode *LD1 = dyn_cast<LoadSDNode>(N->getOperand(1));
if (!LD0 || !LD1)
return SDValue();
if (LD0->getExtensionType() != ISD::NON_EXTLOAD ||
LD1->getExtensionType() != ISD::NON_EXTLOAD)
return SDValue();
// Make sure the second elt is a consecutive load.
if (!TLI.isConsecutiveLoad(LD1, LD0, EVT.getSizeInBits()/8, 1,
DAG.getMachineFunction().getFrameInfo()))
return SDValue();
SDValue N2 = N->getOperand(2);
SDValue N3 = N->getOperand(3);
if (!isZeroNode(N2) && N2.getOpcode() != ISD::UNDEF)
return SDValue();
if (!isZeroNode(N3) && N3.getOpcode() != ISD::UNDEF)
return SDValue();
SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other);
SDValue Ops[] = { LD0->getChain(), LD0->getBasePtr() };
SDValue ResNode = DAG.getNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, 2);
TargetLowering::TargetLoweringOpt TLO(DAG);
TLO.CombineTo(SDValue(LD0, 1), ResNode.getValue(1));
DCI.CommitTargetLoweringOpt(TLO);
return DAG.getNode(ISD::BIT_CONVERT, dl, VT, ResNode); return DAG.getNode(ISD::BIT_CONVERT, dl, VT, ResNode);
} }
return SDValue(); return SDValue();
@ -8466,14 +8389,25 @@ static SDValue PerformBTCombine(SDNode *N,
return SDValue(); return SDValue();
} }
static SDValue PerformVZEXT_MOVLCombine(SDNode *N, SelectionDAG &DAG) {
SDValue Op = N->getOperand(0);
if (Op.getOpcode() == ISD::BIT_CONVERT)
Op = Op.getOperand(0);
MVT VT = N->getValueType(0), OpVT = Op.getValueType();
if (Op.getOpcode() == X86ISD::VZEXT_LOAD &&
VT.getVectorElementType().getSizeInBits() ==
OpVT.getVectorElementType().getSizeInBits()) {
return DAG.getNode(ISD::BIT_CONVERT, N->getDebugLoc(), VT, Op);
}
return SDValue();
}
SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
DAGCombinerInfo &DCI) const { DAGCombinerInfo &DCI) const {
SelectionDAG &DAG = DCI.DAG; SelectionDAG &DAG = DCI.DAG;
switch (N->getOpcode()) { switch (N->getOpcode()) {
default: break; default: break;
case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, *this); case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, *this);
case ISD::BUILD_VECTOR:
return PerformBuildVectorCombine(N, DAG, DCI, Subtarget, *this);
case ISD::SELECT: return PerformSELECTCombine(N, DAG, Subtarget); case ISD::SELECT: return PerformSELECTCombine(N, DAG, Subtarget);
case X86ISD::CMOV: return PerformCMOVCombine(N, DAG, DCI); case X86ISD::CMOV: return PerformCMOVCombine(N, DAG, DCI);
case ISD::MUL: return PerformMulCombine(N, DAG, DCI); case ISD::MUL: return PerformMulCombine(N, DAG, DCI);
@ -8485,6 +8419,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
case X86ISD::FOR: return PerformFORCombine(N, DAG); case X86ISD::FOR: return PerformFORCombine(N, DAG);
case X86ISD::FAND: return PerformFANDCombine(N, DAG); case X86ISD::FAND: return PerformFANDCombine(N, DAG);
case X86ISD::BT: return PerformBTCombine(N, DAG, DCI); case X86ISD::BT: return PerformBTCombine(N, DAG, DCI);
case X86ISD::VZEXT_MOVL: return PerformVZEXT_MOVLCombine(N, DAG);
} }
return SDValue(); return SDValue();

View File

@ -0,0 +1,12 @@
; RUN: llvm-as < %s | llc -march=x86 -mattr=+sse2 | grep movq
define <4 x float> @a(<4 x float> %a, float* nocapture %p) nounwind readonly {
entry:
%tmp1 = load float* %p
%vecins = insertelement <4 x float> undef, float %tmp1, i32 0
%add.ptr = getelementptr float* %p, i32 1
%tmp5 = load float* %add.ptr
%vecins7 = insertelement <4 x float> %vecins, float %tmp5, i32 1
ret <4 x float> %vecins7
}

View File

@ -1,7 +1,6 @@
; RUN: llvm-as < %s | llc -march=x86 -mattr=+sse2 -o %t -f ; RUN: llvm-as < %s | llc -march=x86 -mattr=+sse2 -o %t -f
; RUN: grep movlhps %t | count 1 ; RUN: grep movlhps %t | count 1
; RUN: grep movq %t | count 1 ; RUN: grep movq %t | count 2
; RUN: grep movsd %t | count 1
define <4 x float> @test1(float %a, float %b) nounwind { define <4 x float> @test1(float %a, float %b) nounwind {
%tmp = insertelement <4 x float> zeroinitializer, float %a, i32 0 ; <<4 x float>> [#uses=1] %tmp = insertelement <4 x float> zeroinitializer, float %a, i32 0 ; <<4 x float>> [#uses=1]

View File

@ -1,6 +1,6 @@
; RUN: llvm-as < %s | llc -march=x86 -mattr=+sse2 -o %t -f ; RUN: llvm-as < %s | llc -march=x86 -mattr=+sse2 -o %t -f
; RUN: grep movss %t | count 1 ; RUN: grep movss %t | count 1
; RUN: grep movups %t | count 1 ; RUN: grep movq %t | count 1
; RUN: grep shufps %t | count 1 ; RUN: grep shufps %t | count 1
define <4 x float> @test(float %a, float %b, float %c) nounwind { define <4 x float> @test(float %a, float %b, float %c) nounwind {