mirror of
https://github.com/c64scene-ar/llvm-6502.git
synced 2024-12-12 13:30:51 +00:00
BUILD_VECTOR was missing out on some prime opportunities to use SSE 4.1 inserts.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@99423 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
parent
d6a6b3b756
commit
fdea31a463
@ -3613,6 +3613,54 @@ X86TargetLowering::LowerAsSplatVectorLoad(SDValue SrcOp, EVT VT, DebugLoc dl,
|
|||||||
return SDValue();
|
return SDValue();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts,
|
||||||
|
DebugLoc &dl, SelectionDAG &DAG) {
|
||||||
|
EVT EltVT = VT.getVectorElementType();
|
||||||
|
unsigned NumElems = Elts.size();
|
||||||
|
|
||||||
|
// FIXME: check for zeroes
|
||||||
|
LoadSDNode *LDBase = NULL;
|
||||||
|
unsigned LastLoadedElt = -1U;
|
||||||
|
for (unsigned i = 0; i < NumElems; ++i) {
|
||||||
|
SDValue Elt = Elts[i];
|
||||||
|
|
||||||
|
if (!Elt.getNode() ||
|
||||||
|
(Elt.getOpcode() != ISD::UNDEF && !ISD::isNON_EXTLoad(Elt.getNode())))
|
||||||
|
return SDValue();
|
||||||
|
if (!LDBase) {
|
||||||
|
if (Elt.getNode()->getOpcode() == ISD::UNDEF)
|
||||||
|
return SDValue();
|
||||||
|
LDBase = cast<LoadSDNode>(Elt.getNode());
|
||||||
|
LastLoadedElt = i;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if (Elt.getOpcode() == ISD::UNDEF)
|
||||||
|
continue;
|
||||||
|
|
||||||
|
LoadSDNode *LD = cast<LoadSDNode>(Elt);
|
||||||
|
if (!DAG.isConsecutiveLoad(LD, LDBase, EltVT.getSizeInBits()/8, i))
|
||||||
|
return SDValue();
|
||||||
|
LastLoadedElt = i;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (LastLoadedElt == NumElems - 1) {
|
||||||
|
if (DAG.InferPtrAlignment(LDBase->getBasePtr()) >= 16)
|
||||||
|
return DAG.getLoad(VT, dl, LDBase->getChain(), LDBase->getBasePtr(),
|
||||||
|
LDBase->getSrcValue(), LDBase->getSrcValueOffset(),
|
||||||
|
LDBase->isVolatile(), LDBase->isNonTemporal(), 0);
|
||||||
|
return DAG.getLoad(VT, dl, LDBase->getChain(), LDBase->getBasePtr(),
|
||||||
|
LDBase->getSrcValue(), LDBase->getSrcValueOffset(),
|
||||||
|
LDBase->isVolatile(), LDBase->isNonTemporal(),
|
||||||
|
LDBase->getAlignment());
|
||||||
|
} else if (NumElems == 4 && LastLoadedElt == 1) {
|
||||||
|
SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other);
|
||||||
|
SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
|
||||||
|
SDValue ResNode = DAG.getNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, 2);
|
||||||
|
return DAG.getNode(ISD::BIT_CONVERT, dl, VT, ResNode);
|
||||||
|
}
|
||||||
|
return SDValue();
|
||||||
|
}
|
||||||
|
|
||||||
SDValue
|
SDValue
|
||||||
X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) {
|
X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) {
|
||||||
DebugLoc dl = Op.getDebugLoc();
|
DebugLoc dl = Op.getDebugLoc();
|
||||||
@ -3841,14 +3889,18 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) {
|
|||||||
return DAG.getVectorShuffle(VT, dl, V[0], V[1], &MaskVec[0]);
|
return DAG.getVectorShuffle(VT, dl, V[0], V[1], &MaskVec[0]);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (Values.size() > 2) {
|
if (Values.size() > 1 && VT.getSizeInBits() == 128) {
|
||||||
// If we have SSE 4.1, Expand into a number of inserts unless the number of
|
// Check for a build vector of consecutive loads.
|
||||||
// values to be inserted is equal to the number of elements, in which case
|
for (unsigned i = 0; i < NumElems; ++i)
|
||||||
// use the unpack code below in the hopes of matching the consecutive elts
|
V[i] = Op.getOperand(i);
|
||||||
// load merge pattern for shuffles.
|
|
||||||
// FIXME: We could probably just check that here directly.
|
// Check for elements which are consecutive loads.
|
||||||
if (Values.size() < NumElems && VT.getSizeInBits() == 128 &&
|
SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG);
|
||||||
getSubtarget()->hasSSE41()) {
|
if (LD.getNode())
|
||||||
|
return LD;
|
||||||
|
|
||||||
|
// For SSE 4.1, use inserts into undef.
|
||||||
|
if (getSubtarget()->hasSSE41()) {
|
||||||
V[0] = DAG.getUNDEF(VT);
|
V[0] = DAG.getUNDEF(VT);
|
||||||
for (unsigned i = 0; i < NumElems; ++i)
|
for (unsigned i = 0; i < NumElems; ++i)
|
||||||
if (Op.getOperand(i).getOpcode() != ISD::UNDEF)
|
if (Op.getOperand(i).getOpcode() != ISD::UNDEF)
|
||||||
@ -3856,7 +3908,8 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) {
|
|||||||
Op.getOperand(i), DAG.getIntPtrConstant(i));
|
Op.getOperand(i), DAG.getIntPtrConstant(i));
|
||||||
return V[0];
|
return V[0];
|
||||||
}
|
}
|
||||||
// Expand into a number of unpckl*.
|
|
||||||
|
// Otherwise, expand into a number of unpckl*
|
||||||
// e.g. for v4f32
|
// e.g. for v4f32
|
||||||
// Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
|
// Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
|
||||||
// : unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
|
// : unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
|
||||||
@ -3871,7 +3924,6 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) {
|
|||||||
}
|
}
|
||||||
return V[0];
|
return V[0];
|
||||||
}
|
}
|
||||||
|
|
||||||
return SDValue();
|
return SDValue();
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -8797,83 +8849,24 @@ bool X86TargetLowering::isGAPlusOffset(SDNode *N,
|
|||||||
return TargetLowering::isGAPlusOffset(N, GA, Offset);
|
return TargetLowering::isGAPlusOffset(N, GA, Offset);
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool EltsFromConsecutiveLoads(ShuffleVectorSDNode *N, unsigned NumElems,
|
|
||||||
EVT EltVT, LoadSDNode *&LDBase,
|
|
||||||
unsigned &LastLoadedElt,
|
|
||||||
SelectionDAG &DAG, MachineFrameInfo *MFI,
|
|
||||||
const TargetLowering &TLI) {
|
|
||||||
LDBase = NULL;
|
|
||||||
LastLoadedElt = -1U;
|
|
||||||
for (unsigned i = 0; i < NumElems; ++i) {
|
|
||||||
if (N->getMaskElt(i) < 0) {
|
|
||||||
if (!LDBase)
|
|
||||||
return false;
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
SDValue Elt = DAG.getShuffleScalarElt(N, i);
|
|
||||||
if (!Elt.getNode() ||
|
|
||||||
(Elt.getOpcode() != ISD::UNDEF && !ISD::isNON_EXTLoad(Elt.getNode())))
|
|
||||||
return false;
|
|
||||||
if (!LDBase) {
|
|
||||||
if (Elt.getNode()->getOpcode() == ISD::UNDEF)
|
|
||||||
return false;
|
|
||||||
LDBase = cast<LoadSDNode>(Elt.getNode());
|
|
||||||
LastLoadedElt = i;
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
if (Elt.getOpcode() == ISD::UNDEF)
|
|
||||||
continue;
|
|
||||||
|
|
||||||
LoadSDNode *LD = cast<LoadSDNode>(Elt);
|
|
||||||
if (!DAG.isConsecutiveLoad(LD, LDBase, EltVT.getSizeInBits()/8, i))
|
|
||||||
return false;
|
|
||||||
LastLoadedElt = i;
|
|
||||||
}
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
/// PerformShuffleCombine - Combine a vector_shuffle that is equal to
|
/// PerformShuffleCombine - Combine a vector_shuffle that is equal to
|
||||||
/// build_vector load1, load2, load3, load4, <0, 1, 2, 3> into a 128-bit load
|
/// build_vector load1, load2, load3, load4, <0, 1, 2, 3> into a 128-bit load
|
||||||
/// if the load addresses are consecutive, non-overlapping, and in the right
|
/// if the load addresses are consecutive, non-overlapping, and in the right
|
||||||
/// order. In the case of v2i64, it will see if it can rewrite the
|
/// order.
|
||||||
/// shuffle to be an appropriate build vector so it can take advantage of
|
|
||||||
// performBuildVectorCombine.
|
|
||||||
static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG,
|
static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG,
|
||||||
const TargetLowering &TLI) {
|
const TargetLowering &TLI) {
|
||||||
DebugLoc dl = N->getDebugLoc();
|
DebugLoc dl = N->getDebugLoc();
|
||||||
EVT VT = N->getValueType(0);
|
EVT VT = N->getValueType(0);
|
||||||
EVT EltVT = VT.getVectorElementType();
|
|
||||||
ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(N);
|
ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(N);
|
||||||
unsigned NumElems = VT.getVectorNumElements();
|
|
||||||
|
|
||||||
if (VT.getSizeInBits() != 128)
|
if (VT.getSizeInBits() != 128)
|
||||||
return SDValue();
|
return SDValue();
|
||||||
|
|
||||||
// Try to combine a vector_shuffle into a 128-bit load.
|
SmallVector<SDValue, 16> Elts;
|
||||||
MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
|
for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i)
|
||||||
LoadSDNode *LD = NULL;
|
Elts.push_back(DAG.getShuffleScalarElt(SVN, i));
|
||||||
unsigned LastLoadedElt;
|
|
||||||
if (!EltsFromConsecutiveLoads(SVN, NumElems, EltVT, LD, LastLoadedElt, DAG,
|
return EltsFromConsecutiveLoads(VT, Elts, dl, DAG);
|
||||||
MFI, TLI))
|
|
||||||
return SDValue();
|
|
||||||
|
|
||||||
if (LastLoadedElt == NumElems - 1) {
|
|
||||||
if (DAG.InferPtrAlignment(LD->getBasePtr()) >= 16)
|
|
||||||
return DAG.getLoad(VT, dl, LD->getChain(), LD->getBasePtr(),
|
|
||||||
LD->getSrcValue(), LD->getSrcValueOffset(),
|
|
||||||
LD->isVolatile(), LD->isNonTemporal(), 0);
|
|
||||||
return DAG.getLoad(VT, dl, LD->getChain(), LD->getBasePtr(),
|
|
||||||
LD->getSrcValue(), LD->getSrcValueOffset(),
|
|
||||||
LD->isVolatile(), LD->isNonTemporal(),
|
|
||||||
LD->getAlignment());
|
|
||||||
} else if (NumElems == 4 && LastLoadedElt == 1) {
|
|
||||||
SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other);
|
|
||||||
SDValue Ops[] = { LD->getChain(), LD->getBasePtr() };
|
|
||||||
SDValue ResNode = DAG.getNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, 2);
|
|
||||||
return DAG.getNode(ISD::BIT_CONVERT, dl, VT, ResNode);
|
|
||||||
}
|
|
||||||
return SDValue();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// PerformShuffleCombine - Detect vector gather/scatter index generation
|
/// PerformShuffleCombine - Detect vector gather/scatter index generation
|
||||||
|
9
test/CodeGen/X86/vec_insert-9.ll
Normal file
9
test/CodeGen/X86/vec_insert-9.ll
Normal file
@ -0,0 +1,9 @@
|
|||||||
|
; RUN: llc < %s -march=x86 -mattr=+sse41 > %t
|
||||||
|
; RUN: grep pinsrd %t | count 2
|
||||||
|
|
||||||
|
define <4 x i32> @var_insert2(<4 x i32> %x, i32 %val, i32 %idx) nounwind {
|
||||||
|
entry:
|
||||||
|
%tmp3 = insertelement <4 x i32> undef, i32 %val, i32 0 ; <<4 x i32>> [#uses=1]
|
||||||
|
%tmp4 = insertelement <4 x i32> %tmp3, i32 %idx, i32 3 ; <<4 x i32>> [#uses=1]
|
||||||
|
ret <4 x i32> %tmp4
|
||||||
|
}
|
@ -1,4 +1,4 @@
|
|||||||
; RUN: llc < %s -march=x86 -mattr=+sse2 | grep punpckl | count 7
|
; RUN: llc < %s -march=x86 -mattr=+sse2,-sse41 | grep punpckl | count 7
|
||||||
|
|
||||||
define void @test(<8 x i16>* %b, i16 %a0, i16 %a1, i16 %a2, i16 %a3, i16 %a4, i16 %a5, i16 %a6, i16 %a7) nounwind {
|
define void @test(<8 x i16>* %b, i16 %a0, i16 %a1, i16 %a2, i16 %a3, i16 %a4, i16 %a5, i16 %a6, i16 %a7) nounwind {
|
||||||
%tmp = insertelement <8 x i16> zeroinitializer, i16 %a0, i32 0 ; <<8 x i16>> [#uses=1]
|
%tmp = insertelement <8 x i16> zeroinitializer, i16 %a0, i32 0 ; <<8 x i16>> [#uses=1]
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
; RUN: llc < %s -march=x86 -mcpu=core2 -o %t
|
; RUN: llc < %s -march=x86 -mcpu=core2 -o %t
|
||||||
; RUN: grep shufp %t | count 1
|
; RUN: grep movq %t | count 1
|
||||||
|
; RUN: grep pshufd %t | count 1
|
||||||
; RUN: grep movupd %t | count 1
|
; RUN: grep movupd %t | count 1
|
||||||
; RUN: grep pshufhw %t | count 1
|
; RUN: grep pshufhw %t | count 1
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user