mirror of
https://github.com/c64scene-ar/llvm-6502.git
synced 2025-01-19 04:32:19 +00:00
Optimize splat of a scalar load into a shuffle of a vector load when it's legal. e.g.
vector_shuffle (scalar_to_vector (i32 load (ptr + 4))), undef, <0, 0, 0, 0> => vector_shuffle (v4i32 load ptr), undef, <1, 1, 1, 1> iff ptr is 16-byte aligned (or can be made into 16-byte aligned). git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@90984 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
parent
89452f7386
commit
c363094e04
@ -3343,6 +3343,82 @@ static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp,
|
||||
DAG.getConstant(NumBits, TLI.getShiftAmountTy())));
|
||||
}
|
||||
|
||||
SDValue
|
||||
X86TargetLowering::LowerAsSplatVectorLoad(SDValue SrcOp, EVT VT, DebugLoc dl,
|
||||
SelectionDAG &DAG) {
|
||||
|
||||
// Check if the scalar load can be widened into a vector load. And if
|
||||
// the address is "base + cst" see if the cst can be "absorbed" into
|
||||
// the shuffle mask.
|
||||
if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) {
|
||||
SDValue Ptr = LD->getBasePtr();
|
||||
if (!ISD::isNormalLoad(LD) || LD->isVolatile())
|
||||
return SDValue();
|
||||
EVT PVT = LD->getValueType(0);
|
||||
if (PVT != MVT::i32 && PVT != MVT::f32)
|
||||
return SDValue();
|
||||
|
||||
int FI = -1;
|
||||
int64_t Offset = 0;
|
||||
if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) {
|
||||
FI = FINode->getIndex();
|
||||
Offset = 0;
|
||||
} else if (Ptr.getOpcode() == ISD::ADD &&
|
||||
isa<ConstantSDNode>(Ptr.getOperand(1)) &&
|
||||
isa<FrameIndexSDNode>(Ptr.getOperand(0))) {
|
||||
FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
|
||||
Offset = Ptr.getConstantOperandVal(1);
|
||||
Ptr = Ptr.getOperand(0);
|
||||
} else {
|
||||
return SDValue();
|
||||
}
|
||||
|
||||
SDValue Chain = LD->getChain();
|
||||
// Make sure the stack object alignment is at least 16.
|
||||
MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
|
||||
if (DAG.InferPtrAlignment(Ptr) < 16) {
|
||||
if (MFI->isFixedObjectIndex(FI)) {
|
||||
// Can't change the alignment. Reference stack + offset explicitly
|
||||
// if stack pointer is at least 16-byte aligned.
|
||||
unsigned StackAlign = Subtarget->getStackAlignment();
|
||||
if (StackAlign < 16)
|
||||
return SDValue();
|
||||
Offset = MFI->getObjectOffset(FI) + Offset;
|
||||
SDValue StackPtr = DAG.getCopyFromReg(Chain, dl, X86StackPtr,
|
||||
getPointerTy());
|
||||
Ptr = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr,
|
||||
DAG.getConstant(Offset & ~15, getPointerTy()));
|
||||
Offset %= 16;
|
||||
} else {
|
||||
MFI->setObjectAlignment(FI, 16);
|
||||
}
|
||||
}
|
||||
|
||||
// (Offset % 16) must be multiple of 4. Then address is then
|
||||
// Ptr + (Offset & ~15).
|
||||
if (Offset < 0)
|
||||
return SDValue();
|
||||
if ((Offset % 16) & 3)
|
||||
return SDValue();
|
||||
int64_t StartOffset = Offset & ~15;
|
||||
if (StartOffset)
|
||||
Ptr = DAG.getNode(ISD::ADD, Ptr.getDebugLoc(), Ptr.getValueType(),
|
||||
Ptr,DAG.getConstant(StartOffset, Ptr.getValueType()));
|
||||
|
||||
int EltNo = (Offset - StartOffset) >> 2;
|
||||
int Mask[4] = { EltNo, EltNo, EltNo, EltNo };
|
||||
EVT VT = (PVT == MVT::i32) ? MVT::v4i32 : MVT::v4f32;
|
||||
SDValue V1 = DAG.getLoad(VT, dl, Chain, Ptr,LD->getSrcValue(),0);
|
||||
// Canonicalize it to a v4i32 shuffle.
|
||||
V1 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v4i32, V1);
|
||||
return DAG.getNode(ISD::BIT_CONVERT, dl, VT,
|
||||
DAG.getVectorShuffle(MVT::v4i32, dl, V1,
|
||||
DAG.getUNDEF(MVT::v4i32), &Mask[0]));
|
||||
}
|
||||
|
||||
return SDValue();
|
||||
}
|
||||
|
||||
SDValue
|
||||
X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) {
|
||||
DebugLoc dl = Op.getDebugLoc();
|
||||
@ -3486,8 +3562,19 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) {
|
||||
}
|
||||
|
||||
// Splat is obviously ok. Let legalizer expand it to a shuffle.
|
||||
if (Values.size() == 1)
|
||||
if (Values.size() == 1) {
|
||||
if (EVTBits == 32) {
|
||||
// Instead of a shuffle like this:
|
||||
// shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
|
||||
// Check if it's possible to issue this instead.
|
||||
// shuffle (vload ptr)), undef, <1, 1, 1, 1>
|
||||
unsigned Idx = CountTrailingZeros_32(NonZeros);
|
||||
SDValue Item = Op.getOperand(Idx);
|
||||
if (Op.getNode()->isOnlyUserOf(Item.getNode()))
|
||||
return LowerAsSplatVectorLoad(Item, VT, dl, DAG);
|
||||
}
|
||||
return SDValue();
|
||||
}
|
||||
|
||||
// A vector full of immediates; various special cases are already
|
||||
// handled, so this is best done with a single constant-pool load.
|
||||
@ -4278,7 +4365,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) {
|
||||
unsigned ShAmt = 0;
|
||||
SDValue ShVal;
|
||||
bool isShift = getSubtarget()->hasSSE2() &&
|
||||
isVectorShift(SVOp, DAG, isLeft, ShVal, ShAmt);
|
||||
isVectorShift(SVOp, DAG, isLeft, ShVal, ShAmt);
|
||||
if (isShift && ShVal.hasOneUse()) {
|
||||
// If the shifted value has multiple uses, it may be cheaper to use
|
||||
// v_set0 + movlhps or movhlps, etc.
|
||||
|
@ -626,7 +626,9 @@ namespace llvm {
|
||||
|
||||
std::pair<SDValue,SDValue> FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
|
||||
bool isSigned);
|
||||
|
||||
|
||||
SDValue LowerAsSplatVectorLoad(SDValue SrcOp, EVT VT, DebugLoc dl,
|
||||
SelectionDAG &DAG);
|
||||
SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG);
|
||||
SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG);
|
||||
SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG);
|
||||
|
@ -2083,7 +2083,7 @@ def PSHUFDmi : PDIi8<0x70, MRMSrcMem,
|
||||
(outs VR128:$dst), (ins i128mem:$src1, i8imm:$src2),
|
||||
"pshufd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
|
||||
[(set VR128:$dst, (v4i32 (pshufd:$src2
|
||||
(bc_v4i32(memopv2i64 addr:$src1)),
|
||||
(bc_v4i32 (memopv2i64 addr:$src1)),
|
||||
(undef))))]>;
|
||||
}
|
||||
|
||||
|
43
test/CodeGen/X86/splat-scalar-load.ll
Normal file
43
test/CodeGen/X86/splat-scalar-load.ll
Normal file
@ -0,0 +1,43 @@
|
||||
; RUN: llc < %s -march=x86 -mattr=+sse2 | FileCheck %s
|
||||
; rdar://7434544
|
||||
|
||||
define <2 x i64> @t1() nounwind ssp {
|
||||
entry:
|
||||
; CHECK: t1:
|
||||
; CHECK: pshufd $0, (%esp), %xmm0
|
||||
%array = alloca [8 x float], align 16
|
||||
%arrayidx = getelementptr inbounds [8 x float]* %array, i32 0, i32 0
|
||||
%tmp2 = load float* %arrayidx
|
||||
%vecinit = insertelement <4 x float> undef, float %tmp2, i32 0
|
||||
%vecinit5 = insertelement <4 x float> %vecinit, float %tmp2, i32 1
|
||||
%vecinit7 = insertelement <4 x float> %vecinit5, float %tmp2, i32 2
|
||||
%vecinit9 = insertelement <4 x float> %vecinit7, float %tmp2, i32 3
|
||||
%0 = bitcast <4 x float> %vecinit9 to <2 x i64>
|
||||
ret <2 x i64> %0
|
||||
}
|
||||
|
||||
define <2 x i64> @t2() nounwind ssp {
|
||||
entry:
|
||||
; CHECK: t2:
|
||||
; CHECK: pshufd $85, (%esp), %xmm0
|
||||
%array = alloca [8 x float], align 4
|
||||
%arrayidx = getelementptr inbounds [8 x float]* %array, i32 0, i32 1
|
||||
%tmp2 = load float* %arrayidx
|
||||
%vecinit = insertelement <4 x float> undef, float %tmp2, i32 0
|
||||
%vecinit5 = insertelement <4 x float> %vecinit, float %tmp2, i32 1
|
||||
%vecinit7 = insertelement <4 x float> %vecinit5, float %tmp2, i32 2
|
||||
%vecinit9 = insertelement <4 x float> %vecinit7, float %tmp2, i32 3
|
||||
%0 = bitcast <4 x float> %vecinit9 to <2 x i64>
|
||||
ret <2 x i64> %0
|
||||
}
|
||||
|
||||
define <4 x float> @t3(float %tmp1, float %tmp2, float %tmp3) nounwind readnone ssp {
|
||||
entry:
|
||||
; CHECK: t3:
|
||||
; CHECK: pshufd $-86, (%esp), %xmm0
|
||||
%0 = insertelement <4 x float> undef, float %tmp3, i32 0
|
||||
%1 = insertelement <4 x float> %0, float %tmp3, i32 1
|
||||
%2 = insertelement <4 x float> %1, float %tmp3, i32 2
|
||||
%3 = insertelement <4 x float> %2, float %tmp3, i32 3
|
||||
ret <4 x float> %3
|
||||
}
|
Loading…
x
Reference in New Issue
Block a user