From c363094e04df621d41ca570eb2a7bf8826bb8c1a Mon Sep 17 00:00:00 2001 From: Evan Cheng Date: Wed, 9 Dec 2009 21:00:30 +0000 Subject: [PATCH] Optimize splat of a scalar load into a shuffle of a vector load when it's legal. e.g. vector_shuffle (scalar_to_vector (i32 load (ptr + 4))), undef, <0, 0, 0, 0> => vector_shuffle (v4i32 load ptr), undef, <1, 1, 1, 1> iff ptr is 16-byte aligned (or can be made into 16-byte aligned). git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@90984 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86ISelLowering.cpp | 91 ++++++++++++++++++++++++++- lib/Target/X86/X86ISelLowering.h | 4 +- lib/Target/X86/X86InstrSSE.td | 2 +- test/CodeGen/X86/splat-scalar-load.ll | 43 +++++++++++++ 4 files changed, 136 insertions(+), 4 deletions(-) create mode 100644 test/CodeGen/X86/splat-scalar-load.ll diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 38e1ffe8924..8284b17272d 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -3343,6 +3343,82 @@ static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, DAG.getConstant(NumBits, TLI.getShiftAmountTy()))); } +SDValue +X86TargetLowering::LowerAsSplatVectorLoad(SDValue SrcOp, EVT VT, DebugLoc dl, + SelectionDAG &DAG) { + + // Check if the scalar load can be widened into a vector load. And if + // the address is "base + cst" see if the cst can be "absorbed" into + // the shuffle mask. + if (LoadSDNode *LD = dyn_cast(SrcOp)) { + SDValue Ptr = LD->getBasePtr(); + if (!ISD::isNormalLoad(LD) || LD->isVolatile()) + return SDValue(); + EVT PVT = LD->getValueType(0); + if (PVT != MVT::i32 && PVT != MVT::f32) + return SDValue(); + + int FI = -1; + int64_t Offset = 0; + if (FrameIndexSDNode *FINode = dyn_cast(Ptr)) { + FI = FINode->getIndex(); + Offset = 0; + } else if (Ptr.getOpcode() == ISD::ADD && + isa(Ptr.getOperand(1)) && + isa(Ptr.getOperand(0))) { + FI = cast(Ptr.getOperand(0))->getIndex(); + Offset = Ptr.getConstantOperandVal(1); + Ptr = Ptr.getOperand(0); + } else { + return SDValue(); + } + + SDValue Chain = LD->getChain(); + // Make sure the stack object alignment is at least 16. + MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); + if (DAG.InferPtrAlignment(Ptr) < 16) { + if (MFI->isFixedObjectIndex(FI)) { + // Can't change the alignment. Reference stack + offset explicitly + // if stack pointer is at least 16-byte aligned. + unsigned StackAlign = Subtarget->getStackAlignment(); + if (StackAlign < 16) + return SDValue(); + Offset = MFI->getObjectOffset(FI) + Offset; + SDValue StackPtr = DAG.getCopyFromReg(Chain, dl, X86StackPtr, + getPointerTy()); + Ptr = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, + DAG.getConstant(Offset & ~15, getPointerTy())); + Offset %= 16; + } else { + MFI->setObjectAlignment(FI, 16); + } + } + + // (Offset % 16) must be multiple of 4. Then address is then + // Ptr + (Offset & ~15). + if (Offset < 0) + return SDValue(); + if ((Offset % 16) & 3) + return SDValue(); + int64_t StartOffset = Offset & ~15; + if (StartOffset) + Ptr = DAG.getNode(ISD::ADD, Ptr.getDebugLoc(), Ptr.getValueType(), + Ptr,DAG.getConstant(StartOffset, Ptr.getValueType())); + + int EltNo = (Offset - StartOffset) >> 2; + int Mask[4] = { EltNo, EltNo, EltNo, EltNo }; + EVT VT = (PVT == MVT::i32) ? MVT::v4i32 : MVT::v4f32; + SDValue V1 = DAG.getLoad(VT, dl, Chain, Ptr,LD->getSrcValue(),0); + // Canonicalize it to a v4i32 shuffle. + V1 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v4i32, V1); + return DAG.getNode(ISD::BIT_CONVERT, dl, VT, + DAG.getVectorShuffle(MVT::v4i32, dl, V1, + DAG.getUNDEF(MVT::v4i32), &Mask[0])); + } + + return SDValue(); +} + SDValue X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) { DebugLoc dl = Op.getDebugLoc(); @@ -3486,8 +3562,19 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) { } // Splat is obviously ok. Let legalizer expand it to a shuffle. - if (Values.size() == 1) + if (Values.size() == 1) { + if (EVTBits == 32) { + // Instead of a shuffle like this: + // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0> + // Check if it's possible to issue this instead. + // shuffle (vload ptr)), undef, <1, 1, 1, 1> + unsigned Idx = CountTrailingZeros_32(NonZeros); + SDValue Item = Op.getOperand(Idx); + if (Op.getNode()->isOnlyUserOf(Item.getNode())) + return LowerAsSplatVectorLoad(Item, VT, dl, DAG); + } return SDValue(); + } // A vector full of immediates; various special cases are already // handled, so this is best done with a single constant-pool load. @@ -4278,7 +4365,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) { unsigned ShAmt = 0; SDValue ShVal; bool isShift = getSubtarget()->hasSSE2() && - isVectorShift(SVOp, DAG, isLeft, ShVal, ShAmt); + isVectorShift(SVOp, DAG, isLeft, ShVal, ShAmt); if (isShift && ShVal.hasOneUse()) { // If the shifted value has multiple uses, it may be cheaper to use // v_set0 + movlhps or movhlps, etc. diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h index 7b4ab62fddd..89b773df562 100644 --- a/lib/Target/X86/X86ISelLowering.h +++ b/lib/Target/X86/X86ISelLowering.h @@ -626,7 +626,9 @@ namespace llvm { std::pair FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, bool isSigned); - + + SDValue LowerAsSplatVectorLoad(SDValue SrcOp, EVT VT, DebugLoc dl, + SelectionDAG &DAG); SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG); SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG); SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG); diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index dfdd4ce36c6..62841f8dec2 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -2083,7 +2083,7 @@ def PSHUFDmi : PDIi8<0x70, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src1, i8imm:$src2), "pshufd\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set VR128:$dst, (v4i32 (pshufd:$src2 - (bc_v4i32(memopv2i64 addr:$src1)), + (bc_v4i32 (memopv2i64 addr:$src1)), (undef))))]>; } diff --git a/test/CodeGen/X86/splat-scalar-load.ll b/test/CodeGen/X86/splat-scalar-load.ll new file mode 100644 index 00000000000..6c93efab704 --- /dev/null +++ b/test/CodeGen/X86/splat-scalar-load.ll @@ -0,0 +1,43 @@ +; RUN: llc < %s -march=x86 -mattr=+sse2 | FileCheck %s +; rdar://7434544 + +define <2 x i64> @t1() nounwind ssp { +entry: +; CHECK: t1: +; CHECK: pshufd $0, (%esp), %xmm0 + %array = alloca [8 x float], align 16 + %arrayidx = getelementptr inbounds [8 x float]* %array, i32 0, i32 0 + %tmp2 = load float* %arrayidx + %vecinit = insertelement <4 x float> undef, float %tmp2, i32 0 + %vecinit5 = insertelement <4 x float> %vecinit, float %tmp2, i32 1 + %vecinit7 = insertelement <4 x float> %vecinit5, float %tmp2, i32 2 + %vecinit9 = insertelement <4 x float> %vecinit7, float %tmp2, i32 3 + %0 = bitcast <4 x float> %vecinit9 to <2 x i64> + ret <2 x i64> %0 +} + +define <2 x i64> @t2() nounwind ssp { +entry: +; CHECK: t2: +; CHECK: pshufd $85, (%esp), %xmm0 + %array = alloca [8 x float], align 4 + %arrayidx = getelementptr inbounds [8 x float]* %array, i32 0, i32 1 + %tmp2 = load float* %arrayidx + %vecinit = insertelement <4 x float> undef, float %tmp2, i32 0 + %vecinit5 = insertelement <4 x float> %vecinit, float %tmp2, i32 1 + %vecinit7 = insertelement <4 x float> %vecinit5, float %tmp2, i32 2 + %vecinit9 = insertelement <4 x float> %vecinit7, float %tmp2, i32 3 + %0 = bitcast <4 x float> %vecinit9 to <2 x i64> + ret <2 x i64> %0 +} + +define <4 x float> @t3(float %tmp1, float %tmp2, float %tmp3) nounwind readnone ssp { +entry: +; CHECK: t3: +; CHECK: pshufd $-86, (%esp), %xmm0 + %0 = insertelement <4 x float> undef, float %tmp3, i32 0 + %1 = insertelement <4 x float> %0, float %tmp3, i32 1 + %2 = insertelement <4 x float> %1, float %tmp3, i32 2 + %3 = insertelement <4 x float> %2, float %tmp3, i32 3 + ret <4 x float> %3 +}