llvm-6502/lib/Target/ARM/ARMSelectionDAGInfo.cpp
Peter Collingbourne 27565d6185 Thumb2: Modify codegen for memcpy intrinsic to prefer LDM/STM.
We were previously codegen'ing these as regular load/store operations and
hoping that the register allocator would allocate registers in ascending order
so that we could apply an LDM/STM combine after register allocation. According
to the commit that first introduced this code (r37179), we planned to teach
the register allocator to allocate the registers in ascending order. This
never got implemented, and up to now we've been stuck with very poor codegen.

A much simpler approach for achiveing better codegen is to create LDM/STM
instructions with identical sets of virtual registers, let the register
allocator pick arbitrary registers and order register lists when printing an
MCInst. This approach also avoids the need to repeatedly calculate offsets
which ultimately ought to be eliminated pre-RA in order to decrease register
pressure.

This is implemented by lowering the memcpy intrinsic to a series of SD-only
MCOPY pseudo-instructions which performs a memory copy using a given number
of registers. During SD->MI lowering, we lower MCOPY to LDM/STM. This is a
little unusual, but it avoids the need to encode register lists in the SD,
and we can take advantage of SD use lists to decide whether to use the _UPD
variant of the instructions.

Fixes PR9199.

Differential Revision: http://reviews.llvm.org/D9508

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@238473 91177308-0d34-0410-b5e6-96231b3b80d8
2015-05-28 20:02:45 +00:00

275 lines
9.5 KiB
C++

//===-- ARMSelectionDAGInfo.cpp - ARM SelectionDAG Info -------------------===//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
//
// This file implements the ARMSelectionDAGInfo class.
//
//===----------------------------------------------------------------------===//
#include "ARMTargetMachine.h"
#include "llvm/CodeGen/SelectionDAG.h"
#include "llvm/IR/DerivedTypes.h"
using namespace llvm;
#define DEBUG_TYPE "arm-selectiondag-info"
ARMSelectionDAGInfo::ARMSelectionDAGInfo(const DataLayout &DL)
: TargetSelectionDAGInfo(&DL) {}
ARMSelectionDAGInfo::~ARMSelectionDAGInfo() {
}
// Emit, if possible, a specialized version of the given Libcall. Typically this
// means selecting the appropriately aligned version, but we also convert memset
// of 0 into memclr.
SDValue ARMSelectionDAGInfo::
EmitSpecializedLibcall(SelectionDAG &DAG, SDLoc dl,
SDValue Chain,
SDValue Dst, SDValue Src,
SDValue Size, unsigned Align,
RTLIB::Libcall LC) const {
const ARMSubtarget &Subtarget =
DAG.getMachineFunction().getSubtarget<ARMSubtarget>();
const ARMTargetLowering *TLI = Subtarget.getTargetLowering();
// Only use a specialized AEABI function if the default version of this
// Libcall is an AEABI function.
if (std::strncmp(TLI->getLibcallName(LC), "__aeabi", 7) != 0)
return SDValue();
// Translate RTLIB::Libcall to AEABILibcall. We only do this in order to be
// able to translate memset to memclr and use the value to index the function
// name array.
enum {
AEABI_MEMCPY = 0,
AEABI_MEMMOVE,
AEABI_MEMSET,
AEABI_MEMCLR
} AEABILibcall;
switch (LC) {
case RTLIB::MEMCPY:
AEABILibcall = AEABI_MEMCPY;
break;
case RTLIB::MEMMOVE:
AEABILibcall = AEABI_MEMMOVE;
break;
case RTLIB::MEMSET:
AEABILibcall = AEABI_MEMSET;
if (ConstantSDNode *ConstantSrc = dyn_cast<ConstantSDNode>(Src))
if (ConstantSrc->getZExtValue() == 0)
AEABILibcall = AEABI_MEMCLR;
break;
default:
return SDValue();
}
// Choose the most-aligned libcall variant that we can
enum {
ALIGN1 = 0,
ALIGN4,
ALIGN8
} AlignVariant;
if ((Align & 7) == 0)
AlignVariant = ALIGN8;
else if ((Align & 3) == 0)
AlignVariant = ALIGN4;
else
AlignVariant = ALIGN1;
TargetLowering::ArgListTy Args;
TargetLowering::ArgListEntry Entry;
Entry.Ty = TLI->getDataLayout()->getIntPtrType(*DAG.getContext());
Entry.Node = Dst;
Args.push_back(Entry);
if (AEABILibcall == AEABI_MEMCLR) {
Entry.Node = Size;
Args.push_back(Entry);
} else if (AEABILibcall == AEABI_MEMSET) {
// Adjust parameters for memset, EABI uses format (ptr, size, value),
// GNU library uses (ptr, value, size)
// See RTABI section 4.3.4
Entry.Node = Size;
Args.push_back(Entry);
// Extend or truncate the argument to be an i32 value for the call.
if (Src.getValueType().bitsGT(MVT::i32))
Src = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Src);
else if (Src.getValueType().bitsLT(MVT::i32))
Src = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Src);
Entry.Node = Src;
Entry.Ty = Type::getInt32Ty(*DAG.getContext());
Entry.isSExt = false;
Args.push_back(Entry);
} else {
Entry.Node = Src;
Args.push_back(Entry);
Entry.Node = Size;
Args.push_back(Entry);
}
char const *FunctionNames[4][3] = {
{ "__aeabi_memcpy", "__aeabi_memcpy4", "__aeabi_memcpy8" },
{ "__aeabi_memmove", "__aeabi_memmove4", "__aeabi_memmove8" },
{ "__aeabi_memset", "__aeabi_memset4", "__aeabi_memset8" },
{ "__aeabi_memclr", "__aeabi_memclr4", "__aeabi_memclr8" }
};
TargetLowering::CallLoweringInfo CLI(DAG);
CLI.setDebugLoc(dl).setChain(Chain)
.setCallee(TLI->getLibcallCallingConv(LC),
Type::getVoidTy(*DAG.getContext()),
DAG.getExternalSymbol(FunctionNames[AEABILibcall][AlignVariant],
TLI->getPointerTy()), std::move(Args), 0)
.setDiscardResult();
std::pair<SDValue,SDValue> CallResult = TLI->LowerCallTo(CLI);
return CallResult.second;
}
SDValue
ARMSelectionDAGInfo::EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc dl,
SDValue Chain,
SDValue Dst, SDValue Src,
SDValue Size, unsigned Align,
bool isVolatile, bool AlwaysInline,
MachinePointerInfo DstPtrInfo,
MachinePointerInfo SrcPtrInfo) const {
const ARMSubtarget &Subtarget =
DAG.getMachineFunction().getSubtarget<ARMSubtarget>();
// Do repeated 4-byte loads and stores. To be improved.
// This requires 4-byte alignment.
if ((Align & 3) != 0)
return SDValue();
// This requires the copy size to be a constant, preferably
// within a subtarget-specific limit.
ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
if (!ConstantSize)
return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size, Align,
RTLIB::MEMCPY);
uint64_t SizeVal = ConstantSize->getZExtValue();
if (!AlwaysInline && SizeVal > Subtarget.getMaxInlineSizeThreshold())
return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size, Align,
RTLIB::MEMCPY);
unsigned BytesLeft = SizeVal & 3;
unsigned NumMemOps = SizeVal >> 2;
unsigned EmittedNumMemOps = 0;
EVT VT = MVT::i32;
unsigned VTSize = 4;
unsigned i = 0;
// Emit a maximum of 4 loads in Thumb1 since we have fewer registers
const unsigned MaxLoadsInLDM = Subtarget.isThumb1Only() ? 4 : 6;
SDValue TFOps[6];
SDValue Loads[6];
uint64_t SrcOff = 0, DstOff = 0;
// FIXME: We should invent a VMCOPY pseudo-instruction that lowers to
// VLDM/VSTM and make this code emit it when appropriate. This would reduce
// pressure on the general purpose registers. However this seems harder to map
// onto the register allocator's view of the world.
// The number of MCOPY pseudo-instructions to emit. We use up to MaxLoadsInLDM
// registers per mcopy, which will get lowered into ldm/stm later on. This is
// a lower bound on the number of MCOPY operations we must emit.
unsigned NumMCOPYs = (NumMemOps + MaxLoadsInLDM - 1) / MaxLoadsInLDM;
SDVTList VTs = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other, MVT::Glue);
for (unsigned I = 0; I != NumMCOPYs; ++I) {
// Evenly distribute registers among MCOPY operations to reduce register
// pressure.
unsigned NextEmittedNumMemOps = NumMemOps * (I + 1) / NumMCOPYs;
unsigned NumRegs = NextEmittedNumMemOps - EmittedNumMemOps;
Dst = DAG.getNode(ARMISD::MCOPY, dl, VTs, Chain, Dst, Src,
DAG.getConstant(NumRegs, dl, MVT::i32));
Src = Dst.getValue(1);
Chain = Dst.getValue(2);
DstPtrInfo = DstPtrInfo.getWithOffset(NumRegs * VTSize);
SrcPtrInfo = SrcPtrInfo.getWithOffset(NumRegs * VTSize);
EmittedNumMemOps = NextEmittedNumMemOps;
}
if (BytesLeft == 0)
return Chain;
// Issue loads / stores for the trailing (1 - 3) bytes.
unsigned BytesLeftSave = BytesLeft;
i = 0;
while (BytesLeft) {
if (BytesLeft >= 2) {
VT = MVT::i16;
VTSize = 2;
} else {
VT = MVT::i8;
VTSize = 1;
}
Loads[i] = DAG.getLoad(VT, dl, Chain,
DAG.getNode(ISD::ADD, dl, MVT::i32, Src,
DAG.getConstant(SrcOff, dl, MVT::i32)),
SrcPtrInfo.getWithOffset(SrcOff),
false, false, false, 0);
TFOps[i] = Loads[i].getValue(1);
++i;
SrcOff += VTSize;
BytesLeft -= VTSize;
}
Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
makeArrayRef(TFOps, i));
i = 0;
BytesLeft = BytesLeftSave;
while (BytesLeft) {
if (BytesLeft >= 2) {
VT = MVT::i16;
VTSize = 2;
} else {
VT = MVT::i8;
VTSize = 1;
}
TFOps[i] = DAG.getStore(Chain, dl, Loads[i],
DAG.getNode(ISD::ADD, dl, MVT::i32, Dst,
DAG.getConstant(DstOff, dl, MVT::i32)),
DstPtrInfo.getWithOffset(DstOff), false, false, 0);
++i;
DstOff += VTSize;
BytesLeft -= VTSize;
}
return DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
makeArrayRef(TFOps, i));
}
SDValue ARMSelectionDAGInfo::
EmitTargetCodeForMemmove(SelectionDAG &DAG, SDLoc dl,
SDValue Chain,
SDValue Dst, SDValue Src,
SDValue Size, unsigned Align,
bool isVolatile,
MachinePointerInfo DstPtrInfo,
MachinePointerInfo SrcPtrInfo) const {
return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size, Align,
RTLIB::MEMMOVE);
}
SDValue ARMSelectionDAGInfo::
EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl,
SDValue Chain, SDValue Dst,
SDValue Src, SDValue Size,
unsigned Align, bool isVolatile,
MachinePointerInfo DstPtrInfo) const {
return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size, Align,
RTLIB::MEMSET);
}