Some enhancements for memcpy / memset inline expansion.

1. Teach it to use overlapping unaligned load / store to copy / set the trailing
   bytes. e.g. On 86, use two pairs of movups / movaps for 17 - 31 byte copies.
2. Use f64 for memcpy / memset on targets where i64 is not legal but f64 is. e.g.
   x86 and ARM.
3. When memcpy from a constant string, do *not* replace the load with a constant
   if it's not possible to materialize an integer immediate with a single
   instruction (required a new target hook: TLI.isIntImmLegal()).
4. Use unaligned load / stores more aggressively if target hooks indicates they
   are "fast".
5. Update ARM target hooks to use unaligned load / stores. e.g. vld1.8 / vst1.8.
   Also increase the threshold to something reasonable (8 for memset, 4 pairs
   for memcpy).

This significantly improves Dhrystone, up to 50% on ARM iOS devices.

rdar://12760078


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@169791 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
Evan Cheng
2012-12-10 23:21:26 +00:00
parent 2b475922e6
commit 376642ed62
15 changed files with 299 additions and 85 deletions
+64 -18
View File
@@ -3373,7 +3373,7 @@ static SDValue getMemsetStringVal(EVT VT, DebugLoc dl, SelectionDAG &DAG,
unsigned NumVTBytes = VT.getSizeInBits() / 8;
unsigned NumBytes = std::min(NumVTBytes, unsigned(Str.size()));
uint64_t Val = 0;
APInt Val(NumBytes*8, 0);
if (TLI.isLittleEndian()) {
for (unsigned i = 0; i != NumBytes; ++i)
Val |= (uint64_t)(unsigned char)Str[i] << i*8;
@@ -3382,7 +3382,9 @@ static SDValue getMemsetStringVal(EVT VT, DebugLoc dl, SelectionDAG &DAG,
Val |= (uint64_t)(unsigned char)Str[i] << (NumVTBytes-i-1)*8;
}
return DAG.getConstant(Val, VT);
if (TLI.isIntImmLegal(Val, VT))
return DAG.getConstant(Val, VT);
return SDValue(0, 0);
}
/// getMemBasePlusOffset - Returns base and offset node for the
@@ -3422,6 +3424,7 @@ static bool FindOptimalMemOpLowering(std::vector<EVT> &MemOps,
unsigned DstAlign, unsigned SrcAlign,
bool IsZeroVal,
bool MemcpyStrSrc,
bool AllowOverlap,
SelectionDAG &DAG,
const TargetLowering &TLI) {
assert((SrcAlign == 0 || SrcAlign >= DstAlign) &&
@@ -3461,24 +3464,47 @@ static bool FindOptimalMemOpLowering(std::vector<EVT> &MemOps,
unsigned NumMemOps = 0;
while (Size != 0) {
if (++NumMemOps > Limit)
return false;
unsigned VTSize = VT.getSizeInBits() / 8;
while (VTSize > Size) {
// For now, only use non-vector load / store's for the left-over pieces.
EVT NewVT;
unsigned NewVTSize;
if (VT.isVector() || VT.isFloatingPoint()) {
VT = MVT::i64;
while (!TLI.isTypeLegal(VT))
VT = (MVT::SimpleValueType)(VT.getSimpleVT().SimpleTy - 1);
VTSize = VT.getSizeInBits() / 8;
NewVT = (VT.getSizeInBits() > 64) ? MVT::i64 : MVT::i32;
while (!TLI.isOperationLegalOrCustom(ISD::STORE, NewVT)) {
if (NewVT == MVT::i64 &&
TLI.isOperationLegalOrCustom(ISD::STORE, MVT::f64)) {
// i64 is usually not legal on 32-bit targets, but f64 may be.
NewVT = MVT::f64;
break;
}
NewVT = (MVT::SimpleValueType)(NewVT.getSimpleVT().SimpleTy - 1);
}
NewVTSize = NewVT.getSizeInBits() / 8;
} else {
// This can result in a type that is not legal on the target, e.g.
// 1 or 2 bytes on PPC.
VT = (MVT::SimpleValueType)(VT.getSimpleVT().SimpleTy - 1);
VTSize >>= 1;
NewVT = (MVT::SimpleValueType)(VT.getSimpleVT().SimpleTy - 1);
NewVTSize = VTSize >> 1;
}
// If the new VT cannot cover all of the remaining bits, then consider
// issuing a (or a pair of) unaligned and overlapping load / store.
// FIXME: Only does this for 64-bit or more since we don't have proper
// cost model for unaligned load / store.
bool Fast;
if (AllowOverlap && VTSize >= 8 && NewVTSize < Size &&
TLI.allowsUnalignedMemoryAccesses(VT, &Fast) && Fast)
VTSize = Size;
else {
VT = NewVT;
VTSize = NewVTSize;
}
}
if (++NumMemOps > Limit)
return false;
MemOps.push_back(VT);
Size -= VTSize;
}
@@ -3523,7 +3549,7 @@ static SDValue getMemcpyLoadsAndStores(SelectionDAG &DAG, DebugLoc dl,
if (!FindOptimalMemOpLowering(MemOps, Limit, Size,
(DstAlignCanChange ? 0 : Align),
(isZeroStr ? 0 : SrcAlign),
true, CopyFromStr, DAG, TLI))
true, CopyFromStr, true, DAG, TLI))
return SDValue();
if (DstAlignCanChange) {
@@ -3545,6 +3571,14 @@ static SDValue getMemcpyLoadsAndStores(SelectionDAG &DAG, DebugLoc dl,
unsigned VTSize = VT.getSizeInBits() / 8;
SDValue Value, Store;
if (VTSize > Size) {
// Issuing an unaligned load / store pair that overlaps with the previous
// pair. Adjust the offset accordingly.
assert(i == NumMemOps-1 && i != 0);
SrcOff -= VTSize - Size;
DstOff -= VTSize - Size;
}
if (CopyFromStr &&
(isZeroStr || (VT.isInteger() && !VT.isVector()))) {
// It's unlikely a store of a vector immediate can be done in a single
@@ -3553,11 +3587,14 @@ static SDValue getMemcpyLoadsAndStores(SelectionDAG &DAG, DebugLoc dl,
// FIXME: Handle other cases where store of vector immediate is done in
// a single instruction.
Value = getMemsetStringVal(VT, dl, DAG, TLI, Str.substr(SrcOff));
Store = DAG.getStore(Chain, dl, Value,
getMemBasePlusOffset(Dst, DstOff, DAG),
DstPtrInfo.getWithOffset(DstOff), isVol,
false, Align);
} else {
if (Value.getNode())
Store = DAG.getStore(Chain, dl, Value,
getMemBasePlusOffset(Dst, DstOff, DAG),
DstPtrInfo.getWithOffset(DstOff), isVol,
false, Align);
}
if (!Store.getNode()) {
// The type might not be legal for the target. This should only happen
// if the type is smaller than a legal type, as on PPC, so the right
// thing to do is generate a LoadExt/StoreTrunc pair. These simplify
@@ -3577,6 +3614,7 @@ static SDValue getMemcpyLoadsAndStores(SelectionDAG &DAG, DebugLoc dl,
OutChains.push_back(Store);
SrcOff += VTSize;
DstOff += VTSize;
Size -= VTSize;
}
return DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
@@ -3613,7 +3651,7 @@ static SDValue getMemmoveLoadsAndStores(SelectionDAG &DAG, DebugLoc dl,
if (!FindOptimalMemOpLowering(MemOps, Limit, Size,
(DstAlignCanChange ? 0 : Align),
SrcAlign, true, false, DAG, TLI))
SrcAlign, true, false, false, DAG, TLI))
return SDValue();
if (DstAlignCanChange) {
@@ -3689,7 +3727,7 @@ static SDValue getMemsetStores(SelectionDAG &DAG, DebugLoc dl,
isa<ConstantSDNode>(Src) && cast<ConstantSDNode>(Src)->isNullValue();
if (!FindOptimalMemOpLowering(MemOps, TLI.getMaxStoresPerMemset(OptSize),
Size, (DstAlignCanChange ? 0 : Align), 0,
IsZeroVal, false, DAG, TLI))
IsZeroVal, false, true, DAG, TLI))
return SDValue();
if (DstAlignCanChange) {
@@ -3716,6 +3754,13 @@ static SDValue getMemsetStores(SelectionDAG &DAG, DebugLoc dl,
for (unsigned i = 0; i < NumMemOps; i++) {
EVT VT = MemOps[i];
unsigned VTSize = VT.getSizeInBits() / 8;
if (VTSize > Size) {
// Issuing an unaligned load / store pair that overlaps with the previous
// pair. Adjust the offset accordingly.
assert(i == NumMemOps-1 && i != 0);
DstOff -= VTSize - Size;
}
// If this store is smaller than the largest store see whether we can get
// the smaller value for free with a truncate.
@@ -3734,6 +3779,7 @@ static SDValue getMemsetStores(SelectionDAG &DAG, DebugLoc dl,
isVol, false, Align);
OutChains.push_back(Store);
DstOff += VT.getSizeInBits() / 8;
Size -= VTSize;
}
return DAG.getNode(ISD::TokenFactor, dl, MVT::Other,