Some enhancements for memcpy / memset inline expansion.

1. Teach it to use overlapping unaligned load / store to copy / set the trailing bytes. e.g. On 86, use two pairs of movups / movaps for 17 - 31 byte copies. 2. Use f64 for memcpy / memset on targets where i64 is not legal but f64 is. e.g. x86 and ARM. 3. When memcpy from a constant string, do *not* replace the load with a constant if it's not possible to materialize an integer immediate with a single instruction (required a new target hook: TLI.isIntImmLegal()). 4. Use unaligned load / stores more aggressively if target hooks indicates they are "fast". 5. Update ARM target hooks to use unaligned load / stores. e.g. vld1.8 / vst1.8. Also increase the threshold to something reasonable (8 for memset, 4 pairs for memcpy). This significantly improves Dhrystone, up to 50% on ARM iOS devices. rdar://12760078 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@169791 91177308-0d34-0410-b5e6-96231b3b80d8
2026-04-25 21:18:19 +00:00 · 2012-12-10 23:21:26 +00:00
parent 2b475922e6
commit 376642ed62
15 changed files with 299 additions and 85 deletions
@@ -3373,7 +3373,7 @@ static SDValue getMemsetStringVal(EVT VT, DebugLoc dl, SelectionDAG &DAG,
  unsigned NumVTBytes = VT.getSizeInBits() / 8;
  unsigned NumBytes = std::min(NumVTBytes, unsigned(Str.size()));

-  uint64_t Val = 0;
+  APInt Val(NumBytes*8, 0);
  if (TLI.isLittleEndian()) {
    for (unsigned i = 0; i != NumBytes; ++i)
      Val |= (uint64_t)(unsigned char)Str[i] << i*8;
@@ -3382,7 +3382,9 @@ static SDValue getMemsetStringVal(EVT VT, DebugLoc dl, SelectionDAG &DAG,
      Val |= (uint64_t)(unsigned char)Str[i] << (NumVTBytes-i-1)*8;
  }

-  return DAG.getConstant(Val, VT);
+  if (TLI.isIntImmLegal(Val, VT))
+    return DAG.getConstant(Val, VT);
+  return SDValue(0, 0);
 }

 /// getMemBasePlusOffset - Returns base and offset node for the
@@ -3422,6 +3424,7 @@ static bool FindOptimalMemOpLowering(std::vector<EVT> &MemOps,
                                     unsigned DstAlign, unsigned SrcAlign,
                                     bool IsZeroVal,
                                     bool MemcpyStrSrc,
+                                     bool AllowOverlap,
                                     SelectionDAG &DAG,
                                     const TargetLowering &TLI) {
  assert((SrcAlign == 0 || SrcAlign >= DstAlign) &&
@@ -3461,24 +3464,47 @@ static bool FindOptimalMemOpLowering(std::vector<EVT> &MemOps,

  unsigned NumMemOps = 0;
  while (Size != 0) {
+    if (++NumMemOps > Limit)
+      return false;
+
    unsigned VTSize = VT.getSizeInBits() / 8;
    while (VTSize > Size) {
      // For now, only use non-vector load / store's for the left-over pieces.
+      EVT NewVT;
+      unsigned NewVTSize;
      if (VT.isVector() || VT.isFloatingPoint()) {
-        VT = MVT::i64;
-        while (!TLI.isTypeLegal(VT))
-          VT = (MVT::SimpleValueType)(VT.getSimpleVT().SimpleTy - 1);
-        VTSize = VT.getSizeInBits() / 8;
+        NewVT = (VT.getSizeInBits() > 64) ? MVT::i64 : MVT::i32;
+        while (!TLI.isOperationLegalOrCustom(ISD::STORE, NewVT)) {
+          if (NewVT == MVT::i64 &&
+              TLI.isOperationLegalOrCustom(ISD::STORE, MVT::f64)) {
+            // i64 is usually not legal on 32-bit targets, but f64 may be.
+            NewVT = MVT::f64;
+            break;
+          }
+          NewVT = (MVT::SimpleValueType)(NewVT.getSimpleVT().SimpleTy - 1);
+        }
+        NewVTSize = NewVT.getSizeInBits() / 8;
      } else {
        // This can result in a type that is not legal on the target, e.g.
        // 1 or 2 bytes on PPC.
-        VT = (MVT::SimpleValueType)(VT.getSimpleVT().SimpleTy - 1);
-        VTSize >>= 1;
+        NewVT = (MVT::SimpleValueType)(VT.getSimpleVT().SimpleTy - 1);
+        NewVTSize = VTSize >> 1;
+      }
+
+      // If the new VT cannot cover all of the remaining bits, then consider
+      // issuing a (or a pair of) unaligned and overlapping load / store.
+      // FIXME: Only does this for 64-bit or more since we don't have proper
+      // cost model for unaligned load / store.
+      bool Fast;
+      if (AllowOverlap && VTSize >= 8 && NewVTSize < Size &&
+          TLI.allowsUnalignedMemoryAccesses(VT, &Fast) && Fast)
+        VTSize = Size;
+      else {
+        VT = NewVT;
+        VTSize = NewVTSize;
      }
    }

-    if (++NumMemOps > Limit)
-      return false;
    MemOps.push_back(VT);
    Size -= VTSize;
  }
@@ -3523,7 +3549,7 @@ static SDValue getMemcpyLoadsAndStores(SelectionDAG &DAG, DebugLoc dl,
  if (!FindOptimalMemOpLowering(MemOps, Limit, Size,
                                (DstAlignCanChange ? 0 : Align),
                                (isZeroStr ? 0 : SrcAlign),
-                                true, CopyFromStr, DAG, TLI))
+                                true, CopyFromStr, true, DAG, TLI))
    return SDValue();

  if (DstAlignCanChange) {
@@ -3545,6 +3571,14 @@ static SDValue getMemcpyLoadsAndStores(SelectionDAG &DAG, DebugLoc dl,
    unsigned VTSize = VT.getSizeInBits() / 8;
    SDValue Value, Store;

+    if (VTSize > Size) {
+      // Issuing an unaligned load / store pair  that overlaps with the previous
+      // pair. Adjust the offset accordingly.
+      assert(i == NumMemOps-1 && i != 0);
+      SrcOff -= VTSize - Size;
+      DstOff -= VTSize - Size;
+    }
+
    if (CopyFromStr &&
        (isZeroStr || (VT.isInteger() && !VT.isVector()))) {
      // It's unlikely a store of a vector immediate can be done in a single
@@ -3553,11 +3587,14 @@ static SDValue getMemcpyLoadsAndStores(SelectionDAG &DAG, DebugLoc dl,
      // FIXME: Handle other cases where store of vector immediate is done in
      // a single instruction.
      Value = getMemsetStringVal(VT, dl, DAG, TLI, Str.substr(SrcOff));
-      Store = DAG.getStore(Chain, dl, Value,
-                           getMemBasePlusOffset(Dst, DstOff, DAG),
-                           DstPtrInfo.getWithOffset(DstOff), isVol,
-                           false, Align);
-    } else {
+      if (Value.getNode())
+        Store = DAG.getStore(Chain, dl, Value,
+                             getMemBasePlusOffset(Dst, DstOff, DAG),
+                             DstPtrInfo.getWithOffset(DstOff), isVol,
+                             false, Align);
+    }
+
+    if (!Store.getNode()) {
      // The type might not be legal for the target.  This should only happen
      // if the type is smaller than a legal type, as on PPC, so the right
      // thing to do is generate a LoadExt/StoreTrunc pair.  These simplify
@@ -3577,6 +3614,7 @@ static SDValue getMemcpyLoadsAndStores(SelectionDAG &DAG, DebugLoc dl,
    OutChains.push_back(Store);
    SrcOff += VTSize;
    DstOff += VTSize;
+    Size -= VTSize;
  }

  return DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
@@ -3613,7 +3651,7 @@ static SDValue getMemmoveLoadsAndStores(SelectionDAG &DAG, DebugLoc dl,

  if (!FindOptimalMemOpLowering(MemOps, Limit, Size,
                                (DstAlignCanChange ? 0 : Align),
-                                SrcAlign, true, false, DAG, TLI))
+                                SrcAlign, true, false, false, DAG, TLI))
    return SDValue();

  if (DstAlignCanChange) {
@@ -3689,7 +3727,7 @@ static SDValue getMemsetStores(SelectionDAG &DAG, DebugLoc dl,
    isa<ConstantSDNode>(Src) && cast<ConstantSDNode>(Src)->isNullValue();
  if (!FindOptimalMemOpLowering(MemOps, TLI.getMaxStoresPerMemset(OptSize),
                                Size, (DstAlignCanChange ? 0 : Align), 0,
-                                IsZeroVal, false, DAG, TLI))
+                                IsZeroVal, false, true, DAG, TLI))
    return SDValue();

  if (DstAlignCanChange) {
@@ -3716,6 +3754,13 @@ static SDValue getMemsetStores(SelectionDAG &DAG, DebugLoc dl,

  for (unsigned i = 0; i < NumMemOps; i++) {
    EVT VT = MemOps[i];
+    unsigned VTSize = VT.getSizeInBits() / 8;
+    if (VTSize > Size) {
+      // Issuing an unaligned load / store pair  that overlaps with the previous
+      // pair. Adjust the offset accordingly.
+      assert(i == NumMemOps-1 && i != 0);
+      DstOff -= VTSize - Size;
+    }

    // If this store is smaller than the largest store see whether we can get
    // the smaller value for free with a truncate.
@@ -3734,6 +3779,7 @@ static SDValue getMemsetStores(SelectionDAG &DAG, DebugLoc dl,
                                 isVol, false, Align);
    OutChains.push_back(Store);
    DstOff += VT.getSizeInBits() / 8;
+    Size -= VTSize;
  }

  return DAG.getNode(ISD::TokenFactor, dl, MVT::Other,