Try to reuse the value when lowering memset.

This allows us to compile: void test(char *s, int a) { __builtin_memset(s, a, 15); } into 1 mul + 3 stores instead of 3 muls + 3 stores. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@122710 91177308-0d34-0410-b5e6-96231b3b80d8
2025-03-31 11:32:42 +00:00 · 2011-01-02 19:57:05 +00:00 · 2011-01-02 19:57:05 +00:00 · 80220369b0
commit 80220369b0
parent 8c06aa1c59
3 changed files with 30 additions and 47 deletions
--- a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@ -3527,16 +3527,34 @@ static SDValue getMemsetStores(SelectionDAG &DAG, DebugLoc dl,
  SmallVector<SDValue, 8> OutChains;
  uint64_t DstOff = 0;
  unsigned NumMemOps = MemOps.size();
+
+  // Find the largest store and generate the bit pattern for it.
+  EVT LargestVT = MemOps[0];
+  for (unsigned i = 1; i < NumMemOps; i++)
+    if (MemOps[i].bitsGT(LargestVT))
+      LargestVT = MemOps[i];
+  SDValue MemSetValue = getMemsetValue(Src, LargestVT, DAG, dl);
+
  for (unsigned i = 0; i < NumMemOps; i++) {
    EVT VT = MemOps[i];
-    unsigned VTSize = VT.getSizeInBits() / 8;
-    SDValue Value = getMemsetValue(Src, VT, DAG, dl);
+
+    // If this store is smaller than the largest store see whether we can get
+    // the smaller value for free with a truncate.
+    SDValue Value = MemSetValue;
+    if (VT.bitsLT(LargestVT)) {
+      if (!LargestVT.isVector() && !VT.isVector() &&
+          TLI.isTruncateFree(LargestVT, VT))
+        Value = DAG.getNode(ISD::TRUNCATE, dl, VT, MemSetValue);
+      else
+        Value = getMemsetValue(Src, VT, DAG, dl);
+    }
+    assert(Value.getValueType() == VT && "Value with wrong type.");
    SDValue Store = DAG.getStore(Chain, dl, Value,
                                 getMemBasePlusOffset(Dst, DstOff, DAG),
                                 DstPtrInfo.getWithOffset(DstOff),
                                 isVol, false, Align);
    OutChains.push_back(Store);
-    DstOff += VTSize;
+    DstOff += VT.getSizeInBits() / 8;
  }

  return DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
--- a/lib/Target/X86/README-X86-64.txt
+++ b/lib/Target/X86/README-X86-64.txt
@ -41,50 +41,6 @@ saved a few instructions.

 //===---------------------------------------------------------------------===//

-Poor codegen:
-
-int X[2];
-int b;
-void test(void) {
-  memset(X, b, 2*sizeof(X[0]));
-}
-
-llc:
-	movq _b@GOTPCREL(%rip), %rax
-	movzbq (%rax), %rax
-	movq %rax, %rcx
-	shlq $8, %rcx
-	orq %rax, %rcx
-	movq %rcx, %rax
-	shlq $16, %rax
-	orq %rcx, %rax
-	movq %rax, %rcx
-	shlq $32, %rcx
-	movq _X@GOTPCREL(%rip), %rdx
-	orq %rax, %rcx
-	movq %rcx, (%rdx)
-	ret
-
-gcc:
-	movq	_b@GOTPCREL(%rip), %rax
-	movabsq	$72340172838076673, %rdx
-	movzbq	(%rax), %rax
-	imulq	%rdx, %rax
-	movq	_X@GOTPCREL(%rip), %rdx
-	movq	%rax, (%rdx)
-	ret
-
-And the codegen is even worse for the following
-(from http://gcc.gnu.org/bugzilla/show_bug.cgi?id=33103):
-  void fill1(char *s, int a)
-  {
-    __builtin_memset(s, a, 15);
-  }
-
-For this version, we duplicate the computation of the constant to store.
-
-//===---------------------------------------------------------------------===//
-
 It's not possible to reference AH, BH, CH, and DH registers in an instruction
 requiring REX prefix. However, divb and mulb both produce results in AH. If isel
 emits a CopyFromReg which gets turned into a movb and that can be allocated a
--- a/test/CodeGen/X86/memset-2.ll
+++ b/test/CodeGen/X86/memset-2.ll
@ -28,3 +28,12 @@ entry:
 ; CHECK: imull $16843009
 }

+define void @t4(i8* nocapture %s, i8 %a) nounwind {
+entry:
+  tail call void @llvm.memset.p0i8.i32(i8* %s, i8 %a, i32 15, i32 1, i1 false)
+  ret void
+; CHECK: t4:
+; CHECK: imull $16843009
+; CHECK-NOT: imul
+; CHECK: ret
+}