From 536e66764b93fd1368ce4d1f753cbe909c05b62d Mon Sep 17 00:00:00 2001
From: Evan Cheng <evan.cheng@apple.com>
Date: Thu, 12 Mar 2009 05:59:15 +0000
Subject: [PATCH] On x86, if the only use of a i64 load is a i64 store,
 generate a pair of double load and store instead.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@66776 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/X86ISelLowering.cpp | 115 +++++++++++++++++------------
 test/CodeGen/X86/i64-mem-copy.ll   |  13 ++++
 test/CodeGen/X86/mmx-copy-gprs.ll  |   6 +-
 3 files changed, 83 insertions(+), 51 deletions(-)
 create mode 100644 test/CodeGen/X86/i64-mem-copy.ll
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 2de73778793..f15acd73078 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -8285,14 +8285,21 @@ static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG,
 
 /// PerformSTORECombine - Do target-specific dag combines on STORE nodes.
 static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
-                                     const X86Subtarget *Subtarget) {
+                                   const X86Subtarget *Subtarget) {
   // Turn load->store of MMX types into GPR load/stores.  This avoids clobbering
   // the FP state in cases where an emms may be missing.
   // A preferable solution to the general problem is to figure out the right
   // places to insert EMMS.  This qualifies as a quick hack.
+
+  // Similarly, turn load->store of i64 into double load/stores in 32-bit mode.
   StoreSDNode *St = cast<StoreSDNode>(N);
-  if (St->getValue().getValueType().isVector() &&
-      St->getValue().getValueType().getSizeInBits() == 64 &&
+  MVT VT = St->getValue().getValueType();
+  if (VT.getSizeInBits() != 64)
+    return SDValue();
+
+  bool F64IsLegal = !UseSoftFloat && !NoImplicitFloat && Subtarget->hasSSE2();
+  if ((VT.isVector() ||
+       (VT == MVT::i64 && F64IsLegal && !Subtarget->is64Bit())) &&
       isa<LoadSDNode>(St->getValue()) &&
       !cast<LoadSDNode>(St->getValue())->isVolatile() &&
       St->getChain().hasOneUse() && !St->isVolatile()) {
@@ -8316,60 +8323,72 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
           Ops.push_back(ChainVal->getOperand(i));
       }
     }
-    if (Ld) {
-      DebugLoc DL = N->getDebugLoc();
-      // If we are a 64-bit capable x86, lower to a single movq load/store pair.
-      if (Subtarget->is64Bit()) {
-        SDValue NewLd = DAG.getLoad(MVT::i64, DL, Ld->getChain(),
-                                      Ld->getBasePtr(), Ld->getSrcValue(),
-                                      Ld->getSrcValueOffset(), Ld->isVolatile(),
-                                      Ld->getAlignment());
-        SDValue NewChain = NewLd.getValue(1);
-        if (TokenFactorIndex != -1) {
-          Ops.push_back(NewChain);
-          NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, &Ops[0],
-                                 Ops.size());
-        }
-        return DAG.getStore(NewChain, DL, NewLd, St->getBasePtr(),
-                            St->getSrcValue(), St->getSrcValueOffset(),
-                            St->isVolatile(), St->getAlignment());
-      }
 
-      // Otherwise, lower to two 32-bit copies.
-      SDValue LoAddr = Ld->getBasePtr();
-      SDValue HiAddr = DAG.getNode(ISD::ADD, DL, MVT::i32, LoAddr,
-                                     DAG.getConstant(4, MVT::i32));
+    if (!Ld || !ISD::isNormalLoad(Ld))
+      return SDValue();
 
-      SDValue LoLd = DAG.getLoad(MVT::i32, DL, Ld->getChain(), LoAddr,
-                                   Ld->getSrcValue(), Ld->getSrcValueOffset(),
-                                   Ld->isVolatile(), Ld->getAlignment());
-      SDValue HiLd = DAG.getLoad(MVT::i32, DL, Ld->getChain(), HiAddr,
-                                   Ld->getSrcValue(), Ld->getSrcValueOffset()+4,
-                                   Ld->isVolatile(),
-                                   MinAlign(Ld->getAlignment(), 4));
+    // If this is not the MMX case, i.e. we are just turning i64 load/store
+    // into f64 load/store, avoid the transformation if there are multiple
+    // uses of the loaded value.
+    if (!VT.isVector() && !Ld->hasNUsesOfValue(1, 0))
+      return SDValue();
 
-      SDValue NewChain = LoLd.getValue(1);
+    DebugLoc LdDL = Ld->getDebugLoc();
+    DebugLoc StDL = N->getDebugLoc();
+    // If we are a 64-bit capable x86, lower to a single movq load/store pair.
+    // Otherwise, if it's legal to use f64 SSE instructions, use f64 load/store
+    // pair instead.
+    if (Subtarget->is64Bit() || F64IsLegal) {
+      MVT LdVT = Subtarget->is64Bit() ? MVT::i64 : MVT::f64;
+      SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(),
+                                  Ld->getBasePtr(), Ld->getSrcValue(),
+                                  Ld->getSrcValueOffset(), Ld->isVolatile(),
+                                  Ld->getAlignment());
+      SDValue NewChain = NewLd.getValue(1);
       if (TokenFactorIndex != -1) {
-        Ops.push_back(LoLd);
-        Ops.push_back(HiLd);
-        NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, &Ops[0],
+        Ops.push_back(NewChain);
+        NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, &Ops[0],
                                Ops.size());
       }
-
-      LoAddr = St->getBasePtr();
-      HiAddr = DAG.getNode(ISD::ADD, DL, MVT::i32, LoAddr,
-                           DAG.getConstant(4, MVT::i32));
-
-      SDValue LoSt = DAG.getStore(NewChain, DL, LoLd, LoAddr,
+      return DAG.getStore(NewChain, StDL, NewLd, St->getBasePtr(),
                           St->getSrcValue(), St->getSrcValueOffset(),
                           St->isVolatile(), St->getAlignment());
-      SDValue HiSt = DAG.getStore(NewChain, DL, HiLd, HiAddr,
-                                    St->getSrcValue(),
-                                    St->getSrcValueOffset() + 4,
-                                    St->isVolatile(),
-                                    MinAlign(St->getAlignment(), 4));
-      return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, LoSt, HiSt);
     }
+
+    // Otherwise, lower to two pairs of 32-bit loads / stores.
+    SDValue LoAddr = Ld->getBasePtr();
+    SDValue HiAddr = DAG.getNode(ISD::ADD, LdDL, MVT::i32, LoAddr,
+                                 DAG.getConstant(4, MVT::i32));
+
+    SDValue LoLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), LoAddr,
+                               Ld->getSrcValue(), Ld->getSrcValueOffset(),
+                               Ld->isVolatile(), Ld->getAlignment());
+    SDValue HiLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), HiAddr,
+                               Ld->getSrcValue(), Ld->getSrcValueOffset()+4,
+                               Ld->isVolatile(),
+                               MinAlign(Ld->getAlignment(), 4));
+
+    SDValue NewChain = LoLd.getValue(1);
+    if (TokenFactorIndex != -1) {
+      Ops.push_back(LoLd);
+      Ops.push_back(HiLd);
+      NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, &Ops[0],
+                             Ops.size());
+    }
+
+    LoAddr = St->getBasePtr();
+    HiAddr = DAG.getNode(ISD::ADD, StDL, MVT::i32, LoAddr,
+                         DAG.getConstant(4, MVT::i32));
+
+    SDValue LoSt = DAG.getStore(NewChain, StDL, LoLd, LoAddr,
+                                St->getSrcValue(), St->getSrcValueOffset(),
+                                St->isVolatile(), St->getAlignment());
+    SDValue HiSt = DAG.getStore(NewChain, StDL, HiLd, HiAddr,
+                                St->getSrcValue(),
+                                St->getSrcValueOffset() + 4,
+                                St->isVolatile(),
+                                MinAlign(St->getAlignment(), 4));
+    return DAG.getNode(ISD::TokenFactor, StDL, MVT::Other, LoSt, HiSt);
   }
   return SDValue();
 }
diff --git a/test/CodeGen/X86/i64-mem-copy.ll b/test/CodeGen/X86/i64-mem-copy.ll
new file mode 100644
index 00000000000..ce540112087
--- /dev/null
+++ b/test/CodeGen/X86/i64-mem-copy.ll
@@ -0,0 +1,13 @@
+; RUN: llvm-as < %s | llc -march=x86-64           | grep {movq.*(%rsi), %rax}
+; RUN: llvm-as < %s | llc -march=x86 -mattr=+sse2 | grep {movsd.*(%eax),}
+
+; Uses movsd to load / store i64 values in sse2 is available.
+
+; rdar://6659858
+
+define void @foo(i64* %x, i64* %y) nounwind  {
+entry:
+	%tmp1 = load i64* %y, align 8		; <i64> [#uses=1]
+	store i64 %tmp1, i64* %x, align 8
+	ret void
+}
diff --git a/test/CodeGen/X86/mmx-copy-gprs.ll b/test/CodeGen/X86/mmx-copy-gprs.ll
index da17a04a466..2047ce75e57 100644
--- a/test/CodeGen/X86/mmx-copy-gprs.ll
+++ b/test/CodeGen/X86/mmx-copy-gprs.ll
@@ -1,11 +1,11 @@
-; RUN: llvm-as < %s | llc -march=x86-64 | grep {movq.*(%rsi), %rax}
-; RUN: llvm-as < %s | llc -march=x86 | grep {movl.*4(%eax),}
+; RUN: llvm-as < %s | llc -march=x86-64           | grep {movq.*(%rsi), %rax}
+; RUN: llvm-as < %s | llc -march=x86 -mattr=-sse2 | grep {movl.*4(%eax),}
+; RUN: llvm-as < %s | llc -march=x86 -mattr=+sse2 | grep {movsd.(%eax),}
 
 ; This test should use GPRs to copy the mmx value, not MMX regs.  Using mmx regs,
 ; increases the places that need to use emms.
 
 ; rdar://5741668
-target triple = "x86_64-apple-darwin8"
 
 define void @foo(<1 x i64>* %x, <1 x i64>* %y) nounwind  {
 entry: