From 9bfa03c6fd8e02b738e0077fd1af7b18eeeeb4c1 Mon Sep 17 00:00:00 2001
From: Evan Cheng <evan.cheng@apple.com>
Date: Mon, 12 May 2008 23:04:07 +0000
Subject: [PATCH] Xform bitconvert(build_pair(load a, load b)) to a single load
 if the load locations are at the right offset from each other.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@51008 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/Target/TargetLowering.h        |  2 +-
 lib/CodeGen/SelectionDAG/DAGCombiner.cpp    | 49 +++++++++++++++++++
 lib/CodeGen/SelectionDAG/TargetLowering.cpp |  2 +-
 lib/Target/X86/README-SSE.txt               | 54 ---------------------
 lib/Target/X86/X86ISelLowering.cpp          | 24 ++-------
 test/CodeGen/X86/combine-lds.ll             |  7 +++
 6 files changed, 63 insertions(+), 75 deletions(-)
 create mode 100644 test/CodeGen/X86/combine-lds.ll
diff --git a/include/llvm/Target/TargetLowering.h b/include/llvm/Target/TargetLowering.h
index 3b704298a87..ffc927a56c6 100644
--- a/include/llvm/Target/TargetLowering.h
+++ b/include/llvm/Target/TargetLowering.h
@@ -691,7 +691,7 @@ public:
   /// loading 'Bytes' bytes from a location that is 'Dist' units away from the
   /// location that the 'Base' load is loading from.
   bool isConsecutiveLoad(SDNode *LD, SDNode *Base, unsigned Bytes, int Dist,
-                         MachineFrameInfo *MFI) const;
+                         const MachineFrameInfo *MFI) const;
 
   /// PerformDAGCombine - This method will be invoked for all target nodes and
   /// for any target-independent nodes that the target has registered with
diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 28f32d3d3b9..684b2f66a59 100644
--- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -177,6 +177,7 @@ namespace {
     SDOperand visitSIGN_EXTEND_INREG(SDNode *N);
     SDOperand visitTRUNCATE(SDNode *N);
     SDOperand visitBIT_CONVERT(SDNode *N);
+    SDOperand visitBUILD_PAIR(SDNode *N);
     SDOperand visitFADD(SDNode *N);
     SDOperand visitFSUB(SDNode *N);
     SDOperand visitFMUL(SDNode *N);
@@ -217,6 +218,7 @@ namespace {
                             ISD::CondCode Cond, bool foldBooleans = true);
     SDOperand SimplifyNodeWithTwoResults(SDNode *N, unsigned LoOp, 
                                          unsigned HiOp);
+    SDOperand CombineConsecutiveLoads(SDNode *N, MVT::ValueType VT);
     SDOperand ConstantFoldBIT_CONVERTofBUILD_VECTOR(SDNode *, MVT::ValueType);
     SDOperand BuildSDIV(SDNode *N);
     SDOperand BuildUDIV(SDNode *N);
@@ -710,6 +712,7 @@ SDOperand DAGCombiner::visit(SDNode *N) {
   case ISD::SIGN_EXTEND_INREG:  return visitSIGN_EXTEND_INREG(N);
   case ISD::TRUNCATE:           return visitTRUNCATE(N);
   case ISD::BIT_CONVERT:        return visitBIT_CONVERT(N);
+  case ISD::BUILD_PAIR:         return visitBUILD_PAIR(N);
   case ISD::FADD:               return visitFADD(N);
   case ISD::FSUB:               return visitFSUB(N);
   case ISD::FMUL:               return visitFMUL(N);
@@ -3356,6 +3359,40 @@ SDOperand DAGCombiner::visitTRUNCATE(SDNode *N) {
   return ReduceLoadWidth(N);
 }
 
+static SDNode *getBuildPairElt(SDNode *N, unsigned i) {
+  SDOperand Elt = N->getOperand(i);
+  if (Elt.getOpcode() != ISD::MERGE_VALUES)
+    return Elt.Val;
+  return Elt.getOperand(Elt.ResNo).Val;
+}
+
+/// CombineConsecutiveLoads - build_pair (load, load) -> load
+/// if load locations are consecutive. 
+SDOperand DAGCombiner::CombineConsecutiveLoads(SDNode *N, MVT::ValueType VT) {
+  assert(N->getOpcode() == ISD::BUILD_PAIR);
+
+  SDNode *LD1 = getBuildPairElt(N, 0);
+  if (!ISD::isNON_EXTLoad(LD1) || !LD1->hasOneUse())
+    return SDOperand();
+  MVT::ValueType LD1VT = LD1->getValueType(0);
+  SDNode *LD2 = getBuildPairElt(N, 1);
+  const MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
+  if (ISD::isNON_EXTLoad(LD2) &&
+      LD2->hasOneUse() &&
+      TLI.isConsecutiveLoad(LD2, LD1, MVT::getSizeInBits(LD1VT)/8, 1, MFI)) {
+    LoadSDNode *LD = cast<LoadSDNode>(LD1);
+    unsigned Align = LD->getAlignment();
+    unsigned NewAlign = TLI.getTargetMachine().getTargetData()->
+      getABITypeAlignment(MVT::getTypeForValueType(VT));
+    if ((!AfterLegalize || TLI.isTypeLegal(VT)) &&
+        TLI.isOperationLegal(ISD::LOAD, VT) && NewAlign <= Align)
+      return DAG.getLoad(VT, LD->getChain(), LD->getBasePtr(),
+                         LD->getSrcValue(), LD->getSrcValueOffset(),
+                         LD->isVolatile(), Align);
+  }
+  return SDOperand();
+}
+
 SDOperand DAGCombiner::visitBIT_CONVERT(SDNode *N) {
   SDOperand N0 = N->getOperand(0);
   MVT::ValueType VT = N->getValueType(0);
@@ -3463,10 +3500,22 @@ SDOperand DAGCombiner::visitBIT_CONVERT(SDNode *N) {
 
     return DAG.getNode(ISD::OR, VT, X, Cst);
   }
+
+  // bitconvert(build_pair(ld, ld)) -> ld iff load locations are consecutive. 
+  if (N0.getOpcode() == ISD::BUILD_PAIR) {
+    SDOperand CombineLD = CombineConsecutiveLoads(N0.Val, VT);
+    if (CombineLD.Val)
+      return CombineLD;
+  }
   
   return SDOperand();
 }
 
+SDOperand DAGCombiner::visitBUILD_PAIR(SDNode *N) {
+  MVT::ValueType VT = N->getValueType(0);
+  return CombineConsecutiveLoads(N, VT);
+}
+
 /// ConstantFoldBIT_CONVERTofBUILD_VECTOR - We know that BV is a build_vector
 /// node with Constant, ConstantFP or Undef operands.  DstEltVT indicates the 
 /// destination element value type.
diff --git a/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index e57813dfe7f..408a5b23b4f 100644
--- a/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -1514,7 +1514,7 @@ bool TargetLowering::isGAPlusOffset(SDNode *N, GlobalValue* &GA,
 /// location that the 'Base' load is loading from.
 bool TargetLowering::isConsecutiveLoad(SDNode *LD, SDNode *Base,
                                        unsigned Bytes, int Dist,
-                                       MachineFrameInfo *MFI) const {
+                                       const MachineFrameInfo *MFI) const {
   if (LD->getOperand(0).Val != Base->getOperand(0).Val)
     return false;
   MVT::ValueType VT = LD->getValueType(0);
diff --git a/lib/Target/X86/README-SSE.txt b/lib/Target/X86/README-SSE.txt
index 1a5d9045b05..34b949a6018 100644
--- a/lib/Target/X86/README-SSE.txt
+++ b/lib/Target/X86/README-SSE.txt
@@ -426,60 +426,6 @@ entry:
 	ret void
 }
 
-//===---------------------------------------------------------------------===//
-
-Consider (PR2108):
-
-#include <xmmintrin.h>
-__m128i doload64(unsigned long long x) { return _mm_loadl_epi64(&x);}
-__m128i doload64_2(unsigned long long *x) { return _mm_loadl_epi64(x);}
-
-These are very similar routines, but we generate significantly worse code for
-the first one on x86-32:
-
-_doload64:
-	subl	$12, %esp
-	movl	20(%esp), %eax
-	movl	%eax, 4(%esp)
-	movl	16(%esp), %eax
-	movl	%eax, (%esp)
-	movsd	(%esp), %xmm0
-	addl	$12, %esp
-	ret
-_doload64_2:
-	movl	4(%esp), %eax
-	movsd	(%eax), %xmm0
-	ret
-
-The problem is that the argument lowering logic splits the i64 argument into
-2x i32 loads early, the f64 insert doesn't match.  Here's a reduced testcase:
-
-define fastcc double @doload64(i64 %x) nounwind  {
-entry:
-	%tmp717 = bitcast i64 %x to double		; <double> [#uses=1]
-	ret double %tmp717
-}
-
-compiles to:
-
-_doload64:
-	subl	$12, %esp
-	movl	20(%esp), %eax
-	movl	%eax, 4(%esp)
-	movl	16(%esp), %eax
-	movl	%eax, (%esp)
-	movsd	(%esp), %xmm0
-	addl	$12, %esp
-	ret
-
-instead of movsd from the stack.  This is actually not too bad to implement. The
-best way to do this is to implement a dag combine that turns 
-bitconvert(build_pair(load a, load b)) into one load of the right type.  The
-only trick to this is writing the predicate that determines that a/b are at the
-right offset from each other.  For the enterprising hacker, InferAlignment is a
-helpful place to start poking if interested.
-
-
 //===---------------------------------------------------------------------===//
 
 __m128d test1( __m128d A, __m128d B) {
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 5d50e36535a..806b626456a 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -6285,13 +6285,7 @@ static SDOperand PerformShuffleCombine(SDNode *N, SelectionDAG &DAG,
                      LD->getAlignment());
 }
 
-static SDNode *getBuildPairElt(SDNode *N, unsigned i) {
-  SDOperand Elt = N->getOperand(i);
-  if (Elt.getOpcode() != ISD::MERGE_VALUES)
-    return Elt.Val;
-  return Elt.getOperand(Elt.ResNo).Val;
-}
-
+/// PerformBuildVectorCombine - build_vector 0,(load i64 / f64) -> movq / movsd.
 static SDOperand PerformBuildVectorCombine(SDNode *N, SelectionDAG &DAG,
                                            const X86Subtarget *Subtarget,
                                            const TargetLowering &TLI) {
@@ -6312,25 +6306,17 @@ static SDOperand PerformBuildVectorCombine(SDNode *N, SelectionDAG &DAG,
     return SDOperand();
 
   // Value must be a load.
-  MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
   SDNode *Base = N->getOperand(0).Val;
   if (!isa<LoadSDNode>(Base)) {
-    if (Base->getOpcode() == ISD::BIT_CONVERT)
-      Base = Base->getOperand(0).Val;
-    if (Base->getOpcode() != ISD::BUILD_PAIR)
+    if (Base->getOpcode() != ISD::BIT_CONVERT)
       return SDOperand();
-    SDNode *Pair = Base;
-    Base = getBuildPairElt(Pair, 0);
-    if (!ISD::isNON_EXTLoad(Base))
-      return SDOperand();
-    SDNode *NextLD = getBuildPairElt(Pair, 1);
-    if (!ISD::isNON_EXTLoad(NextLD) ||
-        !TLI.isConsecutiveLoad(NextLD, Base, 4/*32 bits*/, 1, MFI))
+    Base = Base->getOperand(0).Val;
+    if (!isa<LoadSDNode>(Base))
       return SDOperand();
   }
-  LoadSDNode *LD = cast<LoadSDNode>(Base);
 
   // Transform it into VZEXT_LOAD addr.
+  LoadSDNode *LD = cast<LoadSDNode>(Base);
   return DAG.getNode(X86ISD::VZEXT_LOAD, VT, LD->getChain(), LD->getBasePtr());
 }                                           
 
diff --git a/test/CodeGen/X86/combine-lds.ll b/test/CodeGen/X86/combine-lds.ll
new file mode 100644
index 00000000000..5e0ad996678
--- /dev/null
+++ b/test/CodeGen/X86/combine-lds.ll
@@ -0,0 +1,7 @@
+; RUN: llvm-as < %s | llc -march=x86 -mattr=+sse2 | grep movsd
+; RUN: llvm-as < %s | llc -march=x86 -mattr=+sse2 | grep mov | count 1
+
+define fastcc double @doload64(i64 %x) nounwind  {
+	%tmp717 = bitcast i64 %x to double
+	ret double %tmp717
+}