Improve the loading of load-anyext vectors by allowing the codegen to load

multiple scalars and insert them into a vector. Next, we shuffle the elements into the correct places, as before. Also fix a small dagcombine bug in SimplifyBinOpWithSameOpcodeHands, when the migration of bitcasts happened too late in the SelectionDAG process. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@159991 91177308-0d34-0410-b5e6-96231b3b80d8
2025-05-28 00:40:54 +00:00 · 2012-07-10 13:25:08 +00:00 · 2012-07-10 13:25:08 +00:00 · 2dd83eb1ab
commit 2dd83eb1ab
parent fae96f17b4
5 changed files with 73 additions and 40 deletions
--- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@ -2340,7 +2340,7 @@ SDValue DAGCombiner::SimplifyBinOpWithSameOpcodeHands(SDNode *N) {
  // We also handle SCALAR_TO_VECTOR because xor/or/and operations are cheaper
  // on scalars.
  if ((N0.getOpcode() == ISD::BITCAST || N0.getOpcode() == ISD::SCALAR_TO_VECTOR)
-      && Level == AfterLegalizeVectorOps) {
+      && Level == AfterLegalizeTypes) {
    SDValue In0 = N0.getOperand(0);
    SDValue In1 = N1.getOperand(0);
    EVT In0Ty = In0.getValueType();
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@ -14425,7 +14425,8 @@ static SDValue PerformXorCombine(SDNode *N, SelectionDAG &DAG,

 /// PerformLOADCombine - Do target-specific dag combines on LOAD nodes.
 static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG,
-                                   const X86Subtarget *Subtarget) {
+                                  TargetLowering::DAGCombinerInfo &DCI,
+                                  const X86Subtarget *Subtarget) {
  LoadSDNode *Ld = cast<LoadSDNode>(N);
  EVT RegVT = Ld->getValueType(0);
  EVT MemVT = Ld->getMemoryVT();
@ -14447,47 +14448,73 @@ static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG,
    unsigned RegSz = RegVT.getSizeInBits();
    unsigned MemSz = MemVT.getSizeInBits();
    assert(RegSz > MemSz && "Register size must be greater than the mem size");
-    // All sizes must be a power of two
-    if (!isPowerOf2_32(RegSz * MemSz * NumElems)) return SDValue();

-    // Attempt to load the original value using a single load op.
-    // Find a scalar type which is equal to the loaded word size.
+    // All sizes must be a power of two.
+    if (!isPowerOf2_32(RegSz * MemSz * NumElems))
+      return SDValue();
+
+    // Attempt to load the original value using scalar loads.
+    // Find the largest scalar type that divides the total loaded size.
    MVT SclrLoadTy = MVT::i8;
    for (unsigned tp = MVT::FIRST_INTEGER_VALUETYPE;
         tp < MVT::LAST_INTEGER_VALUETYPE; ++tp) {
      MVT Tp = (MVT::SimpleValueType)tp;
-      if (TLI.isTypeLegal(Tp) &&  Tp.getSizeInBits() == MemSz) {
+      if (TLI.isTypeLegal(Tp) && ((MemSz % Tp.getSizeInBits()) == 0)) {
        SclrLoadTy = Tp;
-        break;
      }
    }

-    // Proceed if a load word is found.
-    if (SclrLoadTy.getSizeInBits() != MemSz) return SDValue();
+    // Calculate the number of scalar loads that we need to perform
+    // in order to load our vector from memory.
+    unsigned NumLoads = MemSz / SclrLoadTy.getSizeInBits();

+    // Represent our vector as a sequence of elements which are the
+    // largest scalar that we can load.
    EVT LoadUnitVecVT = EVT::getVectorVT(*DAG.getContext(), SclrLoadTy,
      RegSz/SclrLoadTy.getSizeInBits());

+    // Represent the data using the same element type that is stored in
+    // memory. In practice, we ''widen'' MemVT.
    EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
                                  RegSz/MemVT.getScalarType().getSizeInBits());
-    // Can't shuffle using an illegal type.
-    if (!TLI.isTypeLegal(WideVecVT)) return SDValue();

-    // Perform a single load.
-    SDValue ScalarLoad = DAG.getLoad(SclrLoadTy, dl, Ld->getChain(),
-                                  Ld->getBasePtr(),
-                                  Ld->getPointerInfo(), Ld->isVolatile(),
-                                  Ld->isNonTemporal(), Ld->isInvariant(),
-                                  Ld->getAlignment());
+    assert(WideVecVT.getSizeInBits() == LoadUnitVecVT.getSizeInBits() &&
+      "Invalid vector type");

-    // Insert the word loaded into a vector.
-    SDValue ScalarInVector = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
-      LoadUnitVecVT, ScalarLoad);
+    // We can't shuffle using an illegal type.
+    if (!TLI.isTypeLegal(WideVecVT))
+      return SDValue();
+
+    SmallVector<SDValue, 8> Chains;
+    SDValue Ptr = Ld->getBasePtr();
+    SDValue Increment = DAG.getConstant(SclrLoadTy.getSizeInBits()/8,
+                                        TLI.getPointerTy());
+    SDValue Res = DAG.getUNDEF(LoadUnitVecVT);
+
+    for (unsigned i = 0; i < NumLoads; ++i) {
+      // Perform a single load.
+      SDValue ScalarLoad = DAG.getLoad(SclrLoadTy, dl, Ld->getChain(),
+                                       Ptr, Ld->getPointerInfo(),
+                                       Ld->isVolatile(), Ld->isNonTemporal(),
+                                       Ld->isInvariant(), Ld->getAlignment());
+      Chains.push_back(ScalarLoad.getValue(1));
+      // Create the first element type using SCALAR_TO_VECTOR in order to avoid
+      // another round of DAGCombining.
+      if (i == 0)
+        Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LoadUnitVecVT, ScalarLoad);
+      else
+        Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, LoadUnitVecVT, Res,
+                          ScalarLoad, DAG.getIntPtrConstant(i));
+
+      Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
+    }
+
+    SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &Chains[0],
+                               Chains.size());

    // Bitcast the loaded value to a vector of the original element type, in
    // the size of the target vector type.
-    SDValue SlicedVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT,
-                                    ScalarInVector);
+    SDValue SlicedVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT, Res);
    unsigned SizeRatio = RegSz/MemSz;

    // Redistribute the loaded elements into the different locations.
@ -14503,8 +14530,7 @@ static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG,
    Shuff = DAG.getNode(ISD::BITCAST, dl, RegVT, Shuff);
    // Replace the original load with the new sequence
    // and return the new chain.
-    DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Shuff);
-    return SDValue(ScalarLoad.getNode(), 1);
+    return DCI.CombineTo(N, Shuff, TF, true);
  }

  return SDValue();
@ -14574,8 +14600,9 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
    for (unsigned i = 0; i != NumElems; ++i)
      ShuffleVec[i] = i * SizeRatio;

-    // Can't shuffle using an illegal type
-    if (!TLI.isTypeLegal(WideVecVT)) return SDValue();
+    // Can't shuffle using an illegal type.
+    if (!TLI.isTypeLegal(WideVecVT))
+      return SDValue();

    SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
                                         DAG.getUNDEF(WideVecVT),
@ -15308,7 +15335,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
  case ISD::AND:            return PerformAndCombine(N, DAG, DCI, Subtarget);
  case ISD::OR:             return PerformOrCombine(N, DAG, DCI, Subtarget);
  case ISD::XOR:            return PerformXorCombine(N, DAG, DCI, Subtarget);
-  case ISD::LOAD:           return PerformLOADCombine(N, DAG, Subtarget);
+  case ISD::LOAD:           return PerformLOADCombine(N, DAG, DCI, Subtarget);
  case ISD::STORE:          return PerformSTORECombine(N, DAG, Subtarget);
  case ISD::UINT_TO_FP:     return PerformUINT_TO_FPCombine(N, DAG);
  case ISD::SINT_TO_FP:     return PerformSINT_TO_FPCombine(N, DAG, this);
--- a/test/CodeGen/X86/2012-07-10-extload64.ll
+++ b/test/CodeGen/X86/2012-07-10-extload64.ll
@ -0,0 +1,13 @@
+; RUN: llc < %s -march=x86 -mcpu=corei7 -mtriple=i686-pc-win32 | FileCheck %s
+
+; CHECK: load_store
+define void @load_store(<4 x i16>* %in) {
+entry:
+  %A27 = load <4 x i16>* %in, align 4
+  %A28 = add <4 x i16> %A27, %A27
+  store <4 x i16> %A28, <4 x i16>* %in, align 4
+  ret void
+; CHECK: movd
+; CHECK: pinsrd
+; CHECK: ret
+}
--- a/test/CodeGen/X86/vec_compare-2.ll
+++ b/test/CodeGen/X86/vec_compare-2.ll
@ -10,8 +10,7 @@ define void @blackDespeckle_wrapper(i8** %args_list, i64* %gtid, i64 %xend) {
 entry:
 ; CHECK: cfi_def_cfa_offset
 ; CHECK-NOT: set
-; CHECK: movzwl
-; CHECK: movzwl
+; CHECK: punpcklwd
 ; CHECK: pshufd
 ; CHECK: pshufb
  %shr.i = ashr <4 x i32> zeroinitializer, <i32 3, i32 3, i32 3, i32 3> ; <<4 x i32>> [#uses=1]
--- a/test/CodeGen/X86/widen_load-0.ll
+++ b/test/CodeGen/X86/widen_load-0.ll
@ -1,18 +1,12 @@
 ; RUN: llc < %s -o - -mtriple=x86_64-linux -mcpu=corei7 | FileCheck %s
-; RUN: llc < %s -o - -mtriple=x86_64-win32 -mcpu=corei7 | FileCheck %s -check-prefix=WIN64
 ; PR4891

 ; Both loads should happen before either store.

-; CHECK: movd  ({{.*}}), {{.*}}
-; CHECK: movd  ({{.*}}), {{.*}}
-; CHECK: movd  {{.*}}, ({{.*}})
-; CHECK: movd  {{.*}}, ({{.*}})
-
-; WIN64: movd  ({{.*}}), {{.*}}
-; WIN64: movd  ({{.*}}), {{.*}}
-; WIN64: movd  {{.*}}, ({{.*}})
-; WIN64: movd  {{.*}}, ({{.*}})
+; CHECK: movl  ({{.*}}), {{.*}}
+; CHECK: movl  ({{.*}}), {{.*}}
+; CHECK: movl  {{.*}}, ({{.*}})
+; CHECK: movl  {{.*}}, ({{.*}})

 define void @short2_int_swap(<2 x i16>* nocapture %b, i32* nocapture %c) nounwind {
 entry: