diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index da3293bc8c2..f03a2a9e61f 100644 --- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -301,6 +301,11 @@ namespace { /// looking for a better chain (aliasing node.) SDValue FindBetterChain(SDNode *N, SDValue Chain); + /// Merge consecutive store operations into a wide store. + /// This optimization uses wide integers or vectors when possible. + /// \return True if some memory operations were changed. + bool MergeConsecutiveStores(StoreSDNode *N); + public: DAGCombiner(SelectionDAG &D, AliasAnalysis &A, CodeGenOpt::Level OL) : DAG(D), TLI(D.getTargetLoweringInfo()), Level(BeforeLegalizeTypes), @@ -7423,6 +7428,391 @@ SDValue DAGCombiner::TransformFPLoadStorePair(SDNode *N) { return SDValue(); } +/// Returns the base pointer and an integer offset from that object. +static std::pair GetPointerBaseAndOffset(SDValue Ptr) { + if (Ptr->getOpcode() == ISD::ADD && isa(Ptr->getOperand(1))) { + int64_t Offset = cast(Ptr->getOperand(1))->getSExtValue(); + SDValue Base = Ptr->getOperand(0); + return std::make_pair(Base, Offset); + } + + return std::make_pair(Ptr, 0); +} + +/// Holds a pointer to an LSBaseSDNode as well as information on where it +/// is located in a sequence of memory operations connected by a chain. +struct MemOpLink { + MemOpLink (LSBaseSDNode *N, int64_t Offset, unsigned Seq): + MemNode(N), OffsetFromBase(Offset), SequenceNum(Seq) { } + // Ptr to the mem node. + LSBaseSDNode *MemNode; + // Offset from the base ptr. + int64_t OffsetFromBase; + // What is the sequence number of this mem node. + // Lowest mem operand in the DAG starts at zero. + unsigned SequenceNum; +}; + +/// Sorts store nodes in a link according to their offset from a shared +// base ptr. +struct ConsecutiveMemoryChainSorter { + bool operator()(MemOpLink LHS, MemOpLink RHS) { + return LHS.OffsetFromBase < RHS.OffsetFromBase; + } +}; + +bool DAGCombiner::MergeConsecutiveStores(StoreSDNode* St) { + EVT MemVT = St->getMemoryVT(); + int64_t ElementSizeBytes = MemVT.getSizeInBits()/8; + + // Don't merge vectors into wider inputs. + if (MemVT.isVector() || !MemVT.isSimple()) + return false; + + // Perform an early exit check. Do not bother looking at stored values that + // are not constants or loads. + SDValue StoredVal = St->getValue(); + bool IsLoadSrc = isa(StoredVal); + if (!isa(StoredVal) && !isa(StoredVal) && + !IsLoadSrc) + return false; + + // Only look at ends of store sequences. + SDValue Chain = SDValue(St, 1); + if (Chain->hasOneUse() && Chain->use_begin()->getOpcode() == ISD::STORE) + return false; + + // This holds the base pointer and the offset in bytes from the base pointer. + std::pair BasePtr = + GetPointerBaseAndOffset(St->getBasePtr()); + + // We must have a base and an offset. + if (!BasePtr.first.getNode()) + return false; + + // Do not handle stores to undef base pointers. + if (BasePtr.first.getOpcode() == ISD::UNDEF) + return false; + + SmallVector StoreNodes; + // Walk up the chain and look for nodes with offsets from the same + // base pointer. Stop when reaching an instruction with a different kind + // or instruction which has a different base pointer. + unsigned Seq = 0; + StoreSDNode *Index = St; + while (Index) { + // If the chain has more than one use, then we can't reorder the mem ops. + if (Index != St && !SDValue(Index, 1)->hasOneUse()) + break; + + // Find the base pointer and offset for this memory node. + std::pair Ptr = + GetPointerBaseAndOffset(Index->getBasePtr()); + + // Check that the base pointer is the same as the original one. + if (Ptr.first.getNode() != BasePtr.first.getNode()) + break; + + // Check that the alignment is the same. + if (Index->getAlignment() != St->getAlignment()) + break; + + // The memory operands must not be volatile. + if (Index->isVolatile() || Index->isIndexed()) + break; + + // No truncation. + if (StoreSDNode *St = dyn_cast(Index)) + if (St->isTruncatingStore()) + break; + + // The stored memory type must be the same. + if (Index->getMemoryVT() != MemVT) + break; + + // We do not allow unaligned stores because we want to prevent overriding + // stores. + if (Index->getAlignment()*8 != MemVT.getSizeInBits()) + break; + + // We found a potential memory operand to merge. + StoreNodes.push_back(MemOpLink(Index, Ptr.second, Seq++)); + + // Move up the chain to the next memory operation. + Index = dyn_cast(Index->getChain().getNode()); + } + + // Check if there is anything to merge. + if (StoreNodes.size() < 2) + return false; + + // Sort the memory operands according to their distance from the base pointer. + std::sort(StoreNodes.begin(), StoreNodes.end(), + ConsecutiveMemoryChainSorter()); + + // Scan the memory operations on the chain and find the first non-consecutive + // store memory address. + unsigned LastConsecutiveStore = 0; + int64_t StartAddress = StoreNodes[0].OffsetFromBase; + for (unsigned i=1; i(StoreNodes[i].MemNode); + SDValue StoredVal = St->getValue(); + bool IsConst = (isa(StoredVal) || + isa(StoredVal)); + if (!IsConst) + break; + + // Mark this index as the largest legal constant. + LastConst = i; + + // Find a legal type for the constant store. + unsigned StoreBW = (i+1) * ElementSizeBytes * 8; + EVT StoreTy = EVT::getIntegerVT(*DAG.getContext(), StoreBW); + if (TLI.isTypeLegal(StoreTy)) + LastLegalType = i+1; + } + + // Check if we found a legal integer type to store. + if (LastLegalType == 0) + return false; + + // We add a +1 because the LastXXX variables refer to array location + // while NumElem holds the size. + unsigned NumElem = std::min(LastConsecutiveStore, LastConst) + 1; + NumElem = std::min(LastLegalType, NumElem); + + unsigned EarliestNodeUsed = 0; + for (unsigned i=0; i < NumElem; ++i) { + // Find a chain for the new wide-store operand. Notice that some + // of the store nodes that we found may not be selected for inclusion + // in the wide store. The chain we use needs to be the chain of the + // earliest store node which is *used* and replaced by the wide store. + if (StoreNodes[i].SequenceNum > StoreNodes[EarliestNodeUsed].SequenceNum) + EarliestNodeUsed = i; + } + + // The earliest Node in the DAG. + LSBaseSDNode *EarliestOp = StoreNodes[EarliestNodeUsed].MemNode; + + // Make sure we have something to merge. + if (NumElem < 2) + return false; + + DebugLoc DL = StoreNodes[0].MemNode->getDebugLoc(); + unsigned StoreBW = NumElem * ElementSizeBytes * 8; + APInt StoreInt(StoreBW, 0); + + // Construct a single integer constant which is made of the smaller + // constant inputs. + bool IsLE = TLI.isLittleEndian(); + for (unsigned i = 0; i < NumElem ; ++i) { + unsigned Idx = IsLE ?(NumElem - 1 - i) : i; + StoreSDNode *St = cast(StoreNodes[Idx].MemNode); + SDValue Val = St->getValue(); + StoreInt<<=ElementSizeBytes*8; + if (ConstantSDNode *C = dyn_cast(Val)) { + StoreInt|=C->getAPIntValue().zext(StoreBW); + } else if (ConstantFPSDNode *C = dyn_cast(Val)) { + StoreInt|= C->getValueAPF().bitcastToAPInt().zext(StoreBW); + } else { + assert(false && "Invalid constant element type"); + } + } + + // Create the new Load and Store operations. + EVT StoreTy = EVT::getIntegerVT(*DAG.getContext(), StoreBW); + SDValue WideInt = DAG.getConstant(StoreInt, StoreTy); + SDValue NewStore = DAG.getStore(EarliestOp->getChain(), DL, WideInt, + FirstInChain->getBasePtr(), + FirstInChain->getPointerInfo(), + false, false, + FirstInChain->getAlignment()); + + // Replace the first store with the new store + CombineTo(EarliestOp, NewStore); + // Erase all other stores. + for (unsigned i = 0; i < NumElem ; ++i) { + if (StoreNodes[i].MemNode == EarliestOp) + continue; + StoreSDNode *St = cast(StoreNodes[i].MemNode); + DAG.ReplaceAllUsesOfValueWith(SDValue(St, 0), St->getChain()); + removeFromWorkList(St); + DAG.DeleteNode(St); + } + + return true; + } + + // Below we handle the case of multiple consecutive stores that + // come from multiple consecutive loads. We merge them into a single + // wide load and a single wide store. + + // Look for load nodes which are used by the stored values. + SmallVector LoadNodes; + + // Find acceptable loads. Loads need to have the same chain (token factor), + // must not be zext, volatile, indexed, and they must be consecutive. + SDValue LdBasePtr; + for (unsigned i=0; i(StoreNodes[i].MemNode); + LoadSDNode *Ld = dyn_cast(St->getValue()); + if (!Ld) break; + + // Loads must only have one use. + if (!Ld->hasNUsesOfValue(1, 0)) + break; + + // Check that the alignment is the same as the stores. + if (Ld->getAlignment() != St->getAlignment()) + break; + + // The memory operands must not be volatile. + if (Ld->isVolatile() || Ld->isIndexed()) + break; + + // We do not accept ext loads. + if (Ld->getExtensionType() != ISD::NON_EXTLOAD) + break; + + // The stored memory type must be the same. + if (Ld->getMemoryVT() != MemVT) + break; + + std::pair LdPtr = + GetPointerBaseAndOffset(Ld->getBasePtr()); + + // If this is not the first ptr that we check. + if (LdBasePtr.getNode()) { + // The base ptr must be the same. + if (LdPtr.first != LdBasePtr) + break; + } else { + // Check that all other base pointers are the same as this one. + LdBasePtr = LdPtr.first; + } + + // We found a potential memory operand to merge. + LoadNodes.push_back(MemOpLink(Ld, LdPtr.second, 0)); + } + + if (LoadNodes.size() < 2) + return false; + + // Scan the memory operations on the chain and find the first non-consecutive + // load memory address. These variables hold the index in the store node + // array. + unsigned LastConsecutiveLoad = 0; + // This variable refers to the size and not index in the array. + unsigned LastLegalVectorType = 0; + unsigned LastLegalIntegerType = 0; + StartAddress = LoadNodes[0].OffsetFromBase; + for (unsigned i=1; i LastLegalIntegerType; + unsigned LastLegalType = std::max(LastLegalVectorType, LastLegalIntegerType); + + // We add +1 here because the LastXXX variables refer to location while + // the NumElem refers to array/index size. + unsigned NumElem = std::min(LastConsecutiveStore, LastConsecutiveLoad) + 1; + NumElem = std::min(LastLegalType, NumElem); + + if (NumElem < 2) + return false; + + // The earliest Node in the DAG. + unsigned EarliestNodeUsed = 0; + LSBaseSDNode *EarliestOp = StoreNodes[EarliestNodeUsed].MemNode; + for (unsigned i=1; i StoreNodes[EarliestNodeUsed].SequenceNum) + EarliestNodeUsed = i; + } + + // Find if it is better to use vectors or integers to load and store + // to memory. + EVT JointMemOpVT; + if (UseVectorTy) { + JointMemOpVT = EVT::getVectorVT(*DAG.getContext(), MemVT, NumElem); + } else { + unsigned StoreBW = NumElem * ElementSizeBytes * 8; + JointMemOpVT = EVT::getIntegerVT(*DAG.getContext(), StoreBW); + } + + DebugLoc LoadDL = LoadNodes[0].MemNode->getDebugLoc(); + DebugLoc StoreDL = StoreNodes[0].MemNode->getDebugLoc(); + + LoadSDNode *FirstLoad = cast(LoadNodes[0].MemNode); + SDValue NewLoad = DAG.getLoad(JointMemOpVT, LoadDL, + FirstLoad->getChain(), + FirstLoad->getBasePtr(), + FirstLoad->getPointerInfo(), + false, false, false, + FirstLoad->getAlignment()); + + SDValue NewStore = DAG.getStore(EarliestOp->getChain(), StoreDL, NewLoad, + FirstInChain->getBasePtr(), + FirstInChain->getPointerInfo(), false, false, + FirstInChain->getAlignment()); + + // Replace the first store with the new store + CombineTo(EarliestOp, NewStore); + // Erase all other stores. + for (unsigned i = 0; i < NumElem ; ++i) { + // Replace all chain users of the old load nodes with the chain of the new + // load node. + LoadSDNode *Ld = cast(LoadNodes[i].MemNode); + DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), + SDValue(NewLoad.getNode(), 1)); + + // Remove all Store nodes. + if (StoreNodes[i].MemNode == EarliestOp) + continue; + StoreSDNode *St = cast(StoreNodes[i].MemNode); + DAG.ReplaceAllUsesOfValueWith(SDValue(St, 0), St->getChain()); + removeFromWorkList(St); + DAG.DeleteNode(St); + } + + return true; +} + SDValue DAGCombiner::visitSTORE(SDNode *N) { StoreSDNode *ST = cast(N); SDValue Chain = ST->getChain(); @@ -7624,6 +8014,11 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) { ST->getAlignment()); } + // Only perform this optimization before the types are legal, because we + // don't want to perform this optimization multiple times. + if (!LegalTypes && MergeConsecutiveStores(ST)) + return SDValue(N, 0); + return ReduceLoadOpStoreWidth(N); } diff --git a/test/CodeGen/X86/MergeConsecutiveStores.ll b/test/CodeGen/X86/MergeConsecutiveStores.ll new file mode 100644 index 00000000000..79f8ee54a2a --- /dev/null +++ b/test/CodeGen/X86/MergeConsecutiveStores.ll @@ -0,0 +1,273 @@ +; RUN: llc -march=x86-64 -mcpu=corei7 < %s | FileCheck %s + +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" +target triple = "x86_64-apple-macosx10.8.0" + +%struct.A = type { i8, i8, i8, i8, i8, i8, i8, i8 } +%struct.B = type { i32, i32, i32, i32, i32, i32, i32, i32 } + +; Move all of the constants using a single vector store. +; CHECK: merge_const_store +; save 1,2,3 ... as one big integer. +; CHECK: movabsq $578437695752307201 +; CHECK: ret +define void @merge_const_store(i32 %count, %struct.A* nocapture %p) nounwind uwtable noinline ssp { + %1 = icmp sgt i32 %count, 0 + br i1 %1, label %.lr.ph, label %._crit_edge +.lr.ph: + %i.02 = phi i32 [ %10, %.lr.ph ], [ 0, %0 ] + %.01 = phi %struct.A* [ %11, %.lr.ph ], [ %p, %0 ] + %2 = getelementptr inbounds %struct.A* %.01, i64 0, i32 0 + store i8 1, i8* %2, align 1 + %3 = getelementptr inbounds %struct.A* %.01, i64 0, i32 1 + store i8 2, i8* %3, align 1 + %4 = getelementptr inbounds %struct.A* %.01, i64 0, i32 2 + store i8 3, i8* %4, align 1 + %5 = getelementptr inbounds %struct.A* %.01, i64 0, i32 3 + store i8 4, i8* %5, align 1 + %6 = getelementptr inbounds %struct.A* %.01, i64 0, i32 4 + store i8 5, i8* %6, align 1 + %7 = getelementptr inbounds %struct.A* %.01, i64 0, i32 5 + store i8 6, i8* %7, align 1 + %8 = getelementptr inbounds %struct.A* %.01, i64 0, i32 6 + store i8 7, i8* %8, align 1 + %9 = getelementptr inbounds %struct.A* %.01, i64 0, i32 7 + store i8 8, i8* %9, align 1 + %10 = add nsw i32 %i.02, 1 + %11 = getelementptr inbounds %struct.A* %.01, i64 1 + %exitcond = icmp eq i32 %10, %count + br i1 %exitcond, label %._crit_edge, label %.lr.ph +._crit_edge: + ret void +} + +; Move the first 4 constants as a single vector. Move the rest as scalars. +; CHECK: merge_nonconst_store +; CHECK: movl $67305985 +; CHECK: movb +; CHECK: movb +; CHECK: movb +; CHECK: movb +; CHECK: ret +define void @merge_nonconst_store(i32 %count, i8 %zz, %struct.A* nocapture %p) nounwind uwtable noinline ssp { + %1 = icmp sgt i32 %count, 0 + br i1 %1, label %.lr.ph, label %._crit_edge +.lr.ph: + %i.02 = phi i32 [ %10, %.lr.ph ], [ 0, %0 ] + %.01 = phi %struct.A* [ %11, %.lr.ph ], [ %p, %0 ] + %2 = getelementptr inbounds %struct.A* %.01, i64 0, i32 0 + store i8 1, i8* %2, align 1 + %3 = getelementptr inbounds %struct.A* %.01, i64 0, i32 1 + store i8 2, i8* %3, align 1 + %4 = getelementptr inbounds %struct.A* %.01, i64 0, i32 2 + store i8 3, i8* %4, align 1 + %5 = getelementptr inbounds %struct.A* %.01, i64 0, i32 3 + store i8 4, i8* %5, align 1 + %6 = getelementptr inbounds %struct.A* %.01, i64 0, i32 4 + store i8 %zz, i8* %6, align 1 ; <----------- Not a const; + %7 = getelementptr inbounds %struct.A* %.01, i64 0, i32 5 + store i8 6, i8* %7, align 1 + %8 = getelementptr inbounds %struct.A* %.01, i64 0, i32 6 + store i8 7, i8* %8, align 1 + %9 = getelementptr inbounds %struct.A* %.01, i64 0, i32 7 + store i8 8, i8* %9, align 1 + %10 = add nsw i32 %i.02, 1 + %11 = getelementptr inbounds %struct.A* %.01, i64 1 + %exitcond = icmp eq i32 %10, %count + br i1 %exitcond, label %._crit_edge, label %.lr.ph +._crit_edge: + ret void +} + + +;CHECK: merge_loads_i16 +; load: +;CHECK: movw +; store: +;CHECK: movw +;CHECK: ret +define void @merge_loads_i16(i32 %count, %struct.A* noalias nocapture %q, %struct.A* noalias nocapture %p) nounwind uwtable noinline ssp { + %1 = icmp sgt i32 %count, 0 + br i1 %1, label %.lr.ph, label %._crit_edge + +.lr.ph: ; preds = %0 + %2 = getelementptr inbounds %struct.A* %q, i64 0, i32 0 + %3 = getelementptr inbounds %struct.A* %q, i64 0, i32 1 + br label %4 + +;