From eb38ebf15c326a5bb45ca9da6329cdf19ad6df95 Mon Sep 17 00:00:00 2001 From: Mon P Wang Date: Sun, 24 Jan 2010 00:05:03 +0000 Subject: [PATCH] Improved widening loads by adding support for wider loads if the alignment allows. Fixed a bug where we didn't use a vector load/store for PR5626. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@94338 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/CodeGen/SelectionDAG/LegalizeDAG.cpp | 4 +- lib/CodeGen/SelectionDAG/LegalizeTypes.h | 54 +- .../SelectionDAG/LegalizeVectorTypes.cpp | 594 +++++++++++------- lib/Target/X86/X86ISelLowering.cpp | 34 + lib/Target/X86/X86ISelLowering.h | 6 + lib/Target/X86/X86InstrMMX.td | 14 + test/CodeGen/X86/widen_cast-2.ll | 6 +- test/CodeGen/X86/widen_load-1.ll | 2 +- test/CodeGen/X86/widen_load-2.ll | 155 +++++ 9 files changed, 593 insertions(+), 276 deletions(-) create mode 100644 test/CodeGen/X86/widen_load-2.ll diff --git a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp index 5e3f58a8af4..1c83869d774 100644 --- a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp +++ b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp @@ -1533,10 +1533,10 @@ SDValue SelectionDAGLegalize::ExpandVectorBuildThroughStack(SDNode* Node) { Idx = DAG.getNode(ISD::ADD, dl, FIPtr.getValueType(), FIPtr, Idx); // If EltVT smaller than OpVT, only store the bits necessary. - if (EltVT.bitsLT(OpVT)) + if (!OpVT.isVector() && EltVT.bitsLT(OpVT)) { Stores.push_back(DAG.getTruncStore(DAG.getEntryNode(), dl, Node->getOperand(i), Idx, SV, Offset, EltVT)); - else + } else Stores.push_back(DAG.getStore(DAG.getEntryNode(), dl, Node->getOperand(i), Idx, SV, Offset)); } diff --git a/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/lib/CodeGen/SelectionDAG/LegalizeTypes.h index b5dbd41eb97..bf231d9e526 100644 --- a/lib/CodeGen/SelectionDAG/LegalizeTypes.h +++ b/lib/CodeGen/SelectionDAG/LegalizeTypes.h @@ -633,43 +633,33 @@ private: // Vector Widening Utilities Support: LegalizeVectorTypes.cpp //===--------------------------------------------------------------------===// - /// Helper genWidenVectorLoads - Helper function to generate a set of + /// Helper GenWidenVectorLoads - Helper function to generate a set of /// loads to load a vector with a resulting wider type. It takes - /// ExtType: Extension type - /// LdChain: list of chains for the load we have generated. - /// Chain: incoming chain for the ld vector. - /// BasePtr: base pointer to load from. - /// SV: memory disambiguation source value. - /// SVOffset: memory disambiugation offset. - /// Alignment: alignment of the memory. - /// isVolatile: volatile load. - /// LdWidth: width of memory that we want to load. - /// ResType: the wider result result type for the resulting vector. - /// dl: DebugLoc to be applied to new nodes - SDValue GenWidenVectorLoads(SmallVector& LdChain, SDValue Chain, - SDValue BasePtr, const Value *SV, - int SVOffset, unsigned Alignment, - bool isVolatile, unsigned LdWidth, - EVT ResType, DebugLoc dl); + /// LdChain: list of chains for the load to be generated. + /// Ld: load to widen + SDValue GenWidenVectorLoads(SmallVector& LdChain, + LoadSDNode *LD); + + /// GenWidenVectorExtLoads - Helper function to generate a set of extension + /// loads to load a ector with a resulting wider type. It takes + /// LdChain: list of chains for the load to be generated. + /// Ld: load to widen + /// ExtType: extension element type + SDValue GenWidenVectorExtLoads(SmallVector& LdChain, + LoadSDNode *LD, ISD::LoadExtType ExtType); /// Helper genWidenVectorStores - Helper function to generate a set of /// stores to store a widen vector into non widen memory - /// It takes /// StChain: list of chains for the stores we have generated - /// Chain: incoming chain for the ld vector - /// BasePtr: base pointer to load from - /// SV: memory disambiguation source value - /// SVOffset: memory disambiugation offset - /// Alignment: alignment of the memory - /// isVolatile: volatile lod - /// ValOp: value to store - /// StWidth: width of memory that we want to store - /// dl: DebugLoc to be applied to new nodes - void GenWidenVectorStores(SmallVector& StChain, SDValue Chain, - SDValue BasePtr, const Value *SV, - int SVOffset, unsigned Alignment, - bool isVolatile, SDValue ValOp, - unsigned StWidth, DebugLoc dl); + /// ST: store of a widen value + void GenWidenVectorStores(SmallVector& StChain, StoreSDNode *ST); + + /// Helper genWidenVectorTruncStores - Helper function to generate a set of + /// stores to store a truncate widen vector into non widen memory + /// StChain: list of chains for the stores we have generated + /// ST: store of a widen value + void GenWidenVectorTruncStores(SmallVector& StChain, + StoreSDNode *ST); /// Modifies a vector input (widen or narrows) to a vector of NVT. The /// input vector must have the same element type as NVT. diff --git a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp index 808bac70fda..192cdeee835 100644 --- a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -1655,68 +1655,24 @@ SDValue DAGTypeLegalizer::WidenVecRes_INSERT_VECTOR_ELT(SDNode *N) { SDValue DAGTypeLegalizer::WidenVecRes_LOAD(SDNode *N) { LoadSDNode *LD = cast(N); - EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), LD->getValueType(0)); - EVT LdVT = LD->getMemoryVT(); - DebugLoc dl = N->getDebugLoc(); - assert(LdVT.isVector() && WidenVT.isVector()); - - // Load information - SDValue Chain = LD->getChain(); - SDValue BasePtr = LD->getBasePtr(); - int SVOffset = LD->getSrcValueOffset(); - unsigned Align = LD->getAlignment(); - bool isVolatile = LD->isVolatile(); - const Value *SV = LD->getSrcValue(); ISD::LoadExtType ExtType = LD->getExtensionType(); SDValue Result; SmallVector LdChain; // Chain for the series of load - if (ExtType != ISD::NON_EXTLOAD) { - // For extension loads, we can not play the tricks of chopping legal - // vector types and bit cast it to the right type. Instead, we unroll - // the load and build a vector. - EVT EltVT = WidenVT.getVectorElementType(); - EVT LdEltVT = LdVT.getVectorElementType(); - unsigned NumElts = LdVT.getVectorNumElements(); + if (ExtType != ISD::NON_EXTLOAD) + Result = GenWidenVectorExtLoads(LdChain, LD, ExtType); + else + Result = GenWidenVectorLoads(LdChain, LD); - // Load each element and widen - unsigned WidenNumElts = WidenVT.getVectorNumElements(); - SmallVector Ops(WidenNumElts); - unsigned Increment = LdEltVT.getSizeInBits() / 8; - Ops[0] = DAG.getExtLoad(ExtType, dl, EltVT, Chain, BasePtr, SV, SVOffset, - LdEltVT, isVolatile, Align); - LdChain.push_back(Ops[0].getValue(1)); - unsigned i = 0, Offset = Increment; - for (i=1; i < NumElts; ++i, Offset += Increment) { - SDValue NewBasePtr = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), - BasePtr, DAG.getIntPtrConstant(Offset)); - Ops[i] = DAG.getExtLoad(ExtType, dl, EltVT, Chain, NewBasePtr, SV, - SVOffset + Offset, LdEltVT, isVolatile, Align); - LdChain.push_back(Ops[i].getValue(1)); - } - - // Fill the rest with undefs - SDValue UndefVal = DAG.getUNDEF(EltVT); - for (; i != WidenNumElts; ++i) - Ops[i] = UndefVal; - - Result = DAG.getNode(ISD::BUILD_VECTOR, dl, WidenVT, &Ops[0], Ops.size()); - } else { - assert(LdVT.getVectorElementType() == WidenVT.getVectorElementType()); - unsigned int LdWidth = LdVT.getSizeInBits(); - Result = GenWidenVectorLoads(LdChain, Chain, BasePtr, SV, SVOffset, - Align, isVolatile, LdWidth, WidenVT, dl); - } - - // If we generate a single load, we can use that for the chain. Otherwise, - // build a factor node to remember the multiple loads are independent and - // chain to that. - SDValue NewChain; - if (LdChain.size() == 1) - NewChain = LdChain[0]; - else - NewChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &LdChain[0], - LdChain.size()); + // If we generate a single load, we can use that for the chain. Otherwise, + // build a factor node to remember the multiple loads are independent and + // chain to that. + SDValue NewChain; + if (LdChain.size() == 1) + NewChain = LdChain[0]; + else + NewChain = DAG.getNode(ISD::TokenFactor, LD->getDebugLoc(), MVT::Other, + &LdChain[0], LdChain.size()); // Modified the chain - switch anything that used the old chain to use // the new one. @@ -1954,57 +1910,17 @@ SDValue DAGTypeLegalizer::WidenVecOp_STORE(SDNode *N) { // We have to widen the value but we want only to store the original // vector type. StoreSDNode *ST = cast(N); - SDValue Chain = ST->getChain(); - SDValue BasePtr = ST->getBasePtr(); - const Value *SV = ST->getSrcValue(); - int SVOffset = ST->getSrcValueOffset(); - unsigned Align = ST->getAlignment(); - bool isVolatile = ST->isVolatile(); - SDValue ValOp = GetWidenedVector(ST->getValue()); - DebugLoc dl = N->getDebugLoc(); - - EVT StVT = ST->getMemoryVT(); - EVT ValVT = ValOp.getValueType(); - // It must be true that we the widen vector type is bigger than where - // we need to store. - assert(StVT.isVector() && ValOp.getValueType().isVector()); - assert(StVT.bitsLT(ValOp.getValueType())); SmallVector StChain; - if (ST->isTruncatingStore()) { - // For truncating stores, we can not play the tricks of chopping legal - // vector types and bit cast it to the right type. Instead, we unroll - // the store. - EVT StEltVT = StVT.getVectorElementType(); - EVT ValEltVT = ValVT.getVectorElementType(); - unsigned Increment = ValEltVT.getSizeInBits() / 8; - unsigned NumElts = StVT.getVectorNumElements(); - SDValue EOp = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ValEltVT, ValOp, - DAG.getIntPtrConstant(0)); - StChain.push_back(DAG.getTruncStore(Chain, dl, EOp, BasePtr, SV, - SVOffset, StEltVT, - isVolatile, Align)); - unsigned Offset = Increment; - for (unsigned i=1; i < NumElts; ++i, Offset += Increment) { - SDValue NewBasePtr = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), - BasePtr, DAG.getIntPtrConstant(Offset)); - SDValue EOp = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ValEltVT, ValOp, - DAG.getIntPtrConstant(0)); - StChain.push_back(DAG.getTruncStore(Chain, dl, EOp, NewBasePtr, SV, - SVOffset + Offset, StEltVT, - isVolatile, MinAlign(Align, Offset))); - } - } - else { - assert(StVT.getVectorElementType() == ValVT.getVectorElementType()); - // Store value - GenWidenVectorStores(StChain, Chain, BasePtr, SV, SVOffset, - Align, isVolatile, ValOp, StVT.getSizeInBits(), dl); - } + if (ST->isTruncatingStore()) + GenWidenVectorTruncStores(StChain, ST); + else + GenWidenVectorStores(StChain, ST); + if (StChain.size() == 1) return StChain[0]; else - return DAG.getNode(ISD::TokenFactor, dl, + return DAG.getNode(ISD::TokenFactor, ST->getDebugLoc(), MVT::Other,&StChain[0],StChain.size()); } @@ -2012,179 +1928,383 @@ SDValue DAGTypeLegalizer::WidenVecOp_STORE(SDNode *N) { // Vector Widening Utilities //===----------------------------------------------------------------------===// +// Utility function to find the type to chop up a widen vector for load/store +// TLI: Target lowering used to determine legal types. +// Width: Width left need to load/store. +// WidenVT: The widen vector type to load to/store from +// Align: If 0, don't allow use of a wider type +// WidenEx: If Align is not 0, the amount additional we can load/store from. -// Utility function to find a vector type and its associated element -// type from a preferred width and whose vector type must be the same size -// as the VecVT. -// TLI: Target lowering used to determine legal types. -// Width: Preferred width to store. -// VecVT: Vector value type whose size we must match. -// Returns NewVecVT and NewEltVT - the vector type and its associated -// element type. -static void FindAssocWidenVecType(SelectionDAG& DAG, - const TargetLowering &TLI, unsigned Width, - EVT VecVT, - EVT& NewEltVT, EVT& NewVecVT) { - unsigned EltWidth = Width + 1; - if (TLI.isTypeLegal(VecVT)) { - // We start with the preferred with, making it a power of 2 and find a - // legal vector type of that width. If not, we reduce it by another of 2. - // For incoming type is legal, this process will end as a vector of the - // smallest loadable type should always be legal. - do { - assert(EltWidth > 0); - EltWidth = 1 << Log2_32(EltWidth - 1); - NewEltVT = EVT::getIntegerVT(*DAG.getContext(), EltWidth); - unsigned NumElts = VecVT.getSizeInBits() / EltWidth; - NewVecVT = EVT::getVectorVT(*DAG.getContext(), NewEltVT, NumElts); - } while (!TLI.isTypeLegal(NewVecVT) || - VecVT.getSizeInBits() != NewVecVT.getSizeInBits()); - } else { - // The incoming vector type is illegal and is the result of widening - // a vector to a power of 2. In this case, we will use the preferred - // with as long as it is a multiple of the incoming vector length. - // The legalization process will eventually make this into a legal type - // and remove the illegal bit converts (which would turn to stack converts - // if they are allow to exist). - do { - assert(EltWidth > 0); - EltWidth = 1 << Log2_32(EltWidth - 1); - NewEltVT = EVT::getIntegerVT(*DAG.getContext(), EltWidth); - unsigned NumElts = VecVT.getSizeInBits() / EltWidth; - NewVecVT = EVT::getVectorVT(*DAG.getContext(), NewEltVT, NumElts); - } while (!TLI.isTypeLegal(NewEltVT) || - VecVT.getSizeInBits() != NewVecVT.getSizeInBits()); +static EVT FindMemType(SelectionDAG& DAG, const TargetLowering &TLI, + unsigned Width, EVT WidenVT, + unsigned Align = 0, unsigned WidenEx = 0) { + EVT WidenEltVT = WidenVT.getVectorElementType(); + unsigned WidenWidth = WidenVT.getSizeInBits(); + unsigned WidenEltWidth = WidenEltVT.getSizeInBits(); + unsigned AlignInBits = Align*8; + + // If we have one element to load/store, return it. + EVT RetVT = WidenEltVT; + if (Width == WidenEltWidth) + return RetVT; + + // See if there is larger legal integer than the element type to load/store + unsigned VT; + for (VT = (unsigned)MVT::LAST_INTEGER_VALUETYPE; + VT >= (unsigned)MVT::FIRST_INTEGER_VALUETYPE; --VT) { + EVT MemVT((MVT::SimpleValueType) VT); + unsigned MemVTWidth = MemVT.getSizeInBits(); + if (MemVT.getSizeInBits() <= WidenEltWidth) + break; + if (TLI.isTypeLegal(MemVT) && (WidenWidth % MemVTWidth) == 0 && + (MemVTWidth <= Width || + (Align!=0 && MemVTWidth<=AlignInBits && MemVTWidth<=Width+WidenEx))) { + RetVT = MemVT; + break; + } } + + // See if there is a larger vector type to load/store that has the same vector + // element type and is evenly divisible with the WidenVT. + for (VT = (unsigned)MVT::LAST_VECTOR_VALUETYPE; + VT >= (unsigned)MVT::FIRST_VECTOR_VALUETYPE; --VT) { + EVT MemVT = (MVT::SimpleValueType) VT; + unsigned MemVTWidth = MemVT.getSizeInBits(); + if (TLI.isTypeLegal(MemVT) && WidenEltVT == MemVT.getVectorElementType() && + (WidenWidth % MemVTWidth) == 0 && + (MemVTWidth <= Width || + (Align!=0 && MemVTWidth<=AlignInBits && MemVTWidth<=Width+WidenEx))) { + if (RetVT.getSizeInBits() < MemVTWidth || MemVT == WidenVT) + return MemVT; + } + } + + return RetVT; +} + +// Builds a vector type from scalar loads +// VecTy: Resulting Vector type +// LDOps: Load operators to build a vector type +// [Start,End) the list of loads to use. +static SDValue BuildVectorFromScalar(SelectionDAG& DAG, EVT VecTy, + SmallVector& LdOps, + unsigned Start, unsigned End) { + DebugLoc dl = LdOps[Start].getDebugLoc(); + EVT LdTy = LdOps[Start].getValueType(); + unsigned Width = VecTy.getSizeInBits(); + unsigned NumElts = Width / LdTy.getSizeInBits(); + EVT NewVecVT = EVT::getVectorVT(*DAG.getContext(), LdTy, NumElts); + + unsigned Idx = 1; + SDValue VecOp = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, NewVecVT,LdOps[Start]); + + for (unsigned i = Start + 1; i != End; ++i) { + EVT NewLdTy = LdOps[i].getValueType(); + if (NewLdTy != LdTy) { + NumElts = Width / NewLdTy.getSizeInBits(); + NewVecVT = EVT::getVectorVT(*DAG.getContext(), NewLdTy, NumElts); + VecOp = DAG.getNode(ISD::BIT_CONVERT, dl, NewVecVT, VecOp); + // Readjust position and vector position based on new load type + Idx = Idx * LdTy.getSizeInBits() / NewLdTy.getSizeInBits(); + LdTy = NewLdTy; + } + VecOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, NewVecVT, VecOp, LdOps[i], + DAG.getIntPtrConstant(Idx++)); + } + return DAG.getNode(ISD::BIT_CONVERT, dl, VecTy, VecOp); } SDValue DAGTypeLegalizer::GenWidenVectorLoads(SmallVector& LdChain, - SDValue Chain, - SDValue BasePtr, - const Value *SV, - int SVOffset, - unsigned Alignment, - bool isVolatile, - unsigned LdWidth, - EVT ResType, - DebugLoc dl) { + LoadSDNode * LD) { // The strategy assumes that we can efficiently load powers of two widths. - // The routines chops the vector into the largest power of 2 load and - // can be inserted into a legal vector and then cast the result into the - // vector type we want. This avoids unnecessary stack converts. + // The routines chops the vector into the largest vector loads with the same + // element type or scalar loads and then recombines it to the widen vector + // type. + EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), LD->getValueType(0)); + unsigned WidenWidth = WidenVT.getSizeInBits(); + EVT LdVT = LD->getMemoryVT(); + DebugLoc dl = LD->getDebugLoc(); + assert(LdVT.isVector() && WidenVT.isVector()); + assert(LdVT.getVectorElementType() == WidenVT.getVectorElementType()); - // TODO: If the Ldwidth is legal, alignment is the same as the LdWidth, and - // the load is nonvolatile, we an use a wider load for the value. + // Load information + SDValue Chain = LD->getChain(); + SDValue BasePtr = LD->getBasePtr(); + int SVOffset = LD->getSrcValueOffset(); + unsigned Align = LD->getAlignment(); + bool isVolatile = LD->isVolatile(); + const Value *SV = LD->getSrcValue(); + + int LdWidth = LdVT.getSizeInBits(); + int WidthDiff = WidenWidth - LdWidth; // Difference + unsigned LdAlign = (isVolatile) ? 0 : Align; // Allow wider loads // Find the vector type that can load from. - EVT NewEltVT, NewVecVT; - unsigned NewEltVTWidth; - FindAssocWidenVecType(DAG, TLI, LdWidth, ResType, NewEltVT, NewVecVT); - NewEltVTWidth = NewEltVT.getSizeInBits(); - - SDValue LdOp = DAG.getLoad(NewEltVT, dl, Chain, BasePtr, SV, SVOffset, - isVolatile, Alignment); - SDValue VecOp = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, NewVecVT, LdOp); + EVT NewVT = FindMemType(DAG, TLI, LdWidth, WidenVT, LdAlign, WidthDiff); + int NewVTWidth = NewVT.getSizeInBits(); + SDValue LdOp = DAG.getLoad(NewVT, dl, Chain, BasePtr, SV, SVOffset, + isVolatile, Align); LdChain.push_back(LdOp.getValue(1)); // Check if we can load the element with one instruction - if (LdWidth == NewEltVTWidth) { - return DAG.getNode(ISD::BIT_CONVERT, dl, ResType, VecOp); + if (LdWidth <= NewVTWidth) { + if (NewVT.isVector()) { + if (NewVT != WidenVT) { + assert(WidenWidth % NewVTWidth == 0); + unsigned NumConcat = WidenWidth / NewVTWidth; + SmallVector ConcatOps(NumConcat); + SDValue UndefVal = DAG.getUNDEF(NewVT); + ConcatOps[0] = LdOp; + for (unsigned i = 1; i != NumConcat; ++i) + ConcatOps[i] = UndefVal; + return DAG.getNode(ISD::CONCAT_VECTORS, dl, WidenVT, &ConcatOps[0], + NumConcat); + } else + return LdOp; + } else { + unsigned NumElts = WidenWidth / LdWidth; + EVT NewVecVT = EVT::getVectorVT(*DAG.getContext(), NewVT, NumElts); + SDValue VecOp = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, NewVecVT, LdOp); + return DAG.getNode(ISD::BIT_CONVERT, dl, WidenVT, VecOp); + } } - unsigned Idx = 1; - LdWidth -= NewEltVTWidth; + // Load vector by using multiple loads from largest vector to scalar + SmallVector LdOps; + LdOps.push_back(LdOp); + + LdWidth -= NewVTWidth; unsigned Offset = 0; while (LdWidth > 0) { - unsigned Increment = NewEltVTWidth / 8; + unsigned Increment = NewVTWidth / 8; Offset += Increment; BasePtr = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr, DAG.getIntPtrConstant(Increment)); - if (LdWidth < NewEltVTWidth) { - // Our current type we are using is too large, use a smaller size by - // using a smaller power of 2 - unsigned oNewEltVTWidth = NewEltVTWidth; - FindAssocWidenVecType(DAG, TLI, LdWidth, ResType, NewEltVT, NewVecVT); - NewEltVTWidth = NewEltVT.getSizeInBits(); - // Readjust position and vector position based on new load type - Idx = Idx * (oNewEltVTWidth/NewEltVTWidth); - VecOp = DAG.getNode(ISD::BIT_CONVERT, dl, NewVecVT, VecOp); + if (LdWidth < NewVTWidth) { + // Our current type we are using is too large, find a better size + NewVT = FindMemType(DAG, TLI, LdWidth, WidenVT, LdAlign, WidthDiff); + NewVTWidth = NewVT.getSizeInBits(); } - SDValue LdOp = DAG.getLoad(NewEltVT, dl, Chain, BasePtr, SV, - SVOffset+Offset, isVolatile, - MinAlign(Alignment, Offset)); + SDValue LdOp = DAG.getLoad(NewVT, dl, Chain, BasePtr, SV, + SVOffset+Offset, isVolatile, + MinAlign(Align, Increment)); LdChain.push_back(LdOp.getValue(1)); - VecOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, NewVecVT, VecOp, LdOp, - DAG.getIntPtrConstant(Idx++)); + LdOps.push_back(LdOp); - LdWidth -= NewEltVTWidth; + LdWidth -= NewVTWidth; } - return DAG.getNode(ISD::BIT_CONVERT, dl, ResType, VecOp); + // Build the vector from the loads operations + unsigned End = LdOps.size(); + if (LdOps[0].getValueType().isVector()) { + // If the load contains vectors, build the vector using concat vector. + // All of the vectors used to loads are power of 2 and the scalars load + // can be combined to make a power of 2 vector. + SmallVector ConcatOps(End); + int i = End - 1; + int Idx = End; + EVT LdTy = LdOps[i].getValueType(); + // First combine the scalar loads to a vector + if (!LdTy.isVector()) { + for (--i; i >= 0; --i) { + LdTy = LdOps[i].getValueType(); + if (LdTy.isVector()) + break; + } + ConcatOps[--Idx] = BuildVectorFromScalar(DAG, LdTy, LdOps, i+1, End); + } + ConcatOps[--Idx] = LdOps[i]; + for (--i; i >= 0; --i) { + EVT NewLdTy = LdOps[i].getValueType(); + if (NewLdTy != LdTy) { + // Create a larger vector + ConcatOps[End-1] = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewLdTy, + &ConcatOps[Idx], End - Idx); + Idx = End - 1; + LdTy = NewLdTy; + } + ConcatOps[--Idx] = LdOps[i]; + } + + if (WidenWidth != LdTy.getSizeInBits()*(End - Idx)) { + // We need to fill the rest with undefs to build the vector + unsigned NumOps = WidenWidth / LdTy.getSizeInBits(); + SmallVector WidenOps(NumOps); + SDValue UndefVal = DAG.getUNDEF(LdTy); + unsigned i = 0; + for (; i != End-Idx; ++i) + WidenOps[i] = ConcatOps[Idx+i]; + for (; i != NumOps; ++i) + WidenOps[i] = UndefVal; + return DAG.getNode(ISD::CONCAT_VECTORS, dl, WidenVT, &WidenOps[0],NumOps); + } else + return DAG.getNode(ISD::CONCAT_VECTORS, dl, WidenVT, + &ConcatOps[Idx], End - Idx); + } else // All the loads are scalar loads. + return BuildVectorFromScalar(DAG, WidenVT, LdOps, 0, End); } -void DAGTypeLegalizer::GenWidenVectorStores(SmallVector& StChain, - SDValue Chain, - SDValue BasePtr, - const Value *SV, - int SVOffset, - unsigned Alignment, - bool isVolatile, - SDValue ValOp, - unsigned StWidth, - DebugLoc dl) { - // Breaks the stores into a series of power of 2 width stores. For any - // width, we convert the vector to the vector of element size that we - // want to store. This avoids requiring a stack convert. +SDValue +DAGTypeLegalizer::GenWidenVectorExtLoads(SmallVector& LdChain, + LoadSDNode * LD, + ISD::LoadExtType ExtType) { + // For extension loads, it may not be more efficient to chop up the vector + // and then extended it. Instead, we unroll the load and build a new vector. + EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(),LD->getValueType(0)); + EVT LdVT = LD->getMemoryVT(); + DebugLoc dl = LD->getDebugLoc(); + assert(LdVT.isVector() && WidenVT.isVector()); - // Find a width of the element type we can store with - EVT WidenVT = ValOp.getValueType(); - EVT NewEltVT, NewVecVT; + // Load information + SDValue Chain = LD->getChain(); + SDValue BasePtr = LD->getBasePtr(); + int SVOffset = LD->getSrcValueOffset(); + unsigned Align = LD->getAlignment(); + bool isVolatile = LD->isVolatile(); + const Value *SV = LD->getSrcValue(); - FindAssocWidenVecType(DAG, TLI, StWidth, WidenVT, NewEltVT, NewVecVT); - unsigned NewEltVTWidth = NewEltVT.getSizeInBits(); + EVT EltVT = WidenVT.getVectorElementType(); + EVT LdEltVT = LdVT.getVectorElementType(); + unsigned NumElts = LdVT.getVectorNumElements(); - SDValue VecOp = DAG.getNode(ISD::BIT_CONVERT, dl, NewVecVT, ValOp); - SDValue EOp = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, NewEltVT, VecOp, - DAG.getIntPtrConstant(0)); - SDValue StOp = DAG.getStore(Chain, dl, EOp, BasePtr, SV, SVOffset, - isVolatile, Alignment); - StChain.push_back(StOp); - - // Check if we are done - if (StWidth == NewEltVTWidth) { - return; + // Load each element and widen + unsigned WidenNumElts = WidenVT.getVectorNumElements(); + SmallVector Ops(WidenNumElts); + unsigned Increment = LdEltVT.getSizeInBits() / 8; + Ops[0] = DAG.getExtLoad(ExtType, dl, EltVT, Chain, BasePtr, SV, SVOffset, + LdEltVT, isVolatile, Align); + LdChain.push_back(Ops[0].getValue(1)); + unsigned i = 0, Offset = Increment; + for (i=1; i < NumElts; ++i, Offset += Increment) { + SDValue NewBasePtr = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), + BasePtr, DAG.getIntPtrConstant(Offset)); + Ops[i] = DAG.getExtLoad(ExtType, dl, EltVT, Chain, NewBasePtr, SV, + SVOffset + Offset, LdEltVT, isVolatile, Align); + LdChain.push_back(Ops[i].getValue(1)); } - unsigned Idx = 1; - StWidth -= NewEltVTWidth; - unsigned Offset = 0; + // Fill the rest with undefs + SDValue UndefVal = DAG.getUNDEF(EltVT); + for (; i != WidenNumElts; ++i) + Ops[i] = UndefVal; - while (StWidth > 0) { - unsigned Increment = NewEltVTWidth / 8; - Offset += Increment; - BasePtr = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr, - DAG.getIntPtrConstant(Increment)); + return DAG.getNode(ISD::BUILD_VECTOR, dl, WidenVT, &Ops[0], Ops.size()); +} - if (StWidth < NewEltVTWidth) { - // Our current type we are using is too large, use a smaller size by - // using a smaller power of 2 - unsigned oNewEltVTWidth = NewEltVTWidth; - FindAssocWidenVecType(DAG, TLI, StWidth, WidenVT, NewEltVT, NewVecVT); - NewEltVTWidth = NewEltVT.getSizeInBits(); - // Readjust position and vector position based on new load type - Idx = Idx * (oNewEltVTWidth/NewEltVTWidth); - VecOp = DAG.getNode(ISD::BIT_CONVERT, dl, NewVecVT, VecOp); - } - EOp = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, NewEltVT, VecOp, +void DAGTypeLegalizer::GenWidenVectorStores(SmallVector& StChain, + StoreSDNode *ST) { + // The strategy assumes that we can efficiently store powers of two widths. + // The routines chops the vector into the largest vector stores with the same + // element type or scalar stores. + SDValue Chain = ST->getChain(); + SDValue BasePtr = ST->getBasePtr(); + const Value *SV = ST->getSrcValue(); + int SVOffset = ST->getSrcValueOffset(); + unsigned Align = ST->getAlignment(); + bool isVolatile = ST->isVolatile(); + SDValue ValOp = GetWidenedVector(ST->getValue()); + DebugLoc dl = ST->getDebugLoc(); + + EVT StVT = ST->getMemoryVT(); + unsigned StWidth = StVT.getSizeInBits(); + EVT ValVT = ValOp.getValueType(); + unsigned ValWidth = ValVT.getSizeInBits(); + EVT ValEltVT = ValVT.getVectorElementType(); + unsigned ValEltWidth = ValEltVT.getSizeInBits(); + assert(StVT.getVectorElementType() == ValEltVT); + + int Idx = 0; // current index to store + unsigned Offset = 0; // offset from base to store + while (StWidth != 0) { + // Find the largest vector type we can store with + EVT NewVT = FindMemType(DAG, TLI, StWidth, ValVT); + unsigned NewVTWidth = NewVT.getSizeInBits(); + unsigned Increment = NewVTWidth / 8; + if (NewVT.isVector()) { + unsigned NumVTElts = NewVT.getVectorNumElements(); + do { + SDValue EOp = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, NewVT, ValOp, + DAG.getIntPtrConstant(Idx)); + StChain.push_back(DAG.getStore(Chain, dl, EOp, BasePtr, SV, + SVOffset + Offset, isVolatile, + MinAlign(Align, Offset))); + StWidth -= NewVTWidth; + Offset += Increment; + Idx += NumVTElts; + BasePtr = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr, + DAG.getIntPtrConstant(Increment)); + } while (StWidth != 0 && StWidth >= NewVTWidth); + } else { + // Cast the vector to the scalar type we can store + unsigned NumElts = ValWidth / NewVTWidth; + EVT NewVecVT = EVT::getVectorVT(*DAG.getContext(), NewVT, NumElts); + SDValue VecOp = DAG.getNode(ISD::BIT_CONVERT, dl, NewVecVT, ValOp); + // Readjust index position based on new vector type + Idx = Idx * ValEltWidth / NewVTWidth; + do { + SDValue EOp = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, NewVT, VecOp, DAG.getIntPtrConstant(Idx++)); - StChain.push_back(DAG.getStore(Chain, dl, EOp, BasePtr, SV, + StChain.push_back(DAG.getStore(Chain, dl, EOp, BasePtr, SV, SVOffset + Offset, isVolatile, - MinAlign(Alignment, Offset))); - StWidth -= NewEltVTWidth; + MinAlign(Align, Offset))); + StWidth -= NewVTWidth; + Offset += Increment; + BasePtr = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr, + DAG.getIntPtrConstant(Increment)); + } while (StWidth != 0 && StWidth >= NewVTWidth); + // Restore index back to be relative to the original widen element type + Idx = Idx * NewVTWidth / ValEltWidth; + } + } +} + +void +DAGTypeLegalizer::GenWidenVectorTruncStores(SmallVector& StChain, + StoreSDNode *ST) { + // For extension loads, it may not be more efficient to truncate the vector + // and then store it. Instead, we extract each element and then store it. + SDValue Chain = ST->getChain(); + SDValue BasePtr = ST->getBasePtr(); + const Value *SV = ST->getSrcValue(); + int SVOffset = ST->getSrcValueOffset(); + unsigned Align = ST->getAlignment(); + bool isVolatile = ST->isVolatile(); + SDValue ValOp = GetWidenedVector(ST->getValue()); + DebugLoc dl = ST->getDebugLoc(); + + EVT StVT = ST->getMemoryVT(); + EVT ValVT = ValOp.getValueType(); + + // It must be true that we the widen vector type is bigger than where + // we need to store. + assert(StVT.isVector() && ValOp.getValueType().isVector()); + assert(StVT.bitsLT(ValOp.getValueType())); + + // For truncating stores, we can not play the tricks of chopping legal + // vector types and bit cast it to the right type. Instead, we unroll + // the store. + EVT StEltVT = StVT.getVectorElementType(); + EVT ValEltVT = ValVT.getVectorElementType(); + unsigned Increment = ValEltVT.getSizeInBits() / 8; + unsigned NumElts = StVT.getVectorNumElements(); + SDValue EOp = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ValEltVT, ValOp, + DAG.getIntPtrConstant(0)); + StChain.push_back(DAG.getTruncStore(Chain, dl, EOp, BasePtr, SV, + SVOffset, StEltVT, + isVolatile, Align)); + unsigned Offset = Increment; + for (unsigned i=1; i < NumElts; ++i, Offset += Increment) { + SDValue NewBasePtr = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), + BasePtr, DAG.getIntPtrConstant(Offset)); + SDValue EOp = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ValEltVT, ValOp, + DAG.getIntPtrConstant(0)); + StChain.push_back(DAG.getTruncStore(Chain, dl, EOp, NewBasePtr, SV, + SVOffset + Offset, StEltVT, + isVolatile, MinAlign(Align, Offset))); } } diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 11e07dfec34..ab51369555c 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -747,6 +747,12 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom); + setOperationAction(ISD::CONCAT_VECTORS, MVT::v2f64, Custom); + setOperationAction(ISD::CONCAT_VECTORS, MVT::v2i64, Custom); + setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i8, Custom); + setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i16, Custom); + setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i32, Custom); + // Custom lower build_vector, vector_shuffle, and extract_vector_elt. for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v2i64; ++i) { EVT VT = (MVT::SimpleValueType)i; @@ -3686,6 +3692,33 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) { return SDValue(); } +SDValue +X86TargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) { + // We support concatenate two MMX registers and place them in a MMX + // register. This is better than doing a stack convert. + DebugLoc dl = Op.getDebugLoc(); + EVT ResVT = Op.getValueType(); + assert(Op.getNumOperands() == 2); + assert(ResVT == MVT::v2i64 || ResVT == MVT::v4i32 || + ResVT == MVT::v8i16 || ResVT == MVT::v16i8); + int Mask[2]; + SDValue InVec = DAG.getNode(ISD::BIT_CONVERT,dl, MVT::v1i64, Op.getOperand(0)); + SDValue VecOp = DAG.getNode(X86ISD::MOVQ2DQ, dl, MVT::v2i64, InVec); + InVec = Op.getOperand(1); + if (InVec.getOpcode() == ISD::SCALAR_TO_VECTOR) { + unsigned NumElts = ResVT.getVectorNumElements(); + VecOp = DAG.getNode(ISD::BIT_CONVERT, dl, ResVT, VecOp); + VecOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ResVT, VecOp, + InVec.getOperand(0), DAG.getIntPtrConstant(NumElts/2+1)); + } else { + InVec = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v1i64, InVec); + SDValue VecOp2 = DAG.getNode(X86ISD::MOVQ2DQ, dl, MVT::v2i64, InVec); + Mask[0] = 0; Mask[1] = 2; + VecOp = DAG.getVectorShuffle(MVT::v2i64, dl, VecOp, VecOp2, Mask); + } + return DAG.getNode(ISD::BIT_CONVERT, dl, ResVT, VecOp); +} + // v8i16 shuffles - Prefer shuffles in the following order: // 1. [all] pshuflw, pshufhw, optional move // 2. [ssse3] 1 x pshufb @@ -7238,6 +7271,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) { case ISD::ATOMIC_CMP_SWAP: return LowerCMP_SWAP(Op,DAG); case ISD::ATOMIC_LOAD_SUB: return LowerLOAD_SUB(Op,DAG); case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG); + case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG); case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG); case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG); case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG); diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h index 64bc70c62fb..11b52257f69 100644 --- a/lib/Target/X86/X86ISelLowering.h +++ b/lib/Target/X86/X86ISelLowering.h @@ -156,6 +156,11 @@ namespace llvm { /// relative displacements. WrapperRIP, + /// MOVQ2DQ - Copies a 64-bit value from a vector to another vector. + /// Can be used to move a vector value from a MMX register to a XMM + /// register. + MOVQ2DQ, + /// PEXTRB - Extract an 8-bit value from a vector and zero extend it to /// i32, corresponds to X86::PEXTRB. PEXTRB, @@ -634,6 +639,7 @@ namespace llvm { SDValue LowerAsSplatVectorLoad(SDValue SrcOp, EVT VT, DebugLoc dl, SelectionDAG &DAG); SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG); + SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG); SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG); SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG); SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG); diff --git a/lib/Target/X86/X86InstrMMX.td b/lib/Target/X86/X86InstrMMX.td index fc40c9a4200..ab169ac395f 100644 --- a/lib/Target/X86/X86InstrMMX.td +++ b/lib/Target/X86/X86InstrMMX.td @@ -501,6 +501,20 @@ let Constraints = "$src1 = $dst" in { (iPTR imm:$src3))))]>; } +// MMX to XMM for vector types +def MMX_X86movq2dq : SDNode<"X86ISD::MOVQ2DQ", SDTypeProfile<1, 1, + [SDTCisVT<0, v2i64>, SDTCisVT<1, v1i64>]>>; + +def : Pat<(v2i64 (MMX_X86movq2dq VR64:$src)), + (v2i64 (MMX_MOVQ2DQrr VR64:$src))>; + +def : Pat<(v2i64 (MMX_X86movq2dq (load_mmx addr:$src))), + (v2i64 (MOVQI2PQIrm addr:$src))>; + +def : Pat<(v2i64 (MMX_X86movq2dq (v1i64 (bitconvert + (v2i32 (scalar_to_vector (loadi32 addr:$src))))))), + (v2i64 (MOVDI2PDIrm addr:$src))>; + // Mask creation def MMX_PMOVMSKBrr : MMXI<0xD7, MRMSrcReg, (outs GR32:$dst), (ins VR64:$src), "pmovmskb\t{$src, $dst|$dst, $src}", diff --git a/test/CodeGen/X86/widen_cast-2.ll b/test/CodeGen/X86/widen_cast-2.ll index e5d2c6a61e2..1e626a2f882 100644 --- a/test/CodeGen/X86/widen_cast-2.ll +++ b/test/CodeGen/X86/widen_cast-2.ll @@ -2,10 +2,8 @@ ; CHECK: pextrd ; CHECK: pextrd ; CHECK: movd -; CHECK: pextrd -; CHECK: pextrd -; CHECK: pextrd -; CHECK: movd +; CHECK: movaps + ; bitcast v14i16 to v7i32 diff --git a/test/CodeGen/X86/widen_load-1.ll b/test/CodeGen/X86/widen_load-1.ll index 8a970bff498..d397645f193 100644 --- a/test/CodeGen/X86/widen_load-1.ll +++ b/test/CodeGen/X86/widen_load-1.ll @@ -3,7 +3,7 @@ ; This load should be before the call, not after. -; CHECK: movq compl+128(%rip), %xmm0 +; CHECK: movaps compl+128(%rip), %xmm0 ; CHECK: movaps %xmm0, (%rsp) ; CHECK: callq killcommon diff --git a/test/CodeGen/X86/widen_load-2.ll b/test/CodeGen/X86/widen_load-2.ll new file mode 100644 index 00000000000..11383fa3082 --- /dev/null +++ b/test/CodeGen/X86/widen_load-2.ll @@ -0,0 +1,155 @@ +; RUN: llc < %s -o - -march=x86-64 -mattr=+sse42 -disable-mmx | FileCheck %s + +; Test based on pr5626 to load/store +; + +%i32vec3 = type <3 x i32> +define void @add3i32(%i32vec3* sret %ret, %i32vec3* %ap, %i32vec3* %bp) { +; CHECK: movaps +; CHECK: paddd +; CHECK: pextrd +; CHECK: movq + %a = load %i32vec3* %ap, align 16 + %b = load %i32vec3* %bp, align 16 + %x = add %i32vec3 %a, %b + store %i32vec3 %x, %i32vec3* %ret, align 16 + ret void +} + +define void @add3i32_2(%i32vec3* sret %ret, %i32vec3* %ap, %i32vec3* %bp) { +; CHECK: movq +; CHECK: pinsrd +; CHECK: movq +; CHECK: pinsrd +; CHECK: paddd +; CHECK: pextrd +; CHECK: movq + %a = load %i32vec3* %ap + %b = load %i32vec3* %bp + %x = add %i32vec3 %a, %b + store %i32vec3 %x, %i32vec3* %ret + ret void +} + +%i32vec7 = type <7 x i32> +define void @add7i32(%i32vec7* sret %ret, %i32vec7* %ap, %i32vec7* %bp) { +; CHECK: movaps +; CHECK: movaps +; CHECK: paddd +; CHECK: paddd +; CHECK: pextrd +; CHECK: movq +; CHECK: movaps + %a = load %i32vec7* %ap, align 16 + %b = load %i32vec7* %bp, align 16 + %x = add %i32vec7 %a, %b + store %i32vec7 %x, %i32vec7* %ret, align 16 + ret void +} + +%i32vec12 = type <12 x i32> +define void @add12i32(%i32vec12* sret %ret, %i32vec12* %ap, %i32vec12* %bp) { +; CHECK: movaps +; CHECK: movaps +; CHECK: movaps +; CHECK: paddd +; CHECK: paddd +; CHECK: paddd +; CHECK: movaps +; CHECK: movaps +; CHECK: movaps + %a = load %i32vec12* %ap, align 16 + %b = load %i32vec12* %bp, align 16 + %x = add %i32vec12 %a, %b + store %i32vec12 %x, %i32vec12* %ret, align 16 + ret void +} + + +%i16vec3 = type <3 x i16> +define void @add3i16(%i16vec3* nocapture sret %ret, %i16vec3* %ap, %i16vec3* %bp) nounwind { +; CHECK: movaps +; CHECK: paddw +; CHECK: movd +; CHECK: pextrw + %a = load %i16vec3* %ap, align 16 + %b = load %i16vec3* %bp, align 16 + %x = add %i16vec3 %a, %b + store %i16vec3 %x, %i16vec3* %ret, align 16 + ret void +} + +%i16vec4 = type <4 x i16> +define void @add4i16(%i16vec4* nocapture sret %ret, %i16vec4* %ap, %i16vec4* %bp) nounwind { +; CHECK: movaps +; CHECK: paddw +; CHECK: movq + %a = load %i16vec4* %ap, align 16 + %b = load %i16vec4* %bp, align 16 + %x = add %i16vec4 %a, %b + store %i16vec4 %x, %i16vec4* %ret, align 16 + ret void +} + +%i16vec12 = type <12 x i16> +define void @add12i16(%i16vec12* nocapture sret %ret, %i16vec12* %ap, %i16vec12* %bp) nounwind { +; CHECK: movaps +; CHECK: movaps +; CHECK: paddw +; CHECK: paddw +; CHECK: movq +; CHECK: movaps + %a = load %i16vec12* %ap, align 16 + %b = load %i16vec12* %bp, align 16 + %x = add %i16vec12 %a, %b + store %i16vec12 %x, %i16vec12* %ret, align 16 + ret void +} + +%i16vec18 = type <18 x i16> +define void @add18i16(%i16vec18* nocapture sret %ret, %i16vec18* %ap, %i16vec18* %bp) nounwind { +; CHECK: movaps +; CHECK: movaps +; CHECK: movaps +; CHECK: paddw +; CHECK: paddw +; CHECK: paddw +; CHECK: movd +; CHECK: movaps +; CHECK: movaps + %a = load %i16vec18* %ap, align 16 + %b = load %i16vec18* %bp, align 16 + %x = add %i16vec18 %a, %b + store %i16vec18 %x, %i16vec18* %ret, align 16 + ret void +} + + +%i8vec3 = type <3 x i8> +define void @add3i8(%i8vec3* nocapture sret %ret, %i8vec3* %ap, %i8vec3* %bp) nounwind { +; CHECK: movaps +; CHECK: paddb +; CHECK: pextrb +; CHECK: movb + %a = load %i8vec3* %ap, align 16 + %b = load %i8vec3* %bp, align 16 + %x = add %i8vec3 %a, %b + store %i8vec3 %x, %i8vec3* %ret, align 16 + ret void +} + +%i8vec31 = type <31 x i8> +define void @add31i8(%i8vec31* nocapture sret %ret, %i8vec31* %ap, %i8vec31* %bp) nounwind { +; CHECK: movaps +; CHECK: movaps +; CHECK: paddb +; CHECK: paddb +; CHECK: movq +; CHECK: pextrb +; CHECK: pextrw + %a = load %i8vec31* %ap, align 16 + %b = load %i8vec31* %bp, align 16 + %x = add %i8vec31 %a, %b + store %i8vec31 %x, %i8vec31* %ret, align 16 + ret void +} \ No newline at end of file