diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index a8c3f80af6a..adf5098e2e6 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -1138,6 +1138,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setTargetDAGCombine(ISD::AND); setTargetDAGCombine(ISD::ADD); setTargetDAGCombine(ISD::SUB); + setTargetDAGCombine(ISD::LOAD); setTargetDAGCombine(ISD::STORE); setTargetDAGCombine(ISD::ZERO_EXTEND); setTargetDAGCombine(ISD::SINT_TO_FP); @@ -13433,6 +13434,89 @@ static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG, return SDValue(); } +/// PerformLOADCombine - Do target-specific dag combines on LOAD nodes. +static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG, + const X86Subtarget *Subtarget) { + LoadSDNode *Ld = cast(N); + EVT RegVT = Ld->getValueType(0); + EVT MemVT = Ld->getMemoryVT(); + DebugLoc dl = Ld->getDebugLoc(); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + + ISD::LoadExtType Ext = Ld->getExtensionType(); + + // If yhis is a vector EXT Load then attempt to optimize it using a + // shuffle. We need SSE4 for the shuffles. + // TODO: It is possible to support ZExt by zeroing the undef values + // during the shuffle phase or after the shuffle. + if (RegVT.isVector() && Ext == ISD::EXTLOAD && Subtarget->hasSSE41()) { + assert(MemVT != RegVT && "Cannot extend to the same type"); + assert(MemVT.isVector() && "Must load a vector from memory"); + + unsigned NumElems = RegVT.getVectorNumElements(); + unsigned RegSz = RegVT.getSizeInBits(); + unsigned MemSz = MemVT.getSizeInBits(); + assert(RegSz > MemSz && "Register size must be greater than the mem size"); + // All sized must be a power of two + if (!isPowerOf2_32(RegSz * MemSz * NumElems)) return SDValue(); + + // Attempt to load the original value using a single load op. + // Find a scalar type which is equal to the loaded word size. + MVT SclrLoadTy = MVT::i8; + for (unsigned tp = MVT::FIRST_INTEGER_VALUETYPE; + tp < MVT::LAST_INTEGER_VALUETYPE; ++tp) { + MVT Tp = (MVT::SimpleValueType)tp; + if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() == MemSz) { + SclrLoadTy = Tp; + break; + } + } + + // Proceed if a load word is found. + if (SclrLoadTy.getSizeInBits() != MemSz) return SDValue(); + + EVT LoadUnitVecVT = EVT::getVectorVT(*DAG.getContext(), SclrLoadTy, + RegSz/SclrLoadTy.getSizeInBits()); + + EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(), + RegSz/MemVT.getScalarType().getSizeInBits()); + // Can't shuffle using an illegal type. + if (!TLI.isTypeLegal(WideVecVT)) return SDValue(); + + // Perform a single load. + SDValue ScalarLoad = DAG.getLoad(SclrLoadTy, dl, Ld->getChain(), + Ld->getBasePtr(), + Ld->getPointerInfo(), Ld->isVolatile(), + Ld->isNonTemporal(), Ld->getAlignment()); + + // Insert the word loaded into a vector. + SDValue ScalarInVector = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, + LoadUnitVecVT, ScalarLoad); + + // Bitcast the loaded value to a vector of the original element type, in + // the size of the target vector type. + SDValue SlicedVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT, ScalarInVector); + unsigned SizeRatio = RegSz/MemSz; + + // Redistribute the loaded elements into the different locations. + SmallVector ShuffleVec(NumElems * SizeRatio, -1); + for (unsigned i = 0; i < NumElems; i++) ShuffleVec[i*SizeRatio] = i; + + SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec, + DAG.getUNDEF(SlicedVec.getValueType()), + ShuffleVec.data()); + + // Bitcast to the requested type. + Shuff = DAG.getNode(ISD::BITCAST, dl, RegVT, Shuff); + // Replace the original load with the new sequence + // and return the new chain. + DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Shuff); + return SDValue(ScalarLoad.getNode(), 1); + } + + return SDValue(); +} + /// PerformSTORECombine - Do target-specific dag combines on STORE nodes. static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, const X86Subtarget *Subtarget) { @@ -13479,9 +13563,7 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, // From, To sizes and ElemCount must be pow of two if (!isPowerOf2_32(NumElems * FromSz * ToSz)) return SDValue(); - // We are going to use the original vector elt for storing. - // accumulated smaller vector elements must be a multiple of bigger size. - if (0 != (NumElems * ToSz) % FromSz) return SDValue(); + unsigned SizeRatio = FromSz / ToSz; assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits()); @@ -13885,6 +13967,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case ISD::SRL: return PerformShiftCombine(N, DAG, Subtarget); case ISD::AND: return PerformAndCombine(N, DAG, DCI, Subtarget); case ISD::OR: return PerformOrCombine(N, DAG, DCI, Subtarget); + case ISD::LOAD: return PerformLOADCombine(N, DAG, Subtarget); case ISD::STORE: return PerformSTORECombine(N, DAG, Subtarget); case ISD::SINT_TO_FP: return PerformSINT_TO_FPCombine(N, DAG, this); case X86ISD::FXOR: diff --git a/test/CodeGen/X86/trunc-ext-ld-st.ll b/test/CodeGen/X86/trunc-ext-ld-st.ll new file mode 100644 index 00000000000..57d6e97767b --- /dev/null +++ b/test/CodeGen/X86/trunc-ext-ld-st.ll @@ -0,0 +1,82 @@ +; RUN: llc < %s -march=x86-64 -mcpu=corei7 -promote-elements -mattr=+sse41 | FileCheck %s + +;CHECK: load_2_i8 +; A single 16-bit load +;CHECK: movzwl +;CHECK: pshufb +;CHECK: paddq +;CHECK: pshufb +; A single 16-bit store +;CHECK: movw +;CHECK: ret + +define void @load_2_i8(<2 x i8>* %A) { + %T = load <2 x i8>* %A + %G = add <2 x i8> %T, + store <2 x i8> %G, <2 x i8>* %A + ret void +} + +;CHECK: load_2_i16 +; Read 32-bits +;CHECK: movd +;CHECK: pshufb +;CHECK: paddq +;CHECK: pshufb +;CHECK: movd +;CHECK: ret +define void @load_2_i16(<2 x i16>* %A) { + %T = load <2 x i16>* %A + %G = add <2 x i16> %T, + store <2 x i16> %G, <2 x i16>* %A + ret void +} + +;CHECK: load_2_i32 +;CHECK: pshufd +;CHECK: paddq +;CHECK: pshufd +;CHECK: ret +define void @load_2_i32(<2 x i32>* %A) { + %T = load <2 x i32>* %A + %G = add <2 x i32> %T, + store <2 x i32> %G, <2 x i32>* %A + ret void +} + +;CHECK: load_4_i8 +;CHECK: movd +;CHECK: pshufb +;CHECK: paddd +;CHECK: pshufb +;CHECK: ret +define void @load_4_i8(<4 x i8>* %A) { + %T = load <4 x i8>* %A + %G = add <4 x i8> %T, + store <4 x i8> %G, <4 x i8>* %A + ret void +} + +;CHECK: load_4_i16 +;CHECK: punpcklwd +;CHECK: paddd +;CHECK: pshufb +;CHECK: ret +define void @load_4_i16(<4 x i16>* %A) { + %T = load <4 x i16>* %A + %G = add <4 x i16> %T, + store <4 x i16> %G, <4 x i16>* %A + ret void +} + +;CHECK: load_8_i8 +;CHECK: punpcklbw +;CHECK: paddw +;CHECK: pshufb +;CHECK: ret +define void @load_8_i8(<8 x i8>* %A) { + %T = load <8 x i8>* %A + %G = add <8 x i8> %T, %T + store <8 x i8> %G, <8 x i8>* %A + ret void +}