When promoting integer vectors we often create ext-loads. This patch adds a

dag-combine optimization to implement the ext-load efficiently (using shuffles).

For example the type <4 x i8> is stored in memory as i32, but it needs to
find its way into a <4 x i32> register. Previously we scalarized the memory
access, now we use shuffles.




git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@139995 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
Nadav Rotem 2011-09-18 10:39:32 +00:00
parent b6266fb602
commit 91e43fd17a
2 changed files with 168 additions and 3 deletions

View File

@ -1138,6 +1138,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
setTargetDAGCombine(ISD::AND);
setTargetDAGCombine(ISD::ADD);
setTargetDAGCombine(ISD::SUB);
setTargetDAGCombine(ISD::LOAD);
setTargetDAGCombine(ISD::STORE);
setTargetDAGCombine(ISD::ZERO_EXTEND);
setTargetDAGCombine(ISD::SINT_TO_FP);
@ -13433,6 +13434,89 @@ static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
/// PerformLOADCombine - Do target-specific dag combines on LOAD nodes.
static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG,
const X86Subtarget *Subtarget) {
LoadSDNode *Ld = cast<LoadSDNode>(N);
EVT RegVT = Ld->getValueType(0);
EVT MemVT = Ld->getMemoryVT();
DebugLoc dl = Ld->getDebugLoc();
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
ISD::LoadExtType Ext = Ld->getExtensionType();
// If yhis is a vector EXT Load then attempt to optimize it using a
// shuffle. We need SSE4 for the shuffles.
// TODO: It is possible to support ZExt by zeroing the undef values
// during the shuffle phase or after the shuffle.
if (RegVT.isVector() && Ext == ISD::EXTLOAD && Subtarget->hasSSE41()) {
assert(MemVT != RegVT && "Cannot extend to the same type");
assert(MemVT.isVector() && "Must load a vector from memory");
unsigned NumElems = RegVT.getVectorNumElements();
unsigned RegSz = RegVT.getSizeInBits();
unsigned MemSz = MemVT.getSizeInBits();
assert(RegSz > MemSz && "Register size must be greater than the mem size");
// All sized must be a power of two
if (!isPowerOf2_32(RegSz * MemSz * NumElems)) return SDValue();
// Attempt to load the original value using a single load op.
// Find a scalar type which is equal to the loaded word size.
MVT SclrLoadTy = MVT::i8;
for (unsigned tp = MVT::FIRST_INTEGER_VALUETYPE;
tp < MVT::LAST_INTEGER_VALUETYPE; ++tp) {
MVT Tp = (MVT::SimpleValueType)tp;
if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() == MemSz) {
SclrLoadTy = Tp;
break;
}
}
// Proceed if a load word is found.
if (SclrLoadTy.getSizeInBits() != MemSz) return SDValue();
EVT LoadUnitVecVT = EVT::getVectorVT(*DAG.getContext(), SclrLoadTy,
RegSz/SclrLoadTy.getSizeInBits());
EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
RegSz/MemVT.getScalarType().getSizeInBits());
// Can't shuffle using an illegal type.
if (!TLI.isTypeLegal(WideVecVT)) return SDValue();
// Perform a single load.
SDValue ScalarLoad = DAG.getLoad(SclrLoadTy, dl, Ld->getChain(),
Ld->getBasePtr(),
Ld->getPointerInfo(), Ld->isVolatile(),
Ld->isNonTemporal(), Ld->getAlignment());
// Insert the word loaded into a vector.
SDValue ScalarInVector = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
LoadUnitVecVT, ScalarLoad);
// Bitcast the loaded value to a vector of the original element type, in
// the size of the target vector type.
SDValue SlicedVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT, ScalarInVector);
unsigned SizeRatio = RegSz/MemSz;
// Redistribute the loaded elements into the different locations.
SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
for (unsigned i = 0; i < NumElems; i++) ShuffleVec[i*SizeRatio] = i;
SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec,
DAG.getUNDEF(SlicedVec.getValueType()),
ShuffleVec.data());
// Bitcast to the requested type.
Shuff = DAG.getNode(ISD::BITCAST, dl, RegVT, Shuff);
// Replace the original load with the new sequence
// and return the new chain.
DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Shuff);
return SDValue(ScalarLoad.getNode(), 1);
}
return SDValue();
}
/// PerformSTORECombine - Do target-specific dag combines on STORE nodes.
static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
const X86Subtarget *Subtarget) {
@ -13479,9 +13563,7 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
// From, To sizes and ElemCount must be pow of two
if (!isPowerOf2_32(NumElems * FromSz * ToSz)) return SDValue();
// We are going to use the original vector elt for storing.
// accumulated smaller vector elements must be a multiple of bigger size.
if (0 != (NumElems * ToSz) % FromSz) return SDValue();
unsigned SizeRatio = FromSz / ToSz;
assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits());
@ -13885,6 +13967,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
case ISD::SRL: return PerformShiftCombine(N, DAG, Subtarget);
case ISD::AND: return PerformAndCombine(N, DAG, DCI, Subtarget);
case ISD::OR: return PerformOrCombine(N, DAG, DCI, Subtarget);
case ISD::LOAD: return PerformLOADCombine(N, DAG, Subtarget);
case ISD::STORE: return PerformSTORECombine(N, DAG, Subtarget);
case ISD::SINT_TO_FP: return PerformSINT_TO_FPCombine(N, DAG, this);
case X86ISD::FXOR:

View File

@ -0,0 +1,82 @@
; RUN: llc < %s -march=x86-64 -mcpu=corei7 -promote-elements -mattr=+sse41 | FileCheck %s
;CHECK: load_2_i8
; A single 16-bit load
;CHECK: movzwl
;CHECK: pshufb
;CHECK: paddq
;CHECK: pshufb
; A single 16-bit store
;CHECK: movw
;CHECK: ret
define void @load_2_i8(<2 x i8>* %A) {
%T = load <2 x i8>* %A
%G = add <2 x i8> %T, <i8 9, i8 7>
store <2 x i8> %G, <2 x i8>* %A
ret void
}
;CHECK: load_2_i16
; Read 32-bits
;CHECK: movd
;CHECK: pshufb
;CHECK: paddq
;CHECK: pshufb
;CHECK: movd
;CHECK: ret
define void @load_2_i16(<2 x i16>* %A) {
%T = load <2 x i16>* %A
%G = add <2 x i16> %T, <i16 9, i16 7>
store <2 x i16> %G, <2 x i16>* %A
ret void
}
;CHECK: load_2_i32
;CHECK: pshufd
;CHECK: paddq
;CHECK: pshufd
;CHECK: ret
define void @load_2_i32(<2 x i32>* %A) {
%T = load <2 x i32>* %A
%G = add <2 x i32> %T, <i32 9, i32 7>
store <2 x i32> %G, <2 x i32>* %A
ret void
}
;CHECK: load_4_i8
;CHECK: movd
;CHECK: pshufb
;CHECK: paddd
;CHECK: pshufb
;CHECK: ret
define void @load_4_i8(<4 x i8>* %A) {
%T = load <4 x i8>* %A
%G = add <4 x i8> %T, <i8 1, i8 4, i8 9, i8 7>
store <4 x i8> %G, <4 x i8>* %A
ret void
}
;CHECK: load_4_i16
;CHECK: punpcklwd
;CHECK: paddd
;CHECK: pshufb
;CHECK: ret
define void @load_4_i16(<4 x i16>* %A) {
%T = load <4 x i16>* %A
%G = add <4 x i16> %T, <i16 1, i16 4, i16 9, i16 7>
store <4 x i16> %G, <4 x i16>* %A
ret void
}
;CHECK: load_8_i8
;CHECK: punpcklbw
;CHECK: paddw
;CHECK: pshufb
;CHECK: ret
define void @load_8_i8(<8 x i8>* %A) {
%T = load <8 x i8>* %A
%G = add <8 x i8> %T, %T
store <8 x i8> %G, <8 x i8>* %A
ret void
}