mirror of
https://github.com/c64scene-ar/llvm-6502.git
synced 2024-11-02 07:11:49 +00:00
When promoting integer vectors we often create ext-loads. This patch adds a
dag-combine optimization to implement the ext-load efficiently (using shuffles). For example the type <4 x i8> is stored in memory as i32, but it needs to find its way into a <4 x i32> register. Previously we scalarized the memory access, now we use shuffles. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@139995 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
parent
b6266fb602
commit
91e43fd17a
@ -1138,6 +1138,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
|
||||
setTargetDAGCombine(ISD::AND);
|
||||
setTargetDAGCombine(ISD::ADD);
|
||||
setTargetDAGCombine(ISD::SUB);
|
||||
setTargetDAGCombine(ISD::LOAD);
|
||||
setTargetDAGCombine(ISD::STORE);
|
||||
setTargetDAGCombine(ISD::ZERO_EXTEND);
|
||||
setTargetDAGCombine(ISD::SINT_TO_FP);
|
||||
@ -13433,6 +13434,89 @@ static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG,
|
||||
return SDValue();
|
||||
}
|
||||
|
||||
/// PerformLOADCombine - Do target-specific dag combines on LOAD nodes.
|
||||
static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG,
|
||||
const X86Subtarget *Subtarget) {
|
||||
LoadSDNode *Ld = cast<LoadSDNode>(N);
|
||||
EVT RegVT = Ld->getValueType(0);
|
||||
EVT MemVT = Ld->getMemoryVT();
|
||||
DebugLoc dl = Ld->getDebugLoc();
|
||||
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
|
||||
|
||||
ISD::LoadExtType Ext = Ld->getExtensionType();
|
||||
|
||||
// If yhis is a vector EXT Load then attempt to optimize it using a
|
||||
// shuffle. We need SSE4 for the shuffles.
|
||||
// TODO: It is possible to support ZExt by zeroing the undef values
|
||||
// during the shuffle phase or after the shuffle.
|
||||
if (RegVT.isVector() && Ext == ISD::EXTLOAD && Subtarget->hasSSE41()) {
|
||||
assert(MemVT != RegVT && "Cannot extend to the same type");
|
||||
assert(MemVT.isVector() && "Must load a vector from memory");
|
||||
|
||||
unsigned NumElems = RegVT.getVectorNumElements();
|
||||
unsigned RegSz = RegVT.getSizeInBits();
|
||||
unsigned MemSz = MemVT.getSizeInBits();
|
||||
assert(RegSz > MemSz && "Register size must be greater than the mem size");
|
||||
// All sized must be a power of two
|
||||
if (!isPowerOf2_32(RegSz * MemSz * NumElems)) return SDValue();
|
||||
|
||||
// Attempt to load the original value using a single load op.
|
||||
// Find a scalar type which is equal to the loaded word size.
|
||||
MVT SclrLoadTy = MVT::i8;
|
||||
for (unsigned tp = MVT::FIRST_INTEGER_VALUETYPE;
|
||||
tp < MVT::LAST_INTEGER_VALUETYPE; ++tp) {
|
||||
MVT Tp = (MVT::SimpleValueType)tp;
|
||||
if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() == MemSz) {
|
||||
SclrLoadTy = Tp;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Proceed if a load word is found.
|
||||
if (SclrLoadTy.getSizeInBits() != MemSz) return SDValue();
|
||||
|
||||
EVT LoadUnitVecVT = EVT::getVectorVT(*DAG.getContext(), SclrLoadTy,
|
||||
RegSz/SclrLoadTy.getSizeInBits());
|
||||
|
||||
EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
|
||||
RegSz/MemVT.getScalarType().getSizeInBits());
|
||||
// Can't shuffle using an illegal type.
|
||||
if (!TLI.isTypeLegal(WideVecVT)) return SDValue();
|
||||
|
||||
// Perform a single load.
|
||||
SDValue ScalarLoad = DAG.getLoad(SclrLoadTy, dl, Ld->getChain(),
|
||||
Ld->getBasePtr(),
|
||||
Ld->getPointerInfo(), Ld->isVolatile(),
|
||||
Ld->isNonTemporal(), Ld->getAlignment());
|
||||
|
||||
// Insert the word loaded into a vector.
|
||||
SDValue ScalarInVector = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
|
||||
LoadUnitVecVT, ScalarLoad);
|
||||
|
||||
// Bitcast the loaded value to a vector of the original element type, in
|
||||
// the size of the target vector type.
|
||||
SDValue SlicedVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT, ScalarInVector);
|
||||
unsigned SizeRatio = RegSz/MemSz;
|
||||
|
||||
// Redistribute the loaded elements into the different locations.
|
||||
SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
|
||||
for (unsigned i = 0; i < NumElems; i++) ShuffleVec[i*SizeRatio] = i;
|
||||
|
||||
SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec,
|
||||
DAG.getUNDEF(SlicedVec.getValueType()),
|
||||
ShuffleVec.data());
|
||||
|
||||
// Bitcast to the requested type.
|
||||
Shuff = DAG.getNode(ISD::BITCAST, dl, RegVT, Shuff);
|
||||
// Replace the original load with the new sequence
|
||||
// and return the new chain.
|
||||
DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Shuff);
|
||||
return SDValue(ScalarLoad.getNode(), 1);
|
||||
}
|
||||
|
||||
return SDValue();
|
||||
}
|
||||
|
||||
/// PerformSTORECombine - Do target-specific dag combines on STORE nodes.
|
||||
static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
|
||||
const X86Subtarget *Subtarget) {
|
||||
@ -13479,9 +13563,7 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
|
||||
|
||||
// From, To sizes and ElemCount must be pow of two
|
||||
if (!isPowerOf2_32(NumElems * FromSz * ToSz)) return SDValue();
|
||||
// We are going to use the original vector elt for storing.
|
||||
// accumulated smaller vector elements must be a multiple of bigger size.
|
||||
if (0 != (NumElems * ToSz) % FromSz) return SDValue();
|
||||
|
||||
unsigned SizeRatio = FromSz / ToSz;
|
||||
|
||||
assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits());
|
||||
@ -13885,6 +13967,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
|
||||
case ISD::SRL: return PerformShiftCombine(N, DAG, Subtarget);
|
||||
case ISD::AND: return PerformAndCombine(N, DAG, DCI, Subtarget);
|
||||
case ISD::OR: return PerformOrCombine(N, DAG, DCI, Subtarget);
|
||||
case ISD::LOAD: return PerformLOADCombine(N, DAG, Subtarget);
|
||||
case ISD::STORE: return PerformSTORECombine(N, DAG, Subtarget);
|
||||
case ISD::SINT_TO_FP: return PerformSINT_TO_FPCombine(N, DAG, this);
|
||||
case X86ISD::FXOR:
|
||||
|
82
test/CodeGen/X86/trunc-ext-ld-st.ll
Normal file
82
test/CodeGen/X86/trunc-ext-ld-st.ll
Normal file
@ -0,0 +1,82 @@
|
||||
; RUN: llc < %s -march=x86-64 -mcpu=corei7 -promote-elements -mattr=+sse41 | FileCheck %s
|
||||
|
||||
;CHECK: load_2_i8
|
||||
; A single 16-bit load
|
||||
;CHECK: movzwl
|
||||
;CHECK: pshufb
|
||||
;CHECK: paddq
|
||||
;CHECK: pshufb
|
||||
; A single 16-bit store
|
||||
;CHECK: movw
|
||||
;CHECK: ret
|
||||
|
||||
define void @load_2_i8(<2 x i8>* %A) {
|
||||
%T = load <2 x i8>* %A
|
||||
%G = add <2 x i8> %T, <i8 9, i8 7>
|
||||
store <2 x i8> %G, <2 x i8>* %A
|
||||
ret void
|
||||
}
|
||||
|
||||
;CHECK: load_2_i16
|
||||
; Read 32-bits
|
||||
;CHECK: movd
|
||||
;CHECK: pshufb
|
||||
;CHECK: paddq
|
||||
;CHECK: pshufb
|
||||
;CHECK: movd
|
||||
;CHECK: ret
|
||||
define void @load_2_i16(<2 x i16>* %A) {
|
||||
%T = load <2 x i16>* %A
|
||||
%G = add <2 x i16> %T, <i16 9, i16 7>
|
||||
store <2 x i16> %G, <2 x i16>* %A
|
||||
ret void
|
||||
}
|
||||
|
||||
;CHECK: load_2_i32
|
||||
;CHECK: pshufd
|
||||
;CHECK: paddq
|
||||
;CHECK: pshufd
|
||||
;CHECK: ret
|
||||
define void @load_2_i32(<2 x i32>* %A) {
|
||||
%T = load <2 x i32>* %A
|
||||
%G = add <2 x i32> %T, <i32 9, i32 7>
|
||||
store <2 x i32> %G, <2 x i32>* %A
|
||||
ret void
|
||||
}
|
||||
|
||||
;CHECK: load_4_i8
|
||||
;CHECK: movd
|
||||
;CHECK: pshufb
|
||||
;CHECK: paddd
|
||||
;CHECK: pshufb
|
||||
;CHECK: ret
|
||||
define void @load_4_i8(<4 x i8>* %A) {
|
||||
%T = load <4 x i8>* %A
|
||||
%G = add <4 x i8> %T, <i8 1, i8 4, i8 9, i8 7>
|
||||
store <4 x i8> %G, <4 x i8>* %A
|
||||
ret void
|
||||
}
|
||||
|
||||
;CHECK: load_4_i16
|
||||
;CHECK: punpcklwd
|
||||
;CHECK: paddd
|
||||
;CHECK: pshufb
|
||||
;CHECK: ret
|
||||
define void @load_4_i16(<4 x i16>* %A) {
|
||||
%T = load <4 x i16>* %A
|
||||
%G = add <4 x i16> %T, <i16 1, i16 4, i16 9, i16 7>
|
||||
store <4 x i16> %G, <4 x i16>* %A
|
||||
ret void
|
||||
}
|
||||
|
||||
;CHECK: load_8_i8
|
||||
;CHECK: punpcklbw
|
||||
;CHECK: paddw
|
||||
;CHECK: pshufb
|
||||
;CHECK: ret
|
||||
define void @load_8_i8(<8 x i8>* %A) {
|
||||
%T = load <8 x i8>* %A
|
||||
%G = add <8 x i8> %T, %T
|
||||
store <8 x i8> %G, <8 x i8>* %A
|
||||
ret void
|
||||
}
|
Loading…
Reference in New Issue
Block a user