mirror of
https://github.com/c64scene-ar/llvm-6502.git
synced 2025-01-12 17:32:19 +00:00
When performing a truncating store, it's possible to rearrange the data
in-register, such that we can use a single vector store rather then a series of scalar stores. For func_4_8 the generated code vldr d16, LCPI0_0 vmov d17, r0, r1 vadd.i16 d16, d17, d16 vmov.u16 r0, d16[3] strb r0, [r2, #3] vmov.u16 r0, d16[2] strb r0, [r2, #2] vmov.u16 r0, d16[1] strb r0, [r2, #1] vmov.u16 r0, d16[0] strb r0, [r2] bx lr becomes vldr d16, LCPI0_0 vmov d17, r0, r1 vadd.i16 d16, d17, d16 vuzp.8 d16, d17 vst1.32 {d16[0]}, [r2, :32] bx lr I'm not fond of how this combine pessimizes 2012-03-13-DAGCombineBug.ll, but I couldn't think of a way to judiciously apply this combine. This ldrh r0, [r0, #4] strh r0, [r1] becomes vldr d16, [r0] vmov.u16 r0, d16[2] vmov.32 d16[0], r0 vuzp.16 d16, d17 vst1.32 {d16[0]}, [r1, :32] PR11158 rdar://10703339 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@154340 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
parent
f31ceaf8b7
commit
7f35455708
@ -7339,8 +7339,92 @@ static SDValue PerformVMOVDRRCombine(SDNode *N, SelectionDAG &DAG) {
|
||||
static SDValue PerformSTORECombine(SDNode *N,
|
||||
TargetLowering::DAGCombinerInfo &DCI) {
|
||||
StoreSDNode *St = cast<StoreSDNode>(N);
|
||||
if (St->isVolatile())
|
||||
return SDValue();
|
||||
|
||||
// Optimize trunc store (of multiple scalars) to shuffle and store. First,
|
||||
// pack all of the elements in one place. Next, store to memory in fewer
|
||||
// chunks.
|
||||
SDValue StVal = St->getValue();
|
||||
if (!ISD::isNormalStore(St) || St->isVolatile())
|
||||
EVT VT = StVal.getValueType();
|
||||
if (St->isTruncatingStore() && VT.isVector()) {
|
||||
SelectionDAG &DAG = DCI.DAG;
|
||||
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
|
||||
EVT StVT = St->getMemoryVT();
|
||||
unsigned NumElems = VT.getVectorNumElements();
|
||||
assert(StVT != VT && "Cannot truncate to the same type");
|
||||
unsigned FromEltSz = VT.getVectorElementType().getSizeInBits();
|
||||
unsigned ToEltSz = StVT.getVectorElementType().getSizeInBits();
|
||||
|
||||
// From, To sizes and ElemCount must be pow of two
|
||||
if (!isPowerOf2_32(NumElems * FromEltSz * ToEltSz)) return SDValue();
|
||||
|
||||
// We are going to use the original vector elt for storing.
|
||||
// Accumulated smaller vector elements must be a multiple of the store size.
|
||||
if (0 != (NumElems * FromEltSz) % ToEltSz) return SDValue();
|
||||
|
||||
unsigned SizeRatio = FromEltSz / ToEltSz;
|
||||
assert(SizeRatio * NumElems * ToEltSz == VT.getSizeInBits());
|
||||
|
||||
// Create a type on which we perform the shuffle.
|
||||
EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), StVT.getScalarType(),
|
||||
NumElems*SizeRatio);
|
||||
assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
|
||||
|
||||
DebugLoc DL = St->getDebugLoc();
|
||||
SDValue WideVec = DAG.getNode(ISD::BITCAST, DL, WideVecVT, StVal);
|
||||
SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
|
||||
for (unsigned i = 0; i < NumElems; ++i) ShuffleVec[i] = i * SizeRatio;
|
||||
|
||||
// Can't shuffle using an illegal type.
|
||||
if (!TLI.isTypeLegal(WideVecVT)) return SDValue();
|
||||
|
||||
SDValue Shuff = DAG.getVectorShuffle(WideVecVT, DL, WideVec,
|
||||
DAG.getUNDEF(WideVec.getValueType()),
|
||||
ShuffleVec.data());
|
||||
// At this point all of the data is stored at the bottom of the
|
||||
// register. We now need to save it to mem.
|
||||
|
||||
// Find the largest store unit
|
||||
MVT StoreType = MVT::i8;
|
||||
for (unsigned tp = MVT::FIRST_INTEGER_VALUETYPE;
|
||||
tp < MVT::LAST_INTEGER_VALUETYPE; ++tp) {
|
||||
MVT Tp = (MVT::SimpleValueType)tp;
|
||||
if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToEltSz)
|
||||
StoreType = Tp;
|
||||
}
|
||||
// Didn't find a legal store type.
|
||||
if (!TLI.isTypeLegal(StoreType))
|
||||
return SDValue();
|
||||
|
||||
// Bitcast the original vector into a vector of store-size units
|
||||
EVT StoreVecVT = EVT::getVectorVT(*DAG.getContext(),
|
||||
StoreType, VT.getSizeInBits()/EVT(StoreType).getSizeInBits());
|
||||
assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits());
|
||||
SDValue ShuffWide = DAG.getNode(ISD::BITCAST, DL, StoreVecVT, Shuff);
|
||||
SmallVector<SDValue, 8> Chains;
|
||||
SDValue Increment = DAG.getConstant(StoreType.getSizeInBits()/8,
|
||||
TLI.getPointerTy());
|
||||
SDValue BasePtr = St->getBasePtr();
|
||||
|
||||
// Perform one or more big stores into memory.
|
||||
unsigned E = (ToEltSz*NumElems)/StoreType.getSizeInBits();
|
||||
for (unsigned I = 0; I < E; I++) {
|
||||
SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL,
|
||||
StoreType, ShuffWide,
|
||||
DAG.getIntPtrConstant(I));
|
||||
SDValue Ch = DAG.getStore(St->getChain(), DL, SubVec, BasePtr,
|
||||
St->getPointerInfo(), St->isVolatile(),
|
||||
St->isNonTemporal(), St->getAlignment());
|
||||
BasePtr = DAG.getNode(ISD::ADD, DL, BasePtr.getValueType(), BasePtr,
|
||||
Increment);
|
||||
Chains.push_back(Ch);
|
||||
}
|
||||
return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, &Chains[0],
|
||||
Chains.size());
|
||||
}
|
||||
|
||||
if (!ISD::isNormalStore(St))
|
||||
return SDValue();
|
||||
|
||||
// Split a store of a VMOVDRR into two integer stores to avoid mixing NEON and
|
||||
|
@ -6,8 +6,7 @@
|
||||
; (i32 extload $addr+c*sizeof(i16)
|
||||
define void @test_hi_short3(<3 x i16> * nocapture %srcA, <2 x i16> * nocapture %dst) nounwind {
|
||||
entry:
|
||||
; CHECK: ldrh [[REG:r[0-9]+]]
|
||||
; CHECK: strh [[REG]]
|
||||
; CHECK: vst1.32
|
||||
%0 = load <3 x i16> * %srcA, align 8
|
||||
%1 = shufflevector <3 x i16> %0, <3 x i16> undef, <2 x i32> <i32 2, i32 undef>
|
||||
store <2 x i16> %1, <2 x i16> * %dst, align 4
|
||||
|
19
test/CodeGen/ARM/opt-shuff-tstore.ll
Normal file
19
test/CodeGen/ARM/opt-shuff-tstore.ll
Normal file
@ -0,0 +1,19 @@
|
||||
; RUN: llc -mcpu=cortex-a9 -mtriple=arm-linux-unknown -promote-elements -mattr=+neon < %s | FileCheck %s
|
||||
|
||||
; CHECK: func_4_8
|
||||
; CHECK: vst1.32
|
||||
; CHECK-NEXT: bx lr
|
||||
define void @func_4_8(<4 x i8> %param, <4 x i8>* %p) {
|
||||
%r = add <4 x i8> %param, <i8 1, i8 2, i8 3, i8 4>
|
||||
store <4 x i8> %r, <4 x i8>* %p
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK: func_2_16
|
||||
; CHECK: vst1.32
|
||||
; CHECK-NEXT: bx lr
|
||||
define void @func_2_16(<2 x i16> %param, <2 x i16>* %p) {
|
||||
%r = add <2 x i16> %param, <i16 1, i16 2>
|
||||
store <2 x i16> %r, <2 x i16>* %p
|
||||
ret void
|
||||
}
|
@ -149,12 +149,10 @@ define void @test_with_vcombine(<4 x float>* %v) nounwind {
|
||||
}
|
||||
|
||||
; The type <2 x i16> is legalized to <2 x i32> and need to be trunc-stored
|
||||
; to <2 x i16> when stored to memory. Currently ARM scalarizes these stores.
|
||||
; See PR 11158
|
||||
; to <2 x i16> when stored to memory.
|
||||
define void @test_vrev64(<4 x i16>* nocapture %source, <2 x i16>* nocapture %dst) nounwind ssp {
|
||||
; CHECK: test_vrev64:
|
||||
; CHECK: vst1.16
|
||||
; CHECK: vst1.16
|
||||
; CHECK: vst1.32
|
||||
entry:
|
||||
%0 = bitcast <4 x i16>* %source to <8 x i16>*
|
||||
%tmp2 = load <8 x i16>* %0, align 4
|
||||
|
Loading…
x
Reference in New Issue
Block a user