mirror of
https://github.com/c64scene-ar/llvm-6502.git
synced 2025-01-08 03:30:22 +00:00
Fix memcpy lowering when addresses are 4-byte aligned but size is not multiple of 4.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@43234 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
parent
ac72058dd6
commit
4102eb57bb
@ -1287,7 +1287,8 @@ static SDOperand LowerSRx(SDOperand Op, SelectionDAG &DAG,
|
|||||||
return DAG.getNode(ISD::BUILD_PAIR, MVT::i64, Lo, Hi);
|
return DAG.getNode(ISD::BUILD_PAIR, MVT::i64, Lo, Hi);
|
||||||
}
|
}
|
||||||
|
|
||||||
SDOperand ARMTargetLowering::LowerMEMCPY(SDOperand Op, SelectionDAG &DAG) {
|
SDOperand ARMTargetLowering::LowerMEMCPY(SDOperand Op, SelectionDAG &DAG,
|
||||||
|
const ARMSubtarget *ST) {
|
||||||
SDOperand ChainOp = Op.getOperand(0);
|
SDOperand ChainOp = Op.getOperand(0);
|
||||||
SDOperand DestOp = Op.getOperand(1);
|
SDOperand DestOp = Op.getOperand(1);
|
||||||
SDOperand SourceOp = Op.getOperand(2);
|
SDOperand SourceOp = Op.getOperand(2);
|
||||||
@ -1305,25 +1306,18 @@ SDOperand ARMTargetLowering::LowerMEMCPY(SDOperand Op, SelectionDAG &DAG) {
|
|||||||
assert(!AlwaysInline && "Cannot inline copy of unknown size");
|
assert(!AlwaysInline && "Cannot inline copy of unknown size");
|
||||||
return LowerMEMCPYCall(ChainOp, DestOp, SourceOp, CountOp, DAG);
|
return LowerMEMCPYCall(ChainOp, DestOp, SourceOp, CountOp, DAG);
|
||||||
}
|
}
|
||||||
unsigned Size = I->getValue();
|
|
||||||
|
|
||||||
if (AlwaysInline)
|
// If not DWORD aligned or if size is more than threshold, then call memcpy.
|
||||||
return LowerMEMCPYInline(ChainOp, DestOp, SourceOp, Size, Align, DAG);
|
// The libc version is likely to be faster for the these cases. It can
|
||||||
|
|
||||||
// The libc version is likely to be faster for the following cases. It can
|
|
||||||
// use the address value and run time information about the CPU.
|
// use the address value and run time information about the CPU.
|
||||||
// With glibc 2.6.1 on a core 2, coping an array of 100M longs was 30% faster
|
// With glibc 2.6.1 on a core 2, coping an array of 100M longs was 30% faster
|
||||||
|
// FIXME: For now, we don't lower memcpy's to loads / stores for Thumb. Change
|
||||||
// If not DWORD aligned, call memcpy.
|
// this once Thumb ldmia / stmia support is added.
|
||||||
if ((Align & 3) != 0)
|
unsigned Size = I->getValue();
|
||||||
return LowerMEMCPYCall(ChainOp, DestOp, SourceOp, CountOp, DAG);
|
if (AlwaysInline ||
|
||||||
|
(!ST->isThumb() && Size < 64 && (Align & 3) == 0))
|
||||||
// If size is more than the threshold, call memcpy.
|
return LowerMEMCPYInline(ChainOp, DestOp, SourceOp, Size, Align, DAG);
|
||||||
// if (Size > Subtarget->getMinRepStrSizeThreshold())
|
return LowerMEMCPYCall(ChainOp, DestOp, SourceOp, CountOp, DAG);
|
||||||
if (Size >= 64)
|
|
||||||
return LowerMEMCPYCall(ChainOp, DestOp, SourceOp, CountOp, DAG);
|
|
||||||
|
|
||||||
return LowerMEMCPYInline(ChainOp, DestOp, SourceOp, Size, Align, DAG);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
SDOperand ARMTargetLowering::LowerMEMCPYCall(SDOperand Chain,
|
SDOperand ARMTargetLowering::LowerMEMCPYCall(SDOperand Chain,
|
||||||
@ -1350,46 +1344,93 @@ SDOperand ARMTargetLowering::LowerMEMCPYInline(SDOperand Chain,
|
|||||||
unsigned Size,
|
unsigned Size,
|
||||||
unsigned Align,
|
unsigned Align,
|
||||||
SelectionDAG &DAG) {
|
SelectionDAG &DAG) {
|
||||||
|
// Do repeated 4-byte loads and stores. To be improved.
|
||||||
// Do repeated 4-byte loads and stores. To be improved.
|
assert((Align & 3) == 0 && "Expected 4-byte aligned addresses!");
|
||||||
assert((Size& 3) == 0);
|
unsigned BytesLeft = Size & 3;
|
||||||
assert((Align & 3) == 0);
|
|
||||||
unsigned NumMemOps = Size >> 2;
|
unsigned NumMemOps = Size >> 2;
|
||||||
unsigned EmittedNumMemOps = 0;
|
unsigned EmittedNumMemOps = 0;
|
||||||
unsigned SrcOff = 0, DstOff = 0;
|
unsigned SrcOff = 0, DstOff = 0;
|
||||||
MVT::ValueType VT = MVT::i32;
|
MVT::ValueType VT = MVT::i32;
|
||||||
unsigned VTSize = 4;
|
unsigned VTSize = 4;
|
||||||
|
unsigned i = 0;
|
||||||
const unsigned MAX_LOADS_IN_LDM = 6;
|
const unsigned MAX_LOADS_IN_LDM = 6;
|
||||||
SDOperand LoadChains[MAX_LOADS_IN_LDM];
|
SDOperand TFOps[MAX_LOADS_IN_LDM];
|
||||||
SDOperand Loads[MAX_LOADS_IN_LDM];
|
SDOperand Loads[MAX_LOADS_IN_LDM];
|
||||||
|
|
||||||
// Emit up to 4 loads, then a TokenFactor barrier, then the same
|
// Emit up to MAX_LOADS_IN_LDM loads, then a TokenFactor barrier, then the
|
||||||
// number of stores. The loads and stores will get combined into
|
// same number of stores. The loads and stores will get combined into
|
||||||
// ldm/stm later on.
|
// ldm/stm later on.
|
||||||
while(EmittedNumMemOps < NumMemOps) {
|
while (EmittedNumMemOps < NumMemOps) {
|
||||||
unsigned i;
|
for (i = 0;
|
||||||
for (i=0; i<MAX_LOADS_IN_LDM && EmittedNumMemOps+i < NumMemOps; i++) {
|
i < MAX_LOADS_IN_LDM && EmittedNumMemOps + i < NumMemOps; ++i) {
|
||||||
Loads[i] = DAG.getLoad(VT, Chain,
|
Loads[i] = DAG.getLoad(VT, Chain,
|
||||||
DAG.getNode(ISD::ADD, VT, Source,
|
DAG.getNode(ISD::ADD, MVT::i32, Source,
|
||||||
DAG.getConstant(SrcOff, VT)),
|
DAG.getConstant(SrcOff, MVT::i32)),
|
||||||
NULL, 0);
|
NULL, 0);
|
||||||
LoadChains[i] = Loads[i].getValue(1);
|
TFOps[i] = Loads[i].getValue(1);
|
||||||
SrcOff += VTSize;
|
SrcOff += VTSize;
|
||||||
}
|
}
|
||||||
|
Chain = DAG.getNode(ISD::TokenFactor, MVT::Other, &TFOps[0], i);
|
||||||
|
|
||||||
Chain = DAG.getNode(ISD::TokenFactor, MVT::Other, &LoadChains[0], i);
|
for (i = 0;
|
||||||
|
i < MAX_LOADS_IN_LDM && EmittedNumMemOps + i < NumMemOps; ++i) {
|
||||||
for (i=0; i<MAX_LOADS_IN_LDM && EmittedNumMemOps+i < NumMemOps; i++) {
|
TFOps[i] = DAG.getStore(Chain, Loads[i],
|
||||||
Chain = DAG.getStore(Chain, Loads[i],
|
DAG.getNode(ISD::ADD, MVT::i32, Dest,
|
||||||
DAG.getNode(ISD::ADD, VT, Dest,
|
DAG.getConstant(DstOff, MVT::i32)),
|
||||||
DAG.getConstant(DstOff, VT)),
|
|
||||||
NULL, 0);
|
NULL, 0);
|
||||||
DstOff += VTSize;
|
DstOff += VTSize;
|
||||||
}
|
}
|
||||||
|
Chain = DAG.getNode(ISD::TokenFactor, MVT::Other, &TFOps[0], i);
|
||||||
|
|
||||||
EmittedNumMemOps += i;
|
EmittedNumMemOps += i;
|
||||||
}
|
}
|
||||||
|
|
||||||
return Chain;
|
if (BytesLeft == 0)
|
||||||
|
return Chain;
|
||||||
|
|
||||||
|
// Issue loads / stores for the trailing (1 - 3) bytes.
|
||||||
|
unsigned BytesLeftSave = BytesLeft;
|
||||||
|
i = 0;
|
||||||
|
while (BytesLeft) {
|
||||||
|
if (BytesLeft >= 2) {
|
||||||
|
VT = MVT::i16;
|
||||||
|
VTSize = 2;
|
||||||
|
} else {
|
||||||
|
VT = MVT::i8;
|
||||||
|
VTSize = 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
Loads[i] = DAG.getLoad(VT, Chain,
|
||||||
|
DAG.getNode(ISD::ADD, MVT::i32, Source,
|
||||||
|
DAG.getConstant(SrcOff, MVT::i32)),
|
||||||
|
NULL, 0);
|
||||||
|
TFOps[i] = Loads[i].getValue(1);
|
||||||
|
++i;
|
||||||
|
SrcOff += VTSize;
|
||||||
|
BytesLeft -= VTSize;
|
||||||
|
}
|
||||||
|
Chain = DAG.getNode(ISD::TokenFactor, MVT::Other, &TFOps[0], i);
|
||||||
|
|
||||||
|
i = 0;
|
||||||
|
BytesLeft = BytesLeftSave;
|
||||||
|
while (BytesLeft) {
|
||||||
|
if (BytesLeft >= 2) {
|
||||||
|
VT = MVT::i16;
|
||||||
|
VTSize = 2;
|
||||||
|
} else {
|
||||||
|
VT = MVT::i8;
|
||||||
|
VTSize = 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
TFOps[i] = DAG.getStore(Chain, Loads[i],
|
||||||
|
DAG.getNode(ISD::ADD, MVT::i32, Dest,
|
||||||
|
DAG.getConstant(DstOff, MVT::i32)),
|
||||||
|
NULL, 0);
|
||||||
|
++i;
|
||||||
|
DstOff += VTSize;
|
||||||
|
BytesLeft -= VTSize;
|
||||||
|
}
|
||||||
|
return DAG.getNode(ISD::TokenFactor, MVT::Other, &TFOps[0], i);
|
||||||
}
|
}
|
||||||
|
|
||||||
SDOperand ARMTargetLowering::LowerOperation(SDOperand Op, SelectionDAG &DAG) {
|
SDOperand ARMTargetLowering::LowerOperation(SDOperand Op, SelectionDAG &DAG) {
|
||||||
@ -1419,7 +1460,7 @@ SDOperand ARMTargetLowering::LowerOperation(SDOperand Op, SelectionDAG &DAG) {
|
|||||||
case ISD::RETURNADDR: break;
|
case ISD::RETURNADDR: break;
|
||||||
case ISD::FRAMEADDR: break;
|
case ISD::FRAMEADDR: break;
|
||||||
case ISD::GLOBAL_OFFSET_TABLE: return LowerGLOBAL_OFFSET_TABLE(Op, DAG);
|
case ISD::GLOBAL_OFFSET_TABLE: return LowerGLOBAL_OFFSET_TABLE(Op, DAG);
|
||||||
case ISD::MEMCPY: return LowerMEMCPY(Op, DAG);
|
case ISD::MEMCPY: return LowerMEMCPY(Op, DAG, Subtarget);
|
||||||
}
|
}
|
||||||
return SDOperand();
|
return SDOperand();
|
||||||
}
|
}
|
||||||
|
@ -130,11 +130,12 @@ namespace llvm {
|
|||||||
SDOperand LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA,
|
SDOperand LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA,
|
||||||
SelectionDAG &DAG);
|
SelectionDAG &DAG);
|
||||||
SDOperand LowerToTLSExecModels(GlobalAddressSDNode *GA,
|
SDOperand LowerToTLSExecModels(GlobalAddressSDNode *GA,
|
||||||
SelectionDAG &DAG);
|
SelectionDAG &DAG);
|
||||||
SDOperand LowerGLOBAL_OFFSET_TABLE(SDOperand Op, SelectionDAG &DAG);
|
SDOperand LowerGLOBAL_OFFSET_TABLE(SDOperand Op, SelectionDAG &DAG);
|
||||||
SDOperand LowerFORMAL_ARGUMENTS(SDOperand Op, SelectionDAG &DAG);
|
SDOperand LowerFORMAL_ARGUMENTS(SDOperand Op, SelectionDAG &DAG);
|
||||||
SDOperand LowerBR_JT(SDOperand Op, SelectionDAG &DAG);
|
SDOperand LowerBR_JT(SDOperand Op, SelectionDAG &DAG);
|
||||||
SDOperand LowerMEMCPY(SDOperand Op, SelectionDAG &DAG);
|
SDOperand LowerMEMCPY(SDOperand Op, SelectionDAG &DAG,
|
||||||
|
const ARMSubtarget *ST);
|
||||||
SDOperand LowerMEMCPYCall(SDOperand Chain, SDOperand Dest,
|
SDOperand LowerMEMCPYCall(SDOperand Chain, SDOperand Dest,
|
||||||
SDOperand Source, SDOperand Count,
|
SDOperand Source, SDOperand Count,
|
||||||
SelectionDAG &DAG);
|
SelectionDAG &DAG);
|
||||||
|
@ -221,3 +221,7 @@ LPC0:
|
|||||||
|
|
||||||
Make register allocator / spiller smarter so we can re-materialize "mov r, imm",
|
Make register allocator / spiller smarter so we can re-materialize "mov r, imm",
|
||||||
etc. Almost all Thumb instructions clobber condition code.
|
etc. Almost all Thumb instructions clobber condition code.
|
||||||
|
|
||||||
|
//===---------------------------------------------------------------------===//
|
||||||
|
|
||||||
|
Add ldmia, stmia support.
|
||||||
|
15
test/CodeGen/ARM/memcpy-inline.ll
Normal file
15
test/CodeGen/ARM/memcpy-inline.ll
Normal file
@ -0,0 +1,15 @@
|
|||||||
|
; RUN: llvm-as < %s | llc -mtriple=arm-apple-darwin | grep ldmia
|
||||||
|
; RUN: llvm-as < %s | llc -mtriple=arm-apple-darwin | grep ldrb
|
||||||
|
; RUN: llvm-as < %s | llc -mtriple=arm-apple-darwin | grep ldrh
|
||||||
|
|
||||||
|
%struct.x = type { i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8 }
|
||||||
|
@src = external global %struct.x
|
||||||
|
@dst = external global %struct.x
|
||||||
|
|
||||||
|
define i32 @t() {
|
||||||
|
entry:
|
||||||
|
call void @llvm.memcpy.i32( i8* getelementptr (%struct.x* @dst, i32 0, i32 0), i8* getelementptr (%struct.x* @src, i32 0, i32 0), i32 11, i32 8 )
|
||||||
|
ret i32 0
|
||||||
|
}
|
||||||
|
|
||||||
|
declare void @llvm.memcpy.i32(i8*, i8*, i32, i32)
|
Loading…
Reference in New Issue
Block a user