diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp index 99d1d24d070..154832b62bd 100644 --- a/lib/Target/ARM/ARMISelLowering.cpp +++ b/lib/Target/ARM/ARMISelLowering.cpp @@ -1287,7 +1287,8 @@ static SDOperand LowerSRx(SDOperand Op, SelectionDAG &DAG, return DAG.getNode(ISD::BUILD_PAIR, MVT::i64, Lo, Hi); } -SDOperand ARMTargetLowering::LowerMEMCPY(SDOperand Op, SelectionDAG &DAG) { +SDOperand ARMTargetLowering::LowerMEMCPY(SDOperand Op, SelectionDAG &DAG, + const ARMSubtarget *ST) { SDOperand ChainOp = Op.getOperand(0); SDOperand DestOp = Op.getOperand(1); SDOperand SourceOp = Op.getOperand(2); @@ -1305,25 +1306,18 @@ SDOperand ARMTargetLowering::LowerMEMCPY(SDOperand Op, SelectionDAG &DAG) { assert(!AlwaysInline && "Cannot inline copy of unknown size"); return LowerMEMCPYCall(ChainOp, DestOp, SourceOp, CountOp, DAG); } - unsigned Size = I->getValue(); - if (AlwaysInline) - return LowerMEMCPYInline(ChainOp, DestOp, SourceOp, Size, Align, DAG); - - // The libc version is likely to be faster for the following cases. It can + // If not DWORD aligned or if size is more than threshold, then call memcpy. + // The libc version is likely to be faster for the these cases. It can // use the address value and run time information about the CPU. // With glibc 2.6.1 on a core 2, coping an array of 100M longs was 30% faster - - // If not DWORD aligned, call memcpy. - if ((Align & 3) != 0) - return LowerMEMCPYCall(ChainOp, DestOp, SourceOp, CountOp, DAG); - - // If size is more than the threshold, call memcpy. - // if (Size > Subtarget->getMinRepStrSizeThreshold()) - if (Size >= 64) - return LowerMEMCPYCall(ChainOp, DestOp, SourceOp, CountOp, DAG); - - return LowerMEMCPYInline(ChainOp, DestOp, SourceOp, Size, Align, DAG); + // FIXME: For now, we don't lower memcpy's to loads / stores for Thumb. Change + // this once Thumb ldmia / stmia support is added. + unsigned Size = I->getValue(); + if (AlwaysInline || + (!ST->isThumb() && Size < 64 && (Align & 3) == 0)) + return LowerMEMCPYInline(ChainOp, DestOp, SourceOp, Size, Align, DAG); + return LowerMEMCPYCall(ChainOp, DestOp, SourceOp, CountOp, DAG); } SDOperand ARMTargetLowering::LowerMEMCPYCall(SDOperand Chain, @@ -1350,46 +1344,93 @@ SDOperand ARMTargetLowering::LowerMEMCPYInline(SDOperand Chain, unsigned Size, unsigned Align, SelectionDAG &DAG) { - - // Do repeated 4-byte loads and stores. To be improved. - assert((Size& 3) == 0); - assert((Align & 3) == 0); + // Do repeated 4-byte loads and stores. To be improved. + assert((Align & 3) == 0 && "Expected 4-byte aligned addresses!"); + unsigned BytesLeft = Size & 3; unsigned NumMemOps = Size >> 2; unsigned EmittedNumMemOps = 0; unsigned SrcOff = 0, DstOff = 0; MVT::ValueType VT = MVT::i32; unsigned VTSize = 4; + unsigned i = 0; const unsigned MAX_LOADS_IN_LDM = 6; - SDOperand LoadChains[MAX_LOADS_IN_LDM]; + SDOperand TFOps[MAX_LOADS_IN_LDM]; SDOperand Loads[MAX_LOADS_IN_LDM]; - // Emit up to 4 loads, then a TokenFactor barrier, then the same - // number of stores. The loads and stores will get combined into + // Emit up to MAX_LOADS_IN_LDM loads, then a TokenFactor barrier, then the + // same number of stores. The loads and stores will get combined into // ldm/stm later on. - while(EmittedNumMemOps < NumMemOps) { - unsigned i; - for (i=0; i= 2) { + VT = MVT::i16; + VTSize = 2; + } else { + VT = MVT::i8; + VTSize = 1; + } + + Loads[i] = DAG.getLoad(VT, Chain, + DAG.getNode(ISD::ADD, MVT::i32, Source, + DAG.getConstant(SrcOff, MVT::i32)), + NULL, 0); + TFOps[i] = Loads[i].getValue(1); + ++i; + SrcOff += VTSize; + BytesLeft -= VTSize; + } + Chain = DAG.getNode(ISD::TokenFactor, MVT::Other, &TFOps[0], i); + + i = 0; + BytesLeft = BytesLeftSave; + while (BytesLeft) { + if (BytesLeft >= 2) { + VT = MVT::i16; + VTSize = 2; + } else { + VT = MVT::i8; + VTSize = 1; + } + + TFOps[i] = DAG.getStore(Chain, Loads[i], + DAG.getNode(ISD::ADD, MVT::i32, Dest, + DAG.getConstant(DstOff, MVT::i32)), + NULL, 0); + ++i; + DstOff += VTSize; + BytesLeft -= VTSize; + } + return DAG.getNode(ISD::TokenFactor, MVT::Other, &TFOps[0], i); } SDOperand ARMTargetLowering::LowerOperation(SDOperand Op, SelectionDAG &DAG) { @@ -1419,7 +1460,7 @@ SDOperand ARMTargetLowering::LowerOperation(SDOperand Op, SelectionDAG &DAG) { case ISD::RETURNADDR: break; case ISD::FRAMEADDR: break; case ISD::GLOBAL_OFFSET_TABLE: return LowerGLOBAL_OFFSET_TABLE(Op, DAG); - case ISD::MEMCPY: return LowerMEMCPY(Op, DAG); + case ISD::MEMCPY: return LowerMEMCPY(Op, DAG, Subtarget); } return SDOperand(); } diff --git a/lib/Target/ARM/ARMISelLowering.h b/lib/Target/ARM/ARMISelLowering.h index 46bcb3482fe..47cb2a17718 100644 --- a/lib/Target/ARM/ARMISelLowering.h +++ b/lib/Target/ARM/ARMISelLowering.h @@ -130,11 +130,12 @@ namespace llvm { SDOperand LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA, SelectionDAG &DAG); SDOperand LowerToTLSExecModels(GlobalAddressSDNode *GA, - SelectionDAG &DAG); + SelectionDAG &DAG); SDOperand LowerGLOBAL_OFFSET_TABLE(SDOperand Op, SelectionDAG &DAG); SDOperand LowerFORMAL_ARGUMENTS(SDOperand Op, SelectionDAG &DAG); SDOperand LowerBR_JT(SDOperand Op, SelectionDAG &DAG); - SDOperand LowerMEMCPY(SDOperand Op, SelectionDAG &DAG); + SDOperand LowerMEMCPY(SDOperand Op, SelectionDAG &DAG, + const ARMSubtarget *ST); SDOperand LowerMEMCPYCall(SDOperand Chain, SDOperand Dest, SDOperand Source, SDOperand Count, SelectionDAG &DAG); diff --git a/lib/Target/ARM/README-Thumb.txt b/lib/Target/ARM/README-Thumb.txt index 380097d186c..5bd16a34335 100644 --- a/lib/Target/ARM/README-Thumb.txt +++ b/lib/Target/ARM/README-Thumb.txt @@ -221,3 +221,7 @@ LPC0: Make register allocator / spiller smarter so we can re-materialize "mov r, imm", etc. Almost all Thumb instructions clobber condition code. + +//===---------------------------------------------------------------------===// + +Add ldmia, stmia support. diff --git a/test/CodeGen/ARM/memcpy-inline.ll b/test/CodeGen/ARM/memcpy-inline.ll new file mode 100644 index 00000000000..665d3acd89d --- /dev/null +++ b/test/CodeGen/ARM/memcpy-inline.ll @@ -0,0 +1,15 @@ +; RUN: llvm-as < %s | llc -mtriple=arm-apple-darwin | grep ldmia +; RUN: llvm-as < %s | llc -mtriple=arm-apple-darwin | grep ldrb +; RUN: llvm-as < %s | llc -mtriple=arm-apple-darwin | grep ldrh + + %struct.x = type { i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8 } +@src = external global %struct.x +@dst = external global %struct.x + +define i32 @t() { +entry: + call void @llvm.memcpy.i32( i8* getelementptr (%struct.x* @dst, i32 0, i32 0), i8* getelementptr (%struct.x* @src, i32 0, i32 0), i32 11, i32 8 ) + ret i32 0 +} + +declare void @llvm.memcpy.i32(i8*, i8*, i32, i32)