diff --git a/lib/Target/ARM/ARMFastISel.cpp b/lib/Target/ARM/ARMFastISel.cpp index 0b728a94fa0..67ba14b6a38 100644 --- a/lib/Target/ARM/ARMFastISel.cpp +++ b/lib/Target/ARM/ARMFastISel.cpp @@ -185,6 +185,9 @@ class ARMFastISel : public FastISel { bool ARMEmitStore(EVT VT, unsigned SrcReg, Address &Addr); bool ARMComputeAddress(const Value *Obj, Address &Addr); void ARMSimplifyAddress(Address &Addr, EVT VT, bool useAM3); + bool ARMIsMemXferSmall(uint64_t Len); + bool ARMTryEmitSmallMemXfer(Address Dest, Address Src, uint64_t Len, + bool isMemCpy); unsigned ARMEmitIntExt(EVT SrcVT, unsigned SrcReg, EVT DestVT, bool isZExt); unsigned ARMMaterializeFP(const ConstantFP *CFP, EVT VT); unsigned ARMMaterializeInt(const Constant *C, EVT VT); @@ -2193,18 +2196,76 @@ bool ARMFastISel::SelectCall(const Instruction *I, return true; } +bool ARMFastISel::ARMIsMemXferSmall(uint64_t Len) { + return Len <= 16; +} + +bool ARMFastISel::ARMTryEmitSmallMemXfer(Address Dest, Address Src, uint64_t Len, + bool isMemCpy) { + // FIXME: Memmove's require a little more care because their source and + // destination may overlap. + if (!isMemCpy) + return false; + + // Make sure we don't bloat code by inlining very large memcpy's. + if (!ARMIsMemXferSmall(Len)) + return false; + + // We don't care about alignment here since we just emit integer accesses. + while (Len) { + MVT VT; + if (Len >= 4) + VT = MVT::i32; + else if (Len >= 2) + VT = MVT::i16; + else { + assert(Len == 1); + VT = MVT::i8; + } + + bool RV; + unsigned ResultReg; + RV = ARMEmitLoad(VT, ResultReg, Src); + assert (RV = true && "Should be able to handle this load."); + RV = ARMEmitStore(VT, ResultReg, Dest); + assert (RV = true && "Should be able to handle this store."); + + unsigned Size = VT.getSizeInBits()/8; + Len -= Size; + Dest.Offset += Size; + Src.Offset += Size; + } + + return true; +} + bool ARMFastISel::SelectIntrinsicCall(const IntrinsicInst &I) { // FIXME: Handle more intrinsics. switch (I.getIntrinsicID()) { default: return false; case Intrinsic::memcpy: case Intrinsic::memmove: { - // FIXME: Small memcpy/memmove's are common enough that we want to do them - // without a call if possible. const MemTransferInst &MTI = cast(I); // Don't handle volatile. if (MTI.isVolatile()) return false; + + // Disable inlining for memmove before calls to ComputeAddress. Otherwise, + // we would emit dead code because we don't currently handle memmoves. + bool isMemCpy = (I.getIntrinsicID() == Intrinsic::memcpy); + if (isa(MTI.getLength()) && isMemCpy) { + // Small memcpy/memmove's are common enough that we want to do them + // without a call if possible. + uint64_t Len = cast(MTI.getLength())->getZExtValue(); + if (ARMIsMemXferSmall(Len)) { + Address Dest, Src; + if (!ARMComputeAddress(MTI.getRawDest(), Dest) || + !ARMComputeAddress(MTI.getRawSource(), Src)) + return false; + if (ARMTryEmitSmallMemXfer(Dest, Src, Len, isMemCpy)) + return true; + } + } if (!MTI.getLength()->getType()->isIntegerTy(32)) return false; diff --git a/test/CodeGen/ARM/fast-isel-intrinsic.ll b/test/CodeGen/ARM/fast-isel-intrinsic.ll index 9a924b491b6..3ef8bce5fd0 100644 --- a/test/CodeGen/ARM/fast-isel-intrinsic.ll +++ b/test/CodeGen/ARM/fast-isel-intrinsic.ll @@ -33,7 +33,7 @@ define void @t2() nounwind ssp { ; ARM: ldr r0, [r0] ; ARM: add r1, r0, #4 ; ARM: add r0, r0, #16 -; ARM: movw r2, #10 +; ARM: movw r2, #17 ; ARM: str r0, [sp] @ 4-byte Spill ; ARM: mov r0, r1 ; ARM: ldr r1, [sp] @ 4-byte Reload @@ -43,11 +43,11 @@ define void @t2() nounwind ssp { ; THUMB: ldr r0, [r0] ; THUMB: adds r1, r0, #4 ; THUMB: adds r0, #16 -; THUMB: movs r2, #10 +; THUMB: movs r2, #17 ; THUMB: movt r2, #0 ; THUMB: mov r0, r1 ; THUMB: bl _memcpy - call void @llvm.memcpy.p0i8.p0i8.i32(i8* getelementptr inbounds ([60 x i8]* @temp, i32 0, i32 4), i8* getelementptr inbounds ([60 x i8]* @temp, i32 0, i32 16), i32 10, i32 1, i1 false) + call void @llvm.memcpy.p0i8.p0i8.i32(i8* getelementptr inbounds ([60 x i8]* @temp, i32 0, i32 4), i8* getelementptr inbounds ([60 x i8]* @temp, i32 0, i32 16), i32 17, i32 1, i1 false) ret void } @@ -75,4 +75,32 @@ define void @t3() nounwind ssp { ret void } +define void @t4() nounwind ssp { +; ARM: t4 +; ARM: ldr r0, LCPI3_0 +; ARM: ldr r0, [r0] +; ARM: ldr r1, LCPI3_1 +; ARM: ldr r1, [r1] +; ARM: ldr r2, [r1, #16] +; ARM: str r2, [r0, #4] +; ARM: ldr r2, [r1, #20] +; ARM: str r2, [r0, #8] +; ARM: ldrh r1, [r1, #24] +; ARM: strh r1, [r0, #12] +; ARM: bx lr +; THUMB: ldr.n r0, LCPI3_0 +; THUMB: ldr r0, [r0] +; THUMB: ldr.n r1, LCPI3_1 +; THUMB: ldr r1, [r1] +; THUMB: ldr r2, [r1, #16] +; THUMB: str r2, [r0, #4] +; THUMB: ldr r2, [r1, #20] +; THUMB: str r2, [r0, #8] +; THUMB: ldrh r1, [r1, #24] +; THUMB: strh r1, [r0, #12] +; THUMB: bx lr + call void @llvm.memcpy.p0i8.p0i8.i32(i8* getelementptr inbounds ([60 x i8]* @temp, i32 0, i32 4), i8* getelementptr inbounds ([60 x i8]* @temp, i32 0, i32 16), i32 10, i32 1, i1 false) + ret void +} + declare void @llvm.memmove.p0i8.p0i8.i32(i8* nocapture, i8* nocapture, i32, i32, i1) nounwind