From 0328ca6cd7f194c4ee7e4a8203f56ae7a17ed014 Mon Sep 17 00:00:00 2001 From: John Brawn Date: Wed, 18 Mar 2015 12:01:59 +0000 Subject: [PATCH] [ARM] Align stack objects passed to memory intrinsics Memcpy, and other memory intrinsics, typically tries to use LDM/STM if the source and target addresses are 4-byte aligned. In CodeGenPrepare look for calls to memory intrinsics and, if the object is on the stack, 4-byte align it if it's large enough that we expect that memcpy would want to use LDM/STM to copy it. Differential Revision: http://reviews.llvm.org/D7908 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@232627 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/Target/TargetLowering.h | 9 + lib/CodeGen/CodeGenPrepare.cpp | 36 ++++ lib/Target/ARM/ARMISelLowering.cpp | 15 ++ lib/Target/ARM/ARMISelLowering.h | 3 + test/CodeGen/ARM/memfunc.ll | 293 +++++++++++++++++++++++++-- test/CodeGen/ARM/memset-inline.ll | 2 +- 6 files changed, 337 insertions(+), 21 deletions(-) diff --git a/include/llvm/Target/TargetLowering.h b/include/llvm/Target/TargetLowering.h index 5cf853cca2a..390cc9250b0 100644 --- a/include/llvm/Target/TargetLowering.h +++ b/include/llvm/Target/TargetLowering.h @@ -976,6 +976,15 @@ public: return false; } + /// Return true if the pointer arguments to CI should be aligned by aligning + /// the object whose address is being passed. If so then MinSize is set to the + /// minimum size the object must be to be aligned and PrefAlign is set to the + /// preferred alignment. + virtual bool shouldAlignPointerArgs(CallInst */*CI*/, unsigned &/*MinSize*/, + unsigned &/*PrefAlign*/) const { + return false; + } + //===--------------------------------------------------------------------===// /// \name Helpers for TargetTransformInfo implementations /// @{ diff --git a/lib/CodeGen/CodeGenPrepare.cpp b/lib/CodeGen/CodeGenPrepare.cpp index 61485fc67df..314c73437e0 100644 --- a/lib/CodeGen/CodeGenPrepare.cpp +++ b/lib/CodeGen/CodeGenPrepare.cpp @@ -1228,6 +1228,42 @@ bool CodeGenPrepare::OptimizeCallInst(CallInst *CI, bool& ModifiedDT) { return true; } + const DataLayout *TD = TLI ? TLI->getDataLayout() : nullptr; + + // Align the pointer arguments to this call if the target thinks it's a good + // idea + unsigned MinSize, PrefAlign; + if (TLI && TD && TLI->shouldAlignPointerArgs(CI, MinSize, PrefAlign)) { + for (auto &Arg : CI->arg_operands()) { + // We want to align both objects whose address is used directly and + // objects whose address is used in casts and GEPs, though it only makes + // sense for GEPs if the offset is a multiple of the desired alignment and + // if size - offset meets the size threshold. + if (!Arg->getType()->isPointerTy()) + continue; + APInt Offset(TD->getPointerSizeInBits( + cast(Arg->getType())->getAddressSpace()), 0); + Value *Val = Arg->stripAndAccumulateInBoundsConstantOffsets(*TD, Offset); + uint64_t Offset2 = Offset.getLimitedValue(); + AllocaInst *AI; + if ((Offset2 & (PrefAlign-1)) == 0 && + (AI = dyn_cast(Val)) && + AI->getAlignment() < PrefAlign && + TD->getTypeAllocSize(AI->getAllocatedType()) >= MinSize + Offset2) + AI->setAlignment(PrefAlign); + // TODO: Also align GlobalVariables + } + // If this is a memcpy (or similar) then we may be able to improve the + // alignment + if (MemIntrinsic *MI = dyn_cast(CI)) { + unsigned Align = getKnownAlignment(MI->getDest(), *TD); + if (MemTransferInst *MTI = dyn_cast(MI)) + Align = std::min(Align, getKnownAlignment(MTI->getSource(), *TD)); + if (Align > MI->getAlignment()) + MI->setAlignment(ConstantInt::get(MI->getAlignmentType(), Align)); + } + } + IntrinsicInst *II = dyn_cast(CI); if (II) { switch (II->getIntrinsicID()) { diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp index 74e8512851e..fb12cc226af 100644 --- a/lib/Target/ARM/ARMISelLowering.cpp +++ b/lib/Target/ARM/ARMISelLowering.cpp @@ -42,6 +42,7 @@ #include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/Intrinsics.h" +#include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Type.h" #include "llvm/MC/MCSectionMachO.h" #include "llvm/Support/CommandLine.h" @@ -1163,6 +1164,20 @@ const TargetRegisterClass *ARMTargetLowering::getRegClassFor(MVT VT) const { return TargetLowering::getRegClassFor(VT); } +// memcpy, and other memory intrinsics, typically tries to use LDM/STM if the +// source/dest is aligned and the copy size is large enough. We therefore want +// to align such objects passed to memory intrinsics. +bool ARMTargetLowering::shouldAlignPointerArgs(CallInst *CI, unsigned &MinSize, + unsigned &PrefAlign) const { + if (!isa(CI)) + return false; + MinSize = 8; + // On ARM11 onwards (excluding M class) 8-byte aligned LDM is typically 1 + // cycle faster than 4-byte aligned LDM. + PrefAlign = (Subtarget->hasV6Ops() && !Subtarget->isMClass() ? 8 : 4); + return true; +} + // Create a fast isel object. FastISel * ARMTargetLowering::createFastISel(FunctionLoweringInfo &funcInfo, diff --git a/lib/Target/ARM/ARMISelLowering.h b/lib/Target/ARM/ARMISelLowering.h index a364933b38a..682f479278e 100644 --- a/lib/Target/ARM/ARMISelLowering.h +++ b/lib/Target/ARM/ARMISelLowering.h @@ -368,6 +368,9 @@ namespace llvm { return true; } + bool shouldAlignPointerArgs(CallInst *CI, unsigned &MinSize, + unsigned &PrefAlign) const override; + /// createFastISel - This method returns a target specific FastISel object, /// or null if the target does not support "fast" ISel. FastISel *createFastISel(FunctionLoweringInfo &funcInfo, diff --git a/test/CodeGen/ARM/memfunc.ll b/test/CodeGen/ARM/memfunc.ll index 8d3800b43c1..160096a3808 100644 --- a/test/CodeGen/ARM/memfunc.ll +++ b/test/CodeGen/ARM/memfunc.ll @@ -1,31 +1,284 @@ -; RUN: llc < %s -mtriple=armv7-apple-ios -o - | FileCheck %s -; RUN: llc < %s -mtriple=thumbv7m-none-macho -o - | FileCheck %s --check-prefix=DARWIN -; RUN: llc < %s -mtriple=arm-none-eabi -o - | FileCheck --check-prefix=EABI %s -; RUN: llc < %s -mtriple=arm-none-eabihf -o - | FileCheck --check-prefix=EABI %s +; RUN: llc < %s -mtriple=armv7-apple-ios -disable-post-ra -o - | FileCheck %s --check-prefix=CHECK-IOS --check-prefix=CHECK +; RUN: llc < %s -mtriple=thumbv7m-none-macho -disable-post-ra -o - | FileCheck %s --check-prefix=CHECK-DARWIN --check-prefix=CHECK +; RUN: llc < %s -mtriple=arm-none-eabi -disable-post-ra -o - | FileCheck %s --check-prefix=CHECK-EABI --check-prefix=CHECK +; RUN: llc < %s -mtriple=arm-none-eabihf -disable-post-ra -o - | FileCheck %s --check-prefix=CHECK-EABI --check-prefix=CHECK @from = common global [500 x i32] zeroinitializer, align 4 @to = common global [500 x i32] zeroinitializer, align 4 -define void @f() { +define void @f1() { entry: + ; CHECK-LABEL: f1 - ; CHECK: memmove - ; EABI: __aeabi_memmove - call void @llvm.memmove.p0i8.p0i8.i32(i8* bitcast ([500 x i32]* @from to i8*), i8* bitcast ([500 x i32]* @to to i8*), i32 500, i32 0, i1 false) + ; CHECK-IOS: memmove + ; CHECK-DARWIN: memmove + ; CHECK-EABI: __aeabi_memmove + call void @llvm.memmove.p0i8.p0i8.i32(i8* bitcast ([500 x i32]* @from to i8*), i8* bitcast ([500 x i32]* @to to i8*), i32 500, i32 0, i1 false) - ; CHECK: memcpy - ; EABI: __aeabi_memcpy - call void @llvm.memcpy.p0i8.p0i8.i32(i8* bitcast ([500 x i32]* @from to i8*), i8* bitcast ([500 x i32]* @to to i8*), i32 500, i32 0, i1 false) + ; CHECK-IOS: memcpy + ; CHECK-DARWIN: memcpy + ; CHECK-EABI: __aeabi_memcpy + call void @llvm.memcpy.p0i8.p0i8.i32(i8* bitcast ([500 x i32]* @from to i8*), i8* bitcast ([500 x i32]* @to to i8*), i32 500, i32 0, i1 false) - ; EABI memset swaps arguments - ; CHECK: mov r1, #0 - ; CHECK: memset - ; DARWIN: movs r1, #0 - ; DARWIN: memset - ; EABI: mov r2, #0 - ; EABI: __aeabi_memset - call void @llvm.memset.p0i8.i32(i8* bitcast ([500 x i32]* @from to i8*), i8 0, i32 500, i32 0, i1 false) - unreachable + ; EABI memset swaps arguments + ; CHECK-IOS: mov r1, #0 + ; CHECK-IOS: memset + ; CHECK-DARWIN: movs r1, #0 + ; CHECK-DARWIN: memset + ; CHECK-EABI: mov r2, #0 + ; CHECK-EABI: __aeabi_memset + call void @llvm.memset.p0i8.i32(i8* bitcast ([500 x i32]* @from to i8*), i8 0, i32 500, i32 0, i1 false) + unreachable +} + +; Check that alloca arguments to memory intrinsics are automatically aligned if at least 8 bytes in size +define void @f2(i8* %dest, i32 %n) { +entry: + ; CHECK-LABEL: f2 + + ; IOS (ARMv7) should 8-byte align, others should 4-byte align + ; CHECK-IOS: add r1, sp, #32 + ; CHECK-IOS: memmove + ; CHECK-DARWIN: add r1, sp, #28 + ; CHECK-DARWIN: memmove + ; CHECK-EABI: add r1, sp, #28 + ; CHECK-EABI: __aeabi_memmove + %arr0 = alloca [9 x i8], align 1 + %0 = bitcast [9 x i8]* %arr0 to i8* + call void @llvm.memmove.p0i8.p0i8.i32(i8* %dest, i8* %0, i32 %n, i32 0, i1 false) + + ; CHECK: add r1, sp, #16 + ; CHECK-IOS: memcpy + ; CHECK-DARWIN: memcpy + ; CHECK-EABI: __aeabi_memcpy + %arr1 = alloca [9 x i8], align 1 + %1 = bitcast [9 x i8]* %arr1 to i8* + call void @llvm.memcpy.p0i8.p0i8.i32(i8* %dest, i8* %1, i32 %n, i32 0, i1 false) + + ; CHECK-IOS: mov r0, sp + ; CHECK-IOS: mov r1, #0 + ; CHECK-IOS: memset + ; CHECK-DARINW: add r0, sp, #4 + ; CHECK-DARWIN: movs r1, #0 + ; CHECK-DARWIN: memset + ; CHECK-EABI: add r0, sp, #4 + ; CHECK-EABI: mov r2, #0 + ; CHECK-EABI: __aeabi_memset + %arr2 = alloca [9 x i8], align 1 + %2 = bitcast [9 x i8]* %arr2 to i8* + call void @llvm.memset.p0i8.i32(i8* %2, i8 0, i32 %n, i32 0, i1 false) + + unreachable +} + +; Check that alloca arguments are not aligned if less than 8 bytes in size +define void @f3(i8* %dest, i32 %n) { +entry: + ; CHECK-LABEL: f3 + + ; CHECK: {{add(.w)? r1, sp, #17|sub(.w)? r1, r7, #15}} + ; CHECK-IOS: memmove + ; CHECK-DARWIN: memmove + ; CHECK-EABI: __aeabi_memmove + %arr0 = alloca [7 x i8], align 1 + %0 = bitcast [7 x i8]* %arr0 to i8* + call void @llvm.memmove.p0i8.p0i8.i32(i8* %dest, i8* %0, i32 %n, i32 0, i1 false) + + ; CHECK: {{add(.w)? r1, sp, #10}} + ; CHECK-IOS: memcpy + ; CHECK-DARWIN: memcpy + ; CHECK-EABI: __aeabi_memcpy + %arr1 = alloca [7 x i8], align 1 + %1 = bitcast [7 x i8]* %arr1 to i8* + call void @llvm.memcpy.p0i8.p0i8.i32(i8* %dest, i8* %1, i32 %n, i32 0, i1 false) + + ; CHECK: {{add(.w)? r0, sp, #3}} + ; CHECK-IOS: mov r1, #0 + ; CHECK-IOS: memset + ; CHECK-DARWIN: movs r1, #0 + ; CHECK-DARWIN: memset + ; CHECK-EABI: mov r2, #0 + ; CHECK-EABI: __aeabi_memset + %arr2 = alloca [7 x i8], align 1 + %2 = bitcast [7 x i8]* %arr2 to i8* + call void @llvm.memset.p0i8.i32(i8* %2, i8 0, i32 %n, i32 0, i1 false) + + unreachable +} + +; Check that alloca arguments are not aligned if size+offset is less than 8 bytes +define void @f4(i8* %dest, i32 %n) { +entry: + ; CHECK-LABEL: f4 + + ; CHECK: {{add(.w)? r., sp, #23|sub(.w)? r., r7, #17}} + ; CHECK-IOS: memmove + ; CHECK-DARWIN: memmove + ; CHECK-EABI: __aeabi_memmove + %arr0 = alloca [9 x i8], align 1 + %0 = getelementptr inbounds [9 x i8], [9 x i8]* %arr0, i32 0, i32 4 + call void @llvm.memmove.p0i8.p0i8.i32(i8* %dest, i8* %0, i32 %n, i32 0, i1 false) + + ; CHECK: {{add(.w)? r., sp, #(10|14)}} + ; CHECK-IOS: memcpy + ; CHECK-DARWIN: memcpy + ; CHECK-EABI: __aeabi_memcpy + %arr1 = alloca [9 x i8], align 1 + %1 = getelementptr inbounds [9 x i8], [9 x i8]* %arr1, i32 0, i32 4 + call void @llvm.memcpy.p0i8.p0i8.i32(i8* %dest, i8* %1, i32 %n, i32 0, i1 false) + + ; CHECK: {{add(.w)? r., sp, #(1|5)}} + ; CHECK-IOS: mov r1, #0 + ; CHECK-IOS: memset + ; CHECK-DARWIN: movs r1, #0 + ; CHECK-DARWIN: memset + ; CHECK-EABI: mov r2, #0 + ; CHECK-EABI: __aeabi_memset + %arr2 = alloca [9 x i8], align 1 + %2 = getelementptr inbounds [9 x i8], [9 x i8]* %arr2, i32 0, i32 4 + call void @llvm.memset.p0i8.i32(i8* %2, i8 0, i32 %n, i32 0, i1 false) + + unreachable +} + +; Check that alloca arguments are not aligned if the offset is not a multiple of 4 +define void @f5(i8* %dest, i32 %n) { +entry: + ; CHECK-LABEL: f5 + + ; CHECK: {{add(.w)? r., sp, #27|sub(.w)? r., r7, #21}} + ; CHECK-IOS: memmove + ; CHECK-DARWIN: memmove + ; CHECK-EABI: __aeabi_memmove + %arr0 = alloca [13 x i8], align 1 + %0 = getelementptr inbounds [13 x i8], [13 x i8]* %arr0, i32 0, i32 1 + call void @llvm.memmove.p0i8.p0i8.i32(i8* %dest, i8* %0, i32 %n, i32 0, i1 false) + + ; CHECK: {{add(.w)? r., sp, #(10|14)}} + ; CHECK-IOS: memcpy + ; CHECK-DARWIN: memcpy + ; CHECK-EABI: __aeabi_memcpy + %arr1 = alloca [13 x i8], align 1 + %1 = getelementptr inbounds [13 x i8], [13 x i8]* %arr1, i32 0, i32 1 + call void @llvm.memcpy.p0i8.p0i8.i32(i8* %dest, i8* %1, i32 %n, i32 0, i1 false) + + ; CHECK: {{add(.w)? r., sp, #(1|5)}} + ; CHECK-IOS: mov r1, #0 + ; CHECK-IOS: memset + ; CHECK-DARWIN: movs r1, #0 + ; CHECK-DARWIN: memset + ; CHECK-EABI: mov r2, #0 + ; CHECK-EABI: __aeabi_memset + %arr2 = alloca [13 x i8], align 1 + %2 = getelementptr inbounds [13 x i8], [13 x i8]* %arr2, i32 0, i32 1 + call void @llvm.memset.p0i8.i32(i8* %2, i8 0, i32 %n, i32 0, i1 false) + + unreachable +} + +; Check that alloca arguments are not aligned if the offset is unknown +define void @f6(i8* %dest, i32 %n, i32 %i) { +entry: + ; CHECK-LABEL: f6 + + ; CHECK: {{add(.w)? r., sp, #27|sub(.w)? r., r7, #25}} + ; CHECK-IOS: memmove + ; CHECK-DARWIN: memmove + ; CHECK-EABI: __aeabi_memmove + %arr0 = alloca [13 x i8], align 1 + %0 = getelementptr inbounds [13 x i8], [13 x i8]* %arr0, i32 0, i32 %i + call void @llvm.memmove.p0i8.p0i8.i32(i8* %dest, i8* %0, i32 %n, i32 0, i1 false) + + ; CHECK: {{add(.w)? r., sp, #(10|14)}} + ; CHECK-IOS: memcpy + ; CHECK-DARWIN: memcpy + ; CHECK-EABI: __aeabi_memcpy + %arr1 = alloca [13 x i8], align 1 + %1 = getelementptr inbounds [13 x i8], [13 x i8]* %arr1, i32 0, i32 %i + call void @llvm.memcpy.p0i8.p0i8.i32(i8* %dest, i8* %1, i32 %n, i32 0, i1 false) + + ; CHECK: {{add(.w)? r., sp, #(1|5)}} + ; CHECK-IOS: mov r1, #0 + ; CHECK-IOS: memset + ; CHECK-DARWIN: movs r1, #0 + ; CHECK-DARWIN: memset + ; CHECK-EABI: mov r2, #0 + ; CHECK-EABI: __aeabi_memset + %arr2 = alloca [13 x i8], align 1 + %2 = getelementptr inbounds [13 x i8], [13 x i8]* %arr2, i32 0, i32 %i + call void @llvm.memset.p0i8.i32(i8* %2, i8 0, i32 %n, i32 0, i1 false) + + unreachable +} + +; Check that alloca arguments are not aligned if the GEP is not inbounds +define void @f7(i8* %dest, i32 %n) { +entry: + ; CHECK-LABEL: f7 + + ; CHECK: {{add(.w)? r., sp, #27|sub(.w)? r., r7, #21}} + ; CHECK-IOS: memmove + ; CHECK-DARWIN: memmove + ; CHECK-EABI: __aeabi_memmove + %arr0 = alloca [13 x i8], align 1 + %0 = getelementptr [13 x i8], [13 x i8]* %arr0, i32 0, i32 4 + call void @llvm.memmove.p0i8.p0i8.i32(i8* %dest, i8* %0, i32 %n, i32 0, i1 false) + + ; CHECK: {{add(.w)? r., sp, #(10|14)}} + ; CHECK-IOS: memcpy + ; CHECK-DARWIN: memcpy + ; CHECK-EABI: __aeabi_memcpy + %arr1 = alloca [13 x i8], align 1 + %1 = getelementptr [13 x i8], [13 x i8]* %arr1, i32 0, i32 4 + call void @llvm.memcpy.p0i8.p0i8.i32(i8* %dest, i8* %1, i32 %n, i32 0, i1 false) + + ; CHECK: {{add(.w)? r., sp, #(1|5)}} + ; CHECK-IOS: mov r1, #0 + ; CHECK-IOS: memset + ; CHECK-DARWIN: movs r1, #0 + ; CHECK-DARWIN: memset + ; CHECK-EABI: mov r2, #0 + ; CHECK-EABI: __aeabi_memset + %arr2 = alloca [13 x i8], align 1 + %2 = getelementptr [13 x i8], [13 x i8]* %arr2, i32 0, i32 4 + call void @llvm.memset.p0i8.i32(i8* %2, i8 0, i32 %n, i32 0, i1 false) + + unreachable +} + +; Check that alloca arguments are not aligned when the offset is past the end of the allocation +define void @f8(i8* %dest, i32 %n) { +entry: + ; CHECK-LABEL: f8 + + ; CHECK: {{add(.w)? r., sp, #27|sub(.w)? r., r7, #21}} + ; CHECK-IOS: memmove + ; CHECK-DARWIN: memmove + ; CHECK-EABI: __aeabi_memmove + %arr0 = alloca [13 x i8], align 1 + %0 = getelementptr inbounds [13 x i8], [13 x i8]* %arr0, i32 0, i32 16 + call void @llvm.memmove.p0i8.p0i8.i32(i8* %dest, i8* %0, i32 %n, i32 0, i1 false) + + ; CHECK: {{add(.w)? r., sp, #(10|14)}} + ; CHECK-IOS: memcpy + ; CHECK-DARWIN: memcpy + ; CHECK-EABI: __aeabi_memcpy + %arr1 = alloca [13 x i8], align 1 + %1 = getelementptr inbounds [13 x i8], [13 x i8]* %arr1, i32 0, i32 16 + call void @llvm.memcpy.p0i8.p0i8.i32(i8* %dest, i8* %1, i32 %n, i32 0, i1 false) + + ; CHECK: {{add(.w)? r., sp, #(1|5)}} + ; CHECK-IOS: mov r1, #0 + ; CHECK-IOS: memset + ; CHECK-DARWIN: movs r1, #0 + ; CHECK-DARWIN: memset + ; CHECK-EABI: mov r2, #0 + ; CHECK-EABI: __aeabi_memset + %arr2 = alloca [13 x i8], align 1 + %2 = getelementptr inbounds [13 x i8], [13 x i8]* %arr2, i32 0, i32 16 + call void @llvm.memset.p0i8.i32(i8* %2, i8 0, i32 %n, i32 0, i1 false) + + unreachable } declare void @llvm.memmove.p0i8.p0i8.i32(i8* nocapture, i8* nocapture, i32, i32, i1) nounwind diff --git a/test/CodeGen/ARM/memset-inline.ll b/test/CodeGen/ARM/memset-inline.ll index fa4f850b95a..191db1e20a2 100644 --- a/test/CodeGen/ARM/memset-inline.ll +++ b/test/CodeGen/ARM/memset-inline.ll @@ -17,7 +17,7 @@ entry: ; CHECK: add.w r1, r0, #10 ; CHECK: vmov.i32 {{q[0-9]+}}, #0x0 ; CHECK: vst1.16 {d{{[0-9]+}}, d{{[0-9]+}}}, [r1] -; CHECK: vst1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r0] +; CHECK: vst1.64 {d{{[0-9]+}}, d{{[0-9]+}}}, [r0] %buf = alloca [26 x i8], align 1 %0 = getelementptr inbounds [26 x i8], [26 x i8]* %buf, i32 0, i32 0 call void @llvm.memset.p0i8.i32(i8* %0, i8 0, i32 26, i32 1, i1 false)