From 376642ed620ecae05b68c7bc81f79aeb2065abe0 Mon Sep 17 00:00:00 2001 From: Evan Cheng Date: Mon, 10 Dec 2012 23:21:26 +0000 Subject: [PATCH] Some enhancements for memcpy / memset inline expansion. 1. Teach it to use overlapping unaligned load / store to copy / set the trailing bytes. e.g. On 86, use two pairs of movups / movaps for 17 - 31 byte copies. 2. Use f64 for memcpy / memset on targets where i64 is not legal but f64 is. e.g. x86 and ARM. 3. When memcpy from a constant string, do *not* replace the load with a constant if it's not possible to materialize an integer immediate with a single instruction (required a new target hook: TLI.isIntImmLegal()). 4. Use unaligned load / stores more aggressively if target hooks indicates they are "fast". 5. Update ARM target hooks to use unaligned load / stores. e.g. vld1.8 / vst1.8. Also increase the threshold to something reasonable (8 for memset, 4 pairs for memcpy). This significantly improves Dhrystone, up to 50% on ARM iOS devices. rdar://12760078 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@169791 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/Target/TargetLowering.h | 22 +++- lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 82 ++++++++++--- lib/Target/ARM/ARMISelLowering.cpp | 64 +++++++--- lib/Target/ARM/ARMISelLowering.h | 7 +- lib/Target/ARM/ARMInstrThumb2.td | 6 +- lib/Target/Mips/MipsISelLowering.cpp | 5 +- lib/Target/Mips/MipsISelLowering.h | 2 +- lib/Target/X86/X86ISelLowering.cpp | 7 ++ lib/Target/X86/X86ISelLowering.h | 7 +- .../ARM/2011-10-26-memset-with-neon.ll | 8 -- test/CodeGen/ARM/memcpy-inline.ll | 109 +++++++++++++++++- test/CodeGen/ARM/memset-inline.ll | 30 +++++ test/CodeGen/ARM/reg_asc_order.ll | 16 --- test/CodeGen/X86/2009-11-16-UnfoldMemOpBug.ll | 7 +- test/CodeGen/X86/memcpy-2.ll | 12 +- 15 files changed, 299 insertions(+), 85 deletions(-) create mode 100644 test/CodeGen/ARM/memset-inline.ll delete mode 100644 test/CodeGen/ARM/reg_asc_order.ll diff --git a/include/llvm/Target/TargetLowering.h b/include/llvm/Target/TargetLowering.h index 48083eee57a..d2e20107efc 100644 --- a/include/llvm/Target/TargetLowering.h +++ b/include/llvm/Target/TargetLowering.h @@ -371,6 +371,16 @@ public: return false; } + /// isIntImmLegal - Returns true if the target can instruction select the + /// specified integer immediate natively (that is, it's materialized with one + /// instruction). The current *assumption* in isel is all of integer + /// immediates are "legal" and only the memcpy / memset expansion code is + /// making use of this. The rest of isel doesn't have proper cost model for + /// immediate materialization. + virtual bool isIntImmLegal(const APInt &/*Imm*/, EVT /*VT*/) const { + return true; + } + /// isShuffleMaskLegal - Targets can use this to indicate that they only /// support *some* VECTOR_SHUFFLE operations, those with specific masks. /// By default, if a target supports the VECTOR_SHUFFLE node, all mask values @@ -678,12 +688,14 @@ public: } /// This function returns true if the target allows unaligned memory accesses. - /// of the specified type. This is used, for example, in situations where an - /// array copy/move/set is converted to a sequence of store operations. It's - /// use helps to ensure that such replacements don't generate code that causes - /// an alignment error (trap) on the target machine. + /// of the specified type. If true, it also returns whether the unaligned + /// memory access is "fast" in the second argument by reference. This is used, + /// for example, in situations where an array copy/move/set is converted to a + /// sequence of store operations. It's use helps to ensure that such + /// replacements don't generate code that causes an alignment error (trap) on + /// the target machine. /// @brief Determine if the target supports unaligned memory accesses. - virtual bool allowsUnalignedMemoryAccesses(EVT) const { + virtual bool allowsUnalignedMemoryAccesses(EVT, bool *Fast = 0) const { return false; } diff --git a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index 7dd57d54201..29339405aab 100644 --- a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -3373,7 +3373,7 @@ static SDValue getMemsetStringVal(EVT VT, DebugLoc dl, SelectionDAG &DAG, unsigned NumVTBytes = VT.getSizeInBits() / 8; unsigned NumBytes = std::min(NumVTBytes, unsigned(Str.size())); - uint64_t Val = 0; + APInt Val(NumBytes*8, 0); if (TLI.isLittleEndian()) { for (unsigned i = 0; i != NumBytes; ++i) Val |= (uint64_t)(unsigned char)Str[i] << i*8; @@ -3382,7 +3382,9 @@ static SDValue getMemsetStringVal(EVT VT, DebugLoc dl, SelectionDAG &DAG, Val |= (uint64_t)(unsigned char)Str[i] << (NumVTBytes-i-1)*8; } - return DAG.getConstant(Val, VT); + if (TLI.isIntImmLegal(Val, VT)) + return DAG.getConstant(Val, VT); + return SDValue(0, 0); } /// getMemBasePlusOffset - Returns base and offset node for the @@ -3422,6 +3424,7 @@ static bool FindOptimalMemOpLowering(std::vector &MemOps, unsigned DstAlign, unsigned SrcAlign, bool IsZeroVal, bool MemcpyStrSrc, + bool AllowOverlap, SelectionDAG &DAG, const TargetLowering &TLI) { assert((SrcAlign == 0 || SrcAlign >= DstAlign) && @@ -3461,24 +3464,47 @@ static bool FindOptimalMemOpLowering(std::vector &MemOps, unsigned NumMemOps = 0; while (Size != 0) { + if (++NumMemOps > Limit) + return false; + unsigned VTSize = VT.getSizeInBits() / 8; while (VTSize > Size) { // For now, only use non-vector load / store's for the left-over pieces. + EVT NewVT; + unsigned NewVTSize; if (VT.isVector() || VT.isFloatingPoint()) { - VT = MVT::i64; - while (!TLI.isTypeLegal(VT)) - VT = (MVT::SimpleValueType)(VT.getSimpleVT().SimpleTy - 1); - VTSize = VT.getSizeInBits() / 8; + NewVT = (VT.getSizeInBits() > 64) ? MVT::i64 : MVT::i32; + while (!TLI.isOperationLegalOrCustom(ISD::STORE, NewVT)) { + if (NewVT == MVT::i64 && + TLI.isOperationLegalOrCustom(ISD::STORE, MVT::f64)) { + // i64 is usually not legal on 32-bit targets, but f64 may be. + NewVT = MVT::f64; + break; + } + NewVT = (MVT::SimpleValueType)(NewVT.getSimpleVT().SimpleTy - 1); + } + NewVTSize = NewVT.getSizeInBits() / 8; } else { // This can result in a type that is not legal on the target, e.g. // 1 or 2 bytes on PPC. - VT = (MVT::SimpleValueType)(VT.getSimpleVT().SimpleTy - 1); - VTSize >>= 1; + NewVT = (MVT::SimpleValueType)(VT.getSimpleVT().SimpleTy - 1); + NewVTSize = VTSize >> 1; + } + + // If the new VT cannot cover all of the remaining bits, then consider + // issuing a (or a pair of) unaligned and overlapping load / store. + // FIXME: Only does this for 64-bit or more since we don't have proper + // cost model for unaligned load / store. + bool Fast; + if (AllowOverlap && VTSize >= 8 && NewVTSize < Size && + TLI.allowsUnalignedMemoryAccesses(VT, &Fast) && Fast) + VTSize = Size; + else { + VT = NewVT; + VTSize = NewVTSize; } } - if (++NumMemOps > Limit) - return false; MemOps.push_back(VT); Size -= VTSize; } @@ -3523,7 +3549,7 @@ static SDValue getMemcpyLoadsAndStores(SelectionDAG &DAG, DebugLoc dl, if (!FindOptimalMemOpLowering(MemOps, Limit, Size, (DstAlignCanChange ? 0 : Align), (isZeroStr ? 0 : SrcAlign), - true, CopyFromStr, DAG, TLI)) + true, CopyFromStr, true, DAG, TLI)) return SDValue(); if (DstAlignCanChange) { @@ -3545,6 +3571,14 @@ static SDValue getMemcpyLoadsAndStores(SelectionDAG &DAG, DebugLoc dl, unsigned VTSize = VT.getSizeInBits() / 8; SDValue Value, Store; + if (VTSize > Size) { + // Issuing an unaligned load / store pair that overlaps with the previous + // pair. Adjust the offset accordingly. + assert(i == NumMemOps-1 && i != 0); + SrcOff -= VTSize - Size; + DstOff -= VTSize - Size; + } + if (CopyFromStr && (isZeroStr || (VT.isInteger() && !VT.isVector()))) { // It's unlikely a store of a vector immediate can be done in a single @@ -3553,11 +3587,14 @@ static SDValue getMemcpyLoadsAndStores(SelectionDAG &DAG, DebugLoc dl, // FIXME: Handle other cases where store of vector immediate is done in // a single instruction. Value = getMemsetStringVal(VT, dl, DAG, TLI, Str.substr(SrcOff)); - Store = DAG.getStore(Chain, dl, Value, - getMemBasePlusOffset(Dst, DstOff, DAG), - DstPtrInfo.getWithOffset(DstOff), isVol, - false, Align); - } else { + if (Value.getNode()) + Store = DAG.getStore(Chain, dl, Value, + getMemBasePlusOffset(Dst, DstOff, DAG), + DstPtrInfo.getWithOffset(DstOff), isVol, + false, Align); + } + + if (!Store.getNode()) { // The type might not be legal for the target. This should only happen // if the type is smaller than a legal type, as on PPC, so the right // thing to do is generate a LoadExt/StoreTrunc pair. These simplify @@ -3577,6 +3614,7 @@ static SDValue getMemcpyLoadsAndStores(SelectionDAG &DAG, DebugLoc dl, OutChains.push_back(Store); SrcOff += VTSize; DstOff += VTSize; + Size -= VTSize; } return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, @@ -3613,7 +3651,7 @@ static SDValue getMemmoveLoadsAndStores(SelectionDAG &DAG, DebugLoc dl, if (!FindOptimalMemOpLowering(MemOps, Limit, Size, (DstAlignCanChange ? 0 : Align), - SrcAlign, true, false, DAG, TLI)) + SrcAlign, true, false, false, DAG, TLI)) return SDValue(); if (DstAlignCanChange) { @@ -3689,7 +3727,7 @@ static SDValue getMemsetStores(SelectionDAG &DAG, DebugLoc dl, isa(Src) && cast(Src)->isNullValue(); if (!FindOptimalMemOpLowering(MemOps, TLI.getMaxStoresPerMemset(OptSize), Size, (DstAlignCanChange ? 0 : Align), 0, - IsZeroVal, false, DAG, TLI)) + IsZeroVal, false, true, DAG, TLI)) return SDValue(); if (DstAlignCanChange) { @@ -3716,6 +3754,13 @@ static SDValue getMemsetStores(SelectionDAG &DAG, DebugLoc dl, for (unsigned i = 0; i < NumMemOps; i++) { EVT VT = MemOps[i]; + unsigned VTSize = VT.getSizeInBits() / 8; + if (VTSize > Size) { + // Issuing an unaligned load / store pair that overlaps with the previous + // pair. Adjust the offset accordingly. + assert(i == NumMemOps-1 && i != 0); + DstOff -= VTSize - Size; + } // If this store is smaller than the largest store see whether we can get // the smaller value for free with a truncate. @@ -3734,6 +3779,7 @@ static SDValue getMemsetStores(SelectionDAG &DAG, DebugLoc dl, isVol, false, Align); OutChains.push_back(Store); DstOff += VT.getSizeInBits() / 8; + Size -= VTSize; } return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp index 32235b9d0c6..a0fe215e941 100644 --- a/lib/Target/ARM/ARMISelLowering.cpp +++ b/lib/Target/ARM/ARMISelLowering.cpp @@ -833,9 +833,12 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM) setSchedulingPreference(Sched::Hybrid); //// temporary - rewrite interface to use type - maxStoresPerMemcpy = maxStoresPerMemcpyOptSize = 1; - maxStoresPerMemset = 16; + maxStoresPerMemset = 8; maxStoresPerMemsetOptSize = Subtarget->isTargetDarwin() ? 8 : 4; + maxStoresPerMemcpy = 4; // For @llvm.memcpy -> sequence of stores + maxStoresPerMemcpyOptSize = Subtarget->isTargetDarwin() ? 4 : 2; + maxStoresPerMemmove = 4; // For @llvm.memmove -> sequence of stores + maxStoresPerMemmoveOptSize = Subtarget->isTargetDarwin() ? 4 : 2; // On ARM arguments smaller than 4 bytes are extended, so all arguments // are at least 4 bytes aligned. @@ -9406,7 +9409,7 @@ bool ARMTargetLowering::isDesirableToTransformToIntegerOp(unsigned Opc, return (VT == MVT::f32) && (Opc == ISD::LOAD || Opc == ISD::STORE); } -bool ARMTargetLowering::allowsUnalignedMemoryAccesses(EVT VT) const { +bool ARMTargetLowering::allowsUnalignedMemoryAccesses(EVT VT, bool *Fast) const { // The AllowsUnaliged flag models the SCTLR.A setting in ARM cpus bool AllowsUnaligned = Subtarget->allowsUnalignedMem(); @@ -9415,15 +9418,27 @@ bool ARMTargetLowering::allowsUnalignedMemoryAccesses(EVT VT) const { return false; case MVT::i8: case MVT::i16: - case MVT::i32: + case MVT::i32: { // Unaligned access can use (for example) LRDB, LRDH, LDR - return AllowsUnaligned; + if (AllowsUnaligned) { + if (Fast) + *Fast = Subtarget->hasV7Ops(); + return true; + } + return false; + } case MVT::f64: - case MVT::v2f64: + case MVT::v2f64: { // For any little-endian targets with neon, we can support unaligned ld/st // of D and Q (e.g. {D0,D1}) registers by using vld1.i8/vst1.i8. // A big-endian target may also explictly support unaligned accesses - return Subtarget->hasNEON() && (AllowsUnaligned || isLittleEndian()); + if (Subtarget->hasNEON() && (AllowsUnaligned || isLittleEndian())) { + if (Fast) + *Fast = true; + return true; + } + return false; + } } } @@ -9442,12 +9457,17 @@ EVT ARMTargetLowering::getOptimalMemOpType(uint64_t Size, // See if we can use NEON instructions for this... if (IsZeroVal && - !F->getFnAttributes().hasAttribute(Attributes::NoImplicitFloat) && - Subtarget->hasNEON()) { - if (memOpAlign(SrcAlign, DstAlign, 16) && Size >= 16) { - return MVT::v4i32; - } else if (memOpAlign(SrcAlign, DstAlign, 8) && Size >= 8) { - return MVT::v2i32; + Subtarget->hasNEON() && + !F->getFnAttributes().hasAttribute(Attributes::NoImplicitFloat)) { + bool Fast; + if (Size >= 16 && (memOpAlign(SrcAlign, DstAlign, 16) || + (allowsUnalignedMemoryAccesses(MVT::v2f64, &Fast) && + Fast))) { + return MVT::v2f64; + } else if (Size >= 8 && (memOpAlign(SrcAlign, DstAlign, 8) || + (allowsUnalignedMemoryAccesses(MVT::f64, &Fast) && + Fast))) { + return MVT::f64; } } @@ -10241,6 +10261,24 @@ bool ARMTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const { return false; } +bool ARMTargetLowering::isIntImmLegal(const APInt &Imm, EVT VT) const { + if (VT.getSizeInBits() > 32) + return false; + + int32_t ImmVal = Imm.getSExtValue(); + if (!Subtarget->isThumb()) { + return (ImmVal >= 0 && ImmVal < 65536) || + (ARM_AM::getSOImmVal(ImmVal) != -1) || + (ARM_AM::getSOImmVal(~ImmVal) != -1); + } else if (Subtarget->isThumb2()) { + return (ImmVal >= 0 && ImmVal < 65536) || + (ARM_AM::getT2SOImmVal(ImmVal) != -1) || + (ARM_AM::getT2SOImmVal(~ImmVal) != -1); + } else /*Thumb1*/ { + return (ImmVal >= 0 && ImmVal < 256); + } +} + /// getTgtMemIntrinsic - Represent NEON load and store intrinsics as /// MemIntrinsicNodes. The associated MachineMemOperands record the alignment /// specified in the intrinsic calls. diff --git a/lib/Target/ARM/ARMISelLowering.h b/lib/Target/ARM/ARMISelLowering.h index 1d5aa4fb382..5cf40236c5c 100644 --- a/lib/Target/ARM/ARMISelLowering.h +++ b/lib/Target/ARM/ARMISelLowering.h @@ -285,8 +285,9 @@ namespace llvm { bool isDesirableToTransformToIntegerOp(unsigned Opc, EVT VT) const; /// allowsUnalignedMemoryAccesses - Returns true if the target allows - /// unaligned memory accesses. of the specified type. - virtual bool allowsUnalignedMemoryAccesses(EVT VT) const; + /// unaligned memory accesses of the specified type. Returns whether it + /// is "fast" by reference in the second argument. + virtual bool allowsUnalignedMemoryAccesses(EVT VT, bool *Fast) const; virtual EVT getOptimalMemOpType(uint64_t Size, unsigned DstAlign, unsigned SrcAlign, @@ -386,6 +387,8 @@ namespace llvm { /// materialize the FP immediate as a load from a constant pool. virtual bool isFPImmLegal(const APFloat &Imm, EVT VT) const; + virtual bool isIntImmLegal(const APInt &Imm, EVT VT) const; + virtual bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I, unsigned Intrinsic) const; diff --git a/lib/Target/ARM/ARMInstrThumb2.td b/lib/Target/ARM/ARMInstrThumb2.td index d40a0746f85..cf8b3024fb4 100644 --- a/lib/Target/ARM/ARMInstrThumb2.td +++ b/lib/Target/ARM/ARMInstrThumb2.td @@ -2315,13 +2315,15 @@ defm t2ORN : T2I_bin_irs<0b0011, "orn", /// changed to modify CPSR. multiclass T2I_un_irs opcod, string opc, InstrItinClass iii, InstrItinClass iir, InstrItinClass iis, - PatFrag opnode, bit Cheap = 0, bit ReMat = 0> { + PatFrag opnode, + bit Cheap = 0, bit ReMat = 0, bit MoveImm = 0> { // shifted imm def i : T2sOneRegImm<(outs rGPR:$Rd), (ins t2_so_imm:$imm), iii, opc, "\t$Rd, $imm", [(set rGPR:$Rd, (opnode t2_so_imm:$imm))]> { let isAsCheapAsAMove = Cheap; let isReMaterializable = ReMat; + let isMoveImm = MoveImm; let Inst{31-27} = 0b11110; let Inst{25} = 0; let Inst{24-21} = opcod; @@ -2355,7 +2357,7 @@ multiclass T2I_un_irs opcod, string opc, let AddedComplexity = 1 in defm t2MVN : T2I_un_irs <0b0011, "mvn", IIC_iMVNi, IIC_iMVNr, IIC_iMVNsi, - UnOpFrag<(not node:$Src)>, 1, 1>; + UnOpFrag<(not node:$Src)>, 1, 1, 1>; let AddedComplexity = 1 in def : T2Pat<(and rGPR:$src, t2_so_imm_not:$imm), diff --git a/lib/Target/Mips/MipsISelLowering.cpp b/lib/Target/Mips/MipsISelLowering.cpp index 703f8cec22b..619ae077b30 100644 --- a/lib/Target/Mips/MipsISelLowering.cpp +++ b/lib/Target/Mips/MipsISelLowering.cpp @@ -457,7 +457,8 @@ MipsTargetLowering(MipsTargetMachine &TM) maxStoresPerMemcpy = 16; } -bool MipsTargetLowering::allowsUnalignedMemoryAccesses(EVT VT) const { +bool +MipsTargetLowering::allowsUnalignedMemoryAccesses(EVT VT, bool *Fast) const { MVT::SimpleValueType SVT = VT.getSimpleVT().SimpleTy; if (Subtarget->inMips16Mode()) @@ -466,6 +467,8 @@ bool MipsTargetLowering::allowsUnalignedMemoryAccesses(EVT VT) const { switch (SVT) { case MVT::i64: case MVT::i32: + if (Fast) + *Fast = true; return true; default: return false; diff --git a/lib/Target/Mips/MipsISelLowering.h b/lib/Target/Mips/MipsISelLowering.h index 2c17def06a1..4b318dc16f7 100644 --- a/lib/Target/Mips/MipsISelLowering.h +++ b/lib/Target/Mips/MipsISelLowering.h @@ -149,7 +149,7 @@ namespace llvm { virtual MVT getShiftAmountTy(EVT LHSTy) const { return MVT::i32; } - virtual bool allowsUnalignedMemoryAccesses (EVT VT) const; + virtual bool allowsUnalignedMemoryAccesses (EVT VT, bool *Fast) const; virtual void LowerOperationWrapper(SDNode *N, SmallVectorImpl &Results, diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 84e5677d6c1..90bee41e35d 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -1412,6 +1412,13 @@ X86TargetLowering::getOptimalMemOpType(uint64_t Size, return MVT::i32; } +bool +X86TargetLowering::allowsUnalignedMemoryAccesses(EVT VT, bool *Fast) const { + if (Fast) + *Fast = Subtarget->isUnalignedMemAccessFast(); + return true; +} + /// getJumpTableEncoding - Return the entry encoding for a jump table in the /// current function. The returned value is a member of the /// MachineJumpTableInfo::JTEntryKind enum. diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h index 1042fe13ec7..a515be23ef6 100644 --- a/lib/Target/X86/X86ISelLowering.h +++ b/lib/Target/X86/X86ISelLowering.h @@ -507,10 +507,9 @@ namespace llvm { MachineFunction &MF) const; /// allowsUnalignedMemoryAccesses - Returns true if the target allows - /// unaligned memory accesses. of the specified type. - virtual bool allowsUnalignedMemoryAccesses(EVT VT) const { - return true; - } + /// unaligned memory accesses. of the specified type. Returns whether it + /// is "fast" by reference in the second argument. + virtual bool allowsUnalignedMemoryAccesses(EVT VT, bool *Fast) const; /// LowerOperation - Provide custom lowering hooks for some operations. /// diff --git a/test/CodeGen/ARM/2011-10-26-memset-with-neon.ll b/test/CodeGen/ARM/2011-10-26-memset-with-neon.ll index 6e0ef961965..f563eeef018 100644 --- a/test/CodeGen/ARM/2011-10-26-memset-with-neon.ll +++ b/test/CodeGen/ARM/2011-10-26-memset-with-neon.ll @@ -1,13 +1,5 @@ ; RUN: llc -march=arm -mcpu=cortex-a8 < %s | FileCheck %s -; Should trigger a NEON store. -; CHECK: vstr -define void @f_0_12(i8* nocapture %c) nounwind optsize { -entry: - call void @llvm.memset.p0i8.i64(i8* %c, i8 0, i64 12, i32 8, i1 false) - ret void -} - ; Trigger multiple NEON stores. ; CHECK: vst1.64 ; CHECK-NEXT: vst1.64 diff --git a/test/CodeGen/ARM/memcpy-inline.ll b/test/CodeGen/ARM/memcpy-inline.ll index dc772827f27..d846e5cb268 100644 --- a/test/CodeGen/ARM/memcpy-inline.ll +++ b/test/CodeGen/ARM/memcpy-inline.ll @@ -1,18 +1,115 @@ -; RUN: llc < %s -mtriple=thumbv7-apple-darwin -disable-post-ra | FileCheck %s - -; CHECK: ldrd -; CHECK: strd -; CHECK: ldrb +; RUN: llc < %s -mtriple=thumbv7-apple-ios -mcpu=cortex-a8 -pre-RA-sched=source -disable-post-ra | FileCheck %s %struct.x = type { i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8 } @src = external global %struct.x @dst = external global %struct.x -define i32 @t() { +@.str1 = private unnamed_addr constant [31 x i8] c"DHRYSTONE PROGRAM, SOME STRING\00", align 1 +@.str2 = private unnamed_addr constant [36 x i8] c"DHRYSTONE PROGRAM, SOME STRING BLAH\00", align 1 +@.str3 = private unnamed_addr constant [24 x i8] c"DHRYSTONE PROGRAM, SOME\00", align 1 +@.str4 = private unnamed_addr constant [18 x i8] c"DHRYSTONE PROGR \00", align 1 +@.str5 = private unnamed_addr constant [7 x i8] c"DHRYST\00", align 1 +@.str6 = private unnamed_addr constant [14 x i8] c"/tmp/rmXXXXXX\00", align 1 +@spool.splbuf = internal global [512 x i8] zeroinitializer, align 16 + +define i32 @t0() { entry: +; CHECK: t0: +; CHECK: vldr [[REG1:d[0-9]+]], +; CHECK: vstr [[REG1]], call void @llvm.memcpy.p0i8.p0i8.i32(i8* getelementptr inbounds (%struct.x* @dst, i32 0, i32 0), i8* getelementptr inbounds (%struct.x* @src, i32 0, i32 0), i32 11, i32 8, i1 false) ret i32 0 } +define void @t1(i8* nocapture %C) nounwind { +entry: +; CHECK: t1: +; CHECK: vld1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r1] +; CHECK: vst1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r0] +; CHECK: adds r0, #15 +; CHECK: adds r1, #15 +; CHECK: vld1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r1] +; CHECK: vst1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r0] + tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %C, i8* getelementptr inbounds ([31 x i8]* @.str1, i64 0, i64 0), i64 31, i32 1, i1 false) + ret void +} + +define void @t2(i8* nocapture %C) nounwind { +entry: +; CHECK: t2: +; CHECK: ldr [[REG2:r[0-9]+]], [r1, #32] +; CHECK: str [[REG2]], [r0, #32] +; CHECK: vld1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r1] +; CHECK: vst1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r0] +; CHECK: adds r0, #16 +; CHECK: adds r1, #16 +; CHECK: vld1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r1] +; CHECK: vst1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r0] + tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %C, i8* getelementptr inbounds ([36 x i8]* @.str2, i64 0, i64 0), i64 36, i32 1, i1 false) + ret void +} + +define void @t3(i8* nocapture %C) nounwind { +entry: +; CHECK: t3: +; CHECK: vld1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r1] +; CHECK: vst1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r0] +; CHECK: adds r0, #16 +; CHECK: adds r1, #16 +; CHECK: vld1.8 {d{{[0-9]+}}}, [r1] +; CHECK: vst1.8 {d{{[0-9]+}}}, [r0] + tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %C, i8* getelementptr inbounds ([24 x i8]* @.str3, i64 0, i64 0), i64 24, i32 1, i1 false) + ret void +} + +define void @t4(i8* nocapture %C) nounwind { +entry: +; CHECK: t4: +; CHECK: vld1.8 {[[REG3:d[0-9]+]], [[REG4:d[0-9]+]]}, [r1] +; CHECK: vst1.8 {[[REG3]], [[REG4]]}, [r0] + tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %C, i8* getelementptr inbounds ([18 x i8]* @.str4, i64 0, i64 0), i64 18, i32 1, i1 false) + ret void +} + +define void @t5(i8* nocapture %C) nounwind { +entry: +; CHECK: t5: +; CHECK: movs [[REG5:r[0-9]+]], #0 +; CHECK: strb [[REG5]], [r0, #6] +; CHECK: movw [[REG6:r[0-9]+]], #21587 +; CHECK: strh [[REG6]], [r0, #4] +; CHECK: ldr [[REG7:r[0-9]+]], +; CHECK: str [[REG7]] + tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %C, i8* getelementptr inbounds ([7 x i8]* @.str5, i64 0, i64 0), i64 7, i32 1, i1 false) + ret void +} + +define void @t6() nounwind { +entry: +; CHECK: t6: +; CHECK: vld1.8 {[[REG8:d[0-9]+]]}, [r0] +; CHECK: vstr [[REG8]], [r1] +; CHECK: adds r1, #6 +; CHECK: adds r0, #6 +; CHECK: vld1.8 +; CHECK: vst1.16 + call void @llvm.memcpy.p0i8.p0i8.i64(i8* getelementptr inbounds ([512 x i8]* @spool.splbuf, i64 0, i64 0), i8* getelementptr inbounds ([14 x i8]* @.str6, i64 0, i64 0), i64 14, i32 1, i1 false) + ret void +} + +%struct.Foo = type { i32, i32, i32, i32 } + +define void @t7(%struct.Foo* nocapture %a, %struct.Foo* nocapture %b) nounwind { +entry: +; CHECK: t7 +; CHECK: vld1.32 +; CHECK: vst1.32 + %0 = bitcast %struct.Foo* %a to i8* + %1 = bitcast %struct.Foo* %b to i8* + tail call void @llvm.memcpy.p0i8.p0i8.i32(i8* %0, i8* %1, i32 16, i32 4, i1 false) + ret void +} + declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture, i32, i32, i1) nounwind +declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32, i1) nounwind diff --git a/test/CodeGen/ARM/memset-inline.ll b/test/CodeGen/ARM/memset-inline.ll new file mode 100644 index 00000000000..ee8c3643388 --- /dev/null +++ b/test/CodeGen/ARM/memset-inline.ll @@ -0,0 +1,30 @@ +; RUN: llc < %s -mtriple=thumbv7-apple-ios -mcpu=cortex-a8 -pre-RA-sched=source -disable-post-ra | FileCheck %s + +define void @t1(i8* nocapture %c) nounwind optsize { +entry: +; CHECK: t1: +; CHECK: movs r1, #0 +; CHECK: str r1, [r0] +; CHECK: str r1, [r0, #4] +; CHECK: str r1, [r0, #8] + call void @llvm.memset.p0i8.i64(i8* %c, i8 0, i64 12, i32 8, i1 false) + ret void +} + +define void @t2() nounwind ssp { +entry: +; CHECK: t2: +; CHECK: add.w r1, r0, #10 +; CHECK: vmov.i32 {{q[0-9]+}}, #0x0 +; CHECK: vst1.16 {d{{[0-9]+}}, d{{[0-9]+}}}, [r1] +; CHECK: vst1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r0] + %buf = alloca [26 x i8], align 1 + %0 = getelementptr inbounds [26 x i8]* %buf, i32 0, i32 0 + call void @llvm.memset.p0i8.i32(i8* %0, i8 0, i32 26, i32 1, i1 false) + call void @something(i8* %0) nounwind + ret void +} + +declare void @something(i8*) nounwind +declare void @llvm.memset.p0i8.i32(i8* nocapture, i8, i32, i32, i1) nounwind +declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) nounwind diff --git a/test/CodeGen/ARM/reg_asc_order.ll b/test/CodeGen/ARM/reg_asc_order.ll deleted file mode 100644 index d1d0ee5f3e7..00000000000 --- a/test/CodeGen/ARM/reg_asc_order.ll +++ /dev/null @@ -1,16 +0,0 @@ -; RUN: llc < %s -march=arm -mcpu=cortex-a8 | FileCheck %s -; Check that memcpy gets lowered to ldm/stm, at least in this very smple case. - -%struct.Foo = type { i32, i32, i32, i32 } - -define void @_Z10CopyStructP3FooS0_(%struct.Foo* nocapture %a, %struct.Foo* nocapture %b) nounwind { -entry: -;CHECK: ldm -;CHECK: stm - %0 = bitcast %struct.Foo* %a to i8* - %1 = bitcast %struct.Foo* %b to i8* - tail call void @llvm.memcpy.p0i8.p0i8.i32(i8* %0, i8* %1, i32 16, i32 4, i1 false) - ret void -} - -declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture, i32, i32, i1) nounwind diff --git a/test/CodeGen/X86/2009-11-16-UnfoldMemOpBug.ll b/test/CodeGen/X86/2009-11-16-UnfoldMemOpBug.ll index 94075e78a28..c2d9d84d4c5 100644 --- a/test/CodeGen/X86/2009-11-16-UnfoldMemOpBug.ll +++ b/test/CodeGen/X86/2009-11-16-UnfoldMemOpBug.ll @@ -6,15 +6,16 @@ define void @t(i32 %count) ssp nounwind { entry: ; CHECK: t: -; CHECK: movq ___stack_chk_guard@GOTPCREL(%rip) -; CHECK: movups L_str(%rip), %xmm0 +; CHECK: movups L_str+12(%rip), %xmm0 +; CHECK: movups L_str(%rip), %xmm1 %tmp0 = alloca [60 x i8], align 1 %tmp1 = getelementptr inbounds [60 x i8]* %tmp0, i64 0, i64 0 br label %bb1 bb1: ; CHECK: LBB0_1: -; CHECK: movaps %xmm0, (%rsp) +; CHECK: movups %xmm0, 12(%rsp) +; CHECK: movaps %xmm1, (%rsp) %tmp2 = phi i32 [ %tmp3, %bb1 ], [ 0, %entry ] call void @llvm.memcpy.p0i8.p0i8.i64(i8* %tmp1, i8* getelementptr inbounds ([28 x i8]* @str, i64 0, i64 0), i64 28, i32 1, i1 false) %tmp3 = add i32 %tmp2, 1 diff --git a/test/CodeGen/X86/memcpy-2.ll b/test/CodeGen/X86/memcpy-2.ll index 7a2bbc4ef0f..dcc8f0d268c 100644 --- a/test/CodeGen/X86/memcpy-2.ll +++ b/test/CodeGen/X86/memcpy-2.ll @@ -10,18 +10,18 @@ define void @t1(i32 %argc, i8** %argv) nounwind { entry: ; SSE2: t1: +; SSE2: movsd _.str+16, %xmm0 +; SSE2: movsd %xmm0, 16(%esp) ; SSE2: movaps _.str, %xmm0 ; SSE2: movaps %xmm0 -; SSE2: movb $0 -; SSE2: movl $0 -; SSE2: movl $0 +; SSE2: movb $0, 24(%esp) ; SSE1: t1: +; SSE1: fldl _.str+16 +; SSE1: fstpl 16(%esp) ; SSE1: movaps _.str, %xmm0 ; SSE1: movaps %xmm0 -; SSE1: movb $0 -; SSE1: movl $0 -; SSE1: movl $0 +; SSE1: movb $0, 24(%esp) ; NOSSE: t1: ; NOSSE: movb $0