diff --git a/lib/Target/SystemZ/SystemZISelLowering.cpp b/lib/Target/SystemZ/SystemZISelLowering.cpp index 20afab782a9..216ca3450c5 100644 --- a/lib/Target/SystemZ/SystemZISelLowering.cpp +++ b/lib/Target/SystemZ/SystemZISelLowering.cpp @@ -1917,7 +1917,9 @@ const char *SystemZTargetLowering::getTargetNodeName(unsigned Opcode) const { OPCODE(UDIVREM32); OPCODE(UDIVREM64); OPCODE(MVC); + OPCODE(MVC_LOOP); OPCODE(CLC); + OPCODE(CLC_LOOP); OPCODE(STRCMP); OPCODE(STPCPY); OPCODE(SEARCH_STRING); @@ -1952,18 +1954,31 @@ static MachineBasicBlock *emitBlockAfter(MachineBasicBlock *MBB) { return NewMBB; } -// Split MBB after MI and return the new block (the one that contains -// instructions after MI). -static MachineBasicBlock *splitBlockAfter(MachineInstr *MI, - MachineBasicBlock *MBB) { +// Split MBB before MI and return the new block (the one that contains MI). +static MachineBasicBlock *splitBlockBefore(MachineInstr *MI, + MachineBasicBlock *MBB) { MachineBasicBlock *NewMBB = emitBlockAfter(MBB); - NewMBB->splice(NewMBB->begin(), MBB, - llvm::next(MachineBasicBlock::iterator(MI)), - MBB->end()); + NewMBB->splice(NewMBB->begin(), MBB, MI, MBB->end()); NewMBB->transferSuccessorsAndUpdatePHIs(MBB); return NewMBB; } +// Force base value Base into a register before MI. Return the register. +static unsigned forceReg(MachineInstr *MI, MachineOperand &Base, + const SystemZInstrInfo *TII) { + if (Base.isReg()) + return Base.getReg(); + + MachineBasicBlock *MBB = MI->getParent(); + MachineFunction &MF = *MBB->getParent(); + MachineRegisterInfo &MRI = MF.getRegInfo(); + + unsigned Reg = MRI.createVirtualRegister(&SystemZ::ADDR64BitRegClass); + BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(SystemZ::LA), Reg) + .addOperand(Base).addImm(0).addReg(0); + return Reg; +} + // Implement EmitInstrWithCustomInserter for pseudo Select* instruction MI. MachineBasicBlock * SystemZTargetLowering::emitSelect(MachineInstr *MI, @@ -1978,7 +1993,7 @@ SystemZTargetLowering::emitSelect(MachineInstr *MI, DebugLoc DL = MI->getDebugLoc(); MachineBasicBlock *StartMBB = MBB; - MachineBasicBlock *JoinMBB = splitBlockAfter(MI, MBB); + MachineBasicBlock *JoinMBB = splitBlockBefore(MI, MBB); MachineBasicBlock *FalseMBB = emitBlockAfter(StartMBB); // StartMBB: @@ -1999,7 +2014,7 @@ SystemZTargetLowering::emitSelect(MachineInstr *MI, // %Result = phi [ %FalseReg, FalseMBB ], [ %TrueReg, StartMBB ] // ... MBB = JoinMBB; - BuildMI(*MBB, MBB->begin(), DL, TII->get(SystemZ::PHI), DestReg) + BuildMI(*MBB, MI, DL, TII->get(SystemZ::PHI), DestReg) .addReg(TrueReg).addMBB(StartMBB) .addReg(FalseReg).addMBB(FalseMBB); @@ -2046,7 +2061,7 @@ SystemZTargetLowering::emitCondStore(MachineInstr *MI, CCMask ^= CCValid; MachineBasicBlock *StartMBB = MBB; - MachineBasicBlock *JoinMBB = splitBlockAfter(MI, MBB); + MachineBasicBlock *JoinMBB = splitBlockBefore(MI, MBB); MachineBasicBlock *FalseMBB = emitBlockAfter(StartMBB); // StartMBB: @@ -2122,7 +2137,7 @@ SystemZTargetLowering::emitAtomicLoadBinary(MachineInstr *MI, // Insert a basic block for the main loop. MachineBasicBlock *StartMBB = MBB; - MachineBasicBlock *DoneMBB = splitBlockAfter(MI, MBB); + MachineBasicBlock *DoneMBB = splitBlockBefore(MI, MBB); MachineBasicBlock *LoopMBB = emitBlockAfter(StartMBB); // StartMBB: @@ -2244,7 +2259,7 @@ SystemZTargetLowering::emitAtomicLoadMinMax(MachineInstr *MI, // Insert 3 basic blocks for the loop. MachineBasicBlock *StartMBB = MBB; - MachineBasicBlock *DoneMBB = splitBlockAfter(MI, MBB); + MachineBasicBlock *DoneMBB = splitBlockBefore(MI, MBB); MachineBasicBlock *LoopMBB = emitBlockAfter(StartMBB); MachineBasicBlock *UseAltMBB = emitBlockAfter(LoopMBB); MachineBasicBlock *UpdateMBB = emitBlockAfter(UseAltMBB); @@ -2351,7 +2366,7 @@ SystemZTargetLowering::emitAtomicCmpSwapW(MachineInstr *MI, // Insert 2 basic blocks for the loop. MachineBasicBlock *StartMBB = MBB; - MachineBasicBlock *DoneMBB = splitBlockAfter(MI, MBB); + MachineBasicBlock *DoneMBB = splitBlockBefore(MI, MBB); MachineBasicBlock *LoopMBB = emitBlockAfter(StartMBB); MachineBasicBlock *SetMBB = emitBlockAfter(LoopMBB); @@ -2465,17 +2480,126 @@ SystemZTargetLowering::emitMemMemWrapper(MachineInstr *MI, MachineBasicBlock *MBB, unsigned Opcode) const { const SystemZInstrInfo *TII = TM.getInstrInfo(); + MachineFunction &MF = *MBB->getParent(); + MachineRegisterInfo &MRI = MF.getRegInfo(); DebugLoc DL = MI->getDebugLoc(); - MachineOperand DestBase = MI->getOperand(0); + MachineOperand DestBase = earlyUseOperand(MI->getOperand(0)); uint64_t DestDisp = MI->getOperand(1).getImm(); - MachineOperand SrcBase = MI->getOperand(2); + MachineOperand SrcBase = earlyUseOperand(MI->getOperand(2)); uint64_t SrcDisp = MI->getOperand(3).getImm(); uint64_t Length = MI->getOperand(4).getImm(); - BuildMI(*MBB, MI, DL, TII->get(Opcode)) - .addOperand(DestBase).addImm(DestDisp).addImm(Length) - .addOperand(SrcBase).addImm(SrcDisp); + // Check for the loop form, in which operand 5 is the trip count. + if (MI->getNumExplicitOperands() > 5) { + bool HaveSingleBase = DestBase.isIdenticalTo(SrcBase); + + uint64_t StartCountReg = MI->getOperand(5).getReg(); + uint64_t StartSrcReg = forceReg(MI, SrcBase, TII); + uint64_t StartDestReg = (HaveSingleBase ? StartSrcReg : + forceReg(MI, DestBase, TII)); + + const TargetRegisterClass *RC = &SystemZ::ADDR64BitRegClass; + uint64_t ThisSrcReg = MRI.createVirtualRegister(RC); + uint64_t ThisDestReg = (HaveSingleBase ? ThisSrcReg : + MRI.createVirtualRegister(RC)); + uint64_t NextSrcReg = MRI.createVirtualRegister(RC); + uint64_t NextDestReg = (HaveSingleBase ? NextSrcReg : + MRI.createVirtualRegister(RC)); + + RC = &SystemZ::GR64BitRegClass; + uint64_t ThisCountReg = MRI.createVirtualRegister(RC); + uint64_t NextCountReg = MRI.createVirtualRegister(RC); + + MachineBasicBlock *StartMBB = MBB; + MachineBasicBlock *DoneMBB = splitBlockBefore(MI, MBB); + MachineBasicBlock *LoopMBB = emitBlockAfter(StartMBB); + + // StartMBB: + // # fall through to LoopMMB + MBB->addSuccessor(LoopMBB); + + // LoopMBB: + // %ThisDestReg = phi [ %StartDestReg, StartMBB ], + // [ %NextDestReg, LoopMBB ] + // %ThisSrcReg = phi [ %StartSrcReg, StartMBB ], + // [ %NextSrcReg, LoopMBB ] + // %ThisCountReg = phi [ %StartCountReg, StartMBB ], + // [ %NextCountReg, LoopMBB ] + // PFD 2, 768+DestDisp(%ThisDestReg) + // Opcode DestDisp(256,%ThisDestReg), SrcDisp(%ThisSrcReg) + // %NextDestReg = LA 256(%ThisDestReg) + // %NextSrcReg = LA 256(%ThisSrcReg) + // %NextCountReg = AGHI %ThisCountReg, -1 + // CGHI %NextCountReg, 0 + // JLH LoopMBB + // # fall through to DoneMMB + // + // The AGHI, CGHI and JLH should be converted to BRCTG by later passes. + MBB = LoopMBB; + + BuildMI(MBB, DL, TII->get(SystemZ::PHI), ThisDestReg) + .addReg(StartDestReg).addMBB(StartMBB) + .addReg(NextDestReg).addMBB(LoopMBB); + if (!HaveSingleBase) + BuildMI(MBB, DL, TII->get(SystemZ::PHI), ThisSrcReg) + .addReg(StartSrcReg).addMBB(StartMBB) + .addReg(NextSrcReg).addMBB(LoopMBB); + BuildMI(MBB, DL, TII->get(SystemZ::PHI), ThisCountReg) + .addReg(StartCountReg).addMBB(StartMBB) + .addReg(NextCountReg).addMBB(LoopMBB); + BuildMI(MBB, DL, TII->get(SystemZ::PFD)) + .addImm(SystemZ::PFD_WRITE) + .addReg(ThisDestReg).addImm(DestDisp + 768).addReg(0); + BuildMI(MBB, DL, TII->get(Opcode)) + .addReg(ThisDestReg).addImm(DestDisp).addImm(256) + .addReg(ThisSrcReg).addImm(SrcDisp); + BuildMI(MBB, DL, TII->get(SystemZ::LA), NextDestReg) + .addReg(ThisDestReg).addImm(256).addReg(0); + if (!HaveSingleBase) + BuildMI(MBB, DL, TII->get(SystemZ::LA), NextSrcReg) + .addReg(ThisSrcReg).addImm(256).addReg(0); + BuildMI(MBB, DL, TII->get(SystemZ::AGHI), NextCountReg) + .addReg(ThisCountReg).addImm(-1); + BuildMI(MBB, DL, TII->get(SystemZ::CGHI)) + .addReg(NextCountReg).addImm(0); + BuildMI(MBB, DL, TII->get(SystemZ::BRC)) + .addImm(SystemZ::CCMASK_ICMP).addImm(SystemZ::CCMASK_CMP_NE) + .addMBB(LoopMBB); + MBB->addSuccessor(LoopMBB); + MBB->addSuccessor(DoneMBB); + + DestBase = MachineOperand::CreateReg(NextDestReg, false); + SrcBase = MachineOperand::CreateReg(NextSrcReg, false); + Length &= 255; + MBB = DoneMBB; + } + // Handle any remaining bytes with straight-line code. + while (Length > 0) { + uint64_t ThisLength = std::min(Length, uint64_t(256)); + // The previous iteration might have created out-of-range displacements. + // Apply them using LAY if so. + if (!isUInt<12>(DestDisp)) { + unsigned Reg = MRI.createVirtualRegister(&SystemZ::ADDR64BitRegClass); + BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(SystemZ::LAY), Reg) + .addOperand(DestBase).addImm(DestDisp).addReg(0); + DestBase = MachineOperand::CreateReg(Reg, false); + DestDisp = 0; + } + if (!isUInt<12>(SrcDisp)) { + unsigned Reg = MRI.createVirtualRegister(&SystemZ::ADDR64BitRegClass); + BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(SystemZ::LAY), Reg) + .addOperand(SrcBase).addImm(SrcDisp).addReg(0); + SrcBase = MachineOperand::CreateReg(Reg, false); + SrcDisp = 0; + } + BuildMI(*MBB, MI, DL, TII->get(Opcode)) + .addOperand(DestBase).addImm(DestDisp).addImm(ThisLength) + .addOperand(SrcBase).addImm(SrcDisp); + DestDisp += ThisLength; + SrcDisp += ThisLength; + Length -= ThisLength; + } MI->eraseFromParent(); return MBB; @@ -2503,7 +2627,7 @@ SystemZTargetLowering::emitStringWrapper(MachineInstr *MI, uint64_t End2Reg = MRI.createVirtualRegister(RC); MachineBasicBlock *StartMBB = MBB; - MachineBasicBlock *DoneMBB = splitBlockAfter(MI, MBB); + MachineBasicBlock *DoneMBB = splitBlockBefore(MI, MBB); MachineBasicBlock *LoopMBB = emitBlockAfter(StartMBB); // StartMBB: @@ -2765,9 +2889,11 @@ EmitInstrWithCustomInserter(MachineInstr *MI, MachineBasicBlock *MBB) const { case SystemZ::ATOMIC_CMP_SWAPW: return emitAtomicCmpSwapW(MI, MBB); - case SystemZ::MVCWrapper: + case SystemZ::MVCSequence: + case SystemZ::MVCLoop: return emitMemMemWrapper(MI, MBB, SystemZ::MVC); - case SystemZ::CLCWrapper: + case SystemZ::CLCSequence: + case SystemZ::CLCLoop: return emitMemMemWrapper(MI, MBB, SystemZ::CLC); case SystemZ::CLSTLoop: return emitStringWrapper(MI, MBB, SystemZ::CLST); diff --git a/lib/Target/SystemZ/SystemZISelLowering.h b/lib/Target/SystemZ/SystemZISelLowering.h index f6a2ce041ec..9831777873a 100644 --- a/lib/Target/SystemZ/SystemZISelLowering.h +++ b/lib/Target/SystemZ/SystemZISelLowering.h @@ -74,16 +74,25 @@ namespace SystemZISD { UDIVREM32, UDIVREM64, - // Use MVC to copy bytes from one memory location to another. - // The first operand is the target address, the second operand is the - // source address, and the third operand is the constant length. + // Use a series of MVCs to copy bytes from one memory location to another. + // The operands are: + // - the target address + // - the source address + // - the constant length + // // This isn't a memory opcode because we'd need to attach two // MachineMemOperands rather than one. MVC, + // Like MVC, but implemented as a loop that handles X*256 bytes + // followed by straight-line code to handle the rest (if any). + // The value of X is passed as an additional operand. + MVC_LOOP, + // Use CLC to compare two blocks of memory, with the same comments - // as for MVC. + // as for MVC and MVC_LOOP. CLC, + CLC_LOOP, // Use an MVST-based sequence to implement stpcpy(). STPCPY, diff --git a/lib/Target/SystemZ/SystemZInstrFP.td b/lib/Target/SystemZ/SystemZInstrFP.td index dbe0fb5cac3..8d0dcb673fc 100644 --- a/lib/Target/SystemZ/SystemZInstrFP.td +++ b/lib/Target/SystemZ/SystemZInstrFP.td @@ -86,9 +86,9 @@ def : CopySign128; -defm LoadStoreF32 : MVCLoadStore; -defm LoadStoreF64 : MVCLoadStore; -defm LoadStoreF128 : MVCLoadStore; +defm LoadStoreF32 : MVCLoadStore; +defm LoadStoreF64 : MVCLoadStore; +defm LoadStoreF128 : MVCLoadStore; //===----------------------------------------------------------------------===// // Load instructions diff --git a/lib/Target/SystemZ/SystemZInstrFormats.td b/lib/Target/SystemZ/SystemZInstrFormats.td index a7e18ec6812..7f2f9f8805d 100644 --- a/lib/Target/SystemZ/SystemZInstrFormats.td +++ b/lib/Target/SystemZ/SystemZInstrFormats.td @@ -1426,23 +1426,26 @@ class AtomicLoadWBinaryReg class AtomicLoadWBinaryImm : AtomicLoadWBinary; -// Define an instruction that operates on two fixed-length blocks of memory. -// The real instruction uses a bdladdr12onlylen8 for the first operand and a -// bdaddr12only for the second, with the length of the second operand being -// implicitly the same as the first. This arrangement matches the underlying -// assembly syntax. However, for instruction selection it's easier to have -// two normal bdaddr12onlys and a separate length operand, so define a pseudo -// instruction for that too. +// Define an instruction that operates on two fixed-length blocks of memory, +// and associated pseudo instructions for operating on blocks of any size. +// The Sequence form uses a straight-line sequence of instructions and +// the Loop form uses a loop of length-256 instructions followed by +// another instruction to handle the excess. multiclass MemorySS opcode, - SDPatternOperator operator> { + SDPatternOperator sequence, SDPatternOperator loop> { def "" : InstSS; - let usesCustomInserter = 1 in - def Wrapper : Pseudo<(outs), (ins bdaddr12only:$dest, bdaddr12only:$src, - imm32len8:$length), - [(operator bdaddr12only:$dest, bdaddr12only:$src, - imm32len8:$length)]>; + let usesCustomInserter = 1 in { + def Sequence : Pseudo<(outs), (ins bdaddr12only:$dest, bdaddr12only:$src, + imm64:$length), + [(sequence bdaddr12only:$dest, bdaddr12only:$src, + imm64:$length)]>; + def Loop : Pseudo<(outs), (ins bdaddr12only:$dest, bdaddr12only:$src, + imm64:$length, GR64:$count256), + [(loop bdaddr12only:$dest, bdaddr12only:$src, + imm64:$length, GR64:$count256)]>; + } } // Define an instruction that operates on two strings, both terminated diff --git a/lib/Target/SystemZ/SystemZInstrInfo.td b/lib/Target/SystemZ/SystemZInstrInfo.td index 8e1f5ac3c97..399b48a3368 100644 --- a/lib/Target/SystemZ/SystemZInstrInfo.td +++ b/lib/Target/SystemZ/SystemZInstrInfo.td @@ -344,25 +344,25 @@ def MVGHI : StoreSIL<"mvghi", 0xE548, store, imm64sx16>; // Memory-to-memory moves. let mayLoad = 1, mayStore = 1 in - defm MVC : MemorySS<"mvc", 0xD2, z_mvc>; + defm MVC : MemorySS<"mvc", 0xD2, z_mvc, z_mvc_loop>; // String moves. let mayLoad = 1, mayStore = 1, Defs = [CC], Uses = [R0W] in defm MVST : StringRRE<"mvst", 0xB255, z_stpcpy>; defm LoadStore8_32 : MVCLoadStore; + MVCSequence, 1>; defm LoadStore16_32 : MVCLoadStore; -defm LoadStore32_32 : MVCLoadStore; + MVCSequence, 2>; +defm LoadStore32_32 : MVCLoadStore; defm LoadStore8 : MVCLoadStore; + MVCSequence, 1>; defm LoadStore16 : MVCLoadStore; + MVCSequence, 2>; defm LoadStore32 : MVCLoadStore; -defm LoadStore64 : MVCLoadStore; + MVCSequence, 4>; +defm LoadStore64 : MVCLoadStore; //===----------------------------------------------------------------------===// // Sign extensions @@ -1028,7 +1028,7 @@ defm : ZXB; // Memory-to-memory comparison. let mayLoad = 1, Defs = [CC] in - defm CLC : MemorySS<"clc", 0xD5, z_clc>; + defm CLC : MemorySS<"clc", 0xD5, z_clc, z_clc_loop>; // String comparison. let mayLoad = 1, Defs = [CC], Uses = [R0W] in diff --git a/lib/Target/SystemZ/SystemZOperands.td b/lib/Target/SystemZ/SystemZOperands.td index eb96dba0f2d..421e41f11e6 100644 --- a/lib/Target/SystemZ/SystemZOperands.td +++ b/lib/Target/SystemZ/SystemZOperands.td @@ -219,11 +219,6 @@ def uimm8 : Immediate; // i32 immediates //===----------------------------------------------------------------------===// -// Immediates for 8-bit lengths. -def imm32len8 : Immediate(N->getZExtValue() - 1); -}], NOOP_SDNodeXForm, "U32Imm">; - // Immediates for the lower and upper 16 bits of an i32, with the other // bits of the i32 being zero. def imm32ll16 : Immediate(-N->getSExtValue()); }], NEGIMM32, "U32Imm">; -def imm64 : ImmLeaf; +def imm64 : ImmLeaf, Operand; //===----------------------------------------------------------------------===// // Floating-point immediates diff --git a/lib/Target/SystemZ/SystemZOperators.td b/lib/Target/SystemZ/SystemZOperators.td index e2c43d6e582..ff64ea8fa0b 100644 --- a/lib/Target/SystemZ/SystemZOperators.td +++ b/lib/Target/SystemZ/SystemZOperators.td @@ -57,7 +57,12 @@ def SDT_ZAtomicCmpSwapW : SDTypeProfile<1, 6, def SDT_ZMemMemLength : SDTypeProfile<0, 3, [SDTCisPtrTy<0>, SDTCisPtrTy<1>, - SDTCisVT<2, i32>]>; + SDTCisVT<2, i64>]>; +def SDT_ZMemMemLoop : SDTypeProfile<0, 4, + [SDTCisPtrTy<0>, + SDTCisPtrTy<1>, + SDTCisVT<2, i64>, + SDTCisVT<3, i64>]>; def SDT_ZString : SDTypeProfile<1, 3, [SDTCisPtrTy<0>, SDTCisPtrTy<1>, @@ -123,8 +128,12 @@ def z_atomic_cmp_swapw : AtomicWOp<"ATOMIC_CMP_SWAPW", SDT_ZAtomicCmpSwapW>; def z_mvc : SDNode<"SystemZISD::MVC", SDT_ZMemMemLength, [SDNPHasChain, SDNPMayStore, SDNPMayLoad]>; +def z_mvc_loop : SDNode<"SystemZISD::MVC_LOOP", SDT_ZMemMemLoop, + [SDNPHasChain, SDNPMayStore, SDNPMayLoad]>; def z_clc : SDNode<"SystemZISD::CLC", SDT_ZMemMemLength, [SDNPHasChain, SDNPOutGlue, SDNPMayLoad]>; +def z_clc_loop : SDNode<"SystemZISD::CLC_LOOP", SDT_ZMemMemLoop, + [SDNPHasChain, SDNPOutGlue, SDNPMayLoad]>; def z_strcmp : SDNode<"SystemZISD::STRCMP", SDT_ZString, [SDNPHasChain, SDNPOutGlue, SDNPMayLoad]>; def z_stpcpy : SDNode<"SystemZISD::STPCPY", SDT_ZString, diff --git a/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp b/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp index 638c3ee6f6f..6026b1f6f0e 100644 --- a/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp +++ b/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp @@ -25,6 +25,30 @@ SystemZSelectionDAGInfo(const SystemZTargetMachine &TM) SystemZSelectionDAGInfo::~SystemZSelectionDAGInfo() { } +// Use MVC to copy Size bytes from Src to Dest, deciding whether to use +// a loop or straight-line code. +static SDValue emitMVC(SelectionDAG &DAG, SDLoc DL, SDValue Chain, + SDValue Dst, SDValue Src, uint64_t Size) { + EVT PtrVT = Src.getValueType(); + // The heuristic we use is to prefer loops for anything that would + // require 7 or more MVCs. With these kinds of sizes there isn't + // much to choose between straight-line code and looping code, + // since the time will be dominated by the MVCs themselves. + // However, the loop has 4 or 5 instructions (depending on whether + // the base addresses can be proved equal), so there doesn't seem + // much point using a loop for 5 * 256 bytes or fewer. Anything in + // the range (5 * 256, 6 * 256) will need another instruction after + // the loop, so it doesn't seem worth using a loop then either. + // The next value up, 6 * 256, can be implemented in the same + // number of straight-line MVCs as 6 * 256 - 1. + if (Size > 6 * 256) + return DAG.getNode(SystemZISD::MVC_LOOP, DL, MVT::Other, Chain, Dst, Src, + DAG.getConstant(Size, PtrVT), + DAG.getConstant(Size / 256, PtrVT)); + return DAG.getNode(SystemZISD::MVC, DL, MVT::Other, Chain, Dst, Src, + DAG.getConstant(Size, PtrVT)); +} + SDValue SystemZSelectionDAGInfo:: EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc DL, SDValue Chain, SDValue Dst, SDValue Src, SDValue Size, unsigned Align, @@ -34,14 +58,8 @@ EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc DL, SDValue Chain, if (IsVolatile) return SDValue(); - if (ConstantSDNode *CSize = dyn_cast(Size)) { - uint64_t Bytes = CSize->getZExtValue(); - if (Bytes >= 1 && Bytes <= 0x100) { - // A single MVC. - return DAG.getNode(SystemZISD::MVC, DL, MVT::Other, - Chain, Dst, Src, Size); - } - } + if (ConstantSDNode *CSize = dyn_cast(Size)) + return emitMVC(DAG, DL, Chain, Dst, Src, CSize->getZExtValue()); return SDValue(); } @@ -65,7 +83,7 @@ EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc DL, SDValue Chain, SDValue Dst, SDValue Byte, SDValue Size, unsigned Align, bool IsVolatile, MachinePointerInfo DstPtrInfo) const { - EVT DstVT = Dst.getValueType(); + EVT PtrVT = Dst.getValueType(); if (IsVolatile) return SDValue(); @@ -89,8 +107,8 @@ EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc DL, SDValue Chain, Align, DstPtrInfo); if (Size2 == 0) return Chain1; - Dst = DAG.getNode(ISD::ADD, DL, DstVT, Dst, - DAG.getConstant(Size1, DstVT)); + Dst = DAG.getNode(ISD::ADD, DL, PtrVT, Dst, + DAG.getConstant(Size1, PtrVT)); DstPtrInfo = DstPtrInfo.getWithOffset(Size1); SDValue Chain2 = memsetStore(DAG, DL, Chain, Dst, ByteVal, Size2, std::min(Align, Size1), DstPtrInfo); @@ -103,8 +121,8 @@ EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc DL, SDValue Chain, false, false, Align); if (Bytes == 1) return Chain1; - SDValue Dst2 = DAG.getNode(ISD::ADD, DL, DstVT, Dst, - DAG.getConstant(1, DstVT)); + SDValue Dst2 = DAG.getNode(ISD::ADD, DL, PtrVT, Dst, + DAG.getConstant(1, PtrVT)); SDValue Chain2 = DAG.getStore(Chain, DL, Byte, Dst2, DstPtrInfo.getWithOffset(1), false, false, 1); @@ -112,16 +130,13 @@ EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc DL, SDValue Chain, } } assert(Bytes >= 2 && "Should have dealt with 0- and 1-byte cases already"); - if (Bytes <= 0x101) { - // Copy the byte to the first location and then use MVC to copy - // it to the rest. - Chain = DAG.getStore(Chain, DL, Byte, Dst, DstPtrInfo, - false, false, Align); - SDValue Dst2 = DAG.getNode(ISD::ADD, DL, DstVT, Dst, - DAG.getConstant(1, DstVT)); - return DAG.getNode(SystemZISD::MVC, DL, MVT::Other, Chain, Dst2, Dst, - DAG.getConstant(Bytes - 1, MVT::i32)); - } + // Copy the byte to the first location and then use MVC to copy + // it to the rest. + Chain = DAG.getStore(Chain, DL, Byte, Dst, DstPtrInfo, + false, false, Align); + SDValue DstPlus1 = DAG.getNode(ISD::ADD, DL, PtrVT, Dst, + DAG.getConstant(1, PtrVT)); + return emitMVC(DAG, DL, Chain, DstPlus1, Dst, Bytes - 1); } return SDValue(); } @@ -144,13 +159,14 @@ EmitTargetCodeForMemcmp(SelectionDAG &DAG, SDLoc DL, SDValue Chain, SDValue Src1, SDValue Src2, SDValue Size, MachinePointerInfo Op1PtrInfo, MachinePointerInfo Op2PtrInfo) const { + EVT PtrVT = Src1.getValueType(); if (ConstantSDNode *CSize = dyn_cast(Size)) { uint64_t Bytes = CSize->getZExtValue(); if (Bytes >= 1 && Bytes <= 0x100) { // A single CLC. SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue); Chain = DAG.getNode(SystemZISD::CLC, DL, VTs, Chain, - Src1, Src2, Size); + Src1, Src2, Size, DAG.getConstant(0, PtrVT)); SDValue Glue = Chain.getValue(1); return std::make_pair(addIPMSequence(DL, Glue, DAG), Chain); } diff --git a/test/CodeGen/SystemZ/memcpy-01.ll b/test/CodeGen/SystemZ/memcpy-01.ll index 7cb58b31cce..b53ec5452e2 100644 --- a/test/CodeGen/SystemZ/memcpy-01.ll +++ b/test/CodeGen/SystemZ/memcpy-01.ll @@ -4,7 +4,9 @@ declare void @llvm.memcpy.p0i8.p0i8.i32(i8 *nocapture, i8 *nocapture, i32, i32, i1) nounwind declare void @llvm.memcpy.p0i8.p0i8.i64(i8 *nocapture, i8 *nocapture, i64, i32, i1) nounwind +declare void @foo(i8 *, i8 *) +; Test a no-op move, i32 version. define void @f1(i8 *%dest, i8 *%src) { ; CHECK-LABEL: f1: ; CHECK-NOT: %r2 @@ -15,6 +17,7 @@ define void @f1(i8 *%dest, i8 *%src) { ret void } +; Test a no-op move, i64 version. define void @f2(i8 *%dest, i8 *%src) { ; CHECK-LABEL: f2: ; CHECK-NOT: %r2 @@ -25,6 +28,7 @@ define void @f2(i8 *%dest, i8 *%src) { ret void } +; Test a 1-byte move, i32 version. define void @f3(i8 *%dest, i8 *%src) { ; CHECK-LABEL: f3: ; CHECK: mvc 0(1,%r2), 0(%r3) @@ -34,6 +38,7 @@ define void @f3(i8 *%dest, i8 *%src) { ret void } +; Test a 1-byte move, i64 version. define void @f4(i8 *%dest, i8 *%src) { ; CHECK-LABEL: f4: ; CHECK: mvc 0(1,%r2), 0(%r3) @@ -43,6 +48,7 @@ define void @f4(i8 *%dest, i8 *%src) { ret void } +; Test the upper range of a single MVC, i32 version. define void @f5(i8 *%dest, i8 *%src) { ; CHECK-LABEL: f5: ; CHECK: mvc 0(256,%r2), 0(%r3) @@ -52,6 +58,7 @@ define void @f5(i8 *%dest, i8 *%src) { ret void } +; Test the upper range of a single MVC, i64 version. define void @f6(i8 *%dest, i8 *%src) { ; CHECK-LABEL: f6: ; CHECK: mvc 0(256,%r2), 0(%r3) @@ -61,22 +68,168 @@ define void @f6(i8 *%dest, i8 *%src) { ret void } -; 257 bytes is too big for a single MVC. For now expect none, so that -; the test fails and gets updated when large copies are implemented. +; Test the first case that needs two MVCs. define void @f7(i8 *%dest, i8 *%src) { ; CHECK-LABEL: f7: -; CHECK-NOT: mvc +; CHECK: mvc 0(256,%r2), 0(%r3) +; CHECK: mvc 256(1,%r2), 256(%r3) ; CHECK: br %r14 call void @llvm.memcpy.p0i8.p0i8.i32(i8 *%dest, i8 *%src, i32 257, i32 1, i1 false) ret void } +; Test the last-but-one case that needs two MVCs. define void @f8(i8 *%dest, i8 *%src) { ; CHECK-LABEL: f8: -; CHECK-NOT: mvc +; CHECK: mvc 0(256,%r2), 0(%r3) +; CHECK: mvc 256(255,%r2), 256(%r3) ; CHECK: br %r14 - call void @llvm.memcpy.p0i8.p0i8.i64(i8 *%dest, i8 *%src, i64 257, i32 1, + call void @llvm.memcpy.p0i8.p0i8.i64(i8 *%dest, i8 *%src, i64 511, i32 1, i1 false) ret void } + +; Test the last case that needs two MVCs. +define void @f9(i8 *%dest, i8 *%src) { +; CHECK-LABEL: f9: +; CHECK: mvc 0(256,%r2), 0(%r3) +; CHECK: mvc 256(256,%r2), 256(%r3) +; CHECK: br %r14 + call void @llvm.memcpy.p0i8.p0i8.i64(i8 *%dest, i8 *%src, i64 512, i32 1, + i1 false) + ret void +} + +; Test an arbitrary value that uses straight-line code. +define void @f10(i8 *%dest, i8 *%src) { +; CHECK-LABEL: f10: +; CHECK: mvc 0(256,%r2), 0(%r3) +; CHECK: mvc 256(256,%r2), 256(%r3) +; CHECK: mvc 512(256,%r2), 512(%r3) +; CHECK: mvc 768(256,%r2), 768(%r3) +; CHECK: mvc 1024(255,%r2), 1024(%r3) +; CHECK: br %r14 + call void @llvm.memcpy.p0i8.p0i8.i64(i8 *%dest, i8 *%src, i64 1279, i32 1, + i1 false) + ret void +} + +; ...and again in cases where not all parts are in range of MVC. +define void @f11(i8 *%srcbase, i8 *%destbase) { +; CHECK-LABEL: f11: +; CHECK: mvc 4000(256,%r2), 3500(%r3) +; CHECK: lay [[NEWDEST:%r[1-5]]], 4256(%r2) +; CHECK: mvc 0(256,[[NEWDEST]]), 3756(%r3) +; CHECK: mvc 256(256,[[NEWDEST]]), 4012(%r3) +; CHECK: lay [[NEWSRC:%r[1-5]]], 4268(%r3) +; CHECK: mvc 512(256,[[NEWDEST]]), 0([[NEWSRC]]) +; CHECK: mvc 768(255,[[NEWDEST]]), 256([[NEWSRC]]) +; CHECK: br %r14 + %dest = getelementptr i8 *%srcbase, i64 4000 + %src = getelementptr i8* %destbase, i64 3500 + call void @llvm.memcpy.p0i8.p0i8.i64(i8 *%dest, i8 *%src, i64 1279, i32 1, + i1 false) + ret void +} + +; ...and again with a destination frame base that goes out of range. +define void @f12() { +; CHECK-LABEL: f12: +; CHECK: brasl %r14, foo@PLT +; CHECK: mvc 4076(256,%r15), 2100(%r15) +; CHECK: lay [[NEWDEST:%r[1-5]]], 4332(%r15) +; CHECK: mvc 0(256,[[NEWDEST]]), 2356(%r15) +; CHECK: mvc 256(256,[[NEWDEST]]), 2612(%r15) +; CHECK: mvc 512(256,[[NEWDEST]]), 2868(%r15) +; CHECK: mvc 768(255,[[NEWDEST]]), 3124(%r15) +; CHECK: brasl %r14, foo@PLT +; CHECK: br %r14 + %arr = alloca [6000 x i8] + %dest = getelementptr [6000 x i8] *%arr, i64 0, i64 3900 + %src = getelementptr [6000 x i8] *%arr, i64 0, i64 1924 + call void @foo(i8 *%dest, i8 *%src) + call void @llvm.memcpy.p0i8.p0i8.i64(i8 *%dest, i8 *%src, i64 1279, i32 1, + i1 false) + call void @foo(i8 *%dest, i8 *%src) + ret void +} + +; ...and again with a source frame base that goes out of range. +define void @f13() { +; CHECK-LABEL: f13: +; CHECK: brasl %r14, foo@PLT +; CHECK: mvc 200(256,%r15), 3826(%r15) +; CHECK: mvc 456(256,%r15), 4082(%r15) +; CHECK: lay [[NEWSRC:%r[1-5]]], 4338(%r15) +; CHECK: mvc 712(256,%r15), 0([[NEWSRC]]) +; CHECK: mvc 968(256,%r15), 256([[NEWSRC]]) +; CHECK: mvc 1224(255,%r15), 512([[NEWSRC]]) +; CHECK: brasl %r14, foo@PLT +; CHECK: br %r14 + %arr = alloca [6000 x i8] + %dest = getelementptr [6000 x i8] *%arr, i64 0, i64 24 + %src = getelementptr [6000 x i8] *%arr, i64 0, i64 3650 + call void @foo(i8 *%dest, i8 *%src) + call void @llvm.memcpy.p0i8.p0i8.i64(i8 *%dest, i8 *%src, i64 1279, i32 1, + i1 false) + call void @foo(i8 *%dest, i8 *%src) + ret void +} + +; Test the last case that is done using straight-line code. +define void @f14(i8 *%dest, i8 *%src) { +; CHECK-LABEL: f14: +; CHECK: mvc 0(256,%r2), 0(%r3) +; CHECK: mvc 256(256,%r2), 256(%r3) +; CHECK: mvc 512(256,%r2), 512(%r3) +; CHECK: mvc 768(256,%r2), 768(%r3) +; CHECK: mvc 1024(256,%r2), 1024(%r3) +; CHECK: mvc 1280(256,%r2), 1280(%r3) +; CHECK: br %r14 + call void @llvm.memcpy.p0i8.p0i8.i64(i8 *%dest, i8 *%src, i64 1536, i32 1, + i1 false) + ret void +} + +; Test the first case that is done using a loop. +define void @f15(i8 *%dest, i8 *%src) { +; CHECK-LABEL: f15: +; CHECK: lghi [[COUNT:%r[0-5]]], 6 +; CHECK: [[LABEL:\.L[^:]*]]: +; CHECK: pfd 2, 768(%r2) +; CHECK: mvc 0(256,%r2), 0(%r3) +; CHECK: la %r2, 256(%r2) +; CHECK: la %r3, 256(%r3) +; CHECK: brctg [[COUNT]], [[LABEL]] +; CHECK: mvc 0(1,%r2), 0(%r3) +; CHECK: br %r14 + call void @llvm.memcpy.p0i8.p0i8.i64(i8 *%dest, i8 *%src, i64 1537, i32 1, + i1 false) + ret void +} + +; ...and again with frame bases, where the base must be loaded into a +; register before the loop. +define void @f16() { +; CHECK-LABEL: f16: +; CHECK: brasl %r14, foo@PLT +; CHECK-DAG: lghi [[COUNT:%r[0-5]]], 6 +; CHECK-DAG: la [[BASE:%r[0-5]]], 160(%r15) +; CHECK: [[LABEL:\.L[^:]*]]: +; CHECK: pfd 2, 2368([[BASE]]) +; CHECK: mvc 1600(256,[[BASE]]), 0([[BASE]]) +; CHECK: la [[BASE]], 256([[BASE]]) +; CHECK: brctg [[COUNT]], [[LABEL]] +; CHECK: mvc 1600(1,[[BASE]]), 0([[BASE]]) +; CHECK: brasl %r14, foo@PLT +; CHECK: br %r14 + %arr = alloca [3200 x i8] + %dest = getelementptr [3200 x i8] *%arr, i64 0, i64 1600 + %src = getelementptr [3200 x i8] *%arr, i64 0, i64 0 + call void @foo(i8 *%dest, i8 *%src) + call void @llvm.memcpy.p0i8.p0i8.i64(i8 *%dest, i8 *%src, i64 1537, i32 1, + i1 false) + call void @foo(i8 *%dest, i8 *%src) + ret void +} diff --git a/test/CodeGen/SystemZ/memset-01.ll b/test/CodeGen/SystemZ/memset-01.ll index b272a5bcc69..f17901cc73a 100644 --- a/test/CodeGen/SystemZ/memset-01.ll +++ b/test/CodeGen/SystemZ/memset-01.ll @@ -103,22 +103,58 @@ define void @f10(i8 *%dest, i8 %val) { ret void } -; 258 bytes, i32 version. 258 bytes is too big for a single MVC. -; For now expect none, so that the test fails and gets updated when -; large copies are implemented. +; 258 bytes, i32 version. We need two MVCs. define void @f11(i8 *%dest, i8 %val) { ; CHECK-LABEL: f11: -; CHECK-NOT: mvc +; CHECK: stc %r3, 0(%r2) +; CHECK: mvc 1(256,%r2), 0(%r2) +; CHECK: mvc 257(1,%r2), 256(%r2) ; CHECK: br %r14 call void @llvm.memset.p0i8.i32(i8 *%dest, i8 %val, i32 258, i32 1, i1 false) ret void } -; 258 bytes, i64 version, with the same comments as above. +; 258 bytes, i64 version. define void @f12(i8 *%dest, i8 %val) { ; CHECK-LABEL: f12: -; CHECK-NOT: mvc +; CHECK: stc %r3, 0(%r2) +; CHECK: mvc 1(256,%r2), 0(%r2) +; CHECK: mvc 257(1,%r2), 256(%r2) ; CHECK: br %r14 call void @llvm.memset.p0i8.i64(i8 *%dest, i8 %val, i64 258, i32 1, i1 false) ret void } + +; Test the largest case for which straight-line code is used. +define void @f13(i8 *%dest, i8 %val) { +; CHECK-LABEL: f13: +; CHECK: stc %r3, 0(%r2) +; CHECK: mvc 1(256,%r2), 0(%r2) +; CHECK: mvc 257(256,%r2), 256(%r2) +; CHECK: mvc 513(256,%r2), 512(%r2) +; CHECK: mvc 769(256,%r2), 768(%r2) +; CHECK: mvc 1025(256,%r2), 1024(%r2) +; CHECK: mvc 1281(256,%r2), 1280(%r2) +; CHECK: br %r14 + call void @llvm.memset.p0i8.i64(i8 *%dest, i8 %val, i64 1537, i32 1, + i1 false) + ret void +} + +; Test the next size up, which uses a loop. We leave the other corner +; cases to memcpy-01.ll. +define void @f14(i8 *%dest, i8 %val) { +; CHECK-LABEL: f14: +; CHECK: stc %r3, 0(%r2) +; CHECK: lghi [[COUNT:%r[0-5]]], 6 +; CHECK: [[LABEL:\.L[^:]*]]: +; CHECK: pfd 2, 769(%r2) +; CHECK: mvc 1(256,%r2), 0(%r2) +; CHECK: la %r2, 256(%r2) +; CHECK: brctg [[COUNT]], [[LABEL]] +; CHECK: mvc 1(1,%r2), 0(%r2) +; CHECK: br %r14 + call void @llvm.memset.p0i8.i64(i8 *%dest, i8 %val, i64 1538, i32 1, + i1 false) + ret void +} diff --git a/test/CodeGen/SystemZ/memset-02.ll b/test/CodeGen/SystemZ/memset-02.ll index b74d907aa9a..b4724c0b574 100644 --- a/test/CodeGen/SystemZ/memset-02.ll +++ b/test/CodeGen/SystemZ/memset-02.ll @@ -139,21 +139,23 @@ define void @f14(i8 *%dest) { ret void } -; 258 bytes, i32 version. 258 bytes is too big for a single MVC. -; For now expect none, so that the test fails and gets updated when -; large copies are implemented. +; 258 bytes, i32 version. We need two MVCs. define void @f15(i8 *%dest) { ; CHECK-LABEL: f15: -; CHECK-NOT: mvc +; CHECK: mvi 0(%r2), 128 +; CHECK: mvc 1(256,%r2), 0(%r2) +; CHECK: mvc 257(1,%r2), 256(%r2) ; CHECK: br %r14 call void @llvm.memset.p0i8.i32(i8 *%dest, i8 128, i32 258, i32 1, i1 false) ret void } -; 258 bytes, i64 version, with the same comments as above. +; 258 bytes, i64 version. define void @f16(i8 *%dest) { ; CHECK-LABEL: f16: -; CHECK-NOT: mvc +; CHECK: mvi 0(%r2), 128 +; CHECK: mvc 1(256,%r2), 0(%r2) +; CHECK: mvc 257(1,%r2), 256(%r2) ; CHECK: br %r14 call void @llvm.memset.p0i8.i64(i8 *%dest, i8 128, i64 258, i32 1, i1 false) ret void diff --git a/test/CodeGen/SystemZ/memset-03.ll b/test/CodeGen/SystemZ/memset-03.ll index 1d48f1ad6dc..3f954c4f79f 100644 --- a/test/CodeGen/SystemZ/memset-03.ll +++ b/test/CodeGen/SystemZ/memset-03.ll @@ -375,21 +375,23 @@ define void @f38(i8 *%dest) { ret void } -; 258 bytes, i32 version. 258 bytes is too big for a single MVC. -; For now expect none, so that the test fails and gets updated when -; large copies are implemented. +; 258 bytes, i32 version. We need two MVCs. define void @f39(i8 *%dest) { ; CHECK-LABEL: f39: -; CHECK-NOT: mvc +; CHECK: mvi 0(%r2), 0 +; CHECK: mvc 1(256,%r2), 0(%r2) +; CHECK: mvc 257(1,%r2), 256(%r2) ; CHECK: br %r14 call void @llvm.memset.p0i8.i32(i8 *%dest, i8 0, i32 258, i32 1, i1 false) ret void } -; 258 bytes, i64 version, with the same comments as above. +; 258 bytes, i64 version. define void @f40(i8 *%dest) { ; CHECK-LABEL: f40: -; CHECK-NOT: mvc +; CHECK: mvi 0(%r2), 0 +; CHECK: mvc 1(256,%r2), 0(%r2) +; CHECK: mvc 257(1,%r2), 256(%r2) ; CHECK: br %r14 call void @llvm.memset.p0i8.i64(i8 *%dest, i8 0, i64 258, i32 1, i1 false) ret void diff --git a/test/CodeGen/SystemZ/memset-04.ll b/test/CodeGen/SystemZ/memset-04.ll index 92886921b07..7906e8d10a1 100644 --- a/test/CodeGen/SystemZ/memset-04.ll +++ b/test/CodeGen/SystemZ/memset-04.ll @@ -375,21 +375,23 @@ define void @f38(i8 *%dest) { ret void } -; 258 bytes, i32 version. 258 bytes is too big for a single MVC. -; For now expect none, so that the test fails and gets updated when -; large copies are implemented. +; 258 bytes, i32 version. We need two MVCs. define void @f39(i8 *%dest) { ; CHECK-LABEL: f39: -; CHECK-NOT: mvc +; CHECK: mvi 0(%r2), 255 +; CHECK: mvc 1(256,%r2), 0(%r2) +; CHECK: mvc 257(1,%r2), 256(%r2) ; CHECK: br %r14 call void @llvm.memset.p0i8.i32(i8 *%dest, i8 -1, i32 258, i32 1, i1 false) ret void } -; 258 bytes, i64 version, with the same comments as above. +; 258 bytes, i64 version. define void @f40(i8 *%dest) { ; CHECK-LABEL: f40: -; CHECK-NOT: mvc +; CHECK: mvi 0(%r2), 255 +; CHECK: mvc 1(256,%r2), 0(%r2) +; CHECK: mvc 257(1,%r2), 256(%r2) ; CHECK: br %r14 call void @llvm.memset.p0i8.i64(i8 *%dest, i8 -1, i64 258, i32 1, i1 false) ret void