diff --git a/lib/Target/SystemZ/SystemZISelLowering.cpp b/lib/Target/SystemZ/SystemZISelLowering.cpp index 216ca3450c5..dd230c62a5a 100644 --- a/lib/Target/SystemZ/SystemZISelLowering.cpp +++ b/lib/Target/SystemZ/SystemZISelLowering.cpp @@ -1954,6 +1954,18 @@ static MachineBasicBlock *emitBlockAfter(MachineBasicBlock *MBB) { return NewMBB; } +// Split MBB after MI and return the new block (the one that contains +// instructions after MI). +static MachineBasicBlock *splitBlockAfter(MachineInstr *MI, + MachineBasicBlock *MBB) { + MachineBasicBlock *NewMBB = emitBlockAfter(MBB); + NewMBB->splice(NewMBB->begin(), MBB, + llvm::next(MachineBasicBlock::iterator(MI)), + MBB->end()); + NewMBB->transferSuccessorsAndUpdatePHIs(MBB); + return NewMBB; +} + // Split MBB before MI and return the new block (the one that contains MI). static MachineBasicBlock *splitBlockBefore(MachineInstr *MI, MachineBasicBlock *MBB) { @@ -2490,6 +2502,11 @@ SystemZTargetLowering::emitMemMemWrapper(MachineInstr *MI, uint64_t SrcDisp = MI->getOperand(3).getImm(); uint64_t Length = MI->getOperand(4).getImm(); + // When generating more than one CLC, all but the last will need to + // branch to the end when a difference is found. + MachineBasicBlock *EndMBB = (Length > 256 && Opcode == SystemZ::CLC ? + splitBlockAfter(MI, MBB) : 0); + // Check for the loop form, in which operand 5 is the trip count. if (MI->getNumExplicitOperands() > 5) { bool HaveSingleBase = DestBase.isIdenticalTo(SrcBase); @@ -2514,6 +2531,7 @@ SystemZTargetLowering::emitMemMemWrapper(MachineInstr *MI, MachineBasicBlock *StartMBB = MBB; MachineBasicBlock *DoneMBB = splitBlockBefore(MI, MBB); MachineBasicBlock *LoopMBB = emitBlockAfter(StartMBB); + MachineBasicBlock *NextMBB = (EndMBB ? emitBlockAfter(LoopMBB) : LoopMBB); // StartMBB: // # fall through to LoopMMB @@ -2521,13 +2539,44 @@ SystemZTargetLowering::emitMemMemWrapper(MachineInstr *MI, // LoopMBB: // %ThisDestReg = phi [ %StartDestReg, StartMBB ], - // [ %NextDestReg, LoopMBB ] + // [ %NextDestReg, NextMBB ] // %ThisSrcReg = phi [ %StartSrcReg, StartMBB ], - // [ %NextSrcReg, LoopMBB ] + // [ %NextSrcReg, NextMBB ] // %ThisCountReg = phi [ %StartCountReg, StartMBB ], - // [ %NextCountReg, LoopMBB ] - // PFD 2, 768+DestDisp(%ThisDestReg) + // [ %NextCountReg, NextMBB ] + // ( PFD 2, 768+DestDisp(%ThisDestReg) ) // Opcode DestDisp(256,%ThisDestReg), SrcDisp(%ThisSrcReg) + // ( JLH EndMBB ) + // + // The prefetch is used only for MVC. The JLH is used only for CLC. + MBB = LoopMBB; + + BuildMI(MBB, DL, TII->get(SystemZ::PHI), ThisDestReg) + .addReg(StartDestReg).addMBB(StartMBB) + .addReg(NextDestReg).addMBB(NextMBB); + if (!HaveSingleBase) + BuildMI(MBB, DL, TII->get(SystemZ::PHI), ThisSrcReg) + .addReg(StartSrcReg).addMBB(StartMBB) + .addReg(NextSrcReg).addMBB(NextMBB); + BuildMI(MBB, DL, TII->get(SystemZ::PHI), ThisCountReg) + .addReg(StartCountReg).addMBB(StartMBB) + .addReg(NextCountReg).addMBB(NextMBB); + if (Opcode == SystemZ::MVC) + BuildMI(MBB, DL, TII->get(SystemZ::PFD)) + .addImm(SystemZ::PFD_WRITE) + .addReg(ThisDestReg).addImm(DestDisp + 768).addReg(0); + BuildMI(MBB, DL, TII->get(Opcode)) + .addReg(ThisDestReg).addImm(DestDisp).addImm(256) + .addReg(ThisSrcReg).addImm(SrcDisp); + if (EndMBB) { + BuildMI(MBB, DL, TII->get(SystemZ::BRC)) + .addImm(SystemZ::CCMASK_ICMP).addImm(SystemZ::CCMASK_CMP_NE) + .addMBB(EndMBB); + MBB->addSuccessor(EndMBB); + MBB->addSuccessor(NextMBB); + } + + // NextMBB: // %NextDestReg = LA 256(%ThisDestReg) // %NextSrcReg = LA 256(%ThisSrcReg) // %NextCountReg = AGHI %ThisCountReg, -1 @@ -2536,24 +2585,8 @@ SystemZTargetLowering::emitMemMemWrapper(MachineInstr *MI, // # fall through to DoneMMB // // The AGHI, CGHI and JLH should be converted to BRCTG by later passes. - MBB = LoopMBB; + MBB = NextMBB; - BuildMI(MBB, DL, TII->get(SystemZ::PHI), ThisDestReg) - .addReg(StartDestReg).addMBB(StartMBB) - .addReg(NextDestReg).addMBB(LoopMBB); - if (!HaveSingleBase) - BuildMI(MBB, DL, TII->get(SystemZ::PHI), ThisSrcReg) - .addReg(StartSrcReg).addMBB(StartMBB) - .addReg(NextSrcReg).addMBB(LoopMBB); - BuildMI(MBB, DL, TII->get(SystemZ::PHI), ThisCountReg) - .addReg(StartCountReg).addMBB(StartMBB) - .addReg(NextCountReg).addMBB(LoopMBB); - BuildMI(MBB, DL, TII->get(SystemZ::PFD)) - .addImm(SystemZ::PFD_WRITE) - .addReg(ThisDestReg).addImm(DestDisp + 768).addReg(0); - BuildMI(MBB, DL, TII->get(Opcode)) - .addReg(ThisDestReg).addImm(DestDisp).addImm(256) - .addReg(ThisSrcReg).addImm(SrcDisp); BuildMI(MBB, DL, TII->get(SystemZ::LA), NextDestReg) .addReg(ThisDestReg).addImm(256).addReg(0); if (!HaveSingleBase) @@ -2599,6 +2632,22 @@ SystemZTargetLowering::emitMemMemWrapper(MachineInstr *MI, DestDisp += ThisLength; SrcDisp += ThisLength; Length -= ThisLength; + // If there's another CLC to go, branch to the end if a difference + // was found. + if (EndMBB && Length > 0) { + MachineBasicBlock *NextMBB = splitBlockBefore(MI, MBB); + BuildMI(MBB, DL, TII->get(SystemZ::BRC)) + .addImm(SystemZ::CCMASK_ICMP).addImm(SystemZ::CCMASK_CMP_NE) + .addMBB(EndMBB); + MBB->addSuccessor(EndMBB); + MBB->addSuccessor(NextMBB); + MBB = NextMBB; + } + } + if (EndMBB) { + MBB->addSuccessor(EndMBB); + MBB = EndMBB; + MBB->addLiveIn(SystemZ::CC); } MI->eraseFromParent(); diff --git a/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp b/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp index 6026b1f6f0e..dc2f225acb2 100644 --- a/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp +++ b/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp @@ -141,6 +141,28 @@ EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc DL, SDValue Chain, return SDValue(); } +// Use CLC to compare [Src1, Src1 + Size) with [Src2, Src2 + Size), +// deciding whether to use a loop or straight-line code. +static SDValue emitCLC(SelectionDAG &DAG, SDLoc DL, SDValue Chain, + SDValue Src1, SDValue Src2, uint64_t Size) { + SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue); + EVT PtrVT = Src1.getValueType(); + // A two-CLC sequence is a clear win over a loop, not least because it + // needs only one branch. A three-CLC sequence needs the same number + // of branches as a loop (i.e. 2), but is shorter. That brings us to + // lengths greater than 768 bytes. It seems relatively likely that + // a difference will be found within the first 768 bytes, so we just + // optimize for the smallest number of branch instructions, in order + // to avoid polluting the prediction buffer too much. A loop only ever + // needs 2 branches, whereas a straight-line sequence would need 3 or more. + if (Size > 3 * 256) + return DAG.getNode(SystemZISD::CLC_LOOP, DL, VTs, Chain, Src1, Src2, + DAG.getConstant(Size, PtrVT), + DAG.getConstant(Size / 256, PtrVT)); + return DAG.getNode(SystemZISD::CLC, DL, VTs, Chain, Src1, Src2, + DAG.getConstant(Size, PtrVT)); +} + // Convert the current CC value into an integer that is 0 if CC == 0, // less than zero if CC == 1 and greater than zero if CC >= 2. // The sequence starts with IPM, which puts CC into bits 29 and 28 @@ -159,17 +181,12 @@ EmitTargetCodeForMemcmp(SelectionDAG &DAG, SDLoc DL, SDValue Chain, SDValue Src1, SDValue Src2, SDValue Size, MachinePointerInfo Op1PtrInfo, MachinePointerInfo Op2PtrInfo) const { - EVT PtrVT = Src1.getValueType(); if (ConstantSDNode *CSize = dyn_cast(Size)) { uint64_t Bytes = CSize->getZExtValue(); - if (Bytes >= 1 && Bytes <= 0x100) { - // A single CLC. - SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue); - Chain = DAG.getNode(SystemZISD::CLC, DL, VTs, Chain, - Src1, Src2, Size, DAG.getConstant(0, PtrVT)); - SDValue Glue = Chain.getValue(1); - return std::make_pair(addIPMSequence(DL, Glue, DAG), Chain); - } + assert(Bytes > 0 && "Caller should have handled 0-size case"); + Chain = emitCLC(DAG, DL, Chain, Src1, Src2, Bytes); + SDValue Glue = Chain.getValue(1); + return std::make_pair(addIPMSequence(DL, Glue, DAG), Chain); } return std::make_pair(SDValue(), SDValue()); } diff --git a/test/CodeGen/SystemZ/memcmp-01.ll b/test/CodeGen/SystemZ/memcmp-01.ll index 5f5752b336d..a0144194693 100644 --- a/test/CodeGen/SystemZ/memcmp-01.ll +++ b/test/CodeGen/SystemZ/memcmp-01.ll @@ -123,11 +123,99 @@ exit: ret i32 %res } -; 257 bytes is too big for a single CLC. For now expect a call instead. +; 257 bytes needs two CLCs. define i32 @f8(i8 *%src1, i8 *%src2) { ; CHECK-LABEL: f8: -; CHECK: brasl %r14, memcmp@PLT +; CHECK: clc 0(256,%r2), 0(%r3) +; CHECK: jlh [[LABEL:\..*]] +; CHECK: clc 256(1,%r2), 256(%r3) +; CHECK: [[LABEL]]: +; CHECK: ipm [[REG:%r[0-5]]] ; CHECK: br %r14 %res = call i32 @memcmp(i8 *%src1, i8 *%src2, i64 257) ret i32 %res } + +; Test a comparison of 258 bytes in which the CC result can be used directly. +define void @f9(i8 *%src1, i8 *%src2, i32 *%dest) { +; CHECK-LABEL: f9: +; CHECK: clc 0(256,%r2), 0(%r3) +; CHECK: jlh [[LABEL:\..*]] +; CHECK: clc 256(1,%r2), 256(%r3) +; CHECK: [[LABEL]]: +; CHECK-NEXT: jl .L +; CHECK: br %r14 +entry: + %res = call i32 @memcmp(i8 *%src1, i8 *%src2, i64 257) + %cmp = icmp slt i32 %res, 0 + br i1 %cmp, label %exit, label %store + +store: + store i32 0, i32 *%dest + br label %exit + +exit: + ret void +} + +; Test the largest size that can use two CLCs. +define i32 @f10(i8 *%src1, i8 *%src2) { +; CHECK-LABEL: f10: +; CHECK: clc 0(256,%r2), 0(%r3) +; CHECK: jlh [[LABEL:\..*]] +; CHECK: clc 256(256,%r2), 256(%r3) +; CHECK: [[LABEL]]: +; CHECK: ipm [[REG:%r[0-5]]] +; CHECK: br %r14 + %res = call i32 @memcmp(i8 *%src1, i8 *%src2, i64 512) + ret i32 %res +} + +; Test the smallest size that needs 3 CLCs. +define i32 @f11(i8 *%src1, i8 *%src2) { +; CHECK-LABEL: f11: +; CHECK: clc 0(256,%r2), 0(%r3) +; CHECK: jlh [[LABEL:\..*]] +; CHECK: clc 256(256,%r2), 256(%r3) +; CHECK: jlh [[LABEL]] +; CHECK: clc 512(1,%r2), 512(%r3) +; CHECK: [[LABEL]]: +; CHECK: ipm [[REG:%r[0-5]]] +; CHECK: br %r14 + %res = call i32 @memcmp(i8 *%src1, i8 *%src2, i64 513) + ret i32 %res +} + +; Test the largest size than can use 3 CLCs. +define i32 @f12(i8 *%src1, i8 *%src2) { +; CHECK-LABEL: f12: +; CHECK: clc 0(256,%r2), 0(%r3) +; CHECK: jlh [[LABEL:\..*]] +; CHECK: clc 256(256,%r2), 256(%r3) +; CHECK: jlh [[LABEL]] +; CHECK: clc 512(256,%r2), 512(%r3) +; CHECK: [[LABEL]]: +; CHECK: ipm [[REG:%r[0-5]]] +; CHECK: br %r14 + %res = call i32 @memcmp(i8 *%src1, i8 *%src2, i64 768) + ret i32 %res +} + +; The next size up uses a loop instead. We leave the more complicated +; loop tests to memcpy-01.ll, which shares the same form. +define i32 @f13(i8 *%src1, i8 *%src2) { +; CHECK-LABEL: f13: +; CHECK: lghi [[COUNT:%r[0-5]]], 3 +; CHECK: [[LOOP:.L[^:]*]]: +; CHECK: clc 0(256,%r2), 0(%r3) +; CHECK: jlh [[LABEL:\..*]] +; CHECK-DAG: la %r2, 256(%r2) +; CHECK-DAG: la %r3, 256(%r3) +; CHECK: brctg [[COUNT]], [[LOOP]] +; CHECK: clc 0(1,%r2), 0(%r3) +; CHECK: [[LABEL]]: +; CHECK: ipm [[REG:%r[0-5]]] +; CHECK: br %r14 + %res = call i32 @memcmp(i8 *%src1, i8 *%src2, i64 769) + ret i32 %res +} diff --git a/test/CodeGen/SystemZ/memcmp-02.ll b/test/CodeGen/SystemZ/memcmp-02.ll index cae3d3d4943..74b090dcdd8 100644 --- a/test/CodeGen/SystemZ/memcmp-02.ll +++ b/test/CodeGen/SystemZ/memcmp-02.ll @@ -125,10 +125,14 @@ exit: ret i64 %res } -; 257 bytes is too big for a single CLC. For now expect a call instead. +; 257 bytes needs two CLCs. define i64 @f8(i8 *%src1, i8 *%src2) { ; CHECK-LABEL: f8: -; CHECK: brasl %r14, memcmp@PLT +; CHECK: clc 0(256,%r2), 0(%r3) +; CHECK: jlh [[LABEL:\..*]] +; CHECK: clc 256(1,%r2), 256(%r3) +; CHECK: [[LABEL]]: +; CHECK: ipm [[REG:%r[0-5]]] ; CHECK: br %r14 %res = call i64 @memcmp(i8 *%src1, i8 *%src2, i64 257) ret i64 %res