[SystemZ] Use CLC and IPM to implement memcmp

For now this is restricted to fixed-length comparisons with a length in the range [1, 256], as for memcpy() and MVC. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@188163 91177308-0d34-0410-b5e6-96231b3b80d8
2025-04-03 18:32:50 +00:00 · 2013-08-12 10:28:10 +00:00 · 2013-08-12 10:28:10 +00:00 · ac168b8bc8
commit ac168b8bc8
parent e03a56d62f
12 changed files with 324 additions and 12 deletions
--- a/include/llvm/Target/TargetSelectionDAGInfo.h
+++ b/include/llvm/Target/TargetSelectionDAGInfo.h
@ -94,6 +94,20 @@ public:
                          MachinePointerInfo DstPtrInfo) const {
    return SDValue();
  }
+
+  /// EmitTargetCodeForMemcmp - Emit target-specific code that performs a
+  /// memcmp, in cases where that is faster than a libcall.  The first
+  /// returned SDValue is the result of the memcmp and the second is
+  /// the chain.  Both SDValues can be null if a normal libcall should
+  /// be used.
+  virtual std::pair<SDValue, SDValue>
+  EmitTargetCodeForMemcmp(SelectionDAG &DAG, SDLoc dl,
+                          SDValue Chain,
+                          SDValue Op1, SDValue Op2,
+                          SDValue Op3, MachinePointerInfo Op1PtrInfo,
+                          MachinePointerInfo Op2PtrInfo) const {
+    return std::make_pair(SDValue(), SDValue());
+  }
 };

 } // end llvm namespace
--- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@ -58,6 +58,7 @@
 #include "llvm/Target/TargetLibraryInfo.h"
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetOptions.h"
+#include "llvm/Target/TargetSelectionDAGInfo.h"
 #include <algorithm>
 using namespace llvm;

@ -5463,6 +5464,26 @@ bool SelectionDAGBuilder::visitMemCmpCall(const CallInst &I) {
    return false;

  const ConstantInt *Size = dyn_cast<ConstantInt>(I.getArgOperand(2));
+  if (Size && Size->getZExtValue() == 0) {
+    EVT CallVT = TM.getTargetLowering()->getValueType(I.getType(), true);
+    setValue(&I, DAG.getConstant(0, CallVT));
+    return true;
+  }
+
+  const Value *Arg0 = I.getArgOperand(0);
+  const Value *Arg1 = I.getArgOperand(1);
+  const Value *Arg2 = I.getArgOperand(2);
+  const TargetSelectionDAGInfo &TSI = DAG.getSelectionDAGInfo();
+  std::pair<SDValue, SDValue> Res =
+    TSI.EmitTargetCodeForMemcmp(DAG, getCurSDLoc(), DAG.getRoot(),
+                                getValue(Arg0), getValue(Arg1), getValue(Arg2),
+                                MachinePointerInfo(Arg0),
+                                MachinePointerInfo(Arg1));
+  if (Res.first.getNode()) {
+    setValue(&I, Res.first);
+    DAG.setRoot(Res.second);
+    return true;
+  }

  // memcmp(S1,S2,2) != 0 -> (*(short*)LHS != *(short*)RHS)  != 0
  // memcmp(S1,S2,4) != 0 -> (*(int*)LHS != *(int*)RHS)  != 0
--- a/lib/Target/SystemZ/README.txt
+++ b/lib/Target/SystemZ/README.txt
@ -67,12 +67,12 @@ condition codes.  For example, we could use LCDFR instead of LCDBR.
 --

 We don't optimize block memory operations, except using single MVCs
-for memcpy.
+for memcpy and single CLCs for memcmp.

-It's definitely worth using things like CLC, NC, XC and OC with
+It's definitely worth using things like NC, XC and OC with
 constant lengths.  MVCIN may be worthwhile too.

-We should probably implement things like memcpy using MVC with EXECUTE.
+We should probably implement general memcpy using MVC with EXECUTE.
 Likewise memcmp and CLC.  MVCLE and CLCLE could be useful too.

 --
--- a/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/lib/Target/SystemZ/SystemZISelLowering.cpp
@ -1702,6 +1702,7 @@ const char *SystemZTargetLowering::getTargetNodeName(unsigned Opcode) const {
    OPCODE(UDIVREM64);
    OPCODE(MVC);
    OPCODE(CLC);
+    OPCODE(IPM);
    OPCODE(ATOMIC_SWAPW);
    OPCODE(ATOMIC_LOADW_ADD);
    OPCODE(ATOMIC_LOADW_SUB);
@ -2240,8 +2241,9 @@ SystemZTargetLowering::emitExt128(MachineInstr *MI,
 }

 MachineBasicBlock *
-SystemZTargetLowering::emitMVCWrapper(MachineInstr *MI,
-                                      MachineBasicBlock *MBB) const {
+SystemZTargetLowering::emitMemMemWrapper(MachineInstr *MI,
+                                         MachineBasicBlock *MBB,
+                                         unsigned Opcode) const {
  const SystemZInstrInfo *TII = TM.getInstrInfo();
  DebugLoc DL = MI->getDebugLoc();

@ -2251,7 +2253,7 @@ SystemZTargetLowering::emitMVCWrapper(MachineInstr *MI,
  uint64_t       SrcDisp  = MI->getOperand(3).getImm();
  uint64_t       Length   = MI->getOperand(4).getImm();

-  BuildMI(*MBB, MI, DL, TII->get(SystemZ::MVC))
+  BuildMI(*MBB, MI, DL, TII->get(Opcode))
    .addOperand(DestBase).addImm(DestDisp).addImm(Length)
    .addOperand(SrcBase).addImm(SrcDisp);

@ -2483,7 +2485,9 @@ EmitInstrWithCustomInserter(MachineInstr *MI, MachineBasicBlock *MBB) const {
  case SystemZ::ATOMIC_CMP_SWAPW:
    return emitAtomicCmpSwapW(MI, MBB);
  case SystemZ::MVCWrapper:
-    return emitMVCWrapper(MI, MBB);
+    return emitMemMemWrapper(MI, MBB, SystemZ::MVC);
+  case SystemZ::CLCWrapper:
+    return emitMemMemWrapper(MI, MBB, SystemZ::CLC);
  default:
    llvm_unreachable("Unexpected instr type to insert");
  }
--- a/lib/Target/SystemZ/SystemZISelLowering.h
+++ b/lib/Target/SystemZ/SystemZISelLowering.h
@ -84,6 +84,9 @@ namespace SystemZISD {
    // as for MVC.
    CLC,

+    // Store the CC value in bits 29 and 28 of an integer.
+    IPM,
+
    // Wrappers around the inner loop of an 8- or 16-bit ATOMIC_SWAP or
    // ATOMIC_LOAD_<op>.
    //
@ -234,8 +237,9 @@ private:
                                          unsigned BitSize) const;
  MachineBasicBlock *emitAtomicCmpSwapW(MachineInstr *MI,
                                        MachineBasicBlock *BB) const;
-  MachineBasicBlock *emitMVCWrapper(MachineInstr *MI,
-                                    MachineBasicBlock *BB) const;
+  MachineBasicBlock *emitMemMemWrapper(MachineInstr *MI,
+                                       MachineBasicBlock *BB,
+                                       unsigned Opcode) const;
 };
 } // end namespace llvm

--- a/lib/Target/SystemZ/SystemZInstrInfo.cpp
+++ b/lib/Target/SystemZ/SystemZInstrInfo.cpp
@ -293,6 +293,99 @@ SystemZInstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
  return Count;
 }

+bool SystemZInstrInfo::analyzeCompare(const MachineInstr *MI,
+                                      unsigned &SrcReg, unsigned &SrcReg2,
+                                      int &Mask, int &Value) const {
+  assert(MI->isCompare() && "Caller should have checked for a comparison");
+
+  if (MI->getNumExplicitOperands() == 2 &&
+      MI->getOperand(0).isReg() &&
+      MI->getOperand(1).isImm()) {
+    SrcReg = MI->getOperand(0).getReg();
+    SrcReg2 = 0;
+    Value = MI->getOperand(1).getImm();
+    Mask = ~0;
+    return true;
+  }
+
+  return false;
+}
+
+// If Reg is a virtual register that is used by only a single non-debug
+// instruction, return the defining instruction, otherwise return null.
+static MachineInstr *getDefSingleUse(const MachineRegisterInfo *MRI,
+                                     unsigned Reg) {
+  if (TargetRegisterInfo::isPhysicalRegister(Reg))
+    return 0;
+
+  MachineRegisterInfo::use_nodbg_iterator I = MRI->use_nodbg_begin(Reg);
+  MachineRegisterInfo::use_nodbg_iterator E = MRI->use_nodbg_end();
+  if (I == E || llvm::next(I) != E)
+    return 0;
+
+  return MRI->getUniqueVRegDef(Reg);
+}
+
+// Return true if MI is a shift of type Opcode by Imm bits.
+static bool isShift(MachineInstr *MI, int Opcode, int64_t Imm) {
+  return (MI->getOpcode() == Opcode &&
+          !MI->getOperand(2).getReg() &&
+          MI->getOperand(3).getImm() == Imm);
+}
+
+// Compare compares SrcReg against zero.  Check whether SrcReg contains
+// the result of an IPM sequence that is only used by Compare.  Try to
+// delete both of them if so and return true if a change was made.
+static bool removeIPM(MachineInstr *Compare, unsigned SrcReg,
+                      const MachineRegisterInfo *MRI,
+                      const TargetRegisterInfo *TRI) {
+  MachineInstr *SRA = getDefSingleUse(MRI, SrcReg);
+  if (!SRA || !isShift(SRA, SystemZ::SRA, 30))
+    return false;
+
+  MachineInstr *SLL = getDefSingleUse(MRI, SRA->getOperand(1).getReg());
+  if (!SLL || !isShift(SLL, SystemZ::SLL, 2))
+    return false;
+
+  MachineInstr *IPM = getDefSingleUse(MRI, SLL->getOperand(1).getReg());
+  if (!IPM || IPM->getOpcode() != SystemZ::IPM)
+    return false;
+
+  // Check that there are no assignments to CC between the IPM and Compare,
+  // except for the SRA that we'd like to delete.  We can ignore SLL because
+  // it does not assign to CC.  We can also ignore uses of the SRA CC result,
+  // since it is effectively restoring CC to the value it had before IPM
+  // (for all current use cases).
+  if (IPM->getParent() != Compare->getParent())
+    return false;
+  MachineBasicBlock::iterator MBBI = IPM, MBBE = Compare;
+  for (++MBBI; MBBI != MBBE; ++MBBI) {
+    MachineInstr *MI = MBBI;
+    if (MI != SRA && MI->modifiesRegister(SystemZ::CC, TRI))
+      return false;
+  }
+
+  IPM->eraseFromParent();
+  SLL->eraseFromParent();
+  SRA->eraseFromParent();
+  Compare->eraseFromParent();
+  return true;
+}
+
+bool
+SystemZInstrInfo::optimizeCompareInstr(MachineInstr *Compare,
+                                       unsigned SrcReg, unsigned SrcReg2,
+                                       int Mask, int Value,
+                                       const MachineRegisterInfo *MRI) const {
+  assert(!SrcReg2 && "Only optimizing constant comparisons so far");
+  bool IsLogical = (Compare->getDesc().TSFlags & SystemZII::IsLogical) != 0;
+  if (Value == 0 &&
+      !IsLogical &&
+      removeIPM(Compare, SrcReg, MRI, TM.getRegisterInfo()))
+    return true;
+  return false;
+}
+
 // If Opcode is a move that has a conditional variant, return that variant,
 // otherwise return 0.
 static unsigned getConditionalMove(unsigned Opcode) {
--- a/lib/Target/SystemZ/SystemZInstrInfo.h
+++ b/lib/Target/SystemZ/SystemZInstrInfo.h
@ -129,6 +129,12 @@ public:
                                MachineBasicBlock *FBB,
                                const SmallVectorImpl<MachineOperand> &Cond,
                                DebugLoc DL) const LLVM_OVERRIDE;
+  bool analyzeCompare(const MachineInstr *MI, unsigned &SrcReg,
+                      unsigned &SrcReg2, int &Mask, int &Value) const
+    LLVM_OVERRIDE;
+  bool optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg,
+                            unsigned SrcReg2, int Mask, int Value,
+                            const MachineRegisterInfo *MRI) const LLVM_OVERRIDE;
  virtual bool isPredicable(MachineInstr *MI) const LLVM_OVERRIDE;
  virtual bool isProfitableToIfCvt(MachineBasicBlock &MBB, unsigned NumCycles,
                                   unsigned ExtraPredCycles,
--- a/lib/Target/SystemZ/SystemZInstrInfo.td
+++ b/lib/Target/SystemZ/SystemZInstrInfo.td
@ -1117,7 +1117,7 @@ let Defs = [CC] in {

 // Extract CC into bits 29 and 28 of a register.
 let Uses = [CC] in
-  def IPM : InherentRRE<"ipm", 0xB222, GR32, (null_frag)>;
+  def IPM : InherentRRE<"ipm", 0xB222, GR32, (z_ipm)>;

 // Read a 32-bit access register into a GR32.  As with all GR32 operations,
 // the upper 32 bits of the enclosing GR64 remain unchanged, which is useful
--- a/lib/Target/SystemZ/SystemZOperators.td
+++ b/lib/Target/SystemZ/SystemZOperators.td
@ -58,6 +58,7 @@ def SDT_ZMemMemLength       : SDTypeProfile<0, 3,
                                            [SDTCisPtrTy<0>,
                                             SDTCisPtrTy<1>,
                                             SDTCisVT<2, i32>]>;
+def SDT_ZI32Intrinsic       : SDTypeProfile<1, 0, [SDTCisVT<0, i32>]>;

 //===----------------------------------------------------------------------===//
 // Node definitions
@ -112,7 +113,9 @@ def z_atomic_cmp_swapw  : AtomicWOp<"ATOMIC_CMP_SWAPW", SDT_ZAtomicCmpSwapW>;
 def z_mvc               : SDNode<"SystemZISD::MVC", SDT_ZMemMemLength,
                                 [SDNPHasChain, SDNPMayStore, SDNPMayLoad]>;
 def z_clc               : SDNode<"SystemZISD::CLC", SDT_ZMemMemLength,
-                                 [SDNPHasChain, SDNPMayLoad]>;
+                                 [SDNPHasChain, SDNPOutGlue, SDNPMayLoad]>;
+def z_ipm               : SDNode<"SystemZISD::IPM", SDT_ZI32Intrinsic,
+                                 [SDNPInGlue]>;

 //===----------------------------------------------------------------------===//
 // Pattern fragments
--- a/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp
+++ b/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp
@ -125,3 +125,30 @@ EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc DL, SDValue Chain,
  }
  return SDValue();
 }
+
+std::pair<SDValue, SDValue> SystemZSelectionDAGInfo::
+EmitTargetCodeForMemcmp(SelectionDAG &DAG, SDLoc DL, SDValue Chain,
+                        SDValue Src1, SDValue Src2, SDValue Size,
+                        MachinePointerInfo Op1PtrInfo,
+                        MachinePointerInfo Op2PtrInfo) const {
+  if (ConstantSDNode *CSize = dyn_cast<ConstantSDNode>(Size)) {
+    uint64_t Bytes = CSize->getZExtValue();
+    if (Bytes >= 1 && Bytes <= 0x100) {
+      // A single CLC.
+      SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
+      Chain = DAG.getNode(SystemZISD::CLC, DL, VTs, Chain,
+                          Src1, Src2, Size);
+      SDValue Glue = Chain.getValue(1);
+      // IPM inserts the CC value into bits 29 and 28, with 0 meaning "equal",
+      // 1 meaning "greater" and 2 meaning "less".  Convert them into an
+      // integer that is respectively equal, greater or less than 0.
+      SDValue IPM = DAG.getNode(SystemZISD::IPM, DL, MVT::i32, Glue);
+      SDValue SHL = DAG.getNode(ISD::SHL, DL, MVT::i32, IPM,
+                                DAG.getConstant(2, MVT::i32));
+      SDValue SRA = DAG.getNode(ISD::SRA, DL, MVT::i32, SHL,
+                                DAG.getConstant(30, MVT::i32));
+      return std::make_pair(SRA, Chain);
+    }
+  }
+  return std::make_pair(SDValue(), SDValue());
+}
--- a/lib/Target/SystemZ/SystemZSelectionDAGInfo.h
+++ b/lib/Target/SystemZ/SystemZSelectionDAGInfo.h
@ -38,7 +38,13 @@ public:
  EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc DL,
                          SDValue Chain, SDValue Dst, SDValue Byte,
                          SDValue Size, unsigned Align, bool IsVolatile,
-                          MachinePointerInfo DstPtrInfo) const;
+                          MachinePointerInfo DstPtrInfo) const LLVM_OVERRIDE;
+
+  virtual std::pair<SDValue, SDValue>
+  EmitTargetCodeForMemcmp(SelectionDAG &DAG, SDLoc DL, SDValue Chain,
+                          SDValue Src1, SDValue Src2, SDValue Size,
+                          MachinePointerInfo Op1PtrInfo,
+                          MachinePointerInfo Op2PtrInfo) const LLVM_OVERRIDE;
 };

 }
--- a/test/CodeGen/SystemZ/memcmp-01.ll
+++ b/test/CodeGen/SystemZ/memcmp-01.ll
@ -0,0 +1,134 @@
+; Test memcmp using CLC.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+
+declare signext i32 @memcmp(i8 *%src1, i8 *%src2, i64 %size)
+
+; Zero-length comparisons should be optimized away.
+define i32 @f1(i8 *%src1, i8 *%src2) {
+; CHECK-LABEL: f1:
+; CHECK: lhi %r2, 0
+; CHECK: br %r14
+  %res = call i32 @memcmp(i8 *%src1, i8 *%src2, i64 0)
+  ret i32 %res
+}
+
+; Check a case where the result is used as an integer.
+define i32 @f2(i8 *%src1, i8 *%src2) {
+; CHECK-LABEL: f2:
+; CHECK: clc 0(2,%r2), 0(%r3)
+; CHECK: ipm %r2
+; CHECK: sll %r2, 2
+; CHECK: sra %r2, 30
+; CHECK: br %r14
+  %res = call i32 @memcmp(i8 *%src1, i8 *%src2, i64 2)
+  ret i32 %res
+}
+
+; Check a case where the result is tested for equality.
+define void @f3(i8 *%src1, i8 *%src2, i32 *%dest) {
+; CHECK-LABEL: f3:
+; CHECK: clc 0(3,%r2), 0(%r3)
+; CHECK-NEXT: je {{\..*}}
+; CHECK: br %r14
+  %res = call i32 @memcmp(i8 *%src1, i8 *%src2, i64 3)
+  %cmp = icmp eq i32 %res, 0
+  br i1 %cmp, label %exit, label %store
+
+store:
+  store i32 0, i32 *%dest
+  br label %exit
+
+exit:
+  ret void
+}
+
+; Check a case where the result is tested for inequality.
+define void @f4(i8 *%src1, i8 *%src2, i32 *%dest) {
+; CHECK-LABEL: f4:
+; CHECK: clc 0(4,%r2), 0(%r3)
+; CHECK-NEXT: jlh {{\..*}}
+; CHECK: br %r14
+entry:
+  %res = call i32 @memcmp(i8 *%src1, i8 *%src2, i64 4)
+  %cmp = icmp ne i32 %res, 0
+  br i1 %cmp, label %exit, label %store
+
+store:
+  store i32 0, i32 *%dest
+  br label %exit
+
+exit:
+  ret void
+}
+
+; Check a case where the result is tested via slt.
+define void @f5(i8 *%src1, i8 *%src2, i32 *%dest) {
+; CHECK-LABEL: f5:
+; CHECK: clc 0(5,%r2), 0(%r3)
+; CHECK-NEXT: jl {{\..*}}
+; CHECK: br %r14
+entry:
+  %res = call i32 @memcmp(i8 *%src1, i8 *%src2, i64 5)
+  %cmp = icmp slt i32 %res, 0
+  br i1 %cmp, label %exit, label %store
+
+store:
+  store i32 0, i32 *%dest
+  br label %exit
+
+exit:
+  ret void
+}
+
+; Check a case where the result is tested for sgt.
+define void @f6(i8 *%src1, i8 *%src2, i32 *%dest) {
+; CHECK-LABEL: f6:
+; CHECK: clc 0(6,%r2), 0(%r3)
+; CHECK-NEXT: jh {{\..*}}
+; CHECK: br %r14
+entry:
+  %res = call i32 @memcmp(i8 *%src1, i8 *%src2, i64 6)
+  %cmp = icmp sgt i32 %res, 0
+  br i1 %cmp, label %exit, label %store
+
+store:
+  store i32 0, i32 *%dest
+  br label %exit
+
+exit:
+  ret void
+}
+
+; Check the upper end of the CLC range.  Here the result is used both as
+; an integer and for branching, but it's better to branch on the result
+; of the SRA.
+define i32 @f7(i8 *%src1, i8 *%src2, i32 *%dest) {
+; CHECK-LABEL: f7:
+; CHECK: clc 0(256,%r2), 0(%r3)
+; CHECK: ipm %r2
+; CHECK: sll %r2, 2
+; CHECK: sra %r2, 30
+; CHECK: jl {{.L*}}
+; CHECK: br %r14
+entry:
+  %res = call i32 @memcmp(i8 *%src1, i8 *%src2, i64 256)
+  %cmp = icmp slt i32 %res, 0
+  br i1 %cmp, label %exit, label %store
+
+store:
+  store i32 0, i32 *%dest
+  br label %exit
+
+exit:
+  ret i32 %res
+}
+
+; 257 bytes is too big for a single CLC.  For now expect a call instead.
+define i32 @f8(i8 *%src1, i8 *%src2) {
+; CHECK-LABEL: f8:
+; CHECK: brasl %r14, memcmp@PLT
+; CHECK: br %r14
+  %res = call i32 @memcmp(i8 *%src1, i8 *%src2, i64 257)
+  ret i32 %res
+}