[SystemZ] Use CLC and IPM to implement memcmp

For now this is restricted to fixed-length comparisons with a length
in the range [1, 256], as for memcpy() and MVC.


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@188163 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
Richard Sandiford 2013-08-12 10:28:10 +00:00
parent e03a56d62f
commit ac168b8bc8
12 changed files with 324 additions and 12 deletions

View File

@ -94,6 +94,20 @@ public:
MachinePointerInfo DstPtrInfo) const {
return SDValue();
}
/// EmitTargetCodeForMemcmp - Emit target-specific code that performs a
/// memcmp, in cases where that is faster than a libcall. The first
/// returned SDValue is the result of the memcmp and the second is
/// the chain. Both SDValues can be null if a normal libcall should
/// be used.
virtual std::pair<SDValue, SDValue>
EmitTargetCodeForMemcmp(SelectionDAG &DAG, SDLoc dl,
SDValue Chain,
SDValue Op1, SDValue Op2,
SDValue Op3, MachinePointerInfo Op1PtrInfo,
MachinePointerInfo Op2PtrInfo) const {
return std::make_pair(SDValue(), SDValue());
}
};
} // end llvm namespace

View File

@ -58,6 +58,7 @@
#include "llvm/Target/TargetLibraryInfo.h"
#include "llvm/Target/TargetLowering.h"
#include "llvm/Target/TargetOptions.h"
#include "llvm/Target/TargetSelectionDAGInfo.h"
#include <algorithm>
using namespace llvm;
@ -5463,6 +5464,26 @@ bool SelectionDAGBuilder::visitMemCmpCall(const CallInst &I) {
return false;
const ConstantInt *Size = dyn_cast<ConstantInt>(I.getArgOperand(2));
if (Size && Size->getZExtValue() == 0) {
EVT CallVT = TM.getTargetLowering()->getValueType(I.getType(), true);
setValue(&I, DAG.getConstant(0, CallVT));
return true;
}
const Value *Arg0 = I.getArgOperand(0);
const Value *Arg1 = I.getArgOperand(1);
const Value *Arg2 = I.getArgOperand(2);
const TargetSelectionDAGInfo &TSI = DAG.getSelectionDAGInfo();
std::pair<SDValue, SDValue> Res =
TSI.EmitTargetCodeForMemcmp(DAG, getCurSDLoc(), DAG.getRoot(),
getValue(Arg0), getValue(Arg1), getValue(Arg2),
MachinePointerInfo(Arg0),
MachinePointerInfo(Arg1));
if (Res.first.getNode()) {
setValue(&I, Res.first);
DAG.setRoot(Res.second);
return true;
}
// memcmp(S1,S2,2) != 0 -> (*(short*)LHS != *(short*)RHS) != 0
// memcmp(S1,S2,4) != 0 -> (*(int*)LHS != *(int*)RHS) != 0

View File

@ -67,12 +67,12 @@ condition codes. For example, we could use LCDFR instead of LCDBR.
--
We don't optimize block memory operations, except using single MVCs
for memcpy.
for memcpy and single CLCs for memcmp.
It's definitely worth using things like CLC, NC, XC and OC with
It's definitely worth using things like NC, XC and OC with
constant lengths. MVCIN may be worthwhile too.
We should probably implement things like memcpy using MVC with EXECUTE.
We should probably implement general memcpy using MVC with EXECUTE.
Likewise memcmp and CLC. MVCLE and CLCLE could be useful too.
--

View File

@ -1702,6 +1702,7 @@ const char *SystemZTargetLowering::getTargetNodeName(unsigned Opcode) const {
OPCODE(UDIVREM64);
OPCODE(MVC);
OPCODE(CLC);
OPCODE(IPM);
OPCODE(ATOMIC_SWAPW);
OPCODE(ATOMIC_LOADW_ADD);
OPCODE(ATOMIC_LOADW_SUB);
@ -2240,8 +2241,9 @@ SystemZTargetLowering::emitExt128(MachineInstr *MI,
}
MachineBasicBlock *
SystemZTargetLowering::emitMVCWrapper(MachineInstr *MI,
MachineBasicBlock *MBB) const {
SystemZTargetLowering::emitMemMemWrapper(MachineInstr *MI,
MachineBasicBlock *MBB,
unsigned Opcode) const {
const SystemZInstrInfo *TII = TM.getInstrInfo();
DebugLoc DL = MI->getDebugLoc();
@ -2251,7 +2253,7 @@ SystemZTargetLowering::emitMVCWrapper(MachineInstr *MI,
uint64_t SrcDisp = MI->getOperand(3).getImm();
uint64_t Length = MI->getOperand(4).getImm();
BuildMI(*MBB, MI, DL, TII->get(SystemZ::MVC))
BuildMI(*MBB, MI, DL, TII->get(Opcode))
.addOperand(DestBase).addImm(DestDisp).addImm(Length)
.addOperand(SrcBase).addImm(SrcDisp);
@ -2483,7 +2485,9 @@ EmitInstrWithCustomInserter(MachineInstr *MI, MachineBasicBlock *MBB) const {
case SystemZ::ATOMIC_CMP_SWAPW:
return emitAtomicCmpSwapW(MI, MBB);
case SystemZ::MVCWrapper:
return emitMVCWrapper(MI, MBB);
return emitMemMemWrapper(MI, MBB, SystemZ::MVC);
case SystemZ::CLCWrapper:
return emitMemMemWrapper(MI, MBB, SystemZ::CLC);
default:
llvm_unreachable("Unexpected instr type to insert");
}

View File

@ -84,6 +84,9 @@ namespace SystemZISD {
// as for MVC.
CLC,
// Store the CC value in bits 29 and 28 of an integer.
IPM,
// Wrappers around the inner loop of an 8- or 16-bit ATOMIC_SWAP or
// ATOMIC_LOAD_<op>.
//
@ -234,8 +237,9 @@ private:
unsigned BitSize) const;
MachineBasicBlock *emitAtomicCmpSwapW(MachineInstr *MI,
MachineBasicBlock *BB) const;
MachineBasicBlock *emitMVCWrapper(MachineInstr *MI,
MachineBasicBlock *BB) const;
MachineBasicBlock *emitMemMemWrapper(MachineInstr *MI,
MachineBasicBlock *BB,
unsigned Opcode) const;
};
} // end namespace llvm

View File

@ -293,6 +293,99 @@ SystemZInstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
return Count;
}
bool SystemZInstrInfo::analyzeCompare(const MachineInstr *MI,
unsigned &SrcReg, unsigned &SrcReg2,
int &Mask, int &Value) const {
assert(MI->isCompare() && "Caller should have checked for a comparison");
if (MI->getNumExplicitOperands() == 2 &&
MI->getOperand(0).isReg() &&
MI->getOperand(1).isImm()) {
SrcReg = MI->getOperand(0).getReg();
SrcReg2 = 0;
Value = MI->getOperand(1).getImm();
Mask = ~0;
return true;
}
return false;
}
// If Reg is a virtual register that is used by only a single non-debug
// instruction, return the defining instruction, otherwise return null.
static MachineInstr *getDefSingleUse(const MachineRegisterInfo *MRI,
unsigned Reg) {
if (TargetRegisterInfo::isPhysicalRegister(Reg))
return 0;
MachineRegisterInfo::use_nodbg_iterator I = MRI->use_nodbg_begin(Reg);
MachineRegisterInfo::use_nodbg_iterator E = MRI->use_nodbg_end();
if (I == E || llvm::next(I) != E)
return 0;
return MRI->getUniqueVRegDef(Reg);
}
// Return true if MI is a shift of type Opcode by Imm bits.
static bool isShift(MachineInstr *MI, int Opcode, int64_t Imm) {
return (MI->getOpcode() == Opcode &&
!MI->getOperand(2).getReg() &&
MI->getOperand(3).getImm() == Imm);
}
// Compare compares SrcReg against zero. Check whether SrcReg contains
// the result of an IPM sequence that is only used by Compare. Try to
// delete both of them if so and return true if a change was made.
static bool removeIPM(MachineInstr *Compare, unsigned SrcReg,
const MachineRegisterInfo *MRI,
const TargetRegisterInfo *TRI) {
MachineInstr *SRA = getDefSingleUse(MRI, SrcReg);
if (!SRA || !isShift(SRA, SystemZ::SRA, 30))
return false;
MachineInstr *SLL = getDefSingleUse(MRI, SRA->getOperand(1).getReg());
if (!SLL || !isShift(SLL, SystemZ::SLL, 2))
return false;
MachineInstr *IPM = getDefSingleUse(MRI, SLL->getOperand(1).getReg());
if (!IPM || IPM->getOpcode() != SystemZ::IPM)
return false;
// Check that there are no assignments to CC between the IPM and Compare,
// except for the SRA that we'd like to delete. We can ignore SLL because
// it does not assign to CC. We can also ignore uses of the SRA CC result,
// since it is effectively restoring CC to the value it had before IPM
// (for all current use cases).
if (IPM->getParent() != Compare->getParent())
return false;
MachineBasicBlock::iterator MBBI = IPM, MBBE = Compare;
for (++MBBI; MBBI != MBBE; ++MBBI) {
MachineInstr *MI = MBBI;
if (MI != SRA && MI->modifiesRegister(SystemZ::CC, TRI))
return false;
}
IPM->eraseFromParent();
SLL->eraseFromParent();
SRA->eraseFromParent();
Compare->eraseFromParent();
return true;
}
bool
SystemZInstrInfo::optimizeCompareInstr(MachineInstr *Compare,
unsigned SrcReg, unsigned SrcReg2,
int Mask, int Value,
const MachineRegisterInfo *MRI) const {
assert(!SrcReg2 && "Only optimizing constant comparisons so far");
bool IsLogical = (Compare->getDesc().TSFlags & SystemZII::IsLogical) != 0;
if (Value == 0 &&
!IsLogical &&
removeIPM(Compare, SrcReg, MRI, TM.getRegisterInfo()))
return true;
return false;
}
// If Opcode is a move that has a conditional variant, return that variant,
// otherwise return 0.
static unsigned getConditionalMove(unsigned Opcode) {

View File

@ -129,6 +129,12 @@ public:
MachineBasicBlock *FBB,
const SmallVectorImpl<MachineOperand> &Cond,
DebugLoc DL) const LLVM_OVERRIDE;
bool analyzeCompare(const MachineInstr *MI, unsigned &SrcReg,
unsigned &SrcReg2, int &Mask, int &Value) const
LLVM_OVERRIDE;
bool optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg,
unsigned SrcReg2, int Mask, int Value,
const MachineRegisterInfo *MRI) const LLVM_OVERRIDE;
virtual bool isPredicable(MachineInstr *MI) const LLVM_OVERRIDE;
virtual bool isProfitableToIfCvt(MachineBasicBlock &MBB, unsigned NumCycles,
unsigned ExtraPredCycles,

View File

@ -1117,7 +1117,7 @@ let Defs = [CC] in {
// Extract CC into bits 29 and 28 of a register.
let Uses = [CC] in
def IPM : InherentRRE<"ipm", 0xB222, GR32, (null_frag)>;
def IPM : InherentRRE<"ipm", 0xB222, GR32, (z_ipm)>;
// Read a 32-bit access register into a GR32. As with all GR32 operations,
// the upper 32 bits of the enclosing GR64 remain unchanged, which is useful

View File

@ -58,6 +58,7 @@ def SDT_ZMemMemLength : SDTypeProfile<0, 3,
[SDTCisPtrTy<0>,
SDTCisPtrTy<1>,
SDTCisVT<2, i32>]>;
def SDT_ZI32Intrinsic : SDTypeProfile<1, 0, [SDTCisVT<0, i32>]>;
//===----------------------------------------------------------------------===//
// Node definitions
@ -112,7 +113,9 @@ def z_atomic_cmp_swapw : AtomicWOp<"ATOMIC_CMP_SWAPW", SDT_ZAtomicCmpSwapW>;
def z_mvc : SDNode<"SystemZISD::MVC", SDT_ZMemMemLength,
[SDNPHasChain, SDNPMayStore, SDNPMayLoad]>;
def z_clc : SDNode<"SystemZISD::CLC", SDT_ZMemMemLength,
[SDNPHasChain, SDNPMayLoad]>;
[SDNPHasChain, SDNPOutGlue, SDNPMayLoad]>;
def z_ipm : SDNode<"SystemZISD::IPM", SDT_ZI32Intrinsic,
[SDNPInGlue]>;
//===----------------------------------------------------------------------===//
// Pattern fragments

View File

@ -125,3 +125,30 @@ EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc DL, SDValue Chain,
}
return SDValue();
}
std::pair<SDValue, SDValue> SystemZSelectionDAGInfo::
EmitTargetCodeForMemcmp(SelectionDAG &DAG, SDLoc DL, SDValue Chain,
SDValue Src1, SDValue Src2, SDValue Size,
MachinePointerInfo Op1PtrInfo,
MachinePointerInfo Op2PtrInfo) const {
if (ConstantSDNode *CSize = dyn_cast<ConstantSDNode>(Size)) {
uint64_t Bytes = CSize->getZExtValue();
if (Bytes >= 1 && Bytes <= 0x100) {
// A single CLC.
SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
Chain = DAG.getNode(SystemZISD::CLC, DL, VTs, Chain,
Src1, Src2, Size);
SDValue Glue = Chain.getValue(1);
// IPM inserts the CC value into bits 29 and 28, with 0 meaning "equal",
// 1 meaning "greater" and 2 meaning "less". Convert them into an
// integer that is respectively equal, greater or less than 0.
SDValue IPM = DAG.getNode(SystemZISD::IPM, DL, MVT::i32, Glue);
SDValue SHL = DAG.getNode(ISD::SHL, DL, MVT::i32, IPM,
DAG.getConstant(2, MVT::i32));
SDValue SRA = DAG.getNode(ISD::SRA, DL, MVT::i32, SHL,
DAG.getConstant(30, MVT::i32));
return std::make_pair(SRA, Chain);
}
}
return std::make_pair(SDValue(), SDValue());
}

View File

@ -38,7 +38,13 @@ public:
EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc DL,
SDValue Chain, SDValue Dst, SDValue Byte,
SDValue Size, unsigned Align, bool IsVolatile,
MachinePointerInfo DstPtrInfo) const;
MachinePointerInfo DstPtrInfo) const LLVM_OVERRIDE;
virtual std::pair<SDValue, SDValue>
EmitTargetCodeForMemcmp(SelectionDAG &DAG, SDLoc DL, SDValue Chain,
SDValue Src1, SDValue Src2, SDValue Size,
MachinePointerInfo Op1PtrInfo,
MachinePointerInfo Op2PtrInfo) const LLVM_OVERRIDE;
};
}

View File

@ -0,0 +1,134 @@
; Test memcmp using CLC.
;
; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
declare signext i32 @memcmp(i8 *%src1, i8 *%src2, i64 %size)
; Zero-length comparisons should be optimized away.
define i32 @f1(i8 *%src1, i8 *%src2) {
; CHECK-LABEL: f1:
; CHECK: lhi %r2, 0
; CHECK: br %r14
%res = call i32 @memcmp(i8 *%src1, i8 *%src2, i64 0)
ret i32 %res
}
; Check a case where the result is used as an integer.
define i32 @f2(i8 *%src1, i8 *%src2) {
; CHECK-LABEL: f2:
; CHECK: clc 0(2,%r2), 0(%r3)
; CHECK: ipm %r2
; CHECK: sll %r2, 2
; CHECK: sra %r2, 30
; CHECK: br %r14
%res = call i32 @memcmp(i8 *%src1, i8 *%src2, i64 2)
ret i32 %res
}
; Check a case where the result is tested for equality.
define void @f3(i8 *%src1, i8 *%src2, i32 *%dest) {
; CHECK-LABEL: f3:
; CHECK: clc 0(3,%r2), 0(%r3)
; CHECK-NEXT: je {{\..*}}
; CHECK: br %r14
%res = call i32 @memcmp(i8 *%src1, i8 *%src2, i64 3)
%cmp = icmp eq i32 %res, 0
br i1 %cmp, label %exit, label %store
store:
store i32 0, i32 *%dest
br label %exit
exit:
ret void
}
; Check a case where the result is tested for inequality.
define void @f4(i8 *%src1, i8 *%src2, i32 *%dest) {
; CHECK-LABEL: f4:
; CHECK: clc 0(4,%r2), 0(%r3)
; CHECK-NEXT: jlh {{\..*}}
; CHECK: br %r14
entry:
%res = call i32 @memcmp(i8 *%src1, i8 *%src2, i64 4)
%cmp = icmp ne i32 %res, 0
br i1 %cmp, label %exit, label %store
store:
store i32 0, i32 *%dest
br label %exit
exit:
ret void
}
; Check a case where the result is tested via slt.
define void @f5(i8 *%src1, i8 *%src2, i32 *%dest) {
; CHECK-LABEL: f5:
; CHECK: clc 0(5,%r2), 0(%r3)
; CHECK-NEXT: jl {{\..*}}
; CHECK: br %r14
entry:
%res = call i32 @memcmp(i8 *%src1, i8 *%src2, i64 5)
%cmp = icmp slt i32 %res, 0
br i1 %cmp, label %exit, label %store
store:
store i32 0, i32 *%dest
br label %exit
exit:
ret void
}
; Check a case where the result is tested for sgt.
define void @f6(i8 *%src1, i8 *%src2, i32 *%dest) {
; CHECK-LABEL: f6:
; CHECK: clc 0(6,%r2), 0(%r3)
; CHECK-NEXT: jh {{\..*}}
; CHECK: br %r14
entry:
%res = call i32 @memcmp(i8 *%src1, i8 *%src2, i64 6)
%cmp = icmp sgt i32 %res, 0
br i1 %cmp, label %exit, label %store
store:
store i32 0, i32 *%dest
br label %exit
exit:
ret void
}
; Check the upper end of the CLC range. Here the result is used both as
; an integer and for branching, but it's better to branch on the result
; of the SRA.
define i32 @f7(i8 *%src1, i8 *%src2, i32 *%dest) {
; CHECK-LABEL: f7:
; CHECK: clc 0(256,%r2), 0(%r3)
; CHECK: ipm %r2
; CHECK: sll %r2, 2
; CHECK: sra %r2, 30
; CHECK: jl {{.L*}}
; CHECK: br %r14
entry:
%res = call i32 @memcmp(i8 *%src1, i8 *%src2, i64 256)
%cmp = icmp slt i32 %res, 0
br i1 %cmp, label %exit, label %store
store:
store i32 0, i32 *%dest
br label %exit
exit:
ret i32 %res
}
; 257 bytes is too big for a single CLC. For now expect a call instead.
define i32 @f8(i8 *%src1, i8 *%src2) {
; CHECK-LABEL: f8:
; CHECK: brasl %r14, memcmp@PLT
; CHECK: br %r14
%res = call i32 @memcmp(i8 *%src1, i8 *%src2, i64 257)
ret i32 %res
}