[SystemZ] Use POPCNT instruction on z196

We already exploit a number of instructions specific to z196,
but not yet POPCNT.  Add support for the population-count
facility, MC support for the POPCNT instruction, CodeGen
support for using POPCNT, and implement the getPopcntSupport
TargetTransformInfo hook.



git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@233689 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
Ulrich Weigand 2015-03-31 12:56:33 +00:00
parent 64aa9d8b4c
commit ee84973420
13 changed files with 208 additions and 6 deletions

View File

@ -163,8 +163,13 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &tm,
// available, or if the operand is constant.
setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);
// Use POPCNT on z196 and above.
if (Subtarget.hasPopulationCount())
setOperationAction(ISD::CTPOP, VT, Custom);
else
setOperationAction(ISD::CTPOP, VT, Expand);
// No special instructions for these.
setOperationAction(ISD::CTPOP, VT, Expand);
setOperationAction(ISD::CTTZ, VT, Expand);
setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Expand);
setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Expand);
@ -2304,6 +2309,45 @@ SDValue SystemZTargetLowering::lowerOR(SDValue Op, SelectionDAG &DAG) const {
MVT::i64, HighOp, Low32);
}
SDValue SystemZTargetLowering::lowerCTPOP(SDValue Op,
SelectionDAG &DAG) const {
EVT VT = Op.getValueType();
int64_t OrigBitSize = VT.getSizeInBits();
SDLoc DL(Op);
// Get the known-zero mask for the operand.
Op = Op.getOperand(0);
APInt KnownZero, KnownOne;
DAG.computeKnownBits(Op, KnownZero, KnownOne);
uint64_t Mask = ~KnownZero.getZExtValue();
// Skip known-zero high parts of the operand.
int64_t BitSize = OrigBitSize;
while ((Mask & ((((uint64_t)1 << (BitSize / 2)) - 1) << (BitSize / 2))) == 0)
BitSize = BitSize / 2;
// The POPCNT instruction counts the number of bits in each byte.
Op = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, Op);
Op = DAG.getNode(SystemZISD::POPCNT, DL, MVT::i64, Op);
Op = DAG.getNode(ISD::TRUNCATE, DL, VT, Op);
// Add up per-byte counts in a binary tree. All bits of Op at
// position larger than BitSize remain zero throughout.
for (int64_t I = BitSize / 2; I >= 8; I = I / 2) {
SDValue Tmp = DAG.getNode(ISD::SHL, DL, VT, Op, DAG.getConstant(I, VT));
if (BitSize != OrigBitSize)
Tmp = DAG.getNode(ISD::AND, DL, VT, Tmp,
DAG.getConstant(((uint64_t)1 << BitSize) - 1, VT));
Op = DAG.getNode(ISD::ADD, DL, VT, Op, Tmp);
}
// Extract overall result from high byte.
if (BitSize > 8)
Op = DAG.getNode(ISD::SRL, DL, VT, Op, DAG.getConstant(BitSize - 8, VT));
return Op;
}
// Op is an atomic load. Lower it into a normal volatile load.
SDValue SystemZTargetLowering::lowerATOMIC_LOAD(SDValue Op,
SelectionDAG &DAG) const {
@ -2554,6 +2598,8 @@ SDValue SystemZTargetLowering::LowerOperation(SDValue Op,
return lowerUDIVREM(Op, DAG);
case ISD::OR:
return lowerOR(Op, DAG);
case ISD::CTPOP:
return lowerCTPOP(Op, DAG);
case ISD::ATOMIC_SWAP:
return lowerATOMIC_LOAD_OP(Op, DAG, SystemZISD::ATOMIC_SWAPW);
case ISD::ATOMIC_STORE:

View File

@ -87,6 +87,9 @@ enum {
// the number of the register.
EXTRACT_ACCESS,
// Count number of bits set in operand 0 per byte.
POPCNT,
// Wrappers around the ISD opcodes of the same name. The output and
// first input operands are GR128s. The trailing numbers are the
// widths of the second operand in bits.
@ -304,6 +307,7 @@ private:
SDValue lowerUDIVREM(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerBITCAST(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerOR(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerCTPOP(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerATOMIC_LOAD(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerATOMIC_LOAD_OP(SDValue Op, SelectionDAG &DAG,

View File

@ -1382,6 +1382,13 @@ let Defs = [CC] in {
def : Pat<(ctlz GR64:$src),
(EXTRACT_SUBREG (FLOGR GR64:$src), subreg_h64)>;
// Population count. Counts bits set per byte.
let Predicates = [FeaturePopulationCount], Defs = [CC] in {
def POPCNT : InstRRE<0xB9E1, (outs GR64:$R1), (ins GR64:$R2),
"popcnt\t$R1, $R2",
[(set GR64:$R1, (z_popcnt GR64:$R2))]>;
}
// Use subregs to populate the "don't care" bits in a 32-bit to 64-bit anyext.
def : Pat<(i64 (anyext GR32:$src)),
(INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR32:$src, subreg_l32)>;

View File

@ -121,6 +121,7 @@ def z_select_ccmask : SDNode<"SystemZISD::SELECT_CCMASK", SDT_ZSelectCCMask,
def z_adjdynalloc : SDNode<"SystemZISD::ADJDYNALLOC", SDT_ZAdjDynAlloc>;
def z_extract_access : SDNode<"SystemZISD::EXTRACT_ACCESS",
SDT_ZExtractAccess>;
def z_popcnt : SDNode<"SystemZISD::POPCNT", SDTIntUnaryOp>;
def z_umul_lohi64 : SDNode<"SystemZISD::UMUL_LOHI64", SDT_ZGR128Binary64>;
def z_sdivrem32 : SDNode<"SystemZISD::SDIVREM32", SDT_ZGR128Binary32>;
def z_sdivrem64 : SDNode<"SystemZISD::SDIVREM64", SDT_ZGR128Binary64>;

View File

@ -39,6 +39,11 @@ def FeatureFPExtension : SystemZFeature<
"Assume that the floating-point extension facility is installed"
>;
def FeaturePopulationCount : SystemZFeature<
"population-count", "PopulationCount",
"Assume that the population-count facility is installed"
>;
def FeatureFastSerialization : SystemZFeature<
"fast-serialization", "FastSerialization",
"Assume that the fast-serialization facility is installed"
@ -54,9 +59,9 @@ def : Processor<"generic", NoItineraries, []>;
def : Processor<"z10", NoItineraries, []>;
def : Processor<"z196", NoItineraries,
[FeatureDistinctOps, FeatureLoadStoreOnCond, FeatureHighWord,
FeatureFPExtension, FeatureFastSerialization,
FeatureInterlockedAccess1]>;
FeatureFPExtension, FeaturePopulationCount,
FeatureFastSerialization, FeatureInterlockedAccess1]>;
def : Processor<"zEC12", NoItineraries,
[FeatureDistinctOps, FeatureLoadStoreOnCond, FeatureHighWord,
FeatureFPExtension, FeatureFastSerialization,
FeatureInterlockedAccess1]>;
FeatureFPExtension, FeaturePopulationCount,
FeatureFastSerialization, FeatureInterlockedAccess1]>;

View File

@ -38,7 +38,8 @@ SystemZSubtarget::SystemZSubtarget(const std::string &TT,
const TargetMachine &TM)
: SystemZGenSubtargetInfo(TT, CPU, FS), HasDistinctOps(false),
HasLoadStoreOnCond(false), HasHighWord(false), HasFPExtension(false),
HasFastSerialization(false), HasInterlockedAccess1(false),
HasPopulationCount(false), HasFastSerialization(false),
HasInterlockedAccess1(false),
TargetTriple(TT), InstrInfo(initializeSubtargetDependencies(CPU, FS)),
TLInfo(TM, *this), TSInfo(*TM.getDataLayout()), FrameLowering() {}

View File

@ -38,6 +38,7 @@ protected:
bool HasLoadStoreOnCond;
bool HasHighWord;
bool HasFPExtension;
bool HasPopulationCount;
bool HasFastSerialization;
bool HasInterlockedAccess1;
@ -86,6 +87,9 @@ public:
// Return true if the target has the floating-point extension facility.
bool hasFPExtension() const { return HasFPExtension; }
// Return true if the target has the population-count facility.
bool hasPopulationCount() const { return HasPopulationCount; }
// Return true if the target has the fast-serialization facility.
bool hasFastSerialization() const { return HasFastSerialization; }

View File

@ -229,3 +229,12 @@ unsigned SystemZTTIImpl::getIntImmCost(Intrinsic::ID IID, unsigned Idx,
}
return SystemZTTIImpl::getIntImmCost(Imm, Ty);
}
TargetTransformInfo::PopcntSupportKind
SystemZTTIImpl::getPopcntSupport(unsigned TyWidth) {
assert(isPowerOf2_32(TyWidth) && "Type width must be power of 2");
if (ST->hasPopulationCount() && TyWidth <= 64)
return TTI::PSK_FastHardware;
return TTI::PSK_Software;
}

View File

@ -60,6 +60,8 @@ public:
unsigned getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,
Type *Ty);
TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth);
/// @}
};

View File

@ -0,0 +1,96 @@
; Test population-count instruction
;
; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z196 | FileCheck %s
declare i32 @llvm.ctpop.i32(i32 %a)
declare i64 @llvm.ctpop.i64(i64 %a)
define i32 @f1(i32 %a) {
; CHECK-LABEL: f1:
; CHECK: popcnt %r0, %r2
; CHECK: sllk %r1, %r0, 16
; CHECK: ar %r1, %r0
; CHECK: sllk %r2, %r1, 8
; CHECK: ar %r2, %r1
; CHECK: srl %r2, 24
; CHECK: br %r14
%popcnt = call i32 @llvm.ctpop.i32(i32 %a)
ret i32 %popcnt
}
define i32 @f2(i32 %a) {
; CHECK-LABEL: f2:
; CHECK: llhr %r0, %r2
; CHECK: popcnt %r0, %r0
; CHECK: risblg %r2, %r0, 16, 151, 8
; CHECK: ar %r2, %r0
; CHECK: srl %r2, 8
; CHECK: br %r14
%and = and i32 %a, 65535
%popcnt = call i32 @llvm.ctpop.i32(i32 %and)
ret i32 %popcnt
}
define i32 @f3(i32 %a) {
; CHECK-LABEL: f3:
; CHECK: llcr %r0, %r2
; CHECK: popcnt %r2, %r0
; CHECK: br %r14
%and = and i32 %a, 255
%popcnt = call i32 @llvm.ctpop.i32(i32 %and)
ret i32 %popcnt
}
define i64 @f4(i64 %a) {
; CHECK-LABEL: f4:
; CHECK: popcnt %r0, %r2
; CHECK: sllg %r1, %r0, 32
; CHECK: agr %r1, %r0
; CHECK: sllg %r0, %r1, 16
; CHECK: agr %r0, %r1
; CHECK: sllg %r1, %r0, 8
; CHECK: agr %r1, %r0
; CHECK: srlg %r2, %r1, 56
; CHECK: br %r14
%popcnt = call i64 @llvm.ctpop.i64(i64 %a)
ret i64 %popcnt
}
define i64 @f5(i64 %a) {
; CHECK-LABEL: f5:
; CHECK: llgfr %r0, %r2
; CHECK: popcnt %r0, %r0
; CHECK: sllg %r1, %r0, 16
; CHECK: algfr %r0, %r1
; CHECK: sllg %r1, %r0, 8
; CHECK: algfr %r0, %r1
; CHECK: srlg %r2, %r0, 24
%and = and i64 %a, 4294967295
%popcnt = call i64 @llvm.ctpop.i64(i64 %and)
ret i64 %popcnt
}
define i64 @f6(i64 %a) {
; CHECK-LABEL: f6:
; CHECK: llghr %r0, %r2
; CHECK: popcnt %r0, %r0
; CHECK: risbg %r1, %r0, 48, 183, 8
; CHECK: agr %r1, %r0
; CHECK: srlg %r2, %r1, 8
; CHECK: br %r14
%and = and i64 %a, 65535
%popcnt = call i64 @llvm.ctpop.i64(i64 %and)
ret i64 %popcnt
}
define i64 @f7(i64 %a) {
; CHECK-LABEL: f7:
; CHECK: llgcr %r0, %r2
; CHECK: popcnt %r2, %r0
; CHECK: br %r14
%and = and i64 %a, 255
%popcnt = call i64 @llvm.ctpop.i64(i64 %and)
ret i64 %popcnt
}

View File

@ -6334,6 +6334,18 @@
# CHECK: pfd 15, 0
0xe3 0xf0 0x00 0x00 0x00 0x36
# CHECK: popcnt %r0, %r0
0xb9 0xe1 0x00 0x00
# CHECK: popcnt %r0, %r15
0xb9 0xe1 0x00 0x0f
# CHECK: popcnt %r15, %r0
0xb9 0xe1 0x00 0xf0
# CHECK: popcnt %r7, %r8
0xb9 0xe1 0x00 0x78
# CHECK: risbg %r0, %r0, 0, 0, 0
0xec 0x00 0x00 0x00 0x00 0x55

View File

@ -2666,6 +2666,11 @@
pfdrl 1, 1
pfdrl 1, 0x100000000
#CHECK: error: {{(instruction requires: population-count)?}}
#CHECK: popcnt %r0, %r0
popcnt %r0, %r0
#CHECK: error: invalid operand
#CHECK: risbg %r0,%r0,0,0,-1
#CHECK: error: invalid operand

View File

@ -1021,6 +1021,16 @@
ork %r15,%r0,%r0
ork %r7,%r8,%r9
#CHECK: popcnt %r0, %r0 # encoding: [0xb9,0xe1,0x00,0x00]
#CHECK: popcnt %r0, %r15 # encoding: [0xb9,0xe1,0x00,0x0f]
#CHECK: popcnt %r15, %r0 # encoding: [0xb9,0xe1,0x00,0xf0]
#CHECK: popcnt %r7, %r8 # encoding: [0xb9,0xe1,0x00,0x78]
popcnt %r0,%r0
popcnt %r0,%r15
popcnt %r15,%r0
popcnt %r7,%r8
#CHECK: risbhg %r0, %r0, 0, 0, 0 # encoding: [0xec,0x00,0x00,0x00,0x00,0x5d]
#CHECK: risbhg %r0, %r0, 0, 0, 63 # encoding: [0xec,0x00,0x00,0x00,0x3f,0x5d]
#CHECK: risbhg %r0, %r0, 0, 255, 0 # encoding: [0xec,0x00,0x00,0xff,0x00,0x5d]