[PowerPC] Improve instruction selection bit-permuting operations (32-bit)

The PowerPC backend, somewhat embarrassingly, did not generate an
optimal-length sequence of instructions for a 32-bit bswap. While adding a
pattern for the bswap intrinsic to fix this would not have been terribly
difficult, doing so would not have addressed the real problem: we had been
generating poor code for many bit-permuting operations (by which I mean things
like byte-swap that permute the bits of one or more inputs around in various
ways). Here are some initial steps toward solving this deficiency.

Bit-permuting operations are represented, at the SDAG level, using ISD::ROTL,
SHL, SRL, AND and OR (mostly with constant second operands). Looking back
through these operations, we can build up a description of the bits in the
resulting value in terms of bits of one or more input values (and constant
zeros). For each bit, we compute the rotation amount from the original value,
and then group consecutive (value, rotation factor) bits into groups. Groups
sharing these attributes are then collected and sorted, and we can then
instruction select the entire permutation using a combination of masked
rotations (rlwinm), imm ands (andi/andis), and masked rotation inserts
(rlwimi).

The result is that instead of lowering an i32 bswap as:

	rlwinm 5, 3, 24, 16, 23
	rlwinm 4, 3, 24, 0, 7
	rlwimi 4, 3, 8, 8, 15
	rlwimi 5, 3, 8, 24, 31
	rlwimi 4, 5, 0, 16, 31

we now produce:

	rlwinm 4, 3, 8, 0, 31
	rlwimi 4, 3, 24, 16, 23
	rlwimi 4, 3, 24, 0, 7

and for the 'test6' example in the PowerPC/README.txt file:

 unsigned test6(unsigned x) {
   return ((x & 0x00FF0000) >> 16) | ((x & 0x000000FF) << 16);
 }

we used to produce:

	lis 4, 255
	rlwinm 3, 3, 16, 0, 31
	ori 4, 4, 255
	and 3, 3, 4

and now we produce:

	rlwinm 4, 3, 16, 24, 31
	rlwimi 4, 3, 16, 8, 15

and, as a nice bonus, this fixes the FIXME in
test/CodeGen/PowerPC/rlwimi-and.ll.

This commit does not include instruction-selection for i64 operations, those
will come later.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@224318 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
Hal Finkel 2014-12-16 05:51:41 +00:00
parent f5cd1efe99
commit edfeeb6d70
5 changed files with 524 additions and 61 deletions

View File

@ -119,6 +119,7 @@ namespace {
SDNode *Select(SDNode *N) override;
SDNode *SelectBitfieldInsert(SDNode *N);
SDNode *SelectBitPermutation(SDNode *N);
/// SelectCC - Select a comparison of the specified values with the
/// specified condition code, returning the CR# of the expression.
@ -532,6 +533,481 @@ SDNode *PPCDAGToDAGISel::SelectBitfieldInsert(SDNode *N) {
return nullptr;
}
namespace {
class BitPermutationSelector {
struct ValueBit {
SDValue V;
// The bit number in the value, using a convention where bit 0 is the
// lowest-order bit.
unsigned Idx;
enum Kind {
ConstZero,
Variable
} K;
ValueBit(SDValue V, unsigned I, Kind K = Variable)
: V(V), Idx(I), K(K) {}
ValueBit(Kind K = Variable)
: V(SDValue(nullptr, 0)), Idx(UINT32_MAX), K(K) {}
bool isZero() const {
return K == ConstZero;
}
bool hasValue() const {
return K == Variable;
}
SDValue getValue() const {
assert(hasValue() && "Cannot get the value of a constant bit");
return V;
}
unsigned getValueBitIndex() const {
assert(hasValue() && "Cannot get the value bit index of a constant bit");
return Idx;
}
};
// A bit group has the same underlying value and the same rotate factor.
struct BitGroup {
SDValue V;
unsigned RLAmt;
unsigned StartIdx, EndIdx;
BitGroup(SDValue V, unsigned R, unsigned S, unsigned E)
: V(V), RLAmt(R), StartIdx(S), EndIdx(E) {
DEBUG(dbgs() << "\tbit group for " << V.getNode() << " RLAmt = " << R <<
" [" << S << ", " << E << "]\n");
}
};
// Information on each (Value, RLAmt) pair (like the number of groups
// associated with each) used to choose the lowering method.
struct ValueRotInfo {
SDValue V;
unsigned RLAmt;
unsigned NumGroups;
unsigned FirstGroupStartIdx;
ValueRotInfo()
: RLAmt(UINT32_MAX), NumGroups(0), FirstGroupStartIdx(UINT32_MAX) {}
// For sorting (in reverse order) by NumGroups, and then by
// FirstGroupStartIdx.
bool operator < (const ValueRotInfo &Other) const {
if (NumGroups > Other.NumGroups)
return true;
else if (NumGroups < Other.NumGroups)
return false;
else if (FirstGroupStartIdx < Other.FirstGroupStartIdx)
return true;
return false;
}
};
// Return true if something interesting was deduced, return false if we're
// providing only a generic representation of V (or something else likewise
// uninteresting for instruction selection).
bool getValueBits(SDValue V, SmallVector<ValueBit, 64> &Bits) {
switch (V.getOpcode()) {
default: break;
case ISD::ROTL:
if (isa<ConstantSDNode>(V.getOperand(1))) {
unsigned RotAmt = V.getConstantOperandVal(1);
SmallVector<ValueBit, 64> LHSBits(Bits.size());
getValueBits(V.getOperand(0), LHSBits);
for (unsigned i = 0; i < Bits.size(); ++i)
Bits[i] = LHSBits[i < RotAmt ? i + (Bits.size() - RotAmt) : i - RotAmt];
return true;
}
break;
case ISD::SHL:
if (isa<ConstantSDNode>(V.getOperand(1))) {
unsigned ShiftAmt = V.getConstantOperandVal(1);
SmallVector<ValueBit, 64> LHSBits(Bits.size());
getValueBits(V.getOperand(0), LHSBits);
for (unsigned i = ShiftAmt; i < Bits.size(); ++i)
Bits[i] = LHSBits[i - ShiftAmt];
for (unsigned i = 0; i < ShiftAmt; ++i)
Bits[i] = ValueBit(ValueBit::ConstZero);
return true;
}
break;
case ISD::SRL:
if (isa<ConstantSDNode>(V.getOperand(1))) {
unsigned ShiftAmt = V.getConstantOperandVal(1);
SmallVector<ValueBit, 64> LHSBits(Bits.size());
getValueBits(V.getOperand(0), LHSBits);
for (unsigned i = 0; i < Bits.size() - ShiftAmt; ++i)
Bits[i] = LHSBits[i + ShiftAmt];
for (unsigned i = Bits.size() - ShiftAmt; i < Bits.size(); ++i)
Bits[i] = ValueBit(ValueBit::ConstZero);
return true;
}
break;
case ISD::AND:
if (isa<ConstantSDNode>(V.getOperand(1))) {
uint64_t Mask = V.getConstantOperandVal(1);
SmallVector<ValueBit, 64> LHSBits(Bits.size());
bool LHSTrivial = getValueBits(V.getOperand(0), LHSBits);
for (unsigned i = 0; i < Bits.size(); ++i)
if (((Mask >> i) & 1) == 1)
Bits[i] = LHSBits[i];
else
Bits[i] = ValueBit(ValueBit::ConstZero);
// Mark this as interesting, only if the LHS was also interesting. This
// prevents the overall procedure from matching a single immediate 'and'
// (which is non-optimal because such an and might be folded with other
// things if we don't select it here).
return LHSTrivial;
}
break;
case ISD::OR: {
SmallVector<ValueBit, 64> LHSBits(Bits.size()), RHSBits(Bits.size());
getValueBits(V.getOperand(0), LHSBits);
getValueBits(V.getOperand(1), RHSBits);
bool AllDisjoint = true;
for (unsigned i = 0; i < Bits.size(); ++i)
if (LHSBits[i].isZero())
Bits[i] = RHSBits[i];
else if (RHSBits[i].isZero())
Bits[i] = LHSBits[i];
else {
AllDisjoint = false;
break;
}
if (!AllDisjoint)
break;
return true;
}
}
for (unsigned i = 0; i < Bits.size(); ++i)
Bits[i] = ValueBit(V, i);
return false;
}
// For each value (except the constant ones), compute the left-rotate amount
// to get it from its original to final position.
void computeRotationAmounts() {
HasZeros = false;
RLAmt.resize(Bits.size());
for (unsigned i = 0; i < Bits.size(); ++i)
if (Bits[i].hasValue()) {
unsigned VBI = Bits[i].getValueBitIndex();
if (i >= VBI)
RLAmt[i] = i - VBI;
else
RLAmt[i] = Bits.size() - (VBI - i);
} else if (Bits[i].isZero()) {
HasZeros = true;
RLAmt[i] = UINT32_MAX;
} else {
llvm_unreachable("Unknown value bit type");
}
}
// Collect groups of consecutive bits with the same underlying value and
// rotation factor.
void collectBitGroups() {
BitGroups.clear();
unsigned LastRLAmt = RLAmt[0];
SDValue LastValue = Bits[0].hasValue() ? Bits[0].getValue() : SDValue();
unsigned LastGroupStartIdx = 0;
for (unsigned i = 1; i < Bits.size(); ++i) {
unsigned ThisRLAmt = RLAmt[i];
SDValue ThisValue = Bits[i].hasValue() ? Bits[i].getValue() : SDValue();
// If this bit has the same underlying value and the same rotate factor as
// the last one, then they're part of the same group.
if (ThisRLAmt == LastRLAmt && ThisValue == LastValue)
continue;
if (LastValue.getNode())
BitGroups.push_back(BitGroup(LastValue, LastRLAmt, LastGroupStartIdx,
i-1));
LastRLAmt = ThisRLAmt;
LastValue = ThisValue;
LastGroupStartIdx = i;
}
if (LastValue.getNode())
BitGroups.push_back(BitGroup(LastValue, LastRLAmt, LastGroupStartIdx,
Bits.size()-1));
if (BitGroups.empty())
return;
// We might be able to combine the first and last groups.
if (BitGroups.size() > 1) {
// If the first and last groups are the same, then remove the first group
// in favor of the last group, making the ending index of the last group
// equal to the ending index of the to-be-removed first group.
if (BitGroups[0].StartIdx == 0 &&
BitGroups[BitGroups.size()-1].EndIdx == Bits.size()-1 &&
BitGroups[0].V == BitGroups[BitGroups.size()-1].V &&
BitGroups[0].RLAmt == BitGroups[BitGroups.size()-1].RLAmt) {
BitGroups[BitGroups.size()-1].EndIdx = BitGroups[0].EndIdx;
BitGroups.erase(BitGroups.begin());
}
}
}
// Take all (SDValue, RLAmt) pairs and sort them by the number of groups
// associated with each. If there is a degeneracy, pick the one that occurs
// first (in the final value).
void collectValueRotInfo() {
ValueRots.clear();
for (auto &BG : BitGroups) {
ValueRotInfo &VRI = ValueRots[std::make_pair(BG.V, BG.RLAmt)];
VRI.V = BG.V;
VRI.RLAmt = BG.RLAmt;
VRI.NumGroups += 1;
VRI.FirstGroupStartIdx = std::min(VRI.FirstGroupStartIdx, BG.StartIdx);
}
// Now that we've collected the various ValueRotInfo instances, we need to
// sort them.
ValueRotsVec.clear();
for (auto &I : ValueRots) {
ValueRotsVec.push_back(I.second);
}
std::sort(ValueRotsVec.begin(), ValueRotsVec.end());
}
SDValue getI32Imm(unsigned Imm) {
return CurDAG->getTargetConstant(Imm, MVT::i32);
}
// Depending on the number of groups for a particular value, it might be
// better to rotate, mask explicitly (using andi/andis), and then or the
// result. Select this part of the result first.
void SelectAndParts32(SDNode *N, SDValue &Res) {
SDLoc dl(N);
for (ValueRotInfo &VRI : ValueRotsVec) {
unsigned Mask = 0;
for (unsigned i = 0; i < Bits.size(); ++i) {
if (!Bits[i].hasValue() || Bits[i].getValue() != VRI.V)
continue;
if (RLAmt[i] != VRI.RLAmt)
continue;
Mask |= (1u << i);
}
// Compute the masks for andi/andis that would be necessary.
unsigned ANDIMask = (Mask & UINT16_MAX), ANDISMask = Mask >> 16;
assert((ANDIMask != 0 || ANDISMask != 0) &&
"No set bits in mask for value bit groups");
bool NeedsRotate = VRI.RLAmt != 0;
// We're trying to minimize the number of instructions. If we have one
// group, using one of andi/andis can break even. If we have three
// groups, we can use both andi and andis and break even (to use both
// andi and andis we also need to or the results together). We need four
// groups if we also need to rotate. To use andi/andis we need to do more
// than break even because rotate-and-mask instructions tend to be easier
// to schedule.
// FIXME: We've biased here against using andi/andis, which is right for
// POWER cores, but not optimal everywhere. For example, on the A2,
// andi/andis have single-cycle latency whereas the rotate-and-mask
// instructions take two cycles, and it would be better to bias toward
// andi/andis in break-even cases.
unsigned NumAndInsts = (unsigned) NeedsRotate +
(unsigned) (ANDIMask != 0) +
(unsigned) (ANDISMask != 0) +
(unsigned) (ANDIMask != 0 && ANDISMask != 0) +
(unsigned) (bool) Res;
if (NumAndInsts >= VRI.NumGroups)
continue;
SDValue VRot;
if (VRI.RLAmt) {
SDValue Ops[] =
{ VRI.V, getI32Imm(VRI.RLAmt), getI32Imm(0), getI32Imm(31) };
VRot = SDValue(CurDAG->getMachineNode(PPC::RLWINM, dl, MVT::i32,
Ops), 0);
} else {
VRot = VRI.V;
}
SDValue ANDIVal, ANDISVal;
if (ANDIMask != 0)
ANDIVal = SDValue(CurDAG->getMachineNode(PPC::ANDIo, dl, MVT::i32,
VRot, getI32Imm(ANDIMask)), 0);
if (ANDISMask != 0)
ANDISVal = SDValue(CurDAG->getMachineNode(PPC::ANDISo, dl, MVT::i32,
VRot, getI32Imm(ANDISMask)), 0);
SDValue TotalVal;
if (!ANDIVal)
TotalVal = ANDISVal;
else if (!ANDISVal)
TotalVal = ANDIVal;
else
TotalVal = SDValue(CurDAG->getMachineNode(PPC::OR, dl, MVT::i32,
ANDIVal, ANDISVal), 0);
if (!Res)
Res = TotalVal;
else
Res = SDValue(CurDAG->getMachineNode(PPC::OR, dl, MVT::i32,
Res, TotalVal), 0);
// Now, remove all groups with this underlying value and rotation
// factor.
for (auto I = BitGroups.begin(); I != BitGroups.end();) {
if (I->V == VRI.V && I->RLAmt == VRI.RLAmt)
I = BitGroups.erase(I);
else
++I;
}
}
}
// Instruction selection for the 32-bit case.
SDNode *Select32(SDNode *N) {
SDLoc dl(N);
SDValue Res;
// Take care of cases that should use andi/andis first.
SelectAndParts32(N, Res);
// If we've not yet selected a 'starting' instruction, and we have no zeros
// to fill in, select the (Value, RLAmt) with the highest priority (largest
// number of groups), and start with this rotated value.
if (!HasZeros && !Res) {
ValueRotInfo &VRI = ValueRotsVec[0];
if (VRI.RLAmt) {
SDValue Ops[] =
{ VRI.V, getI32Imm(VRI.RLAmt), getI32Imm(0), getI32Imm(31) };
Res = SDValue(CurDAG->getMachineNode(PPC::RLWINM, dl, MVT::i32, Ops), 0);
} else {
Res = VRI.V;
}
// Now, remove all groups with this underlying value and rotation factor.
for (auto I = BitGroups.begin(); I != BitGroups.end();) {
if (I->V == VRI.V && I->RLAmt == VRI.RLAmt)
I = BitGroups.erase(I);
else
++I;
}
}
// Insert the other groups (one at a time).
for (auto &BG : BitGroups) {
if (!Res.getNode()) {
SDValue Ops[] =
{ BG.V, getI32Imm(BG.RLAmt), getI32Imm(Bits.size() - BG.EndIdx - 1),
getI32Imm(Bits.size() - BG.StartIdx - 1) };
Res = SDValue(CurDAG->getMachineNode(PPC::RLWINM, dl, MVT::i32, Ops), 0);
} else {
SDValue Ops[] =
{ Res, BG.V, getI32Imm(BG.RLAmt), getI32Imm(Bits.size() - BG.EndIdx - 1),
getI32Imm(Bits.size() - BG.StartIdx - 1) };
Res = SDValue(CurDAG->getMachineNode(PPC::RLWIMI, dl, MVT::i32, Ops), 0);
}
}
return Res.getNode();
}
SmallVector<ValueBit, 64> Bits;
bool HasZeros;
SmallVector<unsigned, 64> RLAmt;
SmallVector<BitGroup, 16> BitGroups;
DenseMap<std::pair<SDValue, unsigned>, ValueRotInfo> ValueRots;
SmallVector<ValueRotInfo, 16> ValueRotsVec;
SelectionDAG *CurDAG;
public:
BitPermutationSelector(SelectionDAG *DAG)
: CurDAG(DAG) {}
// Here we try to match complex bit permutations into a set of
// rotate-and-shift/shift/and/or instructions, using a set of heuristics
// known to produce optimial code for common cases (like i32 byte swapping).
SDNode *Select(SDNode *N) {
Bits.resize(N->getValueType(0).getSizeInBits());
if (!getValueBits(SDValue(N, 0), Bits))
return nullptr;
DEBUG(dbgs() << "Considering bit-permutation-based instruction"
" selection for: ");
DEBUG(N->dump(CurDAG));
// Fill it RLAmt and set HasZeros.
computeRotationAmounts();
// Fill in BitGroups.
collectBitGroups();
if (BitGroups.empty())
return nullptr;
// Fill in ValueRotsVec.
collectValueRotInfo();
if (Bits.size() == 32) {
return Select32(N);
} else {
assert(Bits.size() == 64 && "Not 64 bits here?");
// TODO: The 64-bit case!
}
return nullptr;
}
};
} // anonymous namespace
SDNode *PPCDAGToDAGISel::SelectBitPermutation(SDNode *N) {
if (N->getValueType(0) != MVT::i32 &&
N->getValueType(0) != MVT::i64)
return nullptr;
switch (N->getOpcode()) {
default: break;
case ISD::ROTL:
case ISD::SHL:
case ISD::SRL:
case ISD::AND:
case ISD::OR: {
BitPermutationSelector BPS(CurDAG);
return BPS.Select(N);
}
}
return nullptr;
}
/// SelectCC - Select a comparison of the specified values with the specified
/// condition code, returning the CR# of the expression.
SDValue PPCDAGToDAGISel::SelectCC(SDValue LHS, SDValue RHS,
@ -947,6 +1423,10 @@ SDNode *PPCDAGToDAGISel::Select(SDNode *N) {
N->getOperand(1).getOpcode() == ISD::TargetConstant)
llvm_unreachable("Invalid ADD with TargetConstant operand");
// Try matching complex bit permutations before doing anything else.
if (SDNode *NN = SelectBitPermutation(N))
return NN;
switch (N->getOpcode()) {
default: break;

View File

@ -533,32 +533,6 @@ _foo:
===-------------------------------------------------------------------------===
We compile:
unsigned test6(unsigned x) {
return ((x & 0x00FF0000) >> 16) | ((x & 0x000000FF) << 16);
}
into:
_test6:
lis r2, 255
rlwinm r3, r3, 16, 0, 31
ori r2, r2, 255
and r3, r3, r2
blr
GCC gets it down to:
_test6:
rlwinm r0,r3,16,8,15
rlwinm r3,r3,16,24,31
or r3,r3,r0
blr
===-------------------------------------------------------------------------===
Consider a function like this:
float foo(float X) { return X + 1234.4123f; }
@ -655,35 +629,6 @@ _bar:
===-------------------------------------------------------------------------===
We currently compile 32-bit bswap:
declare i32 @llvm.bswap.i32(i32 %A)
define i32 @test(i32 %A) {
%B = call i32 @llvm.bswap.i32(i32 %A)
ret i32 %B
}
to:
_test:
rlwinm r2, r3, 24, 16, 23
slwi r4, r3, 24
rlwimi r2, r3, 8, 24, 31
rlwimi r4, r3, 8, 8, 15
rlwimi r4, r2, 0, 16, 31
mr r3, r4
blr
it would be more efficient to produce:
_foo: mr r0,r3
rlwinm r3,r3,8,0xffffffff
rlwimi r3,r0,24,0,7
rlwimi r3,r0,24,16,23
blr
===-------------------------------------------------------------------------===
test/CodeGen/PowerPC/2007-03-24-cntlzd.ll compiles to:
__ZNK4llvm5APInt17countLeadingZerosEv:

View File

@ -0,0 +1,40 @@
; RUN: llc -mcpu=pwr7 < %s | FileCheck %s
target datalayout = "E-m:e-i64:64-n32:64"
target triple = "powerpc64-unknown-linux-gnu"
; Function Attrs: nounwind readnone
define zeroext i32 @bs4(i32 zeroext %a) #0 {
entry:
%0 = tail call i32 @llvm.bswap.i32(i32 %a)
ret i32 %0
; CHECK-LABEL: @bs4
; CHECK: rlwinm [[REG1:[0-9]+]], 3, 8, 0, 31
; CHECK: rlwimi [[REG1]], 3, 24, 16, 23
; CHECK: rlwimi [[REG1]], 3, 24, 0, 7
; CHECK: mr 3, [[REG1]]
; CHECK: blr
}
; Function Attrs: nounwind readnone
define zeroext i32 @test6(i32 zeroext %x) #0 {
entry:
%and = lshr i32 %x, 16
%shr = and i32 %and, 255
%and1 = shl i32 %x, 16
%shl = and i32 %and1, 16711680
%or = or i32 %shr, %shl
ret i32 %or
; CHECK-LABEL: @test6
; CHECK: rlwinm [[REG1:[0-9]+]], 3, 16, 24, 31
; CHECK: rlwimi [[REG1]], 3, 16, 8, 15
; CHECK: mr 3, [[REG1]]
; CHECK: blr
}
; Function Attrs: nounwind readnone
declare i32 @llvm.bswap.i32(i32) #0
attributes #0 = { nounwind readnone }

View File

@ -28,11 +28,9 @@ codeRepl17: ; preds = %codeRepl4
store i16 %rvml38.sroa.0.0.insert.insert, i16* undef, align 2
unreachable
; FIXME: the SLWI could be folded into the RLWIMI to give a rotate of 8.
; CHECK: @test
; CHECK-DAG: slwi [[R1:[0-9]+]], {{[0-9]+}}, 31
; CHECK-DAG: rlwinm [[R2:[0-9]+]], {{[0-9]+}}, 0, 31, 31
; CHECK: rlwimi [[R2]], [[R1]], 9, 23, 23
; CHECK: rlwinm [[R1:[0-9]+]], {{[0-9]+}}, 0, 31, 31
; CHECK: rlwimi [[R1]], {{[0-9]+}}, 8, 23, 23
codeRepl29: ; preds = %codeRepl1
unreachable

View File

@ -1,7 +1,7 @@
; All of these ands and shifts should be folded into rlwimi's
; RUN: llc < %s -march=ppc32 -o %t
; RUN: grep rlwimi %t | count 3
; RUN: grep srwi %t | count 1
; RUN: grep rlwimi %t | count 4
; RUN: not grep srwi %t
; RUN: not grep slwi %t
define i16 @test1(i32 %srcA, i32 %srcB, i32 %alpha) nounwind {