AArch64/ARM64: implement BFI optimisation

ARM64 was not producing pure BFI instructions for bitfield insertion
operations, unlike AArch64. The approach had to be a little different (in
ISelDAGToDAG rather than ISelLowering), and the outcomes aren't identical but
hopefully this gives it similar power.

This should address PR19424.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@207102 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
Tim Northover 2014-04-24 12:11:53 +00:00
parent fe6f4e4d31
commit b62ba5eca0
3 changed files with 159 additions and 60 deletions

View File

@ -1394,28 +1394,23 @@ SDNode *ARM64DAGToDAGISel::SelectBitfieldExtractOp(SDNode *N) {
return CurDAG->SelectNodeTo(N, Opc, VT, Ops, 3);
}
// Is mask a i32 or i64 binary sequence 1..10..0 and
// CountTrailingZeros(mask) == ExpectedTrailingZeros
static bool isHighMask(uint64_t Mask, unsigned ExpectedTrailingZeros,
unsigned NumberOfIgnoredHighBits, EVT VT) {
/// Does DstMask form a complementary pair with the mask provided by
/// BitsToBeInserted, suitable for use in a BFI instruction. Roughly speaking,
/// this asks whether DstMask zeroes precisely those bits that will be set by
/// the other half.
static bool isBitfieldDstMask(uint64_t DstMask, APInt BitsToBeInserted,
unsigned NumberOfIgnoredHighBits, EVT VT) {
assert((VT == MVT::i32 || VT == MVT::i64) &&
"i32 or i64 mask type expected!");
unsigned BitWidth = VT.getSizeInBits() - NumberOfIgnoredHighBits;
APInt SignificantBits =
~APInt::getHighBitsSet(BitWidth, NumberOfIgnoredHighBits);
uint64_t ExpectedMask;
if (VT == MVT::i32) {
uint32_t ExpectedMaski32 = ~0 << ExpectedTrailingZeros;
ExpectedMask = ExpectedMaski32;
if (NumberOfIgnoredHighBits) {
uint32_t highMask = ~0 << (32 - NumberOfIgnoredHighBits);
Mask |= highMask;
}
} else {
ExpectedMask = ((uint64_t) ~0) << ExpectedTrailingZeros;
if (NumberOfIgnoredHighBits)
Mask |= ((uint64_t) ~0) << (64 - NumberOfIgnoredHighBits);
}
APInt SignificantDstMask = APInt(BitWidth, DstMask);
APInt SignificantBitsToBeInserted = BitsToBeInserted.zextOrTrunc(BitWidth);
return Mask == ExpectedMask;
return (SignificantDstMask & SignificantBitsToBeInserted) == 0 &&
(SignificantDstMask | SignificantBitsToBeInserted).isAllOnesValue();
}
// Look for bits that will be useful for later uses.
@ -1593,6 +1588,79 @@ static void getUsefulBits(SDValue Op, APInt &UsefulBits, unsigned Depth) {
UsefulBits &= UsersUsefulBits;
}
/// Create a machine node performing a notional SHL of Op by ShlAmount. If
/// ShlAmount is negative, do a (logical) right-shift instead. If ShlAmount is
/// 0, return Op unchanged.
static SDValue getLeftShift(SelectionDAG *CurDAG, SDValue Op, int ShlAmount) {
if (ShlAmount == 0)
return Op;
EVT VT = Op.getValueType();
unsigned BitWidth = VT.getSizeInBits();
unsigned UBFMOpc = BitWidth == 32 ? ARM64::UBFMWri : ARM64::UBFMXri;
SDNode *ShiftNode;
if (ShlAmount > 0) {
// LSL wD, wN, #Amt == UBFM wD, wN, #32-Amt, #31-Amt
ShiftNode = CurDAG->getMachineNode(
UBFMOpc, SDLoc(Op), VT, Op,
CurDAG->getTargetConstant(BitWidth - ShlAmount, VT),
CurDAG->getTargetConstant(BitWidth - 1 - ShlAmount, VT));
} else {
// LSR wD, wN, #Amt == UBFM wD, wN, #Amt, #32-1
assert(ShlAmount < 0 && "expected right shift");
int ShrAmount = -ShlAmount;
ShiftNode = CurDAG->getMachineNode(
UBFMOpc, SDLoc(Op), VT, Op, CurDAG->getTargetConstant(ShrAmount, VT),
CurDAG->getTargetConstant(BitWidth - 1, VT));
}
return SDValue(ShiftNode, 0);
}
/// Does this tree qualify as an attempt to move a bitfield into position,
/// essentially "(and (shl VAL, N), Mask)".
static bool isBitfieldPositioningOp(SelectionDAG *CurDAG, SDValue Op,
SDValue &Src, int &ShiftAmount,
int &MaskWidth) {
EVT VT = Op.getValueType();
unsigned BitWidth = VT.getSizeInBits();
assert(BitWidth == 32 || BitWidth == 64);
APInt KnownZero, KnownOne;
CurDAG->ComputeMaskedBits(Op, KnownZero, KnownOne);
// Non-zero in the sense that they're not provably zero, which is the key
// point if we want to use this value
uint64_t NonZeroBits = (~KnownZero).getZExtValue();
// Discard a constant AND mask if present. It's safe because the node will
// already have been factored into the ComputeMaskedBits calculation above.
uint64_t AndImm;
if (isOpcWithIntImmediate(Op.getNode(), ISD::AND, AndImm)) {
assert((~APInt(BitWidth, AndImm) & ~KnownZero) == 0);
Op = Op.getOperand(0);
}
uint64_t ShlImm;
if (!isOpcWithIntImmediate(Op.getNode(), ISD::SHL, ShlImm))
return false;
Op = Op.getOperand(0);
if (!isShiftedMask_64(NonZeroBits))
return false;
ShiftAmount = countTrailingZeros(NonZeroBits);
MaskWidth = CountTrailingOnes_64(NonZeroBits >> ShiftAmount);
// BFI encompasses sufficiently many nodes that it's worth inserting an extra
// LSL/LSR if the mask in NonZeroBits doesn't quite match up with the ISD::SHL
// amount.
Src = getLeftShift(CurDAG, Op, ShlImm - ShiftAmount);
return true;
}
// Given a OR operation, check if we have the following pattern
// ubfm c, b, imm, imm2 (or something that does the same jobs, see
// isBitfieldExtractOp)
@ -1602,9 +1670,9 @@ static void getUsefulBits(SDValue Op, APInt &UsefulBits, unsigned Depth) {
// if yes, given reference arguments will be update so that one can replace
// the OR instruction with:
// f = Opc Opd0, Opd1, LSB, MSB ; where Opc is a BFM, LSB = imm, and MSB = imm2
static bool isBitfieldInsertOpFromOr(SDNode *N, unsigned &Opc, SDValue &Opd0,
SDValue &Opd1, unsigned &LSB,
unsigned &MSB, SelectionDAG *CurDAG) {
static bool isBitfieldInsertOpFromOr(SDNode *N, unsigned &Opc, SDValue &Dst,
SDValue &Src, unsigned &ImmR,
unsigned &ImmS, SelectionDAG *CurDAG) {
assert(N->getOpcode() == ISD::OR && "Expect a OR operation");
// Set Opc
@ -1632,30 +1700,36 @@ static bool isBitfieldInsertOpFromOr(SDNode *N, unsigned &Opc, SDValue &Opd0,
for (int i = 0; i < 2;
++i, std::swap(OrOpd0, OrOpd1), OrOpd1Val = N->getOperand(0)) {
unsigned BFXOpc;
// Set Opd1, LSB and MSB arguments by looking for
// c = ubfm b, imm, imm2
if (!isBitfieldExtractOp(CurDAG, OrOpd0, BFXOpc, Opd1, LSB, MSB,
NumberOfIgnoredLowBits, true))
continue;
int DstLSB, Width;
if (isBitfieldExtractOp(CurDAG, OrOpd0, BFXOpc, Src, ImmR, ImmS,
NumberOfIgnoredLowBits, true)) {
// Check that the returned opcode is compatible with the pattern,
// i.e., same type and zero extended (U and not S)
if ((BFXOpc != ARM64::UBFMXri && VT == MVT::i64) ||
(BFXOpc != ARM64::UBFMWri && VT == MVT::i32))
continue;
// Check that the returned opcode is compatible with the pattern,
// i.e., same type and zero extended (U and not S)
if ((BFXOpc != ARM64::UBFMXri && VT == MVT::i64) ||
(BFXOpc != ARM64::UBFMWri && VT == MVT::i32))
continue;
// Compute the width of the bitfield insertion
DstLSB = 0;
Width = ImmS - ImmR + 1;
// FIXME: This constraint is to catch bitfield insertion we may
// want to widen the pattern if we want to grab general bitfied
// move case
if (Width <= 0)
continue;
// Compute the width of the bitfield insertion
int sMSB = MSB - LSB + 1;
// FIXME: This constraints is to catch bitfield insertion we may
// want to widen the pattern if we want to grab general bitfied
// move case
if (sMSB <= 0)
// If the mask on the insertee is correct, we have a BFXIL operation. We
// can share the ImmR and ImmS values from the already-computed UBFM.
} else if (isBitfieldPositioningOp(CurDAG, SDValue(OrOpd0, 0), Src,
DstLSB, Width)) {
ImmR = (VT.getSizeInBits() - DstLSB) % VT.getSizeInBits();
ImmS = Width - 1;
} else
continue;
// Check the second part of the pattern
EVT VT = OrOpd1->getValueType(0);
if (VT != MVT::i32 && VT != MVT::i64)
continue;
assert((VT == MVT::i32 || VT == MVT::i64) && "unexpected OR operand");
// Compute the Known Zero for the candidate of the first operand.
// This allows to catch more general case than just looking for
@ -1666,19 +1740,22 @@ static bool isBitfieldInsertOpFromOr(SDNode *N, unsigned &Opc, SDValue &Opd0,
// Check if there is enough room for the second operand to appear
// in the first one
if (KnownZero.countTrailingOnes() < (unsigned)sMSB)
APInt BitsToBeInserted =
APInt::getBitsSet(KnownZero.getBitWidth(), DstLSB, DstLSB + Width);
if ((BitsToBeInserted & ~KnownZero) != 0)
continue;
// Set the first operand
uint64_t Imm;
if (isOpcWithIntImmediate(OrOpd1, ISD::AND, Imm) &&
isHighMask(Imm, sMSB, NumberOfIgnoredHighBits, VT))
isBitfieldDstMask(Imm, BitsToBeInserted, NumberOfIgnoredHighBits, VT))
// In that case, we can eliminate the AND
Opd0 = OrOpd1->getOperand(0);
Dst = OrOpd1->getOperand(0);
else
// Maybe the AND has been removed by simplify-demanded-bits
// or is useful because it discards more bits
Opd0 = OrOpd1Val;
Dst = OrOpd1Val;
// both parts match
return true;

View File

@ -1,4 +1,5 @@
; RUN: llc -mtriple=aarch64-none-linux-gnu < %s | FileCheck %s
; RUN: llc -mtriple=aarch64-none-linux-gnu < %s | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-AARCH64
; RUN: llc -mtriple=arm64-none-linux-gnu < %s | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-ARM64
; First, a simple example from Clang. The registers could plausibly be
; different, but probably won't be.
@ -7,8 +8,10 @@
define [1 x i64] @from_clang([1 x i64] %f.coerce, i32 %n) nounwind readnone {
; CHECK-LABEL: from_clang:
; CHECK: bfi w0, w1, #3, #4
; CHECK-NEXT: ret
; CHECK-AARCH64: bfi w0, w1, #3, #4
; CHECK-ARCH64-NEXT: ret
; CHECK-ARM64: bfm {{w[0-9]+}}, {{w[0-9]+}}, #29, #3
entry:
%f.coerce.fca.0.extract = extractvalue [1 x i64] %f.coerce, 0
@ -26,7 +29,9 @@ entry:
define void @test_whole32(i32* %existing, i32* %new) {
; CHECK-LABEL: test_whole32:
; CHECK: bfi {{w[0-9]+}}, {{w[0-9]+}}, #26, #5
; CHECK-AARCH64: bfi {{w[0-9]+}}, {{w[0-9]+}}, #26, #5
; CHECK-ARM64: bfm {{w[0-9]+}}, {{w[0-9]+}}, #6, #4
%oldval = load volatile i32* %existing
%oldval_keep = and i32 %oldval, 2214592511 ; =0x83ffffff
@ -43,7 +48,8 @@ define void @test_whole32(i32* %existing, i32* %new) {
define void @test_whole64(i64* %existing, i64* %new) {
; CHECK-LABEL: test_whole64:
; CHECK: bfi {{x[0-9]+}}, {{x[0-9]+}}, #26, #14
; CHECK-AARCH64: bfi {{x[0-9]+}}, {{x[0-9]+}}, #26, #14
; CHECK-ARM64: bfm {{x[0-9]+}}, {{x[0-9]+}}, #38, #13
; CHECK-NOT: and
; CHECK: ret
@ -62,8 +68,11 @@ define void @test_whole64(i64* %existing, i64* %new) {
define void @test_whole32_from64(i64* %existing, i64* %new) {
; CHECK-LABEL: test_whole32_from64:
; CHECK: bfi {{w[0-9]+}}, {{w[0-9]+}}, #{{0|16}}, #16
; CHECK-NOT: and
; CHECK-AARCH64: bfi {{w[0-9]+}}, {{w[0-9]+}}, #{{0|16}}, #16
; CHECK-AARCH64-NOT: and
; CHECK-ARM64: bfm {{x[0-9]+}}, {{x[0-9]+}}, #0, #15
; CHECK: ret
%oldval = load volatile i64* %existing
@ -80,8 +89,12 @@ define void @test_whole32_from64(i64* %existing, i64* %new) {
define void @test_32bit_masked(i32 *%existing, i32 *%new) {
; CHECK-LABEL: test_32bit_masked:
; CHECK: bfi [[INSERT:w[0-9]+]], {{w[0-9]+}}, #3, #4
; CHECK: and {{w[0-9]+}}, [[INSERT]], #0xff
; CHECK-AARCH64: bfi [[INSERT:w[0-9]+]], {{w[0-9]+}}, #3, #4
; CHECK-AARCH64: and {{w[0-9]+}}, [[INSERT]], #0xff
; CHECK-ARM64: and
; CHECK-ARM64: bfm {{w[0-9]+}}, {{w[0-9]+}}, #29, #3
%oldval = load volatile i32* %existing
%oldval_keep = and i32 %oldval, 135 ; = 0x87
@ -98,8 +111,11 @@ define void @test_32bit_masked(i32 *%existing, i32 *%new) {
define void @test_64bit_masked(i64 *%existing, i64 *%new) {
; CHECK-LABEL: test_64bit_masked:
; CHECK: bfi [[INSERT:x[0-9]+]], {{x[0-9]+}}, #40, #8
; CHECK: and {{x[0-9]+}}, [[INSERT]], #0xffff00000000
; CHECK-AARCH64: bfi [[INSERT:x[0-9]+]], {{x[0-9]+}}, #40, #8
; CHECK-AARCH64: and {{x[0-9]+}}, [[INSERT]], #0xffff00000000
; CHECK-ARM64: and
; CHECK-ARM64: bfm {{x[0-9]+}}, {{x[0-9]+}}, #24, #7
%oldval = load volatile i64* %existing
%oldval_keep = and i64 %oldval, 1095216660480 ; = 0xff_0000_0000
@ -117,8 +133,12 @@ define void @test_64bit_masked(i64 *%existing, i64 *%new) {
; Mask is too complicated for literal ANDwwi, make sure other avenues are tried.
define void @test_32bit_complexmask(i32 *%existing, i32 *%new) {
; CHECK-LABEL: test_32bit_complexmask:
; CHECK: bfi {{w[0-9]+}}, {{w[0-9]+}}, #3, #4
; CHECK: and {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
; CHECK-AARCH64: bfi {{w[0-9]+}}, {{w[0-9]+}}, #3, #4
; CHECK-AARCH64: and {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
; CHECK-ARM64: and
; CHECK-ARM64: bfm {{w[0-9]+}}, {{w[0-9]+}}, #29, #3
%oldval = load volatile i32* %existing
%oldval_keep = and i32 %oldval, 647 ; = 0x287
@ -137,6 +157,7 @@ define void @test_32bit_complexmask(i32 *%existing, i32 *%new) {
define void @test_32bit_badmask(i32 *%existing, i32 *%new) {
; CHECK-LABEL: test_32bit_badmask:
; CHECK-NOT: bfi
; CHECK-NOT: bfm
; CHECK: ret
%oldval = load volatile i32* %existing
@ -156,6 +177,7 @@ define void @test_32bit_badmask(i32 *%existing, i32 *%new) {
define void @test_64bit_badmask(i64 *%existing, i64 *%new) {
; CHECK-LABEL: test_64bit_badmask:
; CHECK-NOT: bfi
; CHECK-NOT: bfm
; CHECK: ret
%oldval = load volatile i64* %existing
@ -186,8 +208,8 @@ define void @test_32bit_with_shr(i32* %existing, i32* %new) {
%combined = or i32 %oldval_keep, %newval_masked
store volatile i32 %combined, i32* %existing
; CHECK: lsr [[BIT:w[0-9]+]], {{w[0-9]+}}, #14
; CHECK: bfi {{w[0-9]}}, [[BIT]], #26, #5
; CHECK-AARCH64: bfi {{w[0-9]+}}, [[BIT]], #26, #5
; CHECK-ARM64: bfm {{w[0-9]+}}, [[BIT]], #6, #4
ret void
}

View File

@ -4,7 +4,7 @@
define i32 @f0(i32* nocapture %p) nounwind {
; CHECK-STRICT: ldrh [[HIGH:w[0-9]+]], [x0, #2]
; CHECK-STRICT: ldrh [[LOW:w[0-9]+]], [x0]
; CHECK-STRICT: orr w0, [[LOW]], [[HIGH]], lsl #16
; CHECK-STRICT: bfm [[LOW]], [[HIGH]], #16, #15
; CHECK-STRICT: ret
; CHECK: ldr w0, [x0]
@ -15,7 +15,7 @@ define i32 @f0(i32* nocapture %p) nounwind {
define i64 @f1(i64* nocapture %p) nounwind {
; CHECK-STRICT: ldp w[[LOW:[0-9]+]], w[[HIGH:[0-9]+]], [x0]
; CHECK-STRICT: orr x0, x[[LOW]], x[[HIGH]], lsl #32
; CHECK-STRICT: bfm x[[LOW]], x[[HIGH]], #32, #31
; CHECK-STRICT: ret
; CHECK: ldr x0, [x0]