From 5423856e44a7e4b173af211b0fb0675c44945a58 Mon Sep 17 00:00:00 2001 From: Jim Grosbach Date: Sat, 17 Jul 2010 03:30:54 +0000 Subject: [PATCH] Add combiner patterns to more effectively utilize the BFI (bitfield insert) instruction for non-constant operands. This includes the case referenced in the README.txt regarding a bitfield copy. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@108608 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/ARM/ARMISelLowering.cpp | 84 ++++++++++++++++++++++++------ lib/Target/ARM/README.txt | 21 -------- test/CodeGen/ARM/bfi.ll | 23 ++++++++ test/CodeGen/Thumb2/bfi.ll | 23 ++++++++ 4 files changed, 114 insertions(+), 37 deletions(-) diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp index 2e5ba3567e4..2cba4bf0cad 100644 --- a/lib/Target/ARM/ARMISelLowering.cpp +++ b/lib/Target/ARM/ARMISelLowering.cpp @@ -4240,21 +4240,33 @@ static SDValue PerformMULCombine(SDNode *N, static SDValue PerformORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget) { + // Try to use the ARM/Thumb2 BFI (bitfield insert) instruction when + // reasonable. + // BFI is only available on V6T2+ if (Subtarget->isThumb1Only() || !Subtarget->hasV6T2Ops()) return SDValue(); SelectionDAG &DAG = DCI.DAG; SDValue N0 = N->getOperand(0), N1 = N->getOperand(1); - // or (and A, mask), val => ARMbfi A, val, mask - // iff (val & mask) == val - if (N0->getOpcode() != ISD::AND) + DebugLoc DL = N->getDebugLoc(); + // 1) or (and A, mask), val => ARMbfi A, val, mask + // iff (val & mask) == val + // + // 2) or (and A, mask), (and B, mask2) => ARMbfi A, (lsr B, amt), mask + // 2a) iff isBitFieldInvertedMask(mask) && isBitFieldInvertedMask(~mask2) + // && CountPopulation_32(mask) == CountPopulation_32(~mask2) + // 2b) iff isBitFieldInvertedMask(~mask) && isBitFieldInvertedMask(mask2) + // && CountPopulation_32(mask) == CountPopulation_32(~mask2) + // (i.e., copy a bitfield value into another bitfield of the same width) + if (N0.getOpcode() != ISD::AND) return SDValue(); EVT VT = N->getValueType(0); if (VT != MVT::i32) return SDValue(); + // The value and the mask need to be constants so we can verify this is // actually a bitfield set. If the mask is 0xffff, we can do better // via a movt instruction, so don't use BFI in that case. @@ -4264,21 +4276,61 @@ static SDValue PerformORCombine(SDNode *N, unsigned Mask = C->getZExtValue(); if (Mask == 0xffff) return SDValue(); - C = dyn_cast(N1); - if (!C) - return SDValue(); - unsigned Val = C->getZExtValue(); - if (!ARM::isBitFieldInvertedMask(Mask) || (Val & ~Mask) != Val) - return SDValue(); - Val >>= CountTrailingZeros_32(~Mask); + SDValue Res; + // Case (1): or (and A, mask), val => ARMbfi A, val, mask + if ((C = dyn_cast(N1))) { + unsigned Val = C->getZExtValue(); + if (!ARM::isBitFieldInvertedMask(Mask) || (Val & ~Mask) != Val) + return SDValue(); + Val >>= CountTrailingZeros_32(~Mask); - DebugLoc DL = N->getDebugLoc(); - SDValue Res = DAG.getNode(ARMISD::BFI, DL, VT, N0.getOperand(0), - DAG.getConstant(Val, MVT::i32), - DAG.getConstant(Mask, MVT::i32)); + Res = DAG.getNode(ARMISD::BFI, DL, VT, N0.getOperand(0), + DAG.getConstant(Val, MVT::i32), + DAG.getConstant(Mask, MVT::i32)); - // Do not add new nodes to DAG combiner worklist. - DCI.CombineTo(N, Res, false); + // Do not add new nodes to DAG combiner worklist. + DCI.CombineTo(N, Res, false); + } else if (N1.getOpcode() == ISD::AND) { + // case (2) or (and A, mask), (and B, mask2) => ARMbfi A, (lsr B, amt), mask + C = dyn_cast(N1.getOperand(1)); + if (!C) + return SDValue(); + unsigned Mask2 = C->getZExtValue(); + + if (ARM::isBitFieldInvertedMask(Mask) && + ARM::isBitFieldInvertedMask(~Mask2) && + (CountPopulation_32(Mask) == CountPopulation_32(~Mask2))) { + // The pack halfword instruction works better for masks that fit it, + // so use that when it's available. + if (Subtarget->hasT2ExtractPack() && + (Mask == 0xffff || Mask == 0xffff0000)) + return SDValue(); + // 2a + unsigned lsb = CountTrailingZeros_32(Mask2); + Res = DAG.getNode(ISD::SRL, DL, VT, N1.getOperand(0), + DAG.getConstant(lsb, MVT::i32)); + Res = DAG.getNode(ARMISD::BFI, DL, VT, N0.getOperand(0), Res, + DAG.getConstant(Mask, MVT::i32)); + // Do not add new nodes to DAG combiner worklist. + DCI.CombineTo(N, Res, false); + } else if (ARM::isBitFieldInvertedMask(~Mask) && + ARM::isBitFieldInvertedMask(Mask2) && + (CountPopulation_32(~Mask) == CountPopulation_32(Mask2))) { + // The pack halfword instruction works better for masks that fit it, + // so use that when it's available. + if (Subtarget->hasT2ExtractPack() && + (Mask2 == 0xffff || Mask2 == 0xffff0000)) + return SDValue(); + // 2b + unsigned lsb = CountTrailingZeros_32(Mask); + Res = DAG.getNode(ISD::SRL, DL, VT, N0.getOperand(0), + DAG.getConstant(lsb, MVT::i32)); + Res = DAG.getNode(ARMISD::BFI, DL, VT, N1.getOperand(0), Res, + DAG.getConstant(Mask2, MVT::i32)); + // Do not add new nodes to DAG combiner worklist. + DCI.CombineTo(N, Res, false); + } + } return SDValue(); } diff --git a/lib/Target/ARM/README.txt b/lib/Target/ARM/README.txt index 0cb8ff01181..ba4e5da5a05 100644 --- a/lib/Target/ARM/README.txt +++ b/lib/Target/ARM/README.txt @@ -609,27 +609,6 @@ We currently generate: We should be able to replace the second ldr+and with a bic (i.e. reuse the constant which was already loaded). Not sure what's necessary to do that. -//===---------------------------------------------------------------------===// - -Given the following on ARMv7: -int test1(int A, int B) { - return (A&-8388481)|(B&8388480); -} - -We currently generate: - bfc r0, #7, #16 - movw r2, #:lower16:8388480 - movt r2, #:upper16:8388480 - and r1, r1, r2 - orr r0, r1, r0 - bx lr - -The following is much shorter: - lsr r1, r1, #7 - bfi r0, r1, #7, #16 - bx lr - - //===---------------------------------------------------------------------===// The code generated for bswap on armv4/5 (CPUs without rev) is less than ideal: diff --git a/test/CodeGen/ARM/bfi.ll b/test/CodeGen/ARM/bfi.ll index 48ef43762f0..59e2b43a917 100644 --- a/test/CodeGen/ARM/bfi.ll +++ b/test/CodeGen/ARM/bfi.ll @@ -15,3 +15,26 @@ entry: store i32 %2, i32* bitcast (%struct.F* @X to i32*), align 4 ret void } + +define i32 @f2(i32 %A, i32 %B) nounwind readnone optsize { +entry: +; CHECK: f2 +; CHECK: mov r1, r1, lsr #7 +; CHECK: bfi r0, r1, #7, #16 + %and = and i32 %A, -8388481 ; [#uses=1] + %and2 = and i32 %B, 8388480 ; [#uses=1] + %or = or i32 %and2, %and ; [#uses=1] + ret i32 %or +} + +define i32 @f3(i32 %A, i32 %B) nounwind readnone optsize { +entry: +; CHECK: f3 +; CHECK: mov r2, r0, lsr #7 +; CHECK: mov r0, r1 +; CHECK: bfi r0, r2, #7, #16 + %and = and i32 %A, 8388480 ; [#uses=1] + %and2 = and i32 %B, -8388481 ; [#uses=1] + %or = or i32 %and2, %and ; [#uses=1] + ret i32 %or +} diff --git a/test/CodeGen/Thumb2/bfi.ll b/test/CodeGen/Thumb2/bfi.ll index a256d67800e..22473bb35a0 100644 --- a/test/CodeGen/Thumb2/bfi.ll +++ b/test/CodeGen/Thumb2/bfi.ll @@ -15,3 +15,26 @@ entry: store i32 %2, i32* bitcast (%struct.F* @X to i32*), align 4 ret void } + +define i32 @f2(i32 %A, i32 %B) nounwind readnone optsize { +entry: +; CHECK: f2 +; CHECK: lsrs r1, r1, #7 +; CHECK: bfi r0, r1, #7, #16 + %and = and i32 %A, -8388481 ; [#uses=1] + %and2 = and i32 %B, 8388480 ; [#uses=1] + %or = or i32 %and2, %and ; [#uses=1] + ret i32 %or +} + +define i32 @f3(i32 %A, i32 %B) nounwind readnone optsize { +entry: +; CHECK: f3 +; CHECK: lsrs r2, r0, #7 +; CHECK: mov r0, r1 +; CHECK: bfi r0, r2, #7, #16 + %and = and i32 %A, 8388480 ; [#uses=1] + %and2 = and i32 %B, -8388481 ; [#uses=1] + %or = or i32 %and2, %and ; [#uses=1] + ret i32 %or +}