From 5423856e44a7e4b173af211b0fb0675c44945a58 Mon Sep 17 00:00:00 2001
From: Jim Grosbach <grosbach@apple.com>
Date: Sat, 17 Jul 2010 03:30:54 +0000
Subject: [PATCH] Add combiner patterns to more effectively utilize the BFI
 (bitfield insert) instruction for non-constant operands. This includes the
 case referenced in the README.txt regarding a bitfield copy.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@108608 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/ARM/ARMISelLowering.cpp | 84 ++++++++++++++++++++++++------
 lib/Target/ARM/README.txt          | 21 --------
 test/CodeGen/ARM/bfi.ll            | 23 ++++++++
 test/CodeGen/Thumb2/bfi.ll         | 23 ++++++++
 4 files changed, 114 insertions(+), 37 deletions(-)
diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp
index 2e5ba3567e4..2cba4bf0cad 100644
--- a/lib/Target/ARM/ARMISelLowering.cpp
+++ b/lib/Target/ARM/ARMISelLowering.cpp
@@ -4240,21 +4240,33 @@ static SDValue PerformMULCombine(SDNode *N,
 static SDValue PerformORCombine(SDNode *N,
                                 TargetLowering::DAGCombinerInfo &DCI,
                                 const ARMSubtarget *Subtarget) {
+  // Try to use the ARM/Thumb2 BFI (bitfield insert) instruction when
+  // reasonable.
+
   // BFI is only available on V6T2+
   if (Subtarget->isThumb1Only() || !Subtarget->hasV6T2Ops())
     return SDValue();
 
   SelectionDAG &DAG = DCI.DAG;
   SDValue N0 = N->getOperand(0), N1 = N->getOperand(1);
-  // or (and A, mask), val => ARMbfi A, val, mask
-  //   iff (val & mask) == val
-  if (N0->getOpcode() != ISD::AND)
+  DebugLoc DL = N->getDebugLoc();
+  // 1) or (and A, mask), val => ARMbfi A, val, mask
+  //      iff (val & mask) == val
+  //
+  // 2) or (and A, mask), (and B, mask2) => ARMbfi A, (lsr B, amt), mask
+  //  2a) iff isBitFieldInvertedMask(mask) && isBitFieldInvertedMask(~mask2)
+  //          && CountPopulation_32(mask) == CountPopulation_32(~mask2)
+  //  2b) iff isBitFieldInvertedMask(~mask) && isBitFieldInvertedMask(mask2)
+  //          && CountPopulation_32(mask) == CountPopulation_32(~mask2)
+  //  (i.e., copy a bitfield value into another bitfield of the same width)
+  if (N0.getOpcode() != ISD::AND)
     return SDValue();
 
   EVT VT = N->getValueType(0);
   if (VT != MVT::i32)
     return SDValue();
 
+
   // The value and the mask need to be constants so we can verify this is
   // actually a bitfield set. If the mask is 0xffff, we can do better
   // via a movt instruction, so don't use BFI in that case.
@@ -4264,21 +4276,61 @@ static SDValue PerformORCombine(SDNode *N,
   unsigned Mask = C->getZExtValue();
   if (Mask == 0xffff)
     return SDValue();
-  C = dyn_cast<ConstantSDNode>(N1);
-  if (!C)
-    return SDValue();
-  unsigned Val = C->getZExtValue();
-  if (!ARM::isBitFieldInvertedMask(Mask) || (Val & ~Mask) != Val)
-    return SDValue();
-  Val >>= CountTrailingZeros_32(~Mask);
+  SDValue Res;
+  // Case (1): or (and A, mask), val => ARMbfi A, val, mask
+  if ((C = dyn_cast<ConstantSDNode>(N1))) {
+    unsigned Val = C->getZExtValue();
+    if (!ARM::isBitFieldInvertedMask(Mask) || (Val & ~Mask) != Val)
+      return SDValue();
+    Val >>= CountTrailingZeros_32(~Mask);
 
-  DebugLoc DL = N->getDebugLoc();
-  SDValue Res = DAG.getNode(ARMISD::BFI, DL, VT, N0.getOperand(0),
-                            DAG.getConstant(Val, MVT::i32),
-                            DAG.getConstant(Mask, MVT::i32));
+    Res = DAG.getNode(ARMISD::BFI, DL, VT, N0.getOperand(0),
+                      DAG.getConstant(Val, MVT::i32),
+                      DAG.getConstant(Mask, MVT::i32));
 
-  // Do not add new nodes to DAG combiner worklist.
-  DCI.CombineTo(N, Res, false);
+    // Do not add new nodes to DAG combiner worklist.
+    DCI.CombineTo(N, Res, false);
+  } else if (N1.getOpcode() == ISD::AND) {
+    // case (2) or (and A, mask), (and B, mask2) => ARMbfi A, (lsr B, amt), mask
+    C = dyn_cast<ConstantSDNode>(N1.getOperand(1));
+    if (!C)
+      return SDValue();
+    unsigned Mask2 = C->getZExtValue();
+
+    if (ARM::isBitFieldInvertedMask(Mask) &&
+        ARM::isBitFieldInvertedMask(~Mask2) &&
+        (CountPopulation_32(Mask) == CountPopulation_32(~Mask2))) {
+      // The pack halfword instruction works better for masks that fit it,
+      // so use that when it's available.
+      if (Subtarget->hasT2ExtractPack() &&
+          (Mask == 0xffff || Mask == 0xffff0000))
+        return SDValue();
+      // 2a
+      unsigned lsb = CountTrailingZeros_32(Mask2);
+      Res = DAG.getNode(ISD::SRL, DL, VT, N1.getOperand(0),
+                        DAG.getConstant(lsb, MVT::i32));
+      Res = DAG.getNode(ARMISD::BFI, DL, VT, N0.getOperand(0), Res,
+                        DAG.getConstant(Mask, MVT::i32));
+      // Do not add new nodes to DAG combiner worklist.
+      DCI.CombineTo(N, Res, false);
+    } else if (ARM::isBitFieldInvertedMask(~Mask) &&
+               ARM::isBitFieldInvertedMask(Mask2) &&
+               (CountPopulation_32(~Mask) == CountPopulation_32(Mask2))) {
+      // The pack halfword instruction works better for masks that fit it,
+      // so use that when it's available.
+      if (Subtarget->hasT2ExtractPack() &&
+          (Mask2 == 0xffff || Mask2 == 0xffff0000))
+        return SDValue();
+      // 2b
+      unsigned lsb = CountTrailingZeros_32(Mask);
+      Res = DAG.getNode(ISD::SRL, DL, VT, N0.getOperand(0),
+                        DAG.getConstant(lsb, MVT::i32));
+      Res = DAG.getNode(ARMISD::BFI, DL, VT, N1.getOperand(0), Res,
+                                DAG.getConstant(Mask2, MVT::i32));
+      // Do not add new nodes to DAG combiner worklist.
+      DCI.CombineTo(N, Res, false);
+    }
+  }
 
   return SDValue();
 }
diff --git a/lib/Target/ARM/README.txt b/lib/Target/ARM/README.txt
index 0cb8ff01181..ba4e5da5a05 100644
--- a/lib/Target/ARM/README.txt
+++ b/lib/Target/ARM/README.txt
@@ -609,27 +609,6 @@ We currently generate:
 We should be able to replace the second ldr+and with a bic (i.e. reuse the
 constant which was already loaded).  Not sure what's necessary to do that.
 
-//===---------------------------------------------------------------------===//
-
-Given the following on ARMv7:
-int test1(int A, int B) {
-  return (A&-8388481)|(B&8388480);
-}
-
-We currently generate:
-	bfc	r0, #7, #16
-	movw	r2, #:lower16:8388480
-	movt	r2, #:upper16:8388480
-	and	r1, r1, r2
-	orr	r0, r1, r0
-	bx	lr
-
-The following is much shorter:
-	lsr	r1, r1, #7
-	bfi	r0, r1, #7, #16
-	bx	lr
-
-
 //===---------------------------------------------------------------------===//
 
 The code generated for bswap on armv4/5 (CPUs without rev) is less than ideal:
diff --git a/test/CodeGen/ARM/bfi.ll b/test/CodeGen/ARM/bfi.ll
index 48ef43762f0..59e2b43a917 100644
--- a/test/CodeGen/ARM/bfi.ll
+++ b/test/CodeGen/ARM/bfi.ll
@@ -15,3 +15,26 @@ entry:
   store i32 %2, i32* bitcast (%struct.F* @X to i32*), align 4
   ret void
 }
+
+define i32 @f2(i32 %A, i32 %B) nounwind readnone optsize {
+entry:
+; CHECK: f2
+; CHECK: mov r1, r1, lsr #7
+; CHECK: bfi r0, r1, #7, #16
+  %and = and i32 %A, -8388481                     ; <i32> [#uses=1]
+  %and2 = and i32 %B, 8388480                     ; <i32> [#uses=1]
+  %or = or i32 %and2, %and                        ; <i32> [#uses=1]
+  ret i32 %or
+}
+
+define i32 @f3(i32 %A, i32 %B) nounwind readnone optsize {
+entry:
+; CHECK: f3
+; CHECK: mov r2, r0, lsr #7
+; CHECK: mov r0, r1
+; CHECK: bfi r0, r2, #7, #16
+  %and = and i32 %A, 8388480                      ; <i32> [#uses=1]
+  %and2 = and i32 %B, -8388481                    ; <i32> [#uses=1]
+  %or = or i32 %and2, %and                        ; <i32> [#uses=1]
+  ret i32 %or
+}
diff --git a/test/CodeGen/Thumb2/bfi.ll b/test/CodeGen/Thumb2/bfi.ll
index a256d67800e..22473bb35a0 100644
--- a/test/CodeGen/Thumb2/bfi.ll
+++ b/test/CodeGen/Thumb2/bfi.ll
@@ -15,3 +15,26 @@ entry:
   store i32 %2, i32* bitcast (%struct.F* @X to i32*), align 4
   ret void
 }
+
+define i32 @f2(i32 %A, i32 %B) nounwind readnone optsize {
+entry:
+; CHECK: f2
+; CHECK: lsrs  r1, r1, #7
+; CHECK: bfi r0, r1, #7, #16
+  %and = and i32 %A, -8388481                     ; <i32> [#uses=1]
+  %and2 = and i32 %B, 8388480                     ; <i32> [#uses=1]
+  %or = or i32 %and2, %and                        ; <i32> [#uses=1]
+  ret i32 %or
+}
+
+define i32 @f3(i32 %A, i32 %B) nounwind readnone optsize {
+entry:
+; CHECK: f3
+; CHECK: lsrs  r2, r0, #7
+; CHECK: mov r0, r1
+; CHECK: bfi r0, r2, #7, #16
+  %and = and i32 %A, 8388480                      ; <i32> [#uses=1]
+  %and2 = and i32 %B, -8388481                    ; <i32> [#uses=1]
+  %or = or i32 %and2, %and                        ; <i32> [#uses=1]
+  ret i32 %or
+}