From d40d03e1bd1d51857fc2f9f9230e334c3a32b249 Mon Sep 17 00:00:00 2001
From: Evan Cheng <evan.cheng@apple.com>
Date: Wed, 6 Jan 2010 19:38:29 +0000
Subject: [PATCH] Teach dag combine to fold the following transformation more
 aggressively: (OP (trunc x), (trunc y)) -> (trunc (OP x, y))

Unfortunately this simple change causes dag combine to infinite looping. The problem is the shrink demanded ops optimization tend to canonicalize expressions in the opposite manner. That is badness. This patch disable those optimizations in dag combine but instead it is done as a late pass in sdisel.

This also exposes some deficiencies in dag combine and x86 setcc / brcond lowering. Teach them to look pass ISD::TRUNCATE in various places.


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@92849 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/CodeGen/SelectionDAGISel.h       |   1 +
 include/llvm/Target/TargetLowering.h          |   6 +-
 lib/CodeGen/SelectionDAG/DAGCombiner.cpp      | 170 ++++++++++++------
 lib/CodeGen/SelectionDAG/SelectionDAG.cpp     |   2 +
 lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp |  73 +++++++-
 lib/CodeGen/SelectionDAG/TargetLowering.cpp   |  22 ++-
 lib/Target/X86/X86ISelLowering.cpp            | 134 +++++++++-----
 .../Blackfin/2009-08-15-SetCC-Undef.ll        |   1 -
 test/CodeGen/Blackfin/promote-logic.ll        |   1 +
 test/CodeGen/CellSPU/mul_ops.ll               |   1 -
 test/CodeGen/SystemZ/2009-06-02-Rotate.ll     |   4 +-
 test/CodeGen/X86/xor-icmp.ll                  |  36 ++++
 12 files changed, 335 insertions(+), 116 deletions(-)
 create mode 100644 test/CodeGen/X86/xor-icmp.ll

diff --git a/include/llvm/CodeGen/SelectionDAGISel.h b/include/llvm/CodeGen/SelectionDAGISel.h
index bf557268474..b33b21da42a 100644
--- a/include/llvm/CodeGen/SelectionDAGISel.h
+++ b/include/llvm/CodeGen/SelectionDAGISel.h
@@ -131,6 +131,7 @@ private:
   void CodeGenAndEmitDAG();
   void LowerArguments(BasicBlock *BB);
   
+  void ShrinkDemandedOps();
   void ComputeLiveOutVRegInfo();
 
   void HandlePHINodesInSuccessorBlocks(BasicBlock *LLVMBB);
diff --git a/include/llvm/Target/TargetLowering.h b/include/llvm/Target/TargetLowering.h
index dd28a87938f..15da8456f17 100644
--- a/include/llvm/Target/TargetLowering.h
+++ b/include/llvm/Target/TargetLowering.h
@@ -774,10 +774,12 @@ public:
   /// that want to combine 
   struct TargetLoweringOpt {
     SelectionDAG &DAG;
+    bool ShrinkOps;
     SDValue Old;
     SDValue New;
 
-    explicit TargetLoweringOpt(SelectionDAG &InDAG) : DAG(InDAG) {}
+    explicit TargetLoweringOpt(SelectionDAG &InDAG, bool Shrink = false) :
+      DAG(InDAG), ShrinkOps(Shrink) {}
     
     bool CombineTo(SDValue O, SDValue N) { 
       Old = O; 
@@ -1478,7 +1480,7 @@ public:
   }
 
   /// isZExtFree - Return true if any actual instruction that defines a
-  /// value of type Ty1 implicit zero-extends the value to Ty2 in the result
+  /// value of type Ty1 implicitly zero-extends the value to Ty2 in the result
   /// register. This does not necessarily include registers defined in
   /// unknown ways, such as incoming arguments, or copies from unknown
   /// virtual registers. Also, if isTruncateFree(Ty2, Ty1) is true, this
diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 759fa0e1125..5ab92805f1a 100644
--- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -1688,18 +1688,18 @@ SDValue DAGCombiner::SimplifyBinOpWithSameOpcodeHands(SDNode *N) {
   // fold (OP (zext x), (zext y)) -> (zext (OP x, y))
   // fold (OP (sext x), (sext y)) -> (sext (OP x, y))
   // fold (OP (aext x), (aext y)) -> (aext (OP x, y))
-  // fold (OP (trunc x), (trunc y)) -> (trunc (OP x, y)) (if trunc isn't free)
+  // fold (OP (trunc x), (trunc y)) -> (trunc (OP x, y))
   //
   // do not sink logical op inside of a vector extend, since it may combine
   // into a vsetcc.
-  if ((N0.getOpcode() == ISD::ZERO_EXTEND || N0.getOpcode() == ISD::ANY_EXTEND||
+  EVT Op0VT = N0.getOperand(0).getValueType();
+  if ((N0.getOpcode() == ISD::ZERO_EXTEND ||
+       N0.getOpcode() == ISD::ANY_EXTEND  ||
        N0.getOpcode() == ISD::SIGN_EXTEND ||
-       (N0.getOpcode() == ISD::TRUNCATE &&
-        !TLI.isTruncateFree(N0.getOperand(0).getValueType(), VT))) &&
+       (N0.getOpcode() == ISD::TRUNCATE && TLI.isTypeLegal(Op0VT))) &&
       !VT.isVector() &&
-      N0.getOperand(0).getValueType() == N1.getOperand(0).getValueType() &&
-      (!LegalOperations ||
-       TLI.isOperationLegal(N->getOpcode(), N0.getOperand(0).getValueType()))) {
+      Op0VT == N1.getOperand(0).getValueType() &&
+      (!LegalOperations || TLI.isOperationLegal(N->getOpcode(), Op0VT))) {
     SDValue ORNode = DAG.getNode(N->getOpcode(), N0.getDebugLoc(),
                                  N0.getOperand(0).getValueType(),
                                  N0.getOperand(0), N1.getOperand(0));
@@ -1839,6 +1839,7 @@ SDValue DAGCombiner::visitAND(SDNode *N) {
   if (!VT.isVector() &&
       SimplifyDemandedBits(SDValue(N, 0)))
     return SDValue(N, 0);
+
   // fold (zext_inreg (extload x)) -> (zextload x)
   if (ISD::isEXTLoad(N0.getNode()) && ISD::isUNINDEXEDLoad(N0.getNode())) {
     LoadSDNode *LN0 = cast<LoadSDNode>(N0);
@@ -1885,48 +1886,89 @@ SDValue DAGCombiner::visitAND(SDNode *N) {
 
   // fold (and (load x), 255) -> (zextload x, i8)
   // fold (and (extload x, i16), 255) -> (zextload x, i8)
-  if (N1C && N0.getOpcode() == ISD::LOAD) {
-    LoadSDNode *LN0 = cast<LoadSDNode>(N0);
+  // fold (and (any_ext (extload x, i16)), 255) -> (zextload x, i8)
+  if (N1C && (N0.getOpcode() == ISD::LOAD ||
+              (N0.getOpcode() == ISD::ANY_EXTEND &&
+               N0.getOperand(0).getOpcode() == ISD::LOAD))) {
+    bool HasAnyExt = N0.getOpcode() == ISD::ANY_EXTEND;
+    LoadSDNode *LN0 = HasAnyExt
+      ? cast<LoadSDNode>(N0.getOperand(0))
+      : cast<LoadSDNode>(N0);
     if (LN0->getExtensionType() != ISD::SEXTLOAD &&
-        LN0->isUnindexed() && N0.hasOneUse() &&
-        // Do not change the width of a volatile load.
-        !LN0->isVolatile()) {
-      EVT ExtVT = MVT::Other;
+        LN0->isUnindexed() && N0.hasOneUse()) {
       uint32_t ActiveBits = N1C->getAPIntValue().getActiveBits();
-      if (ActiveBits > 0 && APIntOps::isMask(ActiveBits, N1C->getAPIntValue()))
-        ExtVT = EVT::getIntegerVT(*DAG.getContext(), ActiveBits);
+      if (ActiveBits > 0 && APIntOps::isMask(ActiveBits, N1C->getAPIntValue())){
+        EVT ExtVT = EVT::getIntegerVT(*DAG.getContext(), ActiveBits);
+        EVT LoadedVT = LN0->getMemoryVT();
 
-      EVT LoadedVT = LN0->getMemoryVT();
+        if (ExtVT == LoadedVT &&
+            (!LegalOperations || TLI.isLoadExtLegal(ISD::ZEXTLOAD, ExtVT))) {
+          if (HasAnyExt) {
+            SDValue Load = 
+              DAG.getExtLoad(ISD::ZEXTLOAD, LN0->getDebugLoc(),
+                             LN0->getValueType(0),
+                             LN0->getChain(), LN0->getBasePtr(),
+                             LN0->getSrcValue(), LN0->getSrcValueOffset(),
+                             ExtVT, LN0->isVolatile(), LN0->getAlignment());
+            AddToWorkList(N);
+            CombineTo(N0.getOperand(0).getNode(), Load, Load.getValue(1));
+            return SDValue(N, 0);   // Return N so it doesn't get rechecked!
+          } else {
+            SDValue Load =
+              DAG.getExtLoad(ISD::ZEXTLOAD, LN0->getDebugLoc(), VT,
+                             LN0->getChain(), LN0->getBasePtr(),
+                             LN0->getSrcValue(), LN0->getSrcValueOffset(),
+                             ExtVT, LN0->isVolatile(), LN0->getAlignment());
+            AddToWorkList(N);
+            CombineTo(N0.getNode(), Load, Load.getValue(1));
+            return SDValue(N, 0);   // Return N so it doesn't get rechecked!
+          }
+        } else if (!LN0->isVolatile()) {
+          // Do not change the width of a volatile load.
+          // Do not generate loads of non-round integer types since these can
+          // be expensive (and would be wrong if the type is not byte sized).
+          if (LoadedVT.bitsGT(ExtVT) && ExtVT.isRound() &&
+              (!LegalOperations || TLI.isLoadExtLegal(ISD::ZEXTLOAD, ExtVT))) {
+            EVT PtrType = LN0->getOperand(1).getValueType();
 
-      // Do not generate loads of non-round integer types since these can
-      // be expensive (and would be wrong if the type is not byte sized).
-      if (ExtVT != MVT::Other && LoadedVT.bitsGT(ExtVT) && ExtVT.isRound() &&
-          (!LegalOperations || TLI.isLoadExtLegal(ISD::ZEXTLOAD, ExtVT))) {
-        EVT PtrType = N0.getOperand(1).getValueType();
+            // For big endian targets, we need to add an offset to the pointer
+            // to load the correct bytes.  For little endian systems, we merely
+            // need to read fewer bytes from the same pointer.
+            unsigned LVTStoreBytes = LoadedVT.getStoreSize();
+            unsigned EVTStoreBytes = ExtVT.getStoreSize();
+            unsigned PtrOff = LVTStoreBytes - EVTStoreBytes;
+            unsigned Alignment = LN0->getAlignment();
+            SDValue NewPtr = LN0->getBasePtr();
 
-        // For big endian targets, we need to add an offset to the pointer to
-        // load the correct bytes.  For little endian systems, we merely need to
-        // read fewer bytes from the same pointer.
-        unsigned LVTStoreBytes = LoadedVT.getStoreSize();
-        unsigned EVTStoreBytes = ExtVT.getStoreSize();
-        unsigned PtrOff = LVTStoreBytes - EVTStoreBytes;
-        unsigned Alignment = LN0->getAlignment();
-        SDValue NewPtr = LN0->getBasePtr();
+            if (TLI.isBigEndian()) {
+              NewPtr = DAG.getNode(ISD::ADD, LN0->getDebugLoc(), PtrType,
+                                   NewPtr, DAG.getConstant(PtrOff, PtrType));
+              Alignment = MinAlign(Alignment, PtrOff);
+            }
 
-        if (TLI.isBigEndian()) {
-          NewPtr = DAG.getNode(ISD::ADD, LN0->getDebugLoc(), PtrType,
-                               NewPtr, DAG.getConstant(PtrOff, PtrType));
-          Alignment = MinAlign(Alignment, PtrOff);
+            AddToWorkList(NewPtr.getNode());
+            if (HasAnyExt) {
+              SDValue Load =
+                DAG.getExtLoad(ISD::ZEXTLOAD, LN0->getDebugLoc(),
+                               LN0->getValueType(0),
+                               LN0->getChain(), NewPtr,
+                               LN0->getSrcValue(), LN0->getSrcValueOffset(),
+                               ExtVT, LN0->isVolatile(), Alignment);
+              AddToWorkList(N);
+              CombineTo(N0.getOperand(0).getNode(), Load, Load.getValue(1));
+              return SDValue(N, 0);   // Return N so it doesn't get rechecked!
+            } else {
+              SDValue Load =
+                DAG.getExtLoad(ISD::ZEXTLOAD, LN0->getDebugLoc(), VT,
+                               LN0->getChain(), NewPtr,
+                               LN0->getSrcValue(), LN0->getSrcValueOffset(),
+                               ExtVT, LN0->isVolatile(), Alignment);
+              AddToWorkList(N);
+              CombineTo(N0.getNode(), Load, Load.getValue(1));
+              return SDValue(N, 0);   // Return N so it doesn't get rechecked!
+            }
+          }
         }
-
-        AddToWorkList(NewPtr.getNode());
-        SDValue Load =
-          DAG.getExtLoad(ISD::ZEXTLOAD, LN0->getDebugLoc(), VT, LN0->getChain(),
-                         NewPtr, LN0->getSrcValue(), LN0->getSrcValueOffset(),
-                         ExtVT, LN0->isVolatile(), Alignment);
-        AddToWorkList(N);
-        CombineTo(N0.getNode(), Load, Load.getValue(1));
-        return SDValue(N, 0);   // Return N so it doesn't get rechecked!
       }
     }
   }
@@ -2778,9 +2820,17 @@ SDValue DAGCombiner::visitSRL(SDNode *N) {
   // However when after the source operand of SRL is optimized into AND, the SRL
   // itself may not be optimized further. Look for it and add the BRCOND into
   // the worklist.
-  if (N->hasOneUse() &&
-      N->use_begin()->getOpcode() == ISD::BRCOND)
-    AddToWorkList(*N->use_begin());
+  if (N->hasOneUse()) {
+    SDNode *Use = *N->use_begin();
+    if (Use->getOpcode() == ISD::BRCOND)
+      AddToWorkList(Use);
+    else if (Use->getOpcode() == ISD::TRUNCATE && Use->hasOneUse()) {
+      // Also look pass the truncate.
+      Use = *Use->use_begin();
+      if (Use->getOpcode() == ISD::BRCOND)
+        AddToWorkList(Use);
+    }
+  }
 
   return SDValue();
 }
@@ -3198,7 +3248,10 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) {
 
   // fold (zext (truncate x)) -> (and x, mask)
   if (N0.getOpcode() == ISD::TRUNCATE &&
-      (!LegalOperations || TLI.isOperationLegal(ISD::AND, VT))) {
+      (!LegalOperations || TLI.isOperationLegal(ISD::AND, VT)) &&
+      (!TLI.isTruncateFree(N0.getOperand(0).getValueType(),
+                           N0.getValueType()) ||
+       !TLI.isZExtFree(N0.getValueType(), VT))) {
     SDValue Op = N0.getOperand(0);
     if (Op.getValueType().bitsLT(VT)) {
       Op = DAG.getNode(ISD::ANY_EXTEND, N->getDebugLoc(), VT, Op);
@@ -3704,7 +3757,7 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
       return DAG.getNode(ISD::TRUNCATE, N->getDebugLoc(), VT, N0.getOperand(0));
     else
       // if the source and dest are the same type, we can drop both the extend
-      // and the truncate
+      // and the truncate.
       return N0.getOperand(0);
   }
 
@@ -4515,6 +4568,13 @@ SDValue DAGCombiner::visitBRCOND(SDNode *N) {
                        N1.getOperand(0), N1.getOperand(1), N2);
   }
 
+  SDNode *Trunc = 0;
+  if (N1.getOpcode() == ISD::TRUNCATE && N1.hasOneUse()) {
+    // Look pass truncate.
+    Trunc = N1.getNode();
+    N1 = N1.getOperand(0);
+  }
+
   if (N1.hasOneUse() && N1.getOpcode() == ISD::SRL) {
     // Match this pattern so that we can generate simpler code:
     //
@@ -4526,7 +4586,7 @@ SDValue DAGCombiner::visitBRCOND(SDNode *N) {
     // into
     // 
     //   %a = ...
-    //   %b = and %a, 2
+    //   %b = and i32 %a, 2
     //   %c = setcc eq %b, 0
     //   brcond %c ...
     //
@@ -4537,7 +4597,6 @@ SDValue DAGCombiner::visitBRCOND(SDNode *N) {
     SDValue Op1 = N1.getOperand(1);
 
     if (Op0.getOpcode() == ISD::AND &&
-        Op0.hasOneUse() &&
         Op1.getOpcode() == ISD::Constant) {
       SDValue AndOp1 = Op0.getOperand(1);
 
@@ -4552,12 +4611,21 @@ SDValue DAGCombiner::visitBRCOND(SDNode *N) {
                          Op0, DAG.getConstant(0, Op0.getValueType()),
                          ISD::SETNE);
 
+          SDValue NewBRCond = DAG.getNode(ISD::BRCOND, N->getDebugLoc(),
+                                          MVT::Other, Chain, SetCC, N2);
+          // Don't add the new BRCond into the worklist or else SimplifySelectCC
+          // will convert it back to (X & C1) >> C2.
+          CombineTo(N, NewBRCond, false);
+          // Truncate is dead.
+          if (Trunc) {
+            removeFromWorkList(Trunc);
+            DAG.DeleteNode(Trunc);
+          }
           // Replace the uses of SRL with SETCC
           DAG.ReplaceAllUsesOfValueWith(N1, SetCC);
           removeFromWorkList(N1.getNode());
           DAG.DeleteNode(N1.getNode());
-          return DAG.getNode(ISD::BRCOND, N->getDebugLoc(),
-                             MVT::Other, Chain, SetCC, N2);
+          return SDValue(N, 0);   // Return N so it doesn't get rechecked!
         }
       }
     }
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index f1c8650728c..ca8c17beffa 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -2656,6 +2656,8 @@ SDValue SelectionDAG::getNode(unsigned Opcode, DebugLoc DL, EVT VT,
     // size of the value, the shift/rotate count is guaranteed to be zero.
     if (VT == MVT::i1)
       return N1;
+    if (N2C && N2C->isNullValue())
+      return N1;
     break;
   case ISD::FP_ROUND_INREG: {
     EVT EVT = cast<VTSDNode>(N2)->getVT();
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
index 3073dfe9cc6..8ed24cce744 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
@@ -438,6 +438,75 @@ void SelectionDAGISel::SelectBasicBlock(BasicBlock *LLVMBB,
   SDB->clear();
 }
 
+void SelectionDAGISel::ShrinkDemandedOps() {
+  SmallVector<SDNode*, 128> Worklist;
+
+  // Add all the dag nodes to the worklist.
+  Worklist.reserve(CurDAG->allnodes_size());
+  for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(),
+       E = CurDAG->allnodes_end(); I != E; ++I)
+    Worklist.push_back(I);
+
+  APInt Mask;
+  APInt KnownZero;
+  APInt KnownOne;
+
+  TargetLowering::TargetLoweringOpt TLO(*CurDAG, true);
+  while (!Worklist.empty()) {
+    SDNode *N = Worklist.back();
+    Worklist.pop_back();
+
+    if (N->use_empty() && N != CurDAG->getRoot().getNode()) {
+      CurDAG->DeleteNode(N);
+      continue;
+    }
+
+    // Run ShrinkDemandedOp on scalar binary operations.
+    if (N->getNumValues() == 1 &&
+        N->getValueType(0).isSimple() && N->getValueType(0).isInteger()) {
+      DebugLoc dl = N->getDebugLoc();
+      unsigned BitWidth = N->getValueType(0).getScalarType().getSizeInBits();
+      APInt Demanded = APInt::getAllOnesValue(BitWidth);
+      APInt KnownZero, KnownOne;
+      if (TLI.SimplifyDemandedBits(SDValue(N, 0), Demanded,
+                                   KnownZero, KnownOne, TLO)) {
+        // Revisit the node.
+        Worklist.erase(std::remove(Worklist.begin(), Worklist.end(), N),
+                       Worklist.end());
+        Worklist.push_back(N);
+
+        // Replace the old value with the new one.
+        DEBUG(errs() << "\nReplacing "; 
+              TLO.Old.getNode()->dump(CurDAG);
+              errs() << "\nWith: ";
+              TLO.New.getNode()->dump(CurDAG);
+              errs() << '\n');
+
+        Worklist.push_back(TLO.New.getNode());
+        CurDAG->ReplaceAllUsesOfValueWith(TLO.Old, TLO.New);
+
+        if (TLO.Old.getNode()->use_empty()) {
+          for (unsigned i = 0, e = TLO.Old.getNode()->getNumOperands();
+               i != e; ++i) {
+            SDNode *OpNode = TLO.Old.getNode()->getOperand(i).getNode(); 
+            if (OpNode->hasOneUse()) {
+              Worklist.erase(std::remove(Worklist.begin(), Worklist.end(),
+                                         OpNode),
+                             Worklist.end());
+              Worklist.push_back(TLO.Old.getNode()->getOperand(i).getNode());
+            }
+          }
+
+          Worklist.erase(std::remove(Worklist.begin(), Worklist.end(),
+                                     TLO.Old.getNode()),
+                         Worklist.end());
+          CurDAG->DeleteNode(TLO.Old.getNode());
+        }
+      }
+    }
+  }
+}
+
 void SelectionDAGISel::ComputeLiveOutVRegInfo() {
   SmallPtrSet<SDNode*, 128> VisitedNodes;
   SmallVector<SDNode*, 128> Worklist;
@@ -609,8 +678,10 @@ void SelectionDAGISel::CodeGenAndEmitDAG() {
 
   if (ViewISelDAGs) CurDAG->viewGraph("isel input for " + BlockName);
 
-  if (OptLevel != CodeGenOpt::None)
+  if (OptLevel != CodeGenOpt::None) {
+    ShrinkDemandedOps();
     ComputeLiveOutVRegInfo();
+  }
 
   // Third, instruction select all of the operations to machine code, adding the
   // code to the MachineBasicBlock.
diff --git a/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index d9a5a13666b..f7694dbd3e5 100644
--- a/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -990,7 +990,7 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op,
     if (TLO.ShrinkDemandedConstant(Op, ~KnownZero2 & NewMask))
       return true;
     // If the operation can be done in a smaller type, do so.
-    if (TLO.ShrinkDemandedOp(Op, BitWidth, NewMask, dl))
+    if (TLO.ShrinkOps && TLO.ShrinkDemandedOp(Op, BitWidth, NewMask, dl))
       return true;
 
     // Output known-1 bits are only known if set in both the LHS & RHS.
@@ -1024,7 +1024,7 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op,
     if (TLO.ShrinkDemandedConstant(Op, NewMask))
       return true;
     // If the operation can be done in a smaller type, do so.
-    if (TLO.ShrinkDemandedOp(Op, BitWidth, NewMask, dl))
+    if (TLO.ShrinkOps && TLO.ShrinkDemandedOp(Op, BitWidth, NewMask, dl))
       return true;
 
     // Output known-0 bits are only known if clear in both the LHS & RHS.
@@ -1049,7 +1049,7 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op,
     if ((KnownZero2 & NewMask) == NewMask)
       return TLO.CombineTo(Op, Op.getOperand(1));
     // If the operation can be done in a smaller type, do so.
-    if (TLO.ShrinkDemandedOp(Op, BitWidth, NewMask, dl))
+    if (TLO.ShrinkOps && TLO.ShrinkDemandedOp(Op, BitWidth, NewMask, dl))
       return true;
 
     // If all of the unknown bits are known to be zero on one side or the other
@@ -1480,7 +1480,7 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op,
                              KnownOne2, TLO, Depth+1))
       return true;
     // See if the operation should be performed at a smaller bit width.
-    if (TLO.ShrinkDemandedOp(Op, BitWidth, NewMask, dl))
+    if (TLO.ShrinkOps && TLO.ShrinkDemandedOp(Op, BitWidth, NewMask, dl))
       return true;
   }
   // FALL THROUGH
@@ -1876,7 +1876,9 @@ TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
 
     // Fold bit comparisons when we can.
     if ((Cond == ISD::SETEQ || Cond == ISD::SETNE) &&
-        VT == N0.getValueType() && N0.getOpcode() == ISD::AND)
+        (VT == N0.getValueType() ||
+         (isTypeLegal(VT) && VT.bitsLE(N0.getValueType()))) &&
+        N0.getOpcode() == ISD::AND)
       if (ConstantSDNode *AndRHS =
                   dyn_cast<ConstantSDNode>(N0.getOperand(1))) {
         EVT ShiftTy = DCI.isBeforeLegalize() ?
@@ -1884,16 +1886,18 @@ TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
         if (Cond == ISD::SETNE && C1 == 0) {// (X & 8) != 0  -->  (X & 8) >> 3
           // Perform the xform if the AND RHS is a single bit.
           if (isPowerOf2_64(AndRHS->getZExtValue())) {
-            return DAG.getNode(ISD::SRL, dl, VT, N0,
+            return DAG.getNode(ISD::TRUNCATE, dl, VT,
+                              DAG.getNode(ISD::SRL, dl, N0.getValueType(), N0,
                                 DAG.getConstant(Log2_64(AndRHS->getZExtValue()),
-                                                ShiftTy));
+                                                ShiftTy)));
           }
         } else if (Cond == ISD::SETEQ && C1 == AndRHS->getZExtValue()) {
           // (X & 8) == 8  -->  (X & 8) >> 3
           // Perform the xform if C1 is a single bit.
           if (C1.isPowerOf2()) {
-            return DAG.getNode(ISD::SRL, dl, VT, N0,
-                                DAG.getConstant(C1.logBase2(), ShiftTy));
+            return DAG.getNode(ISD::TRUNCATE, dl, VT,
+                               DAG.getNode(ISD::SRL, dl, N0.getValueType(), N0,
+                                      DAG.getConstant(C1.logBase2(), ShiftTy)));
           }
         }
       }
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 0367165f5c4..7d92fd8fad6 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -5681,58 +5681,51 @@ SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
   return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1);
 }
 
-static SDValue LowerToBT(SDValue Op0, SDValue Op1, ISD::CondCode CC,
+/// LowerToBT - Result of 'and' is compared against zero. Turn it into a BT node
+/// if it's possible.
+static SDValue LowerToBT(SDValue Op0, ISD::CondCode CC,
                          DebugLoc dl, SelectionDAG &DAG) {
-  // Lower (X & (1 << N)) == 0 to BT(X, N).
-  // Lower ((X >>u N) & 1) != 0 to BT(X, N).
-  // Lower ((X >>s N) & 1) != 0 to BT(X, N).
-  if (Op0.getOpcode() == ISD::AND &&
-      Op0.hasOneUse() &&
-      Op1.getOpcode() == ISD::Constant &&
-      cast<ConstantSDNode>(Op1)->getZExtValue() == 0 &&
-      (CC == ISD::SETEQ || CC == ISD::SETNE)) {
-    SDValue LHS, RHS;
-    if (Op0.getOperand(1).getOpcode() == ISD::SHL) {
-      if (ConstantSDNode *Op010C =
-          dyn_cast<ConstantSDNode>(Op0.getOperand(1).getOperand(0)))
-        if (Op010C->getZExtValue() == 1) {
-          LHS = Op0.getOperand(0);
-          RHS = Op0.getOperand(1).getOperand(1);
-        }
-    } else if (Op0.getOperand(0).getOpcode() == ISD::SHL) {
-      if (ConstantSDNode *Op000C =
-          dyn_cast<ConstantSDNode>(Op0.getOperand(0).getOperand(0)))
-        if (Op000C->getZExtValue() == 1) {
-          LHS = Op0.getOperand(1);
-          RHS = Op0.getOperand(0).getOperand(1);
-        }
-    } else if (Op0.getOperand(1).getOpcode() == ISD::Constant) {
-      ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op0.getOperand(1));
-      SDValue AndLHS = Op0.getOperand(0);
-      if (AndRHS->getZExtValue() == 1 && AndLHS.getOpcode() == ISD::SRL) {
-        LHS = AndLHS.getOperand(0);
-        RHS = AndLHS.getOperand(1);
+  SDValue LHS, RHS;
+  if (Op0.getOperand(1).getOpcode() == ISD::SHL) {
+    if (ConstantSDNode *Op010C =
+        dyn_cast<ConstantSDNode>(Op0.getOperand(1).getOperand(0)))
+      if (Op010C->getZExtValue() == 1) {
+        LHS = Op0.getOperand(0);
+        RHS = Op0.getOperand(1).getOperand(1);
       }
+  } else if (Op0.getOperand(0).getOpcode() == ISD::SHL) {
+    if (ConstantSDNode *Op000C =
+        dyn_cast<ConstantSDNode>(Op0.getOperand(0).getOperand(0)))
+      if (Op000C->getZExtValue() == 1) {
+        LHS = Op0.getOperand(1);
+        RHS = Op0.getOperand(0).getOperand(1);
+      }
+  } else if (Op0.getOperand(1).getOpcode() == ISD::Constant) {
+    ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op0.getOperand(1));
+    SDValue AndLHS = Op0.getOperand(0);
+    if (AndRHS->getZExtValue() == 1 && AndLHS.getOpcode() == ISD::SRL) {
+      LHS = AndLHS.getOperand(0);
+      RHS = AndLHS.getOperand(1);
     }
+  }
 
-    if (LHS.getNode()) {
-      // If LHS is i8, promote it to i16 with any_extend.  There is no i8 BT
-      // instruction.  Since the shift amount is in-range-or-undefined, we know
-      // that doing a bittest on the i16 value is ok.  We extend to i32 because
-      // the encoding for the i16 version is larger than the i32 version.
-      if (LHS.getValueType() == MVT::i8)
-        LHS = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, LHS);
+  if (LHS.getNode()) {
+    // If LHS is i8, promote it to i16 with any_extend.  There is no i8 BT
+    // instruction.  Since the shift amount is in-range-or-undefined, we know
+    // that doing a bittest on the i16 value is ok.  We extend to i32 because
+    // the encoding for the i16 version is larger than the i32 version.
+    if (LHS.getValueType() == MVT::i8)
+      LHS = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, LHS);
 
-      // If the operand types disagree, extend the shift amount to match.  Since
-      // BT ignores high bits (like shifts) we can use anyextend.
-      if (LHS.getValueType() != RHS.getValueType())
-        RHS = DAG.getNode(ISD::ANY_EXTEND, dl, LHS.getValueType(), RHS);
+    // If the operand types disagree, extend the shift amount to match.  Since
+    // BT ignores high bits (like shifts) we can use anyextend.
+    if (LHS.getValueType() != RHS.getValueType())
+      RHS = DAG.getNode(ISD::ANY_EXTEND, dl, LHS.getValueType(), RHS);
 
-      SDValue BT = DAG.getNode(X86ISD::BT, dl, MVT::i32, LHS, RHS);
-      unsigned Cond = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
-      return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
-                         DAG.getConstant(Cond, MVT::i8), BT);
-    }
+    SDValue BT = DAG.getNode(X86ISD::BT, dl, MVT::i32, LHS, RHS);
+    unsigned Cond = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
+    return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
+                       DAG.getConstant(Cond, MVT::i8), BT);
   }
 
   return SDValue();
@@ -5746,9 +5739,18 @@ SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) {
   ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
 
   // Optimize to BT if possible.
-  SDValue NewCond = LowerToBT(Op0, Op1, CC, dl, DAG);
-  if (NewCond.getNode())
-    return NewCond;
+  // Lower (X & (1 << N)) == 0 to BT(X, N).
+  // Lower ((X >>u N) & 1) != 0 to BT(X, N).
+  // Lower ((X >>s N) & 1) != 0 to BT(X, N).
+  if (Op0.getOpcode() == ISD::AND &&
+      Op0.hasOneUse() &&
+      Op1.getOpcode() == ISD::Constant &&
+      cast<ConstantSDNode>(Op1)->getZExtValue() == 0 &&
+      (CC == ISD::SETEQ || CC == ISD::SETNE)) {
+    SDValue NewSetCC = LowerToBT(Op0, CC, dl, DAG);
+    if (NewSetCC.getNode())
+      return NewSetCC;
+  }
 
   bool isFP = Op.getOperand(1).getValueType().isFloatingPoint();
   unsigned X86CC = TranslateX86CC(CC, isFP, Op0, Op1, DAG);
@@ -5946,6 +5948,23 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) {
     }
   }
 
+  if (addTest) {
+    // Look pass the truncate.
+    if (Cond.getOpcode() == ISD::TRUNCATE)
+      Cond = Cond.getOperand(0);
+
+    // We know the result of AND is compared against zero. Try to match
+    // it to BT.
+    if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) { 
+      SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, dl, DAG);
+      if (NewSetCC.getNode()) {
+        CC = NewSetCC.getOperand(0);
+        Cond = NewSetCC.getOperand(1);
+        addTest = false;
+      }
+    }
+  }
+
   if (addTest) {
     CC = DAG.getConstant(X86::COND_NE, MVT::i8);
     Cond = EmitTest(Cond, X86::COND_NE, DAG);
@@ -6103,6 +6122,23 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) {
     }
   }
 
+  if (addTest) {
+    // Look pass the truncate.
+    if (Cond.getOpcode() == ISD::TRUNCATE)
+      Cond = Cond.getOperand(0);
+
+    // We know the result of AND is compared against zero. Try to match
+    // it to BT.
+    if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) { 
+      SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, dl, DAG);
+      if (NewSetCC.getNode()) {
+        CC = NewSetCC.getOperand(0);
+        Cond = NewSetCC.getOperand(1);
+        addTest = false;
+      }
+    }
+  }
+
   if (addTest) {
     CC = DAG.getConstant(X86::COND_NE, MVT::i8);
     Cond = EmitTest(Cond, X86::COND_NE, DAG);
diff --git a/test/CodeGen/Blackfin/2009-08-15-SetCC-Undef.ll b/test/CodeGen/Blackfin/2009-08-15-SetCC-Undef.ll
index f21da52315f..b6cd2d40d1a 100644
--- a/test/CodeGen/Blackfin/2009-08-15-SetCC-Undef.ll
+++ b/test/CodeGen/Blackfin/2009-08-15-SetCC-Undef.ll
@@ -1,5 +1,4 @@
 ; RUN: llc < %s -march=bfin -verify-machineinstrs
-; XFAIL: *
 
 ; An undef argument causes a setugt node to escape instruction selection.
 
diff --git a/test/CodeGen/Blackfin/promote-logic.ll b/test/CodeGen/Blackfin/promote-logic.ll
index c247aca0a5b..46da56681d4 100644
--- a/test/CodeGen/Blackfin/promote-logic.ll
+++ b/test/CodeGen/Blackfin/promote-logic.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -march=bfin > %t
+; XFAIL: *
 
 ; DAGCombiner::SimplifyBinOpWithSameOpcodeHands can produce an illegal i16 OR
 ; operation after LegalizeOps.
diff --git a/test/CodeGen/CellSPU/mul_ops.ll b/test/CodeGen/CellSPU/mul_ops.ll
index 031d6c37ce7..1e28fc7a918 100644
--- a/test/CodeGen/CellSPU/mul_ops.ll
+++ b/test/CodeGen/CellSPU/mul_ops.ll
@@ -11,7 +11,6 @@
 ; RUN: grep shli    %t1.s | count 4
 ; RUN: grep shlhi   %t1.s | count 4
 ; RUN: grep ila     %t1.s | count 2
-; RUN: grep xsbh    %t1.s | count 4
 target datalayout = "E-p:32:32:128-f64:64:128-f32:32:128-i64:32:128-i32:32:128-i16:16:128-i8:8:128-i1:8:128-a0:0:128-v128:128:128-s0:128:128"
 target triple = "spu"
 
diff --git a/test/CodeGen/SystemZ/2009-06-02-Rotate.ll b/test/CodeGen/SystemZ/2009-06-02-Rotate.ll
index 3317864c014..07a164d4264 100644
--- a/test/CodeGen/SystemZ/2009-06-02-Rotate.ll
+++ b/test/CodeGen/SystemZ/2009-06-02-Rotate.ll
@@ -5,8 +5,8 @@ target triple = "s390x-linux"
 
 define i32 @rotl(i32 %x, i32 %y, i32 %z) nounwind readnone {
 entry:
-	%shl = shl i32 %x, 0		; <i32> [#uses=1]
-	%sub = sub i32 32, 0		; <i32> [#uses=1]
+	%shl = shl i32 %x, 1		; <i32> [#uses=1]
+	%sub = sub i32 32, 1		; <i32> [#uses=1]
 	%shr = lshr i32 %x, %sub		; <i32> [#uses=1]
 	%or = or i32 %shr, %shl		; <i32> [#uses=1]
 	ret i32 %or
diff --git a/test/CodeGen/X86/xor-icmp.ll b/test/CodeGen/X86/xor-icmp.ll
new file mode 100644
index 00000000000..a6bdb13ec6b
--- /dev/null
+++ b/test/CodeGen/X86/xor-icmp.ll
@@ -0,0 +1,36 @@
+; RUN: llc < %s -march=x86    | FileCheck %s -check-prefix=X32
+; RUN: llc < %s -march=x86-64 | FileCheck %s -check-prefix=X64
+
+define i32 @t(i32 %a, i32 %b) nounwind ssp {
+entry:
+; X32:     t:
+; X32:     xorb
+; X32-NOT: andb
+; X32-NOT: shrb
+; X32:     testb $64
+; X32:     jne
+
+; X64:     t:
+; X64-NOT: setne
+; X64:     xorl
+; X64:     testb $64
+; X64:     jne
+  %0 = and i32 %a, 16384
+  %1 = icmp ne i32 %0, 0
+  %2 = and i32 %b, 16384
+  %3 = icmp ne i32 %2, 0
+  %4 = xor i1 %1, %3
+  br i1 %4, label %bb1, label %bb
+
+bb:                                               ; preds = %entry
+  %5 = tail call i32 (...)* @foo() nounwind       ; <i32> [#uses=1]
+  ret i32 %5
+
+bb1:                                              ; preds = %entry
+  %6 = tail call i32 (...)* @bar() nounwind       ; <i32> [#uses=1]
+  ret i32 %6
+}
+
+declare i32 @foo(...)
+
+declare i32 @bar(...)