diff --git a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
index bc670768989..c12a82c03b8 100644
--- a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
+++ b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
@@ -42,6 +42,12 @@ using namespace llvm;
 cl::opt<bool> ANDIGlueBug("expose-ppc-andi-glue-bug",
 cl::desc("expose the ANDI glue bug on PPC"), cl::Hidden);
 
+cl::opt<bool> UseBitPermRewriter("ppc-use-bit-perm-rewriter", cl::init(true),
+  cl::desc("use aggressive ppc isel for bit permutations"), cl::Hidden);
+cl::opt<bool> BPermRewriterNoMasking("ppc-bit-perm-rewriter-stress-rotates",
+  cl::desc("stress rotate selection in aggressive ppc isel for "
+           "bit permutations"), cl::Hidden);
+
 namespace llvm {
   void initializePPCDAGToDAGISelPass(PassRegistry&);
 }
@@ -533,6 +539,152 @@ SDNode *PPCDAGToDAGISel::SelectBitfieldInsert(SDNode *N) {
   return nullptr;
 }
 
+// Predict the number of instructions that would be generated by calling
+// SelectInt64(N).
+static unsigned SelectInt64Count(int64_t Imm) {
+  // Assume no remaining bits.
+  unsigned Remainder = 0;
+  // Assume no shift required.
+  unsigned Shift = 0;
+
+  // If it can't be represented as a 32 bit value.
+  if (!isInt<32>(Imm)) {
+    Shift = countTrailingZeros<uint64_t>(Imm);
+    int64_t ImmSh = static_cast<uint64_t>(Imm) >> Shift;
+
+    // If the shifted value fits 32 bits.
+    if (isInt<32>(ImmSh)) {
+      // Go with the shifted value.
+      Imm = ImmSh;
+    } else {
+      // Still stuck with a 64 bit value.
+      Remainder = Imm;
+      Shift = 32;
+      Imm >>= 32;
+    }
+  }
+
+  // Intermediate operand.
+  unsigned Result = 0;
+
+  // Handle first 32 bits.
+  unsigned Lo = Imm & 0xFFFF;
+  unsigned Hi = (Imm >> 16) & 0xFFFF;
+
+  // Simple value.
+  if (isInt<16>(Imm)) {
+    // Just the Lo bits.
+    ++Result;
+  } else if (Lo) {
+    // Handle the Hi bits and Lo bits.
+    Result += 2;
+  } else {
+    // Just the Hi bits.
+    ++Result;
+  }
+
+  // If no shift, we're done.
+  if (!Shift) return Result;
+
+  // Shift for next step if the upper 32-bits were not zero.
+  if (Imm)
+    ++Result;
+
+  // Add in the last bits as required.
+  if ((Hi = (Remainder >> 16) & 0xFFFF))
+    ++Result;
+  if ((Lo = Remainder & 0xFFFF))
+    ++Result;
+
+  return Result;
+}
+
+// Select a 64-bit constant. For cost-modeling purposes, SelectInt64Count
+// (above) needs to be kept in sync with this function.
+static SDNode *SelectInt64(SelectionDAG *CurDAG, SDLoc dl, int64_t Imm) {
+  // Assume no remaining bits.
+  unsigned Remainder = 0;
+  // Assume no shift required.
+  unsigned Shift = 0;
+
+  // If it can't be represented as a 32 bit value.
+  if (!isInt<32>(Imm)) {
+    Shift = countTrailingZeros<uint64_t>(Imm);
+    int64_t ImmSh = static_cast<uint64_t>(Imm) >> Shift;
+
+    // If the shifted value fits 32 bits.
+    if (isInt<32>(ImmSh)) {
+      // Go with the shifted value.
+      Imm = ImmSh;
+    } else {
+      // Still stuck with a 64 bit value.
+      Remainder = Imm;
+      Shift = 32;
+      Imm >>= 32;
+    }
+  }
+
+  // Intermediate operand.
+  SDNode *Result;
+
+  // Handle first 32 bits.
+  unsigned Lo = Imm & 0xFFFF;
+  unsigned Hi = (Imm >> 16) & 0xFFFF;
+
+  auto getI32Imm = [CurDAG](unsigned Imm) {
+      return CurDAG->getTargetConstant(Imm, MVT::i32);
+  };
+
+  // Simple value.
+  if (isInt<16>(Imm)) {
+    // Just the Lo bits.
+    Result = CurDAG->getMachineNode(PPC::LI8, dl, MVT::i64, getI32Imm(Lo));
+  } else if (Lo) {
+    // Handle the Hi bits.
+    unsigned OpC = Hi ? PPC::LIS8 : PPC::LI8;
+    Result = CurDAG->getMachineNode(OpC, dl, MVT::i64, getI32Imm(Hi));
+    // And Lo bits.
+    Result = CurDAG->getMachineNode(PPC::ORI8, dl, MVT::i64,
+                                    SDValue(Result, 0), getI32Imm(Lo));
+  } else {
+    // Just the Hi bits.
+    Result = CurDAG->getMachineNode(PPC::LIS8, dl, MVT::i64, getI32Imm(Hi));
+  }
+
+  // If no shift, we're done.
+  if (!Shift) return Result;
+
+  // Shift for next step if the upper 32-bits were not zero.
+  if (Imm) {
+    Result = CurDAG->getMachineNode(PPC::RLDICR, dl, MVT::i64,
+                                    SDValue(Result, 0),
+                                    getI32Imm(Shift),
+                                    getI32Imm(63 - Shift));
+  }
+
+  // Add in the last bits as required.
+  if ((Hi = (Remainder >> 16) & 0xFFFF)) {
+    Result = CurDAG->getMachineNode(PPC::ORIS8, dl, MVT::i64,
+                                    SDValue(Result, 0), getI32Imm(Hi));
+  }
+  if ((Lo = Remainder & 0xFFFF)) {
+    Result = CurDAG->getMachineNode(PPC::ORI8, dl, MVT::i64,
+                                    SDValue(Result, 0), getI32Imm(Lo));
+  }
+
+  return Result;
+}
+
+// Select a 64-bit constant.
+static SDNode *SelectInt64(SelectionDAG *CurDAG, SDNode *N) {
+  SDLoc dl(N);
+
+  // Get 64 bit value.
+  int64_t Imm = cast<ConstantSDNode>(N)->getZExtValue();
+  return SelectInt64(CurDAG, dl, Imm);
+}
+
+
 namespace {
 class BitPermutationSelector {
   struct ValueBit {
@@ -577,8 +729,19 @@ class BitPermutationSelector {
     unsigned RLAmt;
     unsigned StartIdx, EndIdx;
 
+    // This rotation amount assumes that the lower 32 bits of the quantity are
+    // replicated in the high 32 bits by the rotation operator (which is done
+    // by rlwinm and friends in 64-bit mode).
+    bool Repl32;
+    // Did converting to Repl32 == true change the rotation factor? If it did,
+    // it decreased it by 32.
+    bool Repl32CR;
+    // Was this group coalesced after setting Repl32 to true?
+    bool Repl32Coalesced;
+
     BitGroup(SDValue V, unsigned R, unsigned S, unsigned E)
-      : V(V), RLAmt(R), StartIdx(S), EndIdx(E) {
+      : V(V), RLAmt(R), StartIdx(S), EndIdx(E), Repl32(false), Repl32CR(false),
+        Repl32Coalesced(false) {
       DEBUG(dbgs() << "\tbit group for " << V.getNode() << " RLAmt = " << R <<
                       " [" << S << ", " << E << "]\n");
     }
@@ -591,14 +754,23 @@ class BitPermutationSelector {
     unsigned RLAmt;
     unsigned NumGroups;
     unsigned FirstGroupStartIdx;
+    bool Repl32;
 
     ValueRotInfo()
-      : RLAmt(UINT32_MAX), NumGroups(0), FirstGroupStartIdx(UINT32_MAX) {}
+      : RLAmt(UINT32_MAX), NumGroups(0), FirstGroupStartIdx(UINT32_MAX),
+        Repl32(false) {}
 
     // For sorting (in reverse order) by NumGroups, and then by
     // FirstGroupStartIdx.
     bool operator < (const ValueRotInfo &Other) const {
-      if (NumGroups > Other.NumGroups)
+      // We need to sort so that the non-Repl32 come first because, when we're
+      // doing masking, the Repl32 bit groups might be subsumed into the 64-bit
+      // masking operation.
+      if (Repl32 < Other.Repl32)
+        return true;
+      else if (Repl32 > Other.Repl32)
+        return false;
+      else if (NumGroups > Other.NumGroups)
         return true;
       else if (NumGroups < Other.NumGroups)
         return false;
@@ -729,8 +901,9 @@ class BitPermutationSelector {
   }
 
   // Collect groups of consecutive bits with the same underlying value and
-  // rotation factor.
-  void collectBitGroups() {
+  // rotation factor. If we're doing late masking, we ignore zeros, otherwise
+  // they break up groups.
+  void collectBitGroups(bool LateMask) {
     BitGroups.clear();
 
     unsigned LastRLAmt = RLAmt[0];
@@ -739,6 +912,14 @@ class BitPermutationSelector {
     for (unsigned i = 1; i < Bits.size(); ++i) {
       unsigned ThisRLAmt = RLAmt[i];
       SDValue ThisValue = Bits[i].hasValue() ? Bits[i].getValue() : SDValue();
+      if (LateMask && !ThisValue) {
+        ThisValue = LastValue;
+        ThisRLAmt = LastRLAmt;
+        // If we're doing late masking, then the first bit group always starts
+        // at zero (even if the first bits were zero).
+        if (BitGroups.empty())
+          LastGroupStartIdx = 0;
+      }
 
       // If this bit has the same underlying value and the same rotate factor as
       // the last one, then they're part of the same group.
@@ -768,6 +949,7 @@ class BitPermutationSelector {
           BitGroups[BitGroups.size()-1].EndIdx == Bits.size()-1 &&
           BitGroups[0].V == BitGroups[BitGroups.size()-1].V &&
           BitGroups[0].RLAmt == BitGroups[BitGroups.size()-1].RLAmt) {
+        DEBUG(dbgs() << "\tcombining final bit group with inital one\n");
         BitGroups[BitGroups.size()-1].EndIdx = BitGroups[0].EndIdx;
         BitGroups.erase(BitGroups.begin());
       }
@@ -781,9 +963,11 @@ class BitPermutationSelector {
     ValueRots.clear();
 
     for (auto &BG : BitGroups) {
-      ValueRotInfo &VRI = ValueRots[std::make_pair(BG.V, BG.RLAmt)];
+      unsigned RLAmtKey = BG.RLAmt + (BG.Repl32 ? 64 : 0);
+      ValueRotInfo &VRI = ValueRots[std::make_pair(BG.V, RLAmtKey)];
       VRI.V = BG.V;
       VRI.RLAmt = BG.RLAmt;
+      VRI.Repl32 = BG.Repl32;
       VRI.NumGroups += 1;
       VRI.FirstGroupStartIdx = std::min(VRI.FirstGroupStartIdx, BG.StartIdx);
     }
@@ -797,15 +981,164 @@ class BitPermutationSelector {
     std::sort(ValueRotsVec.begin(), ValueRotsVec.end());
   }
 
+  // In 64-bit mode, rlwinm and friends have a rotation operator that
+  // replicates the low-order 32 bits into the high-order 32-bits. The mask
+  // indices of these instructions can only be in the lower 32 bits, so they
+  // can only represent some 64-bit bit groups. However, when they can be used,
+  // the 32-bit replication can be used to represent, as a single bit group,
+  // otherwise separate bit groups. We'll convert to replicated-32-bit bit
+  // groups when possible. Returns true if any of the bit groups were
+  // converted.
+  void assignRepl32BitGroups() {
+    // If we have bits like this:
+    //
+    // Indices:    15 14 13 12 11 10 9 8  7  6  5  4  3  2  1  0
+    // V bits: ... 7  6  5  4  3  2  1 0 31 30 29 28 27 26 25 24
+    // Groups:    |      RLAmt = 8      |      RLAmt = 40       |
+    //
+    // But, making use of a 32-bit operation that replicates the low-order 32
+    // bits into the high-order 32 bits, this can be one bit group with a RLAmt
+    // of 8.
+
+    auto IsAllLow32 = [this](BitGroup & BG) {
+      if (BG.StartIdx <= BG.EndIdx) {
+        for (unsigned i = BG.StartIdx; i <= BG.EndIdx; ++i) {
+          if (!Bits[i].hasValue())
+            continue;
+          if (Bits[i].getValueBitIndex() >= 32)
+            return false;
+        }
+      } else {
+        for (unsigned i = BG.StartIdx; i < Bits.size(); ++i) {
+          if (!Bits[i].hasValue())
+            continue;
+          if (Bits[i].getValueBitIndex() >= 32)
+            return false;
+        }
+        for (unsigned i = 0; i <= BG.EndIdx; ++i) {
+          if (!Bits[i].hasValue())
+            continue;
+          if (Bits[i].getValueBitIndex() >= 32)
+            return false;
+        }
+      }
+
+      return true;
+    };
+
+    for (auto &BG : BitGroups) {
+      if (BG.StartIdx < 32 && BG.EndIdx < 32) {
+        if (IsAllLow32(BG)) {
+          if (BG.RLAmt >= 32) {
+            BG.RLAmt -= 32;
+            BG.Repl32CR = true;
+          }
+
+          BG.Repl32 = true;
+
+          DEBUG(dbgs() << "\t32-bit replicated bit group for " <<
+                          BG.V.getNode() << " RLAmt = " << BG.RLAmt <<
+                          " [" << BG.StartIdx << ", " << BG.EndIdx << "]\n");
+        }
+      }
+    }
+
+    // Now walk through the bit groups, consolidating where possible.
+    for (auto I = BitGroups.begin(); I != BitGroups.end();) {
+      // We might want to remove this bit group by merging it with the previous
+      // group (which might be the ending group).
+      auto IP = (I == BitGroups.begin()) ?
+                std::prev(BitGroups.end()) : std::prev(I);
+      if (I->Repl32 && IP->Repl32 && I->V == IP->V && I->RLAmt == IP->RLAmt &&
+          I->StartIdx == (IP->EndIdx + 1) % 64 && I != IP) {
+
+        DEBUG(dbgs() << "\tcombining 32-bit replicated bit group for " <<
+                        I->V.getNode() << " RLAmt = " << I->RLAmt <<
+                        " [" << I->StartIdx << ", " << I->EndIdx <<
+                        "] with group with range [" <<
+                        IP->StartIdx << ", " << IP->EndIdx << "]\n");
+
+        IP->EndIdx = I->EndIdx;
+        IP->Repl32CR = IP->Repl32CR || I->Repl32CR;
+        IP->Repl32Coalesced = true;
+        I = BitGroups.erase(I);
+        continue;
+      } else {
+        // There is a special case worth handling: If there is a single group
+        // covering the entire upper 32 bits, and it can be merged with both
+        // the next and previous groups (which might be the same group), then
+        // do so. If it is the same group (so there will be only one group in
+        // total), then we need to reverse the order of the range so that it
+        // covers the entire 64 bits.
+        if (I->StartIdx == 32 && I->EndIdx == 63) {
+          assert(std::next(I) == BitGroups.end() &&
+                 "bit group ends at index 63 but there is another?");
+          auto IN = BitGroups.begin();
+
+          if (IP->Repl32 && IN->Repl32 && I->V == IP->V && I->V == IN->V && 
+              (I->RLAmt % 32) == IP->RLAmt && (I->RLAmt % 32) == IN->RLAmt &&
+              IP->EndIdx == 31 && IN->StartIdx == 0 && I != IP &&
+              IsAllLow32(*I)) {
+
+            DEBUG(dbgs() << "\tcombining bit group for " <<
+                            I->V.getNode() << " RLAmt = " << I->RLAmt <<
+                            " [" << I->StartIdx << ", " << I->EndIdx <<
+                            "] with 32-bit replicated groups with ranges [" <<
+                            IP->StartIdx << ", " << IP->EndIdx << "] and [" <<
+                            IN->StartIdx << ", " << IN->EndIdx << "]\n");
+
+            if (IP == IN) {
+              // There is only one other group; change it to cover the whole
+              // range (backward, so that it can still be Repl32 but cover the
+              // whole 64-bit range).
+              IP->StartIdx = 31;
+              IP->EndIdx = 30;
+              IP->Repl32CR = IP->Repl32CR || I->RLAmt >= 32;
+              IP->Repl32Coalesced = true;
+              I = BitGroups.erase(I);
+            } else {
+              // There are two separate groups, one before this group and one
+              // after us (at the beginning). We're going to remove this group,
+              // but also the group at the very beginning.
+              IP->EndIdx = IN->EndIdx;
+              IP->Repl32CR = IP->Repl32CR || IN->Repl32CR || I->RLAmt >= 32;
+              IP->Repl32Coalesced = true;
+              I = BitGroups.erase(I);
+              BitGroups.erase(BitGroups.begin());
+            }
+
+            // This must be the last group in the vector (and we might have
+            // just invalidated the iterator above), so break here.
+            break;
+          }
+        }
+      }
+
+      ++I;
+    }
+  }
+
   SDValue getI32Imm(unsigned Imm) {
     return CurDAG->getTargetConstant(Imm, MVT::i32);
   }
 
+  uint64_t getZerosMask() {
+    uint64_t Mask = 0;
+    for (unsigned i = 0; i < Bits.size(); ++i) {
+      if (Bits[i].hasValue())
+        continue;
+      Mask |= (1ul << i);
+    }
+
+    return ~Mask;
+  }
+
   // Depending on the number of groups for a particular value, it might be
   // better to rotate, mask explicitly (using andi/andis), and then or the
   // result. Select this part of the result first.
-  void SelectAndParts32(SDNode *N, SDValue &Res) {
-    SDLoc dl(N);
+  void SelectAndParts32(SDLoc dl, SDValue &Res, unsigned *InstCnt) {
+    if (BPermRewriterNoMasking)
+      return;
 
     for (ValueRotInfo &VRI : ValueRotsVec) {
       unsigned Mask = 0;
@@ -842,9 +1175,19 @@ class BitPermutationSelector {
                              (unsigned) (ANDISMask != 0) +
                              (unsigned) (ANDIMask != 0 && ANDISMask != 0) +
                              (unsigned) (bool) Res;
+
+      DEBUG(dbgs() << "\t\trotation groups for " << VRI.V.getNode() <<
+                      " RL: " << VRI.RLAmt << ":" <<
+                      "\n\t\t\tisel using masking: " << NumAndInsts <<
+                      " using rotates: " << VRI.NumGroups << "\n");
+
       if (NumAndInsts >= VRI.NumGroups)
         continue;
 
+      DEBUG(dbgs() << "\t\t\t\tusing masking\n");
+
+      if (InstCnt) *InstCnt += NumAndInsts;
+
       SDValue VRot;
       if (VRI.RLAmt) {
         SDValue Ops[] =
@@ -890,19 +1233,22 @@ class BitPermutationSelector {
   }
 
   // Instruction selection for the 32-bit case.
-  SDNode *Select32(SDNode *N) {
+  SDNode *Select32(SDNode *N, bool LateMask, unsigned *InstCnt) {
     SDLoc dl(N);
     SDValue Res;
 
+    if (InstCnt) *InstCnt = 0;
+
     // Take care of cases that should use andi/andis first.
-    SelectAndParts32(N, Res);
+    SelectAndParts32(dl, Res, InstCnt);
 
     // If we've not yet selected a 'starting' instruction, and we have no zeros
     // to fill in, select the (Value, RLAmt) with the highest priority (largest
     // number of groups), and start with this rotated value.
-    if (!HasZeros && !Res) {
+    if ((!HasZeros || LateMask) && !Res) {
       ValueRotInfo &VRI = ValueRotsVec[0];
       if (VRI.RLAmt) {
+        if (InstCnt) *InstCnt += 1;
         SDValue Ops[] =
           { VRI.V, getI32Imm(VRI.RLAmt), getI32Imm(0), getI32Imm(31) };
         Res = SDValue(CurDAG->getMachineNode(PPC::RLWINM, dl, MVT::i32, Ops), 0);
@@ -919,9 +1265,11 @@ class BitPermutationSelector {
       }
     }
 
+    if (InstCnt) *InstCnt += BitGroups.size();
+
     // Insert the other groups (one at a time).
     for (auto &BG : BitGroups) {
-      if (!Res.getNode()) {
+      if (!Res) {
         SDValue Ops[] =
           { BG.V, getI32Imm(BG.RLAmt), getI32Imm(Bits.size() - BG.EndIdx - 1),
             getI32Imm(Bits.size() - BG.StartIdx - 1) };
@@ -934,9 +1282,488 @@ class BitPermutationSelector {
       }
     }
 
+    if (LateMask) {
+      unsigned Mask = (unsigned) getZerosMask();
+
+      unsigned ANDIMask = (Mask & UINT16_MAX), ANDISMask = Mask >> 16;
+      assert((ANDIMask != 0 || ANDISMask != 0) &&
+             "No set bits in zeros mask?");
+
+      if (InstCnt) *InstCnt += (unsigned) (ANDIMask != 0) +
+                               (unsigned) (ANDISMask != 0) +
+                               (unsigned) (ANDIMask != 0 && ANDISMask != 0);
+
+      SDValue ANDIVal, ANDISVal;
+      if (ANDIMask != 0)
+        ANDIVal = SDValue(CurDAG->getMachineNode(PPC::ANDIo, dl, MVT::i32,
+                            Res, getI32Imm(ANDIMask)), 0);
+      if (ANDISMask != 0)
+        ANDISVal = SDValue(CurDAG->getMachineNode(PPC::ANDISo, dl, MVT::i32,
+                             Res, getI32Imm(ANDISMask)), 0);
+
+      if (!ANDIVal)
+        Res = ANDISVal;
+      else if (!ANDISVal)
+        Res = ANDIVal;
+      else
+        Res = SDValue(CurDAG->getMachineNode(PPC::OR, dl, MVT::i32,
+                        ANDIVal, ANDISVal), 0);
+    }
+
     return Res.getNode();
   }
 
+  unsigned SelectRotMask64Count(unsigned RLAmt, bool Repl32,
+                                unsigned MaskStart, unsigned MaskEnd,
+                                bool IsIns) {
+    // In the notation used by the instructions, 'start' and 'end' are reversed
+    // because bits are counted from high to low order.
+    unsigned InstMaskStart = 64 - MaskEnd - 1,
+             InstMaskEnd   = 64 - MaskStart - 1;
+
+    if (Repl32)
+      return 1;
+
+    if ((!IsIns && (InstMaskEnd == 63 || InstMaskStart == 0)) ||
+        InstMaskEnd == 63 - RLAmt)
+      return 1;
+
+    return 2;
+  }
+
+  // For 64-bit values, not all combinations of rotates and masks are
+  // available. Produce one if it is available.
+  SDValue SelectRotMask64(SDValue V, SDLoc dl, unsigned RLAmt, bool Repl32,
+                          unsigned MaskStart, unsigned MaskEnd,
+                          unsigned *InstCnt = nullptr) {
+    // In the notation used by the instructions, 'start' and 'end' are reversed
+    // because bits are counted from high to low order.
+    unsigned InstMaskStart = 64 - MaskEnd - 1,
+             InstMaskEnd   = 64 - MaskStart - 1;
+
+    if (InstCnt) *InstCnt += 1;
+
+    if (Repl32) {
+      // This rotation amount assumes that the lower 32 bits of the quantity
+      // are replicated in the high 32 bits by the rotation operator (which is
+      // done by rlwinm and friends).
+      assert(InstMaskStart >= 32 && "Mask cannot start out of range");
+      assert(InstMaskEnd   >= 32 && "Mask cannot end out of range");
+      SDValue Ops[] =
+        { V, getI32Imm(RLAmt), getI32Imm(InstMaskStart - 32),
+          getI32Imm(InstMaskEnd - 32) };
+      return SDValue(CurDAG->getMachineNode(PPC::RLWINM8, dl, MVT::i64,
+                                            Ops), 0);
+    }
+
+    if (InstMaskEnd == 63) {
+      SDValue Ops[] =
+        { V, getI32Imm(RLAmt), getI32Imm(InstMaskStart) };
+      return SDValue(CurDAG->getMachineNode(PPC::RLDICL, dl, MVT::i64, Ops), 0);
+    }
+
+    if (InstMaskStart == 0) {
+      SDValue Ops[] =
+        { V, getI32Imm(RLAmt), getI32Imm(InstMaskEnd) };
+      return SDValue(CurDAG->getMachineNode(PPC::RLDICR, dl, MVT::i64, Ops), 0);
+    }
+
+    if (InstMaskEnd == 63 - RLAmt) {
+      SDValue Ops[] =
+        { V, getI32Imm(RLAmt), getI32Imm(InstMaskStart) };
+      return SDValue(CurDAG->getMachineNode(PPC::RLDIC, dl, MVT::i64, Ops), 0);
+    }
+
+    // We cannot do this with a single instruction, so we'll use two. The
+    // problem is that we're not free to choose both a rotation amount and mask
+    // start and end independently. We can choose an arbitrary mask start and
+    // end, but then the rotation amount is fixed. Rotation, however, can be
+    // inverted, and so by applying an "inverse" rotation first, we can get the
+    // desired result.
+    if (InstCnt) *InstCnt += 1;
+
+    // The rotation mask for the second instruction must be MaskStart.
+    unsigned RLAmt2 = MaskStart;
+    // The first instruction must rotate V so that the overall rotation amount
+    // is RLAmt.
+    unsigned RLAmt1 = (64 + RLAmt - RLAmt2) % 64;
+    if (RLAmt1)
+      V = SelectRotMask64(V, dl, RLAmt1, false, 0, 63);
+    return SelectRotMask64(V, dl, RLAmt2, false, MaskStart, MaskEnd);
+  }
+
+  // For 64-bit values, not all combinations of rotates and masks are
+  // available. Produce a rotate-mask-and-insert if one is available.
+  SDValue SelectRotMaskIns64(SDValue Base, SDValue V, SDLoc dl, unsigned RLAmt,
+                             bool Repl32, unsigned MaskStart,
+                             unsigned MaskEnd, unsigned *InstCnt = nullptr) {
+    // In the notation used by the instructions, 'start' and 'end' are reversed
+    // because bits are counted from high to low order.
+    unsigned InstMaskStart = 64 - MaskEnd - 1,
+             InstMaskEnd   = 64 - MaskStart - 1;
+
+    if (InstCnt) *InstCnt += 1;
+
+    if (Repl32) {
+      // This rotation amount assumes that the lower 32 bits of the quantity
+      // are replicated in the high 32 bits by the rotation operator (which is
+      // done by rlwinm and friends).
+      assert(InstMaskStart >= 32 && "Mask cannot start out of range");
+      assert(InstMaskEnd   >= 32 && "Mask cannot end out of range");
+      SDValue Ops[] =
+        { Base, V, getI32Imm(RLAmt), getI32Imm(InstMaskStart - 32),
+          getI32Imm(InstMaskEnd - 32) };
+      return SDValue(CurDAG->getMachineNode(PPC::RLWIMI8, dl, MVT::i64,
+                                            Ops), 0);
+    }
+
+    if (InstMaskEnd == 63 - RLAmt) {
+      SDValue Ops[] =
+        { Base, V, getI32Imm(RLAmt), getI32Imm(InstMaskStart) };
+      return SDValue(CurDAG->getMachineNode(PPC::RLDIMI, dl, MVT::i64, Ops), 0);
+    }
+
+    // We cannot do this with a single instruction, so we'll use two. The
+    // problem is that we're not free to choose both a rotation amount and mask
+    // start and end independently. We can choose an arbitrary mask start and
+    // end, but then the rotation amount is fixed. Rotation, however, can be
+    // inverted, and so by applying an "inverse" rotation first, we can get the
+    // desired result.
+    if (InstCnt) *InstCnt += 1;
+
+    // The rotation mask for the second instruction must be MaskStart.
+    unsigned RLAmt2 = MaskStart;
+    // The first instruction must rotate V so that the overall rotation amount
+    // is RLAmt.
+    unsigned RLAmt1 = (64 + RLAmt - RLAmt2) % 64;
+    if (RLAmt1)
+      V = SelectRotMask64(V, dl, RLAmt1, false, 0, 63);
+    return SelectRotMaskIns64(Base, V, dl, RLAmt2, false, MaskStart, MaskEnd);
+  }
+
+  void SelectAndParts64(SDLoc dl, SDValue &Res, unsigned *InstCnt) {
+    if (BPermRewriterNoMasking)
+      return;
+
+    // The idea here is the same as in the 32-bit version, but with additional
+    // complications from the fact that Repl32 might be true. Because we
+    // aggressively convert bit groups to Repl32 form (which, for small
+    // rotation factors, involves no other change), and then coalesce, it might
+    // be the case that a single 64-bit masking operation could handle both
+    // some Repl32 groups and some non-Repl32 groups. If converting to Repl32
+    // form allowed coalescing, then we must use a 32-bit rotaton in order to
+    // completely capture the new combined bit group.
+
+    for (ValueRotInfo &VRI : ValueRotsVec) {
+      uint64_t Mask = 0;
+
+      // We need to add to the mask all bits from the associated bit groups.
+      // If Repl32 is false, we need to add bits from bit groups that have
+      // Repl32 true, but are trivially convertable to Repl32 false. Such a
+      // group is trivially convertable if it overlaps only with the lower 32
+      // bits, and the group has not been coalesced.
+      auto MatchingBG = [VRI](BitGroup &BG) {
+        if (VRI.V != BG.V)
+          return false;
+
+        unsigned EffRLAmt = BG.RLAmt;
+        if (!VRI.Repl32 && BG.Repl32) {
+          if (BG.StartIdx < 32 && BG.EndIdx < 32 && BG.StartIdx <= BG.EndIdx &&
+              !BG.Repl32Coalesced) {
+            if (BG.Repl32CR)
+              EffRLAmt += 32;
+          } else {
+            return false;
+          }
+        } else if (VRI.Repl32 != BG.Repl32) {
+          return false;
+        }
+
+        if (VRI.RLAmt != EffRLAmt)
+          return false;
+
+        return true;
+      };
+
+      for (auto &BG : BitGroups) {
+        if (!MatchingBG(BG))
+          continue;
+
+        if (BG.StartIdx <= BG.EndIdx) {
+          for (unsigned i = BG.StartIdx; i <= BG.EndIdx; ++i)
+            Mask |= (1ul << i);
+        } else {
+          for (unsigned i = BG.StartIdx; i < Bits.size(); ++i)
+            Mask |= (1ul << i);
+          for (unsigned i = 0; i <= BG.EndIdx; ++i)
+            Mask |= (1ul << i);
+        }
+      }
+
+      // We can use the 32-bit andi/andis technique if the mask does not
+      // require any higher-order bits. This can save an instruction compared
+      // to always using the general 64-bit technique.
+      bool Use32BitInsts = isUInt<32>(Mask);
+      // Compute the masks for andi/andis that would be necessary.
+      unsigned ANDIMask = (Mask & UINT16_MAX),
+               ANDISMask = (Mask >> 16) & UINT16_MAX;
+
+      bool NeedsRotate = VRI.RLAmt || (VRI.Repl32 && !isUInt<32>(Mask));
+
+      unsigned NumAndInsts = (unsigned) NeedsRotate +
+                             (unsigned) (bool) Res;
+      if (Use32BitInsts)
+        NumAndInsts += (unsigned) (ANDIMask != 0) + (unsigned) (ANDISMask != 0) +
+                       (unsigned) (ANDIMask != 0 && ANDISMask != 0);
+      else
+        NumAndInsts += SelectInt64Count(Mask) + /* and */ 1;
+
+      unsigned NumRLInsts = 0;
+      bool FirstBG = true;
+      for (auto &BG : BitGroups) {
+        if (!MatchingBG(BG))
+          continue;
+        NumRLInsts +=
+          SelectRotMask64Count(BG.RLAmt, BG.Repl32, BG.StartIdx, BG.EndIdx,
+                               !FirstBG);
+        FirstBG = false;
+      }
+
+      DEBUG(dbgs() << "\t\trotation groups for " << VRI.V.getNode() <<
+                      " RL: " << VRI.RLAmt << (VRI.Repl32 ? " (32):" : ":") <<
+                      "\n\t\t\tisel using masking: " << NumAndInsts <<
+                      " using rotates: " << NumRLInsts << "\n");
+
+      // When we'd use andi/andis, we bias toward using the rotates (andi only
+      // has a record form, and is cracked on POWER cores). However, when using
+      // general 64-bit constant formation, bias toward the constant form,
+      // because that exposes more opportunities for CSE.
+      if (NumAndInsts > NumRLInsts)
+        continue;
+      if (Use32BitInsts && NumAndInsts == NumRLInsts)
+        continue;
+
+      DEBUG(dbgs() << "\t\t\t\tusing masking\n");
+
+      if (InstCnt) *InstCnt += NumAndInsts;
+
+      SDValue VRot;
+      // We actually need to generate a rotation if we have a non-zero rotation
+      // factor or, in the Repl32 case, if we care about any of the
+      // higher-order replicated bits. In the latter case, we generate a mask
+      // backward so that it actually includes the entire 64 bits.
+      if (VRI.RLAmt || (VRI.Repl32 && !isUInt<32>(Mask)))
+        VRot = SelectRotMask64(VRI.V, dl, VRI.RLAmt, VRI.Repl32,
+                               VRI.Repl32 ? 31 : 0, VRI.Repl32 ? 30 : 63);
+      else
+        VRot = VRI.V;
+
+      SDValue TotalVal;
+      if (Use32BitInsts) {
+        assert((ANDIMask != 0 || ANDISMask != 0) &&
+               "No set bits in mask when using 32-bit ands for 64-bit value");
+
+        SDValue ANDIVal, ANDISVal;
+        if (ANDIMask != 0)
+          ANDIVal = SDValue(CurDAG->getMachineNode(PPC::ANDIo8, dl, MVT::i64,
+                              VRot, getI32Imm(ANDIMask)), 0);
+        if (ANDISMask != 0)
+          ANDISVal = SDValue(CurDAG->getMachineNode(PPC::ANDISo8, dl, MVT::i64,
+                               VRot, getI32Imm(ANDISMask)), 0);
+
+        if (!ANDIVal)
+          TotalVal = ANDISVal;
+        else if (!ANDISVal)
+          TotalVal = ANDIVal;
+        else
+          TotalVal = SDValue(CurDAG->getMachineNode(PPC::OR8, dl, MVT::i64,
+                               ANDIVal, ANDISVal), 0);
+      } else {
+        TotalVal = SDValue(SelectInt64(CurDAG, dl, Mask), 0);
+        TotalVal =
+          SDValue(CurDAG->getMachineNode(PPC::AND8, dl, MVT::i64,
+                                         VRot, TotalVal), 0);
+     }
+
+      if (!Res)
+        Res = TotalVal;
+      else
+        Res = SDValue(CurDAG->getMachineNode(PPC::OR8, dl, MVT::i64,
+                                             Res, TotalVal), 0);
+
+      // Now, remove all groups with this underlying value and rotation
+      // factor.
+      for (auto I = BitGroups.begin(); I != BitGroups.end();) {
+        if (MatchingBG(*I))
+          I = BitGroups.erase(I);
+        else
+          ++I;
+      }
+    }
+  }
+
+  // Instruction selection for the 64-bit case.
+  SDNode *Select64(SDNode *N, bool LateMask, unsigned *InstCnt) {
+    SDLoc dl(N);
+    SDValue Res;
+
+    if (InstCnt) *InstCnt = 0;
+
+    // Take care of cases that should use andi/andis first.
+    SelectAndParts64(dl, Res, InstCnt);
+
+    // If we've not yet selected a 'starting' instruction, and we have no zeros
+    // to fill in, select the (Value, RLAmt) with the highest priority (largest
+    // number of groups), and start with this rotated value.
+    if ((!HasZeros || LateMask) && !Res) {
+      // If we have both Repl32 groups and non-Repl32 groups, the non-Repl32
+      // groups will come first, and so the VRI representing the largest number
+      // of groups might not be first (it might be the first Repl32 groups).
+      unsigned MaxGroupsIdx = 0;
+      if (!ValueRotsVec[0].Repl32) {
+        for (unsigned i = 0, ie = ValueRotsVec.size(); i < ie; ++i)
+          if (ValueRotsVec[i].Repl32) {
+            if (ValueRotsVec[i].NumGroups > ValueRotsVec[0].NumGroups)
+              MaxGroupsIdx = i;
+            break;
+          }
+      }
+
+      ValueRotInfo &VRI = ValueRotsVec[MaxGroupsIdx];
+      bool NeedsRotate = false;
+      if (VRI.RLAmt) {
+        NeedsRotate = true;
+      } else if (VRI.Repl32) {
+        for (auto &BG : BitGroups) {
+          if (BG.V != VRI.V || BG.RLAmt != VRI.RLAmt ||
+              BG.Repl32 != VRI.Repl32)
+            continue;
+
+          // We don't need a rotate if the bit group is confined to the lower
+          // 32 bits.
+          if (BG.StartIdx < 32 && BG.EndIdx < 32 && BG.StartIdx < BG.EndIdx)
+            continue;
+
+          NeedsRotate = true;
+          break;
+        }
+      }
+
+      if (NeedsRotate)
+        Res = SelectRotMask64(VRI.V, dl, VRI.RLAmt, VRI.Repl32,
+                              VRI.Repl32 ? 31 : 0, VRI.Repl32 ? 30 : 63,
+                              InstCnt);
+      else
+        Res = VRI.V;
+
+      // Now, remove all groups with this underlying value and rotation factor.
+      if (Res)
+        for (auto I = BitGroups.begin(); I != BitGroups.end();) {
+          if (I->V == VRI.V && I->RLAmt == VRI.RLAmt && I->Repl32 == VRI.Repl32)
+            I = BitGroups.erase(I);
+          else
+            ++I;
+        }
+    }
+
+    // Because 64-bit rotates are more flexible than inserts, we might have a
+    // preference regarding which one we do first (to save one instruction).
+    if (!Res)
+      for (auto I = BitGroups.begin(), IE = BitGroups.end(); I != IE; ++I) {
+        if (SelectRotMask64Count(I->RLAmt, I->Repl32, I->StartIdx, I->EndIdx,
+                                false) <
+            SelectRotMask64Count(I->RLAmt, I->Repl32, I->StartIdx, I->EndIdx,
+                                true)) {
+          if (I != BitGroups.begin()) {
+            BitGroup BG = *I;
+            BitGroups.erase(I);
+            BitGroups.insert(BitGroups.begin(), BG);
+          }
+
+          break;
+        }
+      }
+
+    // Insert the other groups (one at a time).
+    for (auto &BG : BitGroups) {
+      if (!Res)
+        Res = SelectRotMask64(BG.V, dl, BG.RLAmt, BG.Repl32, BG.StartIdx,
+                              BG.EndIdx, InstCnt);
+      else
+        Res = SelectRotMaskIns64(Res, BG.V, dl, BG.RLAmt, BG.Repl32,
+                                 BG.StartIdx, BG.EndIdx, InstCnt);
+    }
+
+    if (LateMask) {
+      uint64_t Mask = getZerosMask();
+
+      // We can use the 32-bit andi/andis technique if the mask does not
+      // require any higher-order bits. This can save an instruction compared
+      // to always using the general 64-bit technique.
+      bool Use32BitInsts = isUInt<32>(Mask);
+      // Compute the masks for andi/andis that would be necessary.
+      unsigned ANDIMask = (Mask & UINT16_MAX),
+               ANDISMask = (Mask >> 16) & UINT16_MAX;
+
+      if (Use32BitInsts) {
+        assert((ANDIMask != 0 || ANDISMask != 0) &&
+               "No set bits in mask when using 32-bit ands for 64-bit value");
+
+        if (InstCnt) *InstCnt += (unsigned) (ANDIMask != 0) +
+                                 (unsigned) (ANDISMask != 0) +
+                                 (unsigned) (ANDIMask != 0 && ANDISMask != 0);
+
+        SDValue ANDIVal, ANDISVal;
+        if (ANDIMask != 0)
+          ANDIVal = SDValue(CurDAG->getMachineNode(PPC::ANDIo8, dl, MVT::i64,
+                              Res, getI32Imm(ANDIMask)), 0);
+        if (ANDISMask != 0)
+          ANDISVal = SDValue(CurDAG->getMachineNode(PPC::ANDISo8, dl, MVT::i64,
+                               Res, getI32Imm(ANDISMask)), 0);
+
+        if (!ANDIVal)
+          Res = ANDISVal;
+        else if (!ANDISVal)
+          Res = ANDIVal;
+        else
+          Res = SDValue(CurDAG->getMachineNode(PPC::OR8, dl, MVT::i64,
+                          ANDIVal, ANDISVal), 0);
+      } else {
+        if (InstCnt) *InstCnt += SelectInt64Count(Mask) + /* and */ 1;
+
+        SDValue MaskVal = SDValue(SelectInt64(CurDAG, dl, Mask), 0);
+        Res =
+          SDValue(CurDAG->getMachineNode(PPC::AND8, dl, MVT::i64,
+                                         Res, MaskVal), 0);
+      }
+    }
+
+    return Res.getNode();
+  }
+
+  SDNode *Select(SDNode *N, bool LateMask, unsigned *InstCnt = nullptr) {
+    // Fill in BitGroups.
+    collectBitGroups(LateMask);
+    if (BitGroups.empty())
+      return nullptr;
+
+    // For 64-bit values, figure out when we can use 32-bit instructions.
+    if (Bits.size() == 64)
+      assignRepl32BitGroups();
+
+    // Fill in ValueRotsVec.
+    collectValueRotInfo();
+
+    if (Bits.size() == 32) {
+      return Select32(N, LateMask, InstCnt);
+    } else {
+      assert(Bits.size() == 64 && "Not 64 bits here?");
+      return Select64(N, LateMask, InstCnt);
+    }
+
+    return nullptr;
+  }
+
   SmallVector<ValueBit, 64> Bits;
 
   bool HasZeros;
@@ -968,22 +1795,34 @@ public:
     // Fill it RLAmt and set HasZeros.
     computeRotationAmounts();
 
-    // Fill in BitGroups.
-    collectBitGroups();
-    if (BitGroups.empty())
-      return nullptr;
+    if (!HasZeros)
+      return Select(N, false);
 
-    // Fill in ValueRotsVec.
-    collectValueRotInfo();
+    // We currently have two techniques for handling results with zeros: early
+    // masking (the default) and late masking. Late masking is sometimes more
+    // efficient, but because the structure of the bit groups is different, it
+    // is hard to tell without generating both and comparing the results. With
+    // late masking, we ignore zeros in the resulting value when inserting each
+    // set of bit groups, and then mask in the zeros at the end. With early
+    // masking, we only insert the non-zero parts of the result at every step.
 
-    if (Bits.size() == 32) {
-      return Select32(N);
-    } else {
-      assert(Bits.size() == 64 && "Not 64 bits here?");
-      // TODO: The 64-bit case!
+    unsigned InstCnt, InstCntLateMask;
+    DEBUG(dbgs() << "\tEarly masking:\n");
+    SDNode *RN = Select(N, false, &InstCnt);
+    DEBUG(dbgs() << "\t\tisel would use " << InstCnt << " instructions\n");
+
+    DEBUG(dbgs() << "\tLate masking:\n");
+    SDNode *RNLM = Select(N, true, &InstCntLateMask);
+    DEBUG(dbgs() << "\t\tisel would use " << InstCntLateMask <<
+                    " instructions\n");
+
+    if (InstCnt <= InstCntLateMask) {
+      DEBUG(dbgs() << "\tUsing early-masking for isel\n");
+      return RN;
     }
 
-    return nullptr;
+    DEBUG(dbgs() << "\tUsing late-masking for isel\n");
+    return RNLM;
   }
 };
 } // anonymous namespace
@@ -993,6 +1832,9 @@ SDNode *PPCDAGToDAGISel::SelectBitPermutation(SDNode *N) {
       N->getValueType(0) != MVT::i64)
     return nullptr;
 
+  if (!UseBitPermRewriter)
+    return nullptr;
+
   switch (N->getOpcode()) {
   default: break;
   case ISD::ROTL:
@@ -1431,77 +2273,8 @@ SDNode *PPCDAGToDAGISel::Select(SDNode *N) {
   default: break;
 
   case ISD::Constant: {
-    if (N->getValueType(0) == MVT::i64) {
-      // Get 64 bit value.
-      int64_t Imm = cast<ConstantSDNode>(N)->getZExtValue();
-      // Assume no remaining bits.
-      unsigned Remainder = 0;
-      // Assume no shift required.
-      unsigned Shift = 0;
-
-      // If it can't be represented as a 32 bit value.
-      if (!isInt<32>(Imm)) {
-        Shift = countTrailingZeros<uint64_t>(Imm);
-        int64_t ImmSh = static_cast<uint64_t>(Imm) >> Shift;
-
-        // If the shifted value fits 32 bits.
-        if (isInt<32>(ImmSh)) {
-          // Go with the shifted value.
-          Imm = ImmSh;
-        } else {
-          // Still stuck with a 64 bit value.
-          Remainder = Imm;
-          Shift = 32;
-          Imm >>= 32;
-        }
-      }
-
-      // Intermediate operand.
-      SDNode *Result;
-
-      // Handle first 32 bits.
-      unsigned Lo = Imm & 0xFFFF;
-      unsigned Hi = (Imm >> 16) & 0xFFFF;
-
-      // Simple value.
-      if (isInt<16>(Imm)) {
-       // Just the Lo bits.
-        Result = CurDAG->getMachineNode(PPC::LI8, dl, MVT::i64, getI32Imm(Lo));
-      } else if (Lo) {
-        // Handle the Hi bits.
-        unsigned OpC = Hi ? PPC::LIS8 : PPC::LI8;
-        Result = CurDAG->getMachineNode(OpC, dl, MVT::i64, getI32Imm(Hi));
-        // And Lo bits.
-        Result = CurDAG->getMachineNode(PPC::ORI8, dl, MVT::i64,
-                                        SDValue(Result, 0), getI32Imm(Lo));
-      } else {
-       // Just the Hi bits.
-        Result = CurDAG->getMachineNode(PPC::LIS8, dl, MVT::i64, getI32Imm(Hi));
-      }
-
-      // If no shift, we're done.
-      if (!Shift) return Result;
-
-      // Shift for next step if the upper 32-bits were not zero.
-      if (Imm) {
-        Result = CurDAG->getMachineNode(PPC::RLDICR, dl, MVT::i64,
-                                        SDValue(Result, 0),
-                                        getI32Imm(Shift),
-                                        getI32Imm(63 - Shift));
-      }
-
-      // Add in the last bits as required.
-      if ((Hi = (Remainder >> 16) & 0xFFFF)) {
-        Result = CurDAG->getMachineNode(PPC::ORIS8, dl, MVT::i64,
-                                        SDValue(Result, 0), getI32Imm(Hi));
-      }
-      if ((Lo = Remainder & 0xFFFF)) {
-        Result = CurDAG->getMachineNode(PPC::ORI8, dl, MVT::i64,
-                                        SDValue(Result, 0), getI32Imm(Lo));
-      }
-
-      return Result;
-    }
+    if (N->getValueType(0) == MVT::i64)
+      return SelectInt64(CurDAG, N);
     break;
   }
 
diff --git a/lib/Target/PowerPC/PPCInstr64Bit.td b/lib/Target/PowerPC/PPCInstr64Bit.td
index bea27a34926..7d1249c2ce7 100644
--- a/lib/Target/PowerPC/PPCInstr64Bit.td
+++ b/lib/Target/PowerPC/PPCInstr64Bit.td
@@ -617,13 +617,11 @@ def MULLI8 : DForm_2<7, (outs g8rc:$rD), (ins g8rc:$rA, s16imm64:$imm),
 }
 
 let hasSideEffects = 0 in {
-let isCommutable = 1 in {
 defm RLDIMI : MDForm_1r<30, 3, (outs g8rc:$rA),
                         (ins g8rc:$rSi, g8rc:$rS, u6imm:$SH, u6imm:$MBE),
                         "rldimi", "$rA, $rS, $SH, $MBE", IIC_IntRotateDI,
                         []>, isPPC64, RegConstraint<"$rSi = $rA">,
                         NoEncode<"$rSi">;
-}
 
 // Rotate instructions.
 defm RLDCL  : MDSForm_1r<30, 8,
diff --git a/test/CodeGen/PowerPC/bperm.ll b/test/CodeGen/PowerPC/bperm.ll
index 0f920328a05..c489c1f90a8 100644
--- a/test/CodeGen/PowerPC/bperm.ll
+++ b/test/CodeGen/PowerPC/bperm.ll
@@ -16,6 +16,100 @@ entry:
 ; CHECK: blr
 }
 
+define i64 @bs8(i64 %x) #0 {
+entry:
+  %0 = tail call i64 @llvm.bswap.i64(i64 %x)
+  ret i64 %0
+
+; CHECK-LABEL: @bs8
+; CHECK-DAG: rldicl [[REG1:[0-9]+]], 3, 16, 0
+; CHECK-DAG: rldicl [[REG2:[0-9]+]], 3, 8, 0
+; CHECK-DAG: rldicl [[REG3:[0-9]+]], 3, 24, 0
+; CHECK-DAG: rldimi [[REG2]], [[REG1]], 8, 48
+; CHECK-DAG: rldicl [[REG4:[0-9]+]], 3, 32, 0
+; CHECK-DAG: rldimi [[REG2]], [[REG3]], 16, 40
+; CHECK-DAG: rldicl [[REG5:[0-9]+]], 3, 48, 0
+; CHECK-DAG: rldimi [[REG2]], [[REG4]], 24, 32
+; CHECK-DAG: rldicl [[REG6:[0-9]+]], 3, 56, 0
+; CHECK-DAG: rldimi [[REG2]], [[REG5]], 40, 16
+; CHECK-DAG: rldimi [[REG2]], [[REG6]], 48, 8
+; CHECK-DAG: rldimi [[REG2]], 3, 56, 0
+; CHECK: mr 3, [[REG2]]
+; CHECK: blr
+}
+
+define i64 @test1(i64 %i0, i64 %i1) #0 {
+entry:
+  %0 = lshr i64 %i1, 8
+  %and = and i64 %0, 5963776000
+  ret i64 %and
+
+; CHECK-LABEL: @test1
+; CHECK-DAG: li [[REG1:[0-9]+]], 11375
+; CHECK-DAG: rldicl [[REG3:[0-9]+]], 4, 56, 0
+; CHECK-DAG: sldi [[REG2:[0-9]+]], [[REG1]], 19
+; CHECK: and 3, [[REG3]], [[REG2]]
+; CHECK: blr
+}
+
+define i64 @test2(i64 %i0, i64 %i1) #0 {
+entry:
+  %0 = lshr i64 %i1, 6
+  %and = and i64 %0, 133434808670355456
+  ret i64 %and
+
+; CHECK-LABEL: @test2
+; CHECK-DAG: lis [[REG1:[0-9]+]], 474
+; CHECK-DAG: rldicl [[REG5:[0-9]+]], 4, 58, 0
+; CHECK-DAG: ori [[REG2:[0-9]+]], [[REG1]], 3648
+; CHECK-DAG: sldi [[REG3:[0-9]+]], [[REG2]], 32
+; CHECK-DAG: oris [[REG4:[0-9]+]], [[REG3]], 25464
+; CHECK: and 3, [[REG5]], [[REG4]]
+; CHECK: blr
+}
+
+define i64 @test3(i64 %i0, i64 %i1) #0 {
+entry:
+  %0 = shl i64 %i0, 34
+  %and = and i64 %0, 191795733152661504
+  ret i64 %and
+
+; CHECK-LABEL: @test3
+; CHECK-DAG: lis [[REG1:[0-9]+]], 170
+; CHECK-DAG: rldicl [[REG4:[0-9]+]], 3, 34, 0
+; CHECK-DAG: ori [[REG2:[0-9]+]], [[REG1]], 22861
+; CHECK-DAG: sldi [[REG3:[0-9]+]], [[REG2]], 34
+; CHECK: and 3, [[REG4]], [[REG3]]
+; CHECK: blr
+}
+
+define i64 @test4(i64 %i0, i64 %i1) #0 {
+entry:
+  %0 = lshr i64 %i1, 15
+  %and = and i64 %0, 58195968
+  ret i64 %and
+
+; CHECK-LABEL: @test4
+; CHECK: rldicl [[REG1:[0-9]+]], 4, 49, 0
+; CHECK: andis. 3, [[REG1]], 888
+; CHECK: blr
+}
+
+define i64 @test5(i64 %i0, i64 %i1) #0 {
+entry:
+  %0 = shl i64 %i1, 12
+  %and = and i64 %0, 127252959854592
+  ret i64 %and
+
+; CHECK-LABEL: @test5
+; CHECK-DAG: lis [[REG1:[0-9]+]], 3703
+; CHECK-DAG: rldicl [[REG4:[0-9]+]], 4, 12, 0
+; CHECK-DAG: ori [[REG2:[0-9]+]], [[REG1]], 35951
+; CHECK-DAG: sldi [[REG3:[0-9]+]], [[REG2]], 19
+; CHECK: and 3, [[REG4]], [[REG3]]
+; CHECK: blr
+}
+
 ; Function Attrs: nounwind readnone
 define zeroext i32 @test6(i32 zeroext %x) #0 {
 entry:
@@ -33,8 +127,153 @@ entry:
 ; CHECK: blr
 }
 
+define i64 @test7(i64 %i0, i64 %i1) #0 {
+entry:
+  %0 = lshr i64 %i0, 5
+  %and = and i64 %0, 58195968
+  ret i64 %and
+
+; CHECK-LABEL: @test7
+; CHECK: rlwinm [[REG1:[0-9]+]], 3, 27, 9, 12
+; CHECK: rlwimi [[REG1]], 3, 27, 6, 7
+; CHECK: mr 3, [[REG1]]
+; CHECK: blr
+}
+
+define i64 @test8(i64 %i0, i64 %i1) #0 {
+entry:
+  %0 = lshr i64 %i0, 1
+  %and = and i64 %0, 169172533248
+  ret i64 %and
+
+; CHECK-LABEL: @test8
+; CHECK-DAG: lis [[REG1:[0-9]+]], 4
+; CHECK-DAG: rldicl [[REG4:[0-9]+]], 3, 63, 0
+; CHECK-DAG: ori [[REG2:[0-9]+]], [[REG1]], 60527
+; CHECK-DAG: sldi [[REG3:[0-9]+]], [[REG2]], 19
+; CHECK: and 3, [[REG4]], [[REG3]]
+; CHECK: blr
+}
+
+define i64 @test9(i64 %i0, i64 %i1) #0 {
+entry:
+  %0 = lshr i64 %i1, 14
+  %and = and i64 %0, 18848677888
+  %1 = shl i64 %i1, 51
+  %and3 = and i64 %1, 405323966463344640
+  %or4 = or i64 %and, %and3
+  ret i64 %or4
+
+; CHECK-LABEL: @test9
+; CHECK-DAG: lis [[REG1:[0-9]+]], 1440
+; CHECK-DAG: rldicl [[REG5:[0-9]+]], 4, 62, 0
+; CHECK-DAG: rldicl [[REG6:[0-9]+]], 4, 50, 0
+; CHECK-DAG: ori [[REG2:[0-9]+]], [[REG1]], 4
+; CHECK-DAG: rldimi [[REG6]], [[REG5]], 53, 0
+; CHECK-DAG: sldi [[REG3:[0-9]+]], [[REG2]], 32
+; CHECK-DAG: oris [[REG4:[0-9]+]], [[REG3]], 25464
+; CHECK: and 3, [[REG6]], [[REG4]]
+; CHECK: blr
+}
+
+define i64 @test10(i64 %i0, i64 %i1) #0 {
+entry:
+  %0 = shl i64 %i0, 37
+  %and = and i64 %0, 15881483390550016
+  %1 = shl i64 %i0, 25
+  %and3 = and i64 %1, 2473599172608
+  %or4 = or i64 %and, %and3
+  ret i64 %or4
+
+; CHECK-LABEL: @test10
+; CHECK-DAG: lis [[REG1:[0-9]+]], 1
+; CHECK-DAG: rldicl [[REG6:[0-9]+]], 3, 25, 0
+; CHECK-DAG: rldicl [[REG7:[0-9]+]], 3, 37, 0
+; CHECK-DAG: ori [[REG2:[0-9]+]], [[REG1]], 8183
+; CHECK-DAG: ori [[REG3:[0-9]+]], [[REG1]], 50017
+; CHECK-DAG: sldi [[REG4:[0-9]+]], [[REG2]], 25
+; CHECK-DAG: sldi [[REG5:[0-9]+]], [[REG3]], 37
+; CHECK-DAG: and [[REG8:[0-9]+]], [[REG6]], [[REG4]]
+; CHECK-DAG: and [[REG9:[0-9]+]], [[REG7]], [[REG5]]
+; CHECK: or 3, [[REG9]], [[REG8]]
+; CHECK: blr
+}
+
+define i64 @test11(i64 %x) #0 {
+entry:
+  %and = and i64 %x, 4294967295
+  %shl = shl i64 %x, 32
+  %or = or i64 %and, %shl
+  ret i64 %or
+
+; CHECK-LABEL: @test11
+; CHECK: rlwinm 3, 3, 0, 1, 0
+; CHECK: blr
+}
+
+define i64 @test12(i64 %x) #0 {
+entry:
+  %and = and i64 %x, 4294905855
+  %shl = shl i64 %x, 32
+  %or = or i64 %and, %shl
+  ret i64 %or
+
+; CHECK-LABEL: @test12
+; CHECK: rlwinm 3, 3, 0, 20, 15
+; CHECK: blr
+}
+
+define i64 @test13(i64 %x) #0 {
+entry:
+  %shl = shl i64 %x, 4
+  %and = and i64 %shl, 240
+  %shr = lshr i64 %x, 28
+  %and1 = and i64 %shr, 15
+  %or = or i64 %and, %and1
+  ret i64 %or
+
+; CHECK-LABEL: @test13
+; CHECK: rlwinm 3, 3, 4, 24, 31
+; CHECK: blr
+}
+
+define i64 @test14(i64 %x) #0 {
+entry:
+  %shl = shl i64 %x, 4
+  %and = and i64 %shl, 240
+  %shr = lshr i64 %x, 28
+  %and1 = and i64 %shr, 15
+  %and2 = and i64 %x, -4294967296
+  %or = or i64 %and1, %and2
+  %or3 = or i64 %or, %and
+  ret i64 %or3
+
+; CHECK-LABEL: @test14
+; CHECK: rldicr [[REG1:[0-9]+]], 3, 0, 31
+; CHECK: rlwimi [[REG1]], 3, 4, 24, 31
+; CHECK: mr 3, [[REG1]]
+; CHECK: blr
+}
+
+define i64 @test15(i64 %x) #0 {
+entry:
+  %shl = shl i64 %x, 4
+  %and = and i64 %shl, 240
+  %shr = lshr i64 %x, 28
+  %and1 = and i64 %shr, 15
+  %and2 = and i64 %x, -256
+  %or = or i64 %and1, %and2
+  %or3 = or i64 %or, %and
+  ret i64 %or3
+
+; CHECK-LABEL: @test15
+; CHECK: rlwimi 3, 3, 4, 24, 31
+; CHECK: blr
+}
+
 ; Function Attrs: nounwind readnone
 declare i32 @llvm.bswap.i32(i32) #0
+declare i64 @llvm.bswap.i64(i64) #0
 
 attributes #0 = { nounwind readnone }