diff --git a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp index bc670768989..c12a82c03b8 100644 --- a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp +++ b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp @@ -42,6 +42,12 @@ using namespace llvm; cl::opt ANDIGlueBug("expose-ppc-andi-glue-bug", cl::desc("expose the ANDI glue bug on PPC"), cl::Hidden); +cl::opt UseBitPermRewriter("ppc-use-bit-perm-rewriter", cl::init(true), + cl::desc("use aggressive ppc isel for bit permutations"), cl::Hidden); +cl::opt BPermRewriterNoMasking("ppc-bit-perm-rewriter-stress-rotates", + cl::desc("stress rotate selection in aggressive ppc isel for " + "bit permutations"), cl::Hidden); + namespace llvm { void initializePPCDAGToDAGISelPass(PassRegistry&); } @@ -533,6 +539,152 @@ SDNode *PPCDAGToDAGISel::SelectBitfieldInsert(SDNode *N) { return nullptr; } +// Predict the number of instructions that would be generated by calling +// SelectInt64(N). +static unsigned SelectInt64Count(int64_t Imm) { + // Assume no remaining bits. + unsigned Remainder = 0; + // Assume no shift required. + unsigned Shift = 0; + + // If it can't be represented as a 32 bit value. + if (!isInt<32>(Imm)) { + Shift = countTrailingZeros(Imm); + int64_t ImmSh = static_cast(Imm) >> Shift; + + // If the shifted value fits 32 bits. + if (isInt<32>(ImmSh)) { + // Go with the shifted value. + Imm = ImmSh; + } else { + // Still stuck with a 64 bit value. + Remainder = Imm; + Shift = 32; + Imm >>= 32; + } + } + + // Intermediate operand. + unsigned Result = 0; + + // Handle first 32 bits. + unsigned Lo = Imm & 0xFFFF; + unsigned Hi = (Imm >> 16) & 0xFFFF; + + // Simple value. + if (isInt<16>(Imm)) { + // Just the Lo bits. + ++Result; + } else if (Lo) { + // Handle the Hi bits and Lo bits. + Result += 2; + } else { + // Just the Hi bits. + ++Result; + } + + // If no shift, we're done. + if (!Shift) return Result; + + // Shift for next step if the upper 32-bits were not zero. + if (Imm) + ++Result; + + // Add in the last bits as required. + if ((Hi = (Remainder >> 16) & 0xFFFF)) + ++Result; + if ((Lo = Remainder & 0xFFFF)) + ++Result; + + return Result; +} + +// Select a 64-bit constant. For cost-modeling purposes, SelectInt64Count +// (above) needs to be kept in sync with this function. +static SDNode *SelectInt64(SelectionDAG *CurDAG, SDLoc dl, int64_t Imm) { + // Assume no remaining bits. + unsigned Remainder = 0; + // Assume no shift required. + unsigned Shift = 0; + + // If it can't be represented as a 32 bit value. + if (!isInt<32>(Imm)) { + Shift = countTrailingZeros(Imm); + int64_t ImmSh = static_cast(Imm) >> Shift; + + // If the shifted value fits 32 bits. + if (isInt<32>(ImmSh)) { + // Go with the shifted value. + Imm = ImmSh; + } else { + // Still stuck with a 64 bit value. + Remainder = Imm; + Shift = 32; + Imm >>= 32; + } + } + + // Intermediate operand. + SDNode *Result; + + // Handle first 32 bits. + unsigned Lo = Imm & 0xFFFF; + unsigned Hi = (Imm >> 16) & 0xFFFF; + + auto getI32Imm = [CurDAG](unsigned Imm) { + return CurDAG->getTargetConstant(Imm, MVT::i32); + }; + + // Simple value. + if (isInt<16>(Imm)) { + // Just the Lo bits. + Result = CurDAG->getMachineNode(PPC::LI8, dl, MVT::i64, getI32Imm(Lo)); + } else if (Lo) { + // Handle the Hi bits. + unsigned OpC = Hi ? PPC::LIS8 : PPC::LI8; + Result = CurDAG->getMachineNode(OpC, dl, MVT::i64, getI32Imm(Hi)); + // And Lo bits. + Result = CurDAG->getMachineNode(PPC::ORI8, dl, MVT::i64, + SDValue(Result, 0), getI32Imm(Lo)); + } else { + // Just the Hi bits. + Result = CurDAG->getMachineNode(PPC::LIS8, dl, MVT::i64, getI32Imm(Hi)); + } + + // If no shift, we're done. + if (!Shift) return Result; + + // Shift for next step if the upper 32-bits were not zero. + if (Imm) { + Result = CurDAG->getMachineNode(PPC::RLDICR, dl, MVT::i64, + SDValue(Result, 0), + getI32Imm(Shift), + getI32Imm(63 - Shift)); + } + + // Add in the last bits as required. + if ((Hi = (Remainder >> 16) & 0xFFFF)) { + Result = CurDAG->getMachineNode(PPC::ORIS8, dl, MVT::i64, + SDValue(Result, 0), getI32Imm(Hi)); + } + if ((Lo = Remainder & 0xFFFF)) { + Result = CurDAG->getMachineNode(PPC::ORI8, dl, MVT::i64, + SDValue(Result, 0), getI32Imm(Lo)); + } + + return Result; +} + +// Select a 64-bit constant. +static SDNode *SelectInt64(SelectionDAG *CurDAG, SDNode *N) { + SDLoc dl(N); + + // Get 64 bit value. + int64_t Imm = cast(N)->getZExtValue(); + return SelectInt64(CurDAG, dl, Imm); +} + + namespace { class BitPermutationSelector { struct ValueBit { @@ -577,8 +729,19 @@ class BitPermutationSelector { unsigned RLAmt; unsigned StartIdx, EndIdx; + // This rotation amount assumes that the lower 32 bits of the quantity are + // replicated in the high 32 bits by the rotation operator (which is done + // by rlwinm and friends in 64-bit mode). + bool Repl32; + // Did converting to Repl32 == true change the rotation factor? If it did, + // it decreased it by 32. + bool Repl32CR; + // Was this group coalesced after setting Repl32 to true? + bool Repl32Coalesced; + BitGroup(SDValue V, unsigned R, unsigned S, unsigned E) - : V(V), RLAmt(R), StartIdx(S), EndIdx(E) { + : V(V), RLAmt(R), StartIdx(S), EndIdx(E), Repl32(false), Repl32CR(false), + Repl32Coalesced(false) { DEBUG(dbgs() << "\tbit group for " << V.getNode() << " RLAmt = " << R << " [" << S << ", " << E << "]\n"); } @@ -591,14 +754,23 @@ class BitPermutationSelector { unsigned RLAmt; unsigned NumGroups; unsigned FirstGroupStartIdx; + bool Repl32; ValueRotInfo() - : RLAmt(UINT32_MAX), NumGroups(0), FirstGroupStartIdx(UINT32_MAX) {} + : RLAmt(UINT32_MAX), NumGroups(0), FirstGroupStartIdx(UINT32_MAX), + Repl32(false) {} // For sorting (in reverse order) by NumGroups, and then by // FirstGroupStartIdx. bool operator < (const ValueRotInfo &Other) const { - if (NumGroups > Other.NumGroups) + // We need to sort so that the non-Repl32 come first because, when we're + // doing masking, the Repl32 bit groups might be subsumed into the 64-bit + // masking operation. + if (Repl32 < Other.Repl32) + return true; + else if (Repl32 > Other.Repl32) + return false; + else if (NumGroups > Other.NumGroups) return true; else if (NumGroups < Other.NumGroups) return false; @@ -729,8 +901,9 @@ class BitPermutationSelector { } // Collect groups of consecutive bits with the same underlying value and - // rotation factor. - void collectBitGroups() { + // rotation factor. If we're doing late masking, we ignore zeros, otherwise + // they break up groups. + void collectBitGroups(bool LateMask) { BitGroups.clear(); unsigned LastRLAmt = RLAmt[0]; @@ -739,6 +912,14 @@ class BitPermutationSelector { for (unsigned i = 1; i < Bits.size(); ++i) { unsigned ThisRLAmt = RLAmt[i]; SDValue ThisValue = Bits[i].hasValue() ? Bits[i].getValue() : SDValue(); + if (LateMask && !ThisValue) { + ThisValue = LastValue; + ThisRLAmt = LastRLAmt; + // If we're doing late masking, then the first bit group always starts + // at zero (even if the first bits were zero). + if (BitGroups.empty()) + LastGroupStartIdx = 0; + } // If this bit has the same underlying value and the same rotate factor as // the last one, then they're part of the same group. @@ -768,6 +949,7 @@ class BitPermutationSelector { BitGroups[BitGroups.size()-1].EndIdx == Bits.size()-1 && BitGroups[0].V == BitGroups[BitGroups.size()-1].V && BitGroups[0].RLAmt == BitGroups[BitGroups.size()-1].RLAmt) { + DEBUG(dbgs() << "\tcombining final bit group with inital one\n"); BitGroups[BitGroups.size()-1].EndIdx = BitGroups[0].EndIdx; BitGroups.erase(BitGroups.begin()); } @@ -781,9 +963,11 @@ class BitPermutationSelector { ValueRots.clear(); for (auto &BG : BitGroups) { - ValueRotInfo &VRI = ValueRots[std::make_pair(BG.V, BG.RLAmt)]; + unsigned RLAmtKey = BG.RLAmt + (BG.Repl32 ? 64 : 0); + ValueRotInfo &VRI = ValueRots[std::make_pair(BG.V, RLAmtKey)]; VRI.V = BG.V; VRI.RLAmt = BG.RLAmt; + VRI.Repl32 = BG.Repl32; VRI.NumGroups += 1; VRI.FirstGroupStartIdx = std::min(VRI.FirstGroupStartIdx, BG.StartIdx); } @@ -797,15 +981,164 @@ class BitPermutationSelector { std::sort(ValueRotsVec.begin(), ValueRotsVec.end()); } + // In 64-bit mode, rlwinm and friends have a rotation operator that + // replicates the low-order 32 bits into the high-order 32-bits. The mask + // indices of these instructions can only be in the lower 32 bits, so they + // can only represent some 64-bit bit groups. However, when they can be used, + // the 32-bit replication can be used to represent, as a single bit group, + // otherwise separate bit groups. We'll convert to replicated-32-bit bit + // groups when possible. Returns true if any of the bit groups were + // converted. + void assignRepl32BitGroups() { + // If we have bits like this: + // + // Indices: 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0 + // V bits: ... 7 6 5 4 3 2 1 0 31 30 29 28 27 26 25 24 + // Groups: | RLAmt = 8 | RLAmt = 40 | + // + // But, making use of a 32-bit operation that replicates the low-order 32 + // bits into the high-order 32 bits, this can be one bit group with a RLAmt + // of 8. + + auto IsAllLow32 = [this](BitGroup & BG) { + if (BG.StartIdx <= BG.EndIdx) { + for (unsigned i = BG.StartIdx; i <= BG.EndIdx; ++i) { + if (!Bits[i].hasValue()) + continue; + if (Bits[i].getValueBitIndex() >= 32) + return false; + } + } else { + for (unsigned i = BG.StartIdx; i < Bits.size(); ++i) { + if (!Bits[i].hasValue()) + continue; + if (Bits[i].getValueBitIndex() >= 32) + return false; + } + for (unsigned i = 0; i <= BG.EndIdx; ++i) { + if (!Bits[i].hasValue()) + continue; + if (Bits[i].getValueBitIndex() >= 32) + return false; + } + } + + return true; + }; + + for (auto &BG : BitGroups) { + if (BG.StartIdx < 32 && BG.EndIdx < 32) { + if (IsAllLow32(BG)) { + if (BG.RLAmt >= 32) { + BG.RLAmt -= 32; + BG.Repl32CR = true; + } + + BG.Repl32 = true; + + DEBUG(dbgs() << "\t32-bit replicated bit group for " << + BG.V.getNode() << " RLAmt = " << BG.RLAmt << + " [" << BG.StartIdx << ", " << BG.EndIdx << "]\n"); + } + } + } + + // Now walk through the bit groups, consolidating where possible. + for (auto I = BitGroups.begin(); I != BitGroups.end();) { + // We might want to remove this bit group by merging it with the previous + // group (which might be the ending group). + auto IP = (I == BitGroups.begin()) ? + std::prev(BitGroups.end()) : std::prev(I); + if (I->Repl32 && IP->Repl32 && I->V == IP->V && I->RLAmt == IP->RLAmt && + I->StartIdx == (IP->EndIdx + 1) % 64 && I != IP) { + + DEBUG(dbgs() << "\tcombining 32-bit replicated bit group for " << + I->V.getNode() << " RLAmt = " << I->RLAmt << + " [" << I->StartIdx << ", " << I->EndIdx << + "] with group with range [" << + IP->StartIdx << ", " << IP->EndIdx << "]\n"); + + IP->EndIdx = I->EndIdx; + IP->Repl32CR = IP->Repl32CR || I->Repl32CR; + IP->Repl32Coalesced = true; + I = BitGroups.erase(I); + continue; + } else { + // There is a special case worth handling: If there is a single group + // covering the entire upper 32 bits, and it can be merged with both + // the next and previous groups (which might be the same group), then + // do so. If it is the same group (so there will be only one group in + // total), then we need to reverse the order of the range so that it + // covers the entire 64 bits. + if (I->StartIdx == 32 && I->EndIdx == 63) { + assert(std::next(I) == BitGroups.end() && + "bit group ends at index 63 but there is another?"); + auto IN = BitGroups.begin(); + + if (IP->Repl32 && IN->Repl32 && I->V == IP->V && I->V == IN->V && + (I->RLAmt % 32) == IP->RLAmt && (I->RLAmt % 32) == IN->RLAmt && + IP->EndIdx == 31 && IN->StartIdx == 0 && I != IP && + IsAllLow32(*I)) { + + DEBUG(dbgs() << "\tcombining bit group for " << + I->V.getNode() << " RLAmt = " << I->RLAmt << + " [" << I->StartIdx << ", " << I->EndIdx << + "] with 32-bit replicated groups with ranges [" << + IP->StartIdx << ", " << IP->EndIdx << "] and [" << + IN->StartIdx << ", " << IN->EndIdx << "]\n"); + + if (IP == IN) { + // There is only one other group; change it to cover the whole + // range (backward, so that it can still be Repl32 but cover the + // whole 64-bit range). + IP->StartIdx = 31; + IP->EndIdx = 30; + IP->Repl32CR = IP->Repl32CR || I->RLAmt >= 32; + IP->Repl32Coalesced = true; + I = BitGroups.erase(I); + } else { + // There are two separate groups, one before this group and one + // after us (at the beginning). We're going to remove this group, + // but also the group at the very beginning. + IP->EndIdx = IN->EndIdx; + IP->Repl32CR = IP->Repl32CR || IN->Repl32CR || I->RLAmt >= 32; + IP->Repl32Coalesced = true; + I = BitGroups.erase(I); + BitGroups.erase(BitGroups.begin()); + } + + // This must be the last group in the vector (and we might have + // just invalidated the iterator above), so break here. + break; + } + } + } + + ++I; + } + } + SDValue getI32Imm(unsigned Imm) { return CurDAG->getTargetConstant(Imm, MVT::i32); } + uint64_t getZerosMask() { + uint64_t Mask = 0; + for (unsigned i = 0; i < Bits.size(); ++i) { + if (Bits[i].hasValue()) + continue; + Mask |= (1ul << i); + } + + return ~Mask; + } + // Depending on the number of groups for a particular value, it might be // better to rotate, mask explicitly (using andi/andis), and then or the // result. Select this part of the result first. - void SelectAndParts32(SDNode *N, SDValue &Res) { - SDLoc dl(N); + void SelectAndParts32(SDLoc dl, SDValue &Res, unsigned *InstCnt) { + if (BPermRewriterNoMasking) + return; for (ValueRotInfo &VRI : ValueRotsVec) { unsigned Mask = 0; @@ -842,9 +1175,19 @@ class BitPermutationSelector { (unsigned) (ANDISMask != 0) + (unsigned) (ANDIMask != 0 && ANDISMask != 0) + (unsigned) (bool) Res; + + DEBUG(dbgs() << "\t\trotation groups for " << VRI.V.getNode() << + " RL: " << VRI.RLAmt << ":" << + "\n\t\t\tisel using masking: " << NumAndInsts << + " using rotates: " << VRI.NumGroups << "\n"); + if (NumAndInsts >= VRI.NumGroups) continue; + DEBUG(dbgs() << "\t\t\t\tusing masking\n"); + + if (InstCnt) *InstCnt += NumAndInsts; + SDValue VRot; if (VRI.RLAmt) { SDValue Ops[] = @@ -890,19 +1233,22 @@ class BitPermutationSelector { } // Instruction selection for the 32-bit case. - SDNode *Select32(SDNode *N) { + SDNode *Select32(SDNode *N, bool LateMask, unsigned *InstCnt) { SDLoc dl(N); SDValue Res; + if (InstCnt) *InstCnt = 0; + // Take care of cases that should use andi/andis first. - SelectAndParts32(N, Res); + SelectAndParts32(dl, Res, InstCnt); // If we've not yet selected a 'starting' instruction, and we have no zeros // to fill in, select the (Value, RLAmt) with the highest priority (largest // number of groups), and start with this rotated value. - if (!HasZeros && !Res) { + if ((!HasZeros || LateMask) && !Res) { ValueRotInfo &VRI = ValueRotsVec[0]; if (VRI.RLAmt) { + if (InstCnt) *InstCnt += 1; SDValue Ops[] = { VRI.V, getI32Imm(VRI.RLAmt), getI32Imm(0), getI32Imm(31) }; Res = SDValue(CurDAG->getMachineNode(PPC::RLWINM, dl, MVT::i32, Ops), 0); @@ -919,9 +1265,11 @@ class BitPermutationSelector { } } + if (InstCnt) *InstCnt += BitGroups.size(); + // Insert the other groups (one at a time). for (auto &BG : BitGroups) { - if (!Res.getNode()) { + if (!Res) { SDValue Ops[] = { BG.V, getI32Imm(BG.RLAmt), getI32Imm(Bits.size() - BG.EndIdx - 1), getI32Imm(Bits.size() - BG.StartIdx - 1) }; @@ -934,9 +1282,488 @@ class BitPermutationSelector { } } + if (LateMask) { + unsigned Mask = (unsigned) getZerosMask(); + + unsigned ANDIMask = (Mask & UINT16_MAX), ANDISMask = Mask >> 16; + assert((ANDIMask != 0 || ANDISMask != 0) && + "No set bits in zeros mask?"); + + if (InstCnt) *InstCnt += (unsigned) (ANDIMask != 0) + + (unsigned) (ANDISMask != 0) + + (unsigned) (ANDIMask != 0 && ANDISMask != 0); + + SDValue ANDIVal, ANDISVal; + if (ANDIMask != 0) + ANDIVal = SDValue(CurDAG->getMachineNode(PPC::ANDIo, dl, MVT::i32, + Res, getI32Imm(ANDIMask)), 0); + if (ANDISMask != 0) + ANDISVal = SDValue(CurDAG->getMachineNode(PPC::ANDISo, dl, MVT::i32, + Res, getI32Imm(ANDISMask)), 0); + + if (!ANDIVal) + Res = ANDISVal; + else if (!ANDISVal) + Res = ANDIVal; + else + Res = SDValue(CurDAG->getMachineNode(PPC::OR, dl, MVT::i32, + ANDIVal, ANDISVal), 0); + } + return Res.getNode(); } + unsigned SelectRotMask64Count(unsigned RLAmt, bool Repl32, + unsigned MaskStart, unsigned MaskEnd, + bool IsIns) { + // In the notation used by the instructions, 'start' and 'end' are reversed + // because bits are counted from high to low order. + unsigned InstMaskStart = 64 - MaskEnd - 1, + InstMaskEnd = 64 - MaskStart - 1; + + if (Repl32) + return 1; + + if ((!IsIns && (InstMaskEnd == 63 || InstMaskStart == 0)) || + InstMaskEnd == 63 - RLAmt) + return 1; + + return 2; + } + + // For 64-bit values, not all combinations of rotates and masks are + // available. Produce one if it is available. + SDValue SelectRotMask64(SDValue V, SDLoc dl, unsigned RLAmt, bool Repl32, + unsigned MaskStart, unsigned MaskEnd, + unsigned *InstCnt = nullptr) { + // In the notation used by the instructions, 'start' and 'end' are reversed + // because bits are counted from high to low order. + unsigned InstMaskStart = 64 - MaskEnd - 1, + InstMaskEnd = 64 - MaskStart - 1; + + if (InstCnt) *InstCnt += 1; + + if (Repl32) { + // This rotation amount assumes that the lower 32 bits of the quantity + // are replicated in the high 32 bits by the rotation operator (which is + // done by rlwinm and friends). + assert(InstMaskStart >= 32 && "Mask cannot start out of range"); + assert(InstMaskEnd >= 32 && "Mask cannot end out of range"); + SDValue Ops[] = + { V, getI32Imm(RLAmt), getI32Imm(InstMaskStart - 32), + getI32Imm(InstMaskEnd - 32) }; + return SDValue(CurDAG->getMachineNode(PPC::RLWINM8, dl, MVT::i64, + Ops), 0); + } + + if (InstMaskEnd == 63) { + SDValue Ops[] = + { V, getI32Imm(RLAmt), getI32Imm(InstMaskStart) }; + return SDValue(CurDAG->getMachineNode(PPC::RLDICL, dl, MVT::i64, Ops), 0); + } + + if (InstMaskStart == 0) { + SDValue Ops[] = + { V, getI32Imm(RLAmt), getI32Imm(InstMaskEnd) }; + return SDValue(CurDAG->getMachineNode(PPC::RLDICR, dl, MVT::i64, Ops), 0); + } + + if (InstMaskEnd == 63 - RLAmt) { + SDValue Ops[] = + { V, getI32Imm(RLAmt), getI32Imm(InstMaskStart) }; + return SDValue(CurDAG->getMachineNode(PPC::RLDIC, dl, MVT::i64, Ops), 0); + } + + // We cannot do this with a single instruction, so we'll use two. The + // problem is that we're not free to choose both a rotation amount and mask + // start and end independently. We can choose an arbitrary mask start and + // end, but then the rotation amount is fixed. Rotation, however, can be + // inverted, and so by applying an "inverse" rotation first, we can get the + // desired result. + if (InstCnt) *InstCnt += 1; + + // The rotation mask for the second instruction must be MaskStart. + unsigned RLAmt2 = MaskStart; + // The first instruction must rotate V so that the overall rotation amount + // is RLAmt. + unsigned RLAmt1 = (64 + RLAmt - RLAmt2) % 64; + if (RLAmt1) + V = SelectRotMask64(V, dl, RLAmt1, false, 0, 63); + return SelectRotMask64(V, dl, RLAmt2, false, MaskStart, MaskEnd); + } + + // For 64-bit values, not all combinations of rotates and masks are + // available. Produce a rotate-mask-and-insert if one is available. + SDValue SelectRotMaskIns64(SDValue Base, SDValue V, SDLoc dl, unsigned RLAmt, + bool Repl32, unsigned MaskStart, + unsigned MaskEnd, unsigned *InstCnt = nullptr) { + // In the notation used by the instructions, 'start' and 'end' are reversed + // because bits are counted from high to low order. + unsigned InstMaskStart = 64 - MaskEnd - 1, + InstMaskEnd = 64 - MaskStart - 1; + + if (InstCnt) *InstCnt += 1; + + if (Repl32) { + // This rotation amount assumes that the lower 32 bits of the quantity + // are replicated in the high 32 bits by the rotation operator (which is + // done by rlwinm and friends). + assert(InstMaskStart >= 32 && "Mask cannot start out of range"); + assert(InstMaskEnd >= 32 && "Mask cannot end out of range"); + SDValue Ops[] = + { Base, V, getI32Imm(RLAmt), getI32Imm(InstMaskStart - 32), + getI32Imm(InstMaskEnd - 32) }; + return SDValue(CurDAG->getMachineNode(PPC::RLWIMI8, dl, MVT::i64, + Ops), 0); + } + + if (InstMaskEnd == 63 - RLAmt) { + SDValue Ops[] = + { Base, V, getI32Imm(RLAmt), getI32Imm(InstMaskStart) }; + return SDValue(CurDAG->getMachineNode(PPC::RLDIMI, dl, MVT::i64, Ops), 0); + } + + // We cannot do this with a single instruction, so we'll use two. The + // problem is that we're not free to choose both a rotation amount and mask + // start and end independently. We can choose an arbitrary mask start and + // end, but then the rotation amount is fixed. Rotation, however, can be + // inverted, and so by applying an "inverse" rotation first, we can get the + // desired result. + if (InstCnt) *InstCnt += 1; + + // The rotation mask for the second instruction must be MaskStart. + unsigned RLAmt2 = MaskStart; + // The first instruction must rotate V so that the overall rotation amount + // is RLAmt. + unsigned RLAmt1 = (64 + RLAmt - RLAmt2) % 64; + if (RLAmt1) + V = SelectRotMask64(V, dl, RLAmt1, false, 0, 63); + return SelectRotMaskIns64(Base, V, dl, RLAmt2, false, MaskStart, MaskEnd); + } + + void SelectAndParts64(SDLoc dl, SDValue &Res, unsigned *InstCnt) { + if (BPermRewriterNoMasking) + return; + + // The idea here is the same as in the 32-bit version, but with additional + // complications from the fact that Repl32 might be true. Because we + // aggressively convert bit groups to Repl32 form (which, for small + // rotation factors, involves no other change), and then coalesce, it might + // be the case that a single 64-bit masking operation could handle both + // some Repl32 groups and some non-Repl32 groups. If converting to Repl32 + // form allowed coalescing, then we must use a 32-bit rotaton in order to + // completely capture the new combined bit group. + + for (ValueRotInfo &VRI : ValueRotsVec) { + uint64_t Mask = 0; + + // We need to add to the mask all bits from the associated bit groups. + // If Repl32 is false, we need to add bits from bit groups that have + // Repl32 true, but are trivially convertable to Repl32 false. Such a + // group is trivially convertable if it overlaps only with the lower 32 + // bits, and the group has not been coalesced. + auto MatchingBG = [VRI](BitGroup &BG) { + if (VRI.V != BG.V) + return false; + + unsigned EffRLAmt = BG.RLAmt; + if (!VRI.Repl32 && BG.Repl32) { + if (BG.StartIdx < 32 && BG.EndIdx < 32 && BG.StartIdx <= BG.EndIdx && + !BG.Repl32Coalesced) { + if (BG.Repl32CR) + EffRLAmt += 32; + } else { + return false; + } + } else if (VRI.Repl32 != BG.Repl32) { + return false; + } + + if (VRI.RLAmt != EffRLAmt) + return false; + + return true; + }; + + for (auto &BG : BitGroups) { + if (!MatchingBG(BG)) + continue; + + if (BG.StartIdx <= BG.EndIdx) { + for (unsigned i = BG.StartIdx; i <= BG.EndIdx; ++i) + Mask |= (1ul << i); + } else { + for (unsigned i = BG.StartIdx; i < Bits.size(); ++i) + Mask |= (1ul << i); + for (unsigned i = 0; i <= BG.EndIdx; ++i) + Mask |= (1ul << i); + } + } + + // We can use the 32-bit andi/andis technique if the mask does not + // require any higher-order bits. This can save an instruction compared + // to always using the general 64-bit technique. + bool Use32BitInsts = isUInt<32>(Mask); + // Compute the masks for andi/andis that would be necessary. + unsigned ANDIMask = (Mask & UINT16_MAX), + ANDISMask = (Mask >> 16) & UINT16_MAX; + + bool NeedsRotate = VRI.RLAmt || (VRI.Repl32 && !isUInt<32>(Mask)); + + unsigned NumAndInsts = (unsigned) NeedsRotate + + (unsigned) (bool) Res; + if (Use32BitInsts) + NumAndInsts += (unsigned) (ANDIMask != 0) + (unsigned) (ANDISMask != 0) + + (unsigned) (ANDIMask != 0 && ANDISMask != 0); + else + NumAndInsts += SelectInt64Count(Mask) + /* and */ 1; + + unsigned NumRLInsts = 0; + bool FirstBG = true; + for (auto &BG : BitGroups) { + if (!MatchingBG(BG)) + continue; + NumRLInsts += + SelectRotMask64Count(BG.RLAmt, BG.Repl32, BG.StartIdx, BG.EndIdx, + !FirstBG); + FirstBG = false; + } + + DEBUG(dbgs() << "\t\trotation groups for " << VRI.V.getNode() << + " RL: " << VRI.RLAmt << (VRI.Repl32 ? " (32):" : ":") << + "\n\t\t\tisel using masking: " << NumAndInsts << + " using rotates: " << NumRLInsts << "\n"); + + // When we'd use andi/andis, we bias toward using the rotates (andi only + // has a record form, and is cracked on POWER cores). However, when using + // general 64-bit constant formation, bias toward the constant form, + // because that exposes more opportunities for CSE. + if (NumAndInsts > NumRLInsts) + continue; + if (Use32BitInsts && NumAndInsts == NumRLInsts) + continue; + + DEBUG(dbgs() << "\t\t\t\tusing masking\n"); + + if (InstCnt) *InstCnt += NumAndInsts; + + SDValue VRot; + // We actually need to generate a rotation if we have a non-zero rotation + // factor or, in the Repl32 case, if we care about any of the + // higher-order replicated bits. In the latter case, we generate a mask + // backward so that it actually includes the entire 64 bits. + if (VRI.RLAmt || (VRI.Repl32 && !isUInt<32>(Mask))) + VRot = SelectRotMask64(VRI.V, dl, VRI.RLAmt, VRI.Repl32, + VRI.Repl32 ? 31 : 0, VRI.Repl32 ? 30 : 63); + else + VRot = VRI.V; + + SDValue TotalVal; + if (Use32BitInsts) { + assert((ANDIMask != 0 || ANDISMask != 0) && + "No set bits in mask when using 32-bit ands for 64-bit value"); + + SDValue ANDIVal, ANDISVal; + if (ANDIMask != 0) + ANDIVal = SDValue(CurDAG->getMachineNode(PPC::ANDIo8, dl, MVT::i64, + VRot, getI32Imm(ANDIMask)), 0); + if (ANDISMask != 0) + ANDISVal = SDValue(CurDAG->getMachineNode(PPC::ANDISo8, dl, MVT::i64, + VRot, getI32Imm(ANDISMask)), 0); + + if (!ANDIVal) + TotalVal = ANDISVal; + else if (!ANDISVal) + TotalVal = ANDIVal; + else + TotalVal = SDValue(CurDAG->getMachineNode(PPC::OR8, dl, MVT::i64, + ANDIVal, ANDISVal), 0); + } else { + TotalVal = SDValue(SelectInt64(CurDAG, dl, Mask), 0); + TotalVal = + SDValue(CurDAG->getMachineNode(PPC::AND8, dl, MVT::i64, + VRot, TotalVal), 0); + } + + if (!Res) + Res = TotalVal; + else + Res = SDValue(CurDAG->getMachineNode(PPC::OR8, dl, MVT::i64, + Res, TotalVal), 0); + + // Now, remove all groups with this underlying value and rotation + // factor. + for (auto I = BitGroups.begin(); I != BitGroups.end();) { + if (MatchingBG(*I)) + I = BitGroups.erase(I); + else + ++I; + } + } + } + + // Instruction selection for the 64-bit case. + SDNode *Select64(SDNode *N, bool LateMask, unsigned *InstCnt) { + SDLoc dl(N); + SDValue Res; + + if (InstCnt) *InstCnt = 0; + + // Take care of cases that should use andi/andis first. + SelectAndParts64(dl, Res, InstCnt); + + // If we've not yet selected a 'starting' instruction, and we have no zeros + // to fill in, select the (Value, RLAmt) with the highest priority (largest + // number of groups), and start with this rotated value. + if ((!HasZeros || LateMask) && !Res) { + // If we have both Repl32 groups and non-Repl32 groups, the non-Repl32 + // groups will come first, and so the VRI representing the largest number + // of groups might not be first (it might be the first Repl32 groups). + unsigned MaxGroupsIdx = 0; + if (!ValueRotsVec[0].Repl32) { + for (unsigned i = 0, ie = ValueRotsVec.size(); i < ie; ++i) + if (ValueRotsVec[i].Repl32) { + if (ValueRotsVec[i].NumGroups > ValueRotsVec[0].NumGroups) + MaxGroupsIdx = i; + break; + } + } + + ValueRotInfo &VRI = ValueRotsVec[MaxGroupsIdx]; + bool NeedsRotate = false; + if (VRI.RLAmt) { + NeedsRotate = true; + } else if (VRI.Repl32) { + for (auto &BG : BitGroups) { + if (BG.V != VRI.V || BG.RLAmt != VRI.RLAmt || + BG.Repl32 != VRI.Repl32) + continue; + + // We don't need a rotate if the bit group is confined to the lower + // 32 bits. + if (BG.StartIdx < 32 && BG.EndIdx < 32 && BG.StartIdx < BG.EndIdx) + continue; + + NeedsRotate = true; + break; + } + } + + if (NeedsRotate) + Res = SelectRotMask64(VRI.V, dl, VRI.RLAmt, VRI.Repl32, + VRI.Repl32 ? 31 : 0, VRI.Repl32 ? 30 : 63, + InstCnt); + else + Res = VRI.V; + + // Now, remove all groups with this underlying value and rotation factor. + if (Res) + for (auto I = BitGroups.begin(); I != BitGroups.end();) { + if (I->V == VRI.V && I->RLAmt == VRI.RLAmt && I->Repl32 == VRI.Repl32) + I = BitGroups.erase(I); + else + ++I; + } + } + + // Because 64-bit rotates are more flexible than inserts, we might have a + // preference regarding which one we do first (to save one instruction). + if (!Res) + for (auto I = BitGroups.begin(), IE = BitGroups.end(); I != IE; ++I) { + if (SelectRotMask64Count(I->RLAmt, I->Repl32, I->StartIdx, I->EndIdx, + false) < + SelectRotMask64Count(I->RLAmt, I->Repl32, I->StartIdx, I->EndIdx, + true)) { + if (I != BitGroups.begin()) { + BitGroup BG = *I; + BitGroups.erase(I); + BitGroups.insert(BitGroups.begin(), BG); + } + + break; + } + } + + // Insert the other groups (one at a time). + for (auto &BG : BitGroups) { + if (!Res) + Res = SelectRotMask64(BG.V, dl, BG.RLAmt, BG.Repl32, BG.StartIdx, + BG.EndIdx, InstCnt); + else + Res = SelectRotMaskIns64(Res, BG.V, dl, BG.RLAmt, BG.Repl32, + BG.StartIdx, BG.EndIdx, InstCnt); + } + + if (LateMask) { + uint64_t Mask = getZerosMask(); + + // We can use the 32-bit andi/andis technique if the mask does not + // require any higher-order bits. This can save an instruction compared + // to always using the general 64-bit technique. + bool Use32BitInsts = isUInt<32>(Mask); + // Compute the masks for andi/andis that would be necessary. + unsigned ANDIMask = (Mask & UINT16_MAX), + ANDISMask = (Mask >> 16) & UINT16_MAX; + + if (Use32BitInsts) { + assert((ANDIMask != 0 || ANDISMask != 0) && + "No set bits in mask when using 32-bit ands for 64-bit value"); + + if (InstCnt) *InstCnt += (unsigned) (ANDIMask != 0) + + (unsigned) (ANDISMask != 0) + + (unsigned) (ANDIMask != 0 && ANDISMask != 0); + + SDValue ANDIVal, ANDISVal; + if (ANDIMask != 0) + ANDIVal = SDValue(CurDAG->getMachineNode(PPC::ANDIo8, dl, MVT::i64, + Res, getI32Imm(ANDIMask)), 0); + if (ANDISMask != 0) + ANDISVal = SDValue(CurDAG->getMachineNode(PPC::ANDISo8, dl, MVT::i64, + Res, getI32Imm(ANDISMask)), 0); + + if (!ANDIVal) + Res = ANDISVal; + else if (!ANDISVal) + Res = ANDIVal; + else + Res = SDValue(CurDAG->getMachineNode(PPC::OR8, dl, MVT::i64, + ANDIVal, ANDISVal), 0); + } else { + if (InstCnt) *InstCnt += SelectInt64Count(Mask) + /* and */ 1; + + SDValue MaskVal = SDValue(SelectInt64(CurDAG, dl, Mask), 0); + Res = + SDValue(CurDAG->getMachineNode(PPC::AND8, dl, MVT::i64, + Res, MaskVal), 0); + } + } + + return Res.getNode(); + } + + SDNode *Select(SDNode *N, bool LateMask, unsigned *InstCnt = nullptr) { + // Fill in BitGroups. + collectBitGroups(LateMask); + if (BitGroups.empty()) + return nullptr; + + // For 64-bit values, figure out when we can use 32-bit instructions. + if (Bits.size() == 64) + assignRepl32BitGroups(); + + // Fill in ValueRotsVec. + collectValueRotInfo(); + + if (Bits.size() == 32) { + return Select32(N, LateMask, InstCnt); + } else { + assert(Bits.size() == 64 && "Not 64 bits here?"); + return Select64(N, LateMask, InstCnt); + } + + return nullptr; + } + SmallVector Bits; bool HasZeros; @@ -968,22 +1795,34 @@ public: // Fill it RLAmt and set HasZeros. computeRotationAmounts(); - // Fill in BitGroups. - collectBitGroups(); - if (BitGroups.empty()) - return nullptr; + if (!HasZeros) + return Select(N, false); - // Fill in ValueRotsVec. - collectValueRotInfo(); + // We currently have two techniques for handling results with zeros: early + // masking (the default) and late masking. Late masking is sometimes more + // efficient, but because the structure of the bit groups is different, it + // is hard to tell without generating both and comparing the results. With + // late masking, we ignore zeros in the resulting value when inserting each + // set of bit groups, and then mask in the zeros at the end. With early + // masking, we only insert the non-zero parts of the result at every step. - if (Bits.size() == 32) { - return Select32(N); - } else { - assert(Bits.size() == 64 && "Not 64 bits here?"); - // TODO: The 64-bit case! + unsigned InstCnt, InstCntLateMask; + DEBUG(dbgs() << "\tEarly masking:\n"); + SDNode *RN = Select(N, false, &InstCnt); + DEBUG(dbgs() << "\t\tisel would use " << InstCnt << " instructions\n"); + + DEBUG(dbgs() << "\tLate masking:\n"); + SDNode *RNLM = Select(N, true, &InstCntLateMask); + DEBUG(dbgs() << "\t\tisel would use " << InstCntLateMask << + " instructions\n"); + + if (InstCnt <= InstCntLateMask) { + DEBUG(dbgs() << "\tUsing early-masking for isel\n"); + return RN; } - return nullptr; + DEBUG(dbgs() << "\tUsing late-masking for isel\n"); + return RNLM; } }; } // anonymous namespace @@ -993,6 +1832,9 @@ SDNode *PPCDAGToDAGISel::SelectBitPermutation(SDNode *N) { N->getValueType(0) != MVT::i64) return nullptr; + if (!UseBitPermRewriter) + return nullptr; + switch (N->getOpcode()) { default: break; case ISD::ROTL: @@ -1431,77 +2273,8 @@ SDNode *PPCDAGToDAGISel::Select(SDNode *N) { default: break; case ISD::Constant: { - if (N->getValueType(0) == MVT::i64) { - // Get 64 bit value. - int64_t Imm = cast(N)->getZExtValue(); - // Assume no remaining bits. - unsigned Remainder = 0; - // Assume no shift required. - unsigned Shift = 0; - - // If it can't be represented as a 32 bit value. - if (!isInt<32>(Imm)) { - Shift = countTrailingZeros(Imm); - int64_t ImmSh = static_cast(Imm) >> Shift; - - // If the shifted value fits 32 bits. - if (isInt<32>(ImmSh)) { - // Go with the shifted value. - Imm = ImmSh; - } else { - // Still stuck with a 64 bit value. - Remainder = Imm; - Shift = 32; - Imm >>= 32; - } - } - - // Intermediate operand. - SDNode *Result; - - // Handle first 32 bits. - unsigned Lo = Imm & 0xFFFF; - unsigned Hi = (Imm >> 16) & 0xFFFF; - - // Simple value. - if (isInt<16>(Imm)) { - // Just the Lo bits. - Result = CurDAG->getMachineNode(PPC::LI8, dl, MVT::i64, getI32Imm(Lo)); - } else if (Lo) { - // Handle the Hi bits. - unsigned OpC = Hi ? PPC::LIS8 : PPC::LI8; - Result = CurDAG->getMachineNode(OpC, dl, MVT::i64, getI32Imm(Hi)); - // And Lo bits. - Result = CurDAG->getMachineNode(PPC::ORI8, dl, MVT::i64, - SDValue(Result, 0), getI32Imm(Lo)); - } else { - // Just the Hi bits. - Result = CurDAG->getMachineNode(PPC::LIS8, dl, MVT::i64, getI32Imm(Hi)); - } - - // If no shift, we're done. - if (!Shift) return Result; - - // Shift for next step if the upper 32-bits were not zero. - if (Imm) { - Result = CurDAG->getMachineNode(PPC::RLDICR, dl, MVT::i64, - SDValue(Result, 0), - getI32Imm(Shift), - getI32Imm(63 - Shift)); - } - - // Add in the last bits as required. - if ((Hi = (Remainder >> 16) & 0xFFFF)) { - Result = CurDAG->getMachineNode(PPC::ORIS8, dl, MVT::i64, - SDValue(Result, 0), getI32Imm(Hi)); - } - if ((Lo = Remainder & 0xFFFF)) { - Result = CurDAG->getMachineNode(PPC::ORI8, dl, MVT::i64, - SDValue(Result, 0), getI32Imm(Lo)); - } - - return Result; - } + if (N->getValueType(0) == MVT::i64) + return SelectInt64(CurDAG, N); break; } diff --git a/lib/Target/PowerPC/PPCInstr64Bit.td b/lib/Target/PowerPC/PPCInstr64Bit.td index bea27a34926..7d1249c2ce7 100644 --- a/lib/Target/PowerPC/PPCInstr64Bit.td +++ b/lib/Target/PowerPC/PPCInstr64Bit.td @@ -617,13 +617,11 @@ def MULLI8 : DForm_2<7, (outs g8rc:$rD), (ins g8rc:$rA, s16imm64:$imm), } let hasSideEffects = 0 in { -let isCommutable = 1 in { defm RLDIMI : MDForm_1r<30, 3, (outs g8rc:$rA), (ins g8rc:$rSi, g8rc:$rS, u6imm:$SH, u6imm:$MBE), "rldimi", "$rA, $rS, $SH, $MBE", IIC_IntRotateDI, []>, isPPC64, RegConstraint<"$rSi = $rA">, NoEncode<"$rSi">; -} // Rotate instructions. defm RLDCL : MDSForm_1r<30, 8, diff --git a/test/CodeGen/PowerPC/bperm.ll b/test/CodeGen/PowerPC/bperm.ll index 0f920328a05..c489c1f90a8 100644 --- a/test/CodeGen/PowerPC/bperm.ll +++ b/test/CodeGen/PowerPC/bperm.ll @@ -16,6 +16,100 @@ entry: ; CHECK: blr } +define i64 @bs8(i64 %x) #0 { +entry: + %0 = tail call i64 @llvm.bswap.i64(i64 %x) + ret i64 %0 + +; CHECK-LABEL: @bs8 +; CHECK-DAG: rldicl [[REG1:[0-9]+]], 3, 16, 0 +; CHECK-DAG: rldicl [[REG2:[0-9]+]], 3, 8, 0 +; CHECK-DAG: rldicl [[REG3:[0-9]+]], 3, 24, 0 +; CHECK-DAG: rldimi [[REG2]], [[REG1]], 8, 48 +; CHECK-DAG: rldicl [[REG4:[0-9]+]], 3, 32, 0 +; CHECK-DAG: rldimi [[REG2]], [[REG3]], 16, 40 +; CHECK-DAG: rldicl [[REG5:[0-9]+]], 3, 48, 0 +; CHECK-DAG: rldimi [[REG2]], [[REG4]], 24, 32 +; CHECK-DAG: rldicl [[REG6:[0-9]+]], 3, 56, 0 +; CHECK-DAG: rldimi [[REG2]], [[REG5]], 40, 16 +; CHECK-DAG: rldimi [[REG2]], [[REG6]], 48, 8 +; CHECK-DAG: rldimi [[REG2]], 3, 56, 0 +; CHECK: mr 3, [[REG2]] +; CHECK: blr +} + +define i64 @test1(i64 %i0, i64 %i1) #0 { +entry: + %0 = lshr i64 %i1, 8 + %and = and i64 %0, 5963776000 + ret i64 %and + +; CHECK-LABEL: @test1 +; CHECK-DAG: li [[REG1:[0-9]+]], 11375 +; CHECK-DAG: rldicl [[REG3:[0-9]+]], 4, 56, 0 +; CHECK-DAG: sldi [[REG2:[0-9]+]], [[REG1]], 19 +; CHECK: and 3, [[REG3]], [[REG2]] +; CHECK: blr +} + +define i64 @test2(i64 %i0, i64 %i1) #0 { +entry: + %0 = lshr i64 %i1, 6 + %and = and i64 %0, 133434808670355456 + ret i64 %and + +; CHECK-LABEL: @test2 +; CHECK-DAG: lis [[REG1:[0-9]+]], 474 +; CHECK-DAG: rldicl [[REG5:[0-9]+]], 4, 58, 0 +; CHECK-DAG: ori [[REG2:[0-9]+]], [[REG1]], 3648 +; CHECK-DAG: sldi [[REG3:[0-9]+]], [[REG2]], 32 +; CHECK-DAG: oris [[REG4:[0-9]+]], [[REG3]], 25464 +; CHECK: and 3, [[REG5]], [[REG4]] +; CHECK: blr +} + +define i64 @test3(i64 %i0, i64 %i1) #0 { +entry: + %0 = shl i64 %i0, 34 + %and = and i64 %0, 191795733152661504 + ret i64 %and + +; CHECK-LABEL: @test3 +; CHECK-DAG: lis [[REG1:[0-9]+]], 170 +; CHECK-DAG: rldicl [[REG4:[0-9]+]], 3, 34, 0 +; CHECK-DAG: ori [[REG2:[0-9]+]], [[REG1]], 22861 +; CHECK-DAG: sldi [[REG3:[0-9]+]], [[REG2]], 34 +; CHECK: and 3, [[REG4]], [[REG3]] +; CHECK: blr +} + +define i64 @test4(i64 %i0, i64 %i1) #0 { +entry: + %0 = lshr i64 %i1, 15 + %and = and i64 %0, 58195968 + ret i64 %and + +; CHECK-LABEL: @test4 +; CHECK: rldicl [[REG1:[0-9]+]], 4, 49, 0 +; CHECK: andis. 3, [[REG1]], 888 +; CHECK: blr +} + +define i64 @test5(i64 %i0, i64 %i1) #0 { +entry: + %0 = shl i64 %i1, 12 + %and = and i64 %0, 127252959854592 + ret i64 %and + +; CHECK-LABEL: @test5 +; CHECK-DAG: lis [[REG1:[0-9]+]], 3703 +; CHECK-DAG: rldicl [[REG4:[0-9]+]], 4, 12, 0 +; CHECK-DAG: ori [[REG2:[0-9]+]], [[REG1]], 35951 +; CHECK-DAG: sldi [[REG3:[0-9]+]], [[REG2]], 19 +; CHECK: and 3, [[REG4]], [[REG3]] +; CHECK: blr +} + ; Function Attrs: nounwind readnone define zeroext i32 @test6(i32 zeroext %x) #0 { entry: @@ -33,8 +127,153 @@ entry: ; CHECK: blr } +define i64 @test7(i64 %i0, i64 %i1) #0 { +entry: + %0 = lshr i64 %i0, 5 + %and = and i64 %0, 58195968 + ret i64 %and + +; CHECK-LABEL: @test7 +; CHECK: rlwinm [[REG1:[0-9]+]], 3, 27, 9, 12 +; CHECK: rlwimi [[REG1]], 3, 27, 6, 7 +; CHECK: mr 3, [[REG1]] +; CHECK: blr +} + +define i64 @test8(i64 %i0, i64 %i1) #0 { +entry: + %0 = lshr i64 %i0, 1 + %and = and i64 %0, 169172533248 + ret i64 %and + +; CHECK-LABEL: @test8 +; CHECK-DAG: lis [[REG1:[0-9]+]], 4 +; CHECK-DAG: rldicl [[REG4:[0-9]+]], 3, 63, 0 +; CHECK-DAG: ori [[REG2:[0-9]+]], [[REG1]], 60527 +; CHECK-DAG: sldi [[REG3:[0-9]+]], [[REG2]], 19 +; CHECK: and 3, [[REG4]], [[REG3]] +; CHECK: blr +} + +define i64 @test9(i64 %i0, i64 %i1) #0 { +entry: + %0 = lshr i64 %i1, 14 + %and = and i64 %0, 18848677888 + %1 = shl i64 %i1, 51 + %and3 = and i64 %1, 405323966463344640 + %or4 = or i64 %and, %and3 + ret i64 %or4 + +; CHECK-LABEL: @test9 +; CHECK-DAG: lis [[REG1:[0-9]+]], 1440 +; CHECK-DAG: rldicl [[REG5:[0-9]+]], 4, 62, 0 +; CHECK-DAG: rldicl [[REG6:[0-9]+]], 4, 50, 0 +; CHECK-DAG: ori [[REG2:[0-9]+]], [[REG1]], 4 +; CHECK-DAG: rldimi [[REG6]], [[REG5]], 53, 0 +; CHECK-DAG: sldi [[REG3:[0-9]+]], [[REG2]], 32 +; CHECK-DAG: oris [[REG4:[0-9]+]], [[REG3]], 25464 +; CHECK: and 3, [[REG6]], [[REG4]] +; CHECK: blr +} + +define i64 @test10(i64 %i0, i64 %i1) #0 { +entry: + %0 = shl i64 %i0, 37 + %and = and i64 %0, 15881483390550016 + %1 = shl i64 %i0, 25 + %and3 = and i64 %1, 2473599172608 + %or4 = or i64 %and, %and3 + ret i64 %or4 + +; CHECK-LABEL: @test10 +; CHECK-DAG: lis [[REG1:[0-9]+]], 1 +; CHECK-DAG: rldicl [[REG6:[0-9]+]], 3, 25, 0 +; CHECK-DAG: rldicl [[REG7:[0-9]+]], 3, 37, 0 +; CHECK-DAG: ori [[REG2:[0-9]+]], [[REG1]], 8183 +; CHECK-DAG: ori [[REG3:[0-9]+]], [[REG1]], 50017 +; CHECK-DAG: sldi [[REG4:[0-9]+]], [[REG2]], 25 +; CHECK-DAG: sldi [[REG5:[0-9]+]], [[REG3]], 37 +; CHECK-DAG: and [[REG8:[0-9]+]], [[REG6]], [[REG4]] +; CHECK-DAG: and [[REG9:[0-9]+]], [[REG7]], [[REG5]] +; CHECK: or 3, [[REG9]], [[REG8]] +; CHECK: blr +} + +define i64 @test11(i64 %x) #0 { +entry: + %and = and i64 %x, 4294967295 + %shl = shl i64 %x, 32 + %or = or i64 %and, %shl + ret i64 %or + +; CHECK-LABEL: @test11 +; CHECK: rlwinm 3, 3, 0, 1, 0 +; CHECK: blr +} + +define i64 @test12(i64 %x) #0 { +entry: + %and = and i64 %x, 4294905855 + %shl = shl i64 %x, 32 + %or = or i64 %and, %shl + ret i64 %or + +; CHECK-LABEL: @test12 +; CHECK: rlwinm 3, 3, 0, 20, 15 +; CHECK: blr +} + +define i64 @test13(i64 %x) #0 { +entry: + %shl = shl i64 %x, 4 + %and = and i64 %shl, 240 + %shr = lshr i64 %x, 28 + %and1 = and i64 %shr, 15 + %or = or i64 %and, %and1 + ret i64 %or + +; CHECK-LABEL: @test13 +; CHECK: rlwinm 3, 3, 4, 24, 31 +; CHECK: blr +} + +define i64 @test14(i64 %x) #0 { +entry: + %shl = shl i64 %x, 4 + %and = and i64 %shl, 240 + %shr = lshr i64 %x, 28 + %and1 = and i64 %shr, 15 + %and2 = and i64 %x, -4294967296 + %or = or i64 %and1, %and2 + %or3 = or i64 %or, %and + ret i64 %or3 + +; CHECK-LABEL: @test14 +; CHECK: rldicr [[REG1:[0-9]+]], 3, 0, 31 +; CHECK: rlwimi [[REG1]], 3, 4, 24, 31 +; CHECK: mr 3, [[REG1]] +; CHECK: blr +} + +define i64 @test15(i64 %x) #0 { +entry: + %shl = shl i64 %x, 4 + %and = and i64 %shl, 240 + %shr = lshr i64 %x, 28 + %and1 = and i64 %shr, 15 + %and2 = and i64 %x, -256 + %or = or i64 %and1, %and2 + %or3 = or i64 %or, %and + ret i64 %or3 + +; CHECK-LABEL: @test15 +; CHECK: rlwimi 3, 3, 4, 24, 31 +; CHECK: blr +} + ; Function Attrs: nounwind readnone declare i32 @llvm.bswap.i32(i32) #0 +declare i64 @llvm.bswap.i64(i64) #0 attributes #0 = { nounwind readnone }