R600: Support schedule and packetization of trans-only inst

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@185268 91177308-0d34-0410-b5e6-96231b3b80d8
2025-04-12 07:37:34 +00:00 · 2013-06-29 19:32:43 +00:00 · 2013-06-29 19:32:43 +00:00 · 8f9fbd67c3
commit 8f9fbd67c3
parent 7d1a0d4e3e
13 changed files with 267 additions and 111 deletions
--- a/lib/Target/R600/R600InstrInfo.cpp
+++ b/lib/Target/R600/R600InstrInfo.cpp
@ -250,8 +250,9 @@ R600InstrInfo::getSrcs(MachineInstr *MI) const {

 std::vector<std::pair<int, unsigned> >
 R600InstrInfo::ExtractSrcs(MachineInstr *MI,
-                           const DenseMap<unsigned, unsigned> &PV)
-    const {
+                           const DenseMap<unsigned, unsigned> &PV,
+                           unsigned &ConstCount) const {
+  ConstCount = 0;
  const SmallVector<std::pair<MachineOperand *, int64_t>, 3> Srcs = getSrcs(MI);
  const std::pair<int, unsigned> DummyPair(-1, 0);
  std::vector<std::pair<int, unsigned> > Result;
@ -259,18 +260,20 @@ R600InstrInfo::ExtractSrcs(MachineInstr *MI,
  for (unsigned n = Srcs.size(); i < n; ++i) {
    unsigned Reg = Srcs[i].first->getReg();
    unsigned Index = RI.getEncodingValue(Reg) & 0xff;
-    unsigned Chan = RI.getHWRegChan(Reg);
    if (Reg == AMDGPU::OQAP) {
      Result.push_back(std::pair<int, unsigned>(Index, 0));
    }
-    if (Index > 127) {
-      Result.push_back(DummyPair);
-      continue;
-    }
    if (PV.find(Reg) != PV.end()) {
+      // 255 is used to tells its a PS/PV reg
+      Result.push_back(std::pair<int, unsigned>(255, 0));
+      continue;
+    }
+    if (Index > 127) {
+      ConstCount++;
      Result.push_back(DummyPair);
      continue;
    }
+    unsigned Chan = RI.getHWRegChan(Reg);
    Result.push_back(std::pair<int, unsigned>(Index, Chan));
  }
  for (; i < 3; ++i)
@ -305,23 +308,51 @@ Swizzle(std::vector<std::pair<int, unsigned> > Src,
  return Src;
 }

-bool
-R600InstrInfo::isLegal(
-             const std::vector<std::vector<std::pair<int, unsigned> > > &IGSrcs,
-             const std::vector<R600InstrInfo::BankSwizzle> &Swz,
-             unsigned CheckedSize) const {
+static unsigned
+getTransSwizzle(R600InstrInfo::BankSwizzle Swz, unsigned Op) {
+  switch (Swz) {
+  case R600InstrInfo::ALU_VEC_012_SCL_210: {
+    unsigned Cycles[3] = { 2, 1, 0};
+    return Cycles[Op];
+  }
+  case R600InstrInfo::ALU_VEC_021_SCL_122: {
+    unsigned Cycles[3] = { 1, 2, 2};
+    return Cycles[Op];
+  }
+  case R600InstrInfo::ALU_VEC_120_SCL_212: {
+    unsigned Cycles[3] = { 2, 1, 2};
+    return Cycles[Op];
+  }
+  case R600InstrInfo::ALU_VEC_102_SCL_221: {
+    unsigned Cycles[3] = { 2, 2, 1};
+    return Cycles[Op];
+  }
+  default:
+    llvm_unreachable("Wrong Swizzle for Trans Slot");
+    return 0;
+  }
+}
+
+/// returns how many MIs (whose inputs are represented by IGSrcs) can be packed
+/// in the same Instruction Group while meeting read port limitations given a
+/// Swz swizzle sequence.
+unsigned  R600InstrInfo::isLegalUpTo(
+    const std::vector<std::vector<std::pair<int, unsigned> > > &IGSrcs,
+    const std::vector<R600InstrInfo::BankSwizzle> &Swz,
+    const std::vector<std::pair<int, unsigned> > &TransSrcs,
+    R600InstrInfo::BankSwizzle TransSwz) const {
  int Vector[4][3];
  memset(Vector, -1, sizeof(Vector));
-  for (unsigned i = 0; i < CheckedSize; i++) {
+  for (unsigned i = 0, e = IGSrcs.size(); i < e; i++) {
    const std::vector<std::pair<int, unsigned> > &Srcs =
        Swizzle(IGSrcs[i], Swz[i]);
    for (unsigned j = 0; j < 3; j++) {
      const std::pair<int, unsigned> &Src = Srcs[j];
-      if (Src.first < 0)
+      if (Src.first < 0 || Src.first == 255)
        continue;
      if (Src.first == GET_REG_INDEX(RI.getEncodingValue(AMDGPU::OQAP))) {
-        if (Swz[i] != R600InstrInfo::ALU_VEC_012 &&
-            Swz[i] != R600InstrInfo::ALU_VEC_021) {
+        if (Swz[i] != R600InstrInfo::ALU_VEC_012_SCL_210 &&
+            Swz[i] != R600InstrInfo::ALU_VEC_021_SCL_122) {
            // The value from output queue A (denoted by register OQAP) can
            // only be fetched during the first cycle.
            return false;
@ -332,51 +363,126 @@ R600InstrInfo::isLegal(
      if (Vector[Src.second][j] < 0)
        Vector[Src.second][j] = Src.first;
      if (Vector[Src.second][j] != Src.first)
-        return false;
+        return i;
    }
  }
+  // Now check Trans Alu
+  for (unsigned i = 0, e = TransSrcs.size(); i < e; ++i) {
+    const std::pair<int, unsigned> &Src = TransSrcs[i];
+    unsigned Cycle = getTransSwizzle(TransSwz, i);
+    if (Src.first < 0)
+      continue;
+    if (Src.first == 255)
+      continue;
+    if (Vector[Src.second][Cycle] < 0)
+      Vector[Src.second][Cycle] = Src.first;
+    if (Vector[Src.second][Cycle] != Src.first)
+      return IGSrcs.size() - 1;
+  }
+  return IGSrcs.size();
+}
+
+/// Given a swizzle sequence SwzCandidate and an index Idx, returns the next
+/// (in lexicographic term) swizzle sequence assuming that all swizzles after
+/// Idx can be skipped
+static bool
+NextPossibleSolution(
+    std::vector<R600InstrInfo::BankSwizzle> &SwzCandidate,
+    unsigned Idx) {
+  assert(Idx < SwzCandidate.size());
+  int ResetIdx = Idx;
+  while (ResetIdx > -1 && SwzCandidate[ResetIdx] == R600InstrInfo::ALU_VEC_210)
+    ResetIdx --;
+  for (unsigned i = ResetIdx + 1, e = SwzCandidate.size(); i < e; i++) {
+    SwzCandidate[i] = R600InstrInfo::ALU_VEC_012_SCL_210;
+  }
+  if (ResetIdx == -1)
+    return false;
+  SwzCandidate[ResetIdx]++;
+  return true;
+}
+
+/// Enumerate all possible Swizzle sequence to find one that can meet all
+/// read port requirements.
+bool R600InstrInfo::FindSwizzleForVectorSlot(
+    const std::vector<std::vector<std::pair<int, unsigned> > > &IGSrcs,
+    std::vector<R600InstrInfo::BankSwizzle> &SwzCandidate,
+    const std::vector<std::pair<int, unsigned> > &TransSrcs,
+    R600InstrInfo::BankSwizzle TransSwz) const {
+  unsigned ValidUpTo = 0;
+  do {
+    ValidUpTo = isLegalUpTo(IGSrcs, SwzCandidate, TransSrcs, TransSwz);
+    if (ValidUpTo == IGSrcs.size())
+      return true;
+  } while (NextPossibleSolution(SwzCandidate, ValidUpTo));
+  return false;
+}
+
+/// Instructions in Trans slot can't read gpr at cycle 0 if they also read
+/// a const, and can't read a gpr at cycle 1 if they read 2 const.
+static bool
+isConstCompatible(R600InstrInfo::BankSwizzle TransSwz,
+                  const std::vector<std::pair<int, unsigned> > &TransOps,
+                  unsigned ConstCount) {
+  for (unsigned i = 0, e = TransOps.size(); i < e; ++i) {
+    const std::pair<int, unsigned> &Src = TransOps[i];
+    unsigned Cycle = getTransSwizzle(TransSwz, i);
+    if (Src.first < 0)
+      continue;
+    if (ConstCount > 0 && Cycle == 0)
+      return false;
+    if (ConstCount > 1 && Cycle == 1)
+      return false;
+  }
  return true;
 }

-bool
-R600InstrInfo::recursiveFitsFPLimitation(
-             const std::vector<std::vector<std::pair<int, unsigned> > > &IGSrcs,
-             std::vector<R600InstrInfo::BankSwizzle> &SwzCandidate,
-             unsigned Depth) const {
-  if (!isLegal(IGSrcs, SwzCandidate, Depth))
-    return false;
-  if (IGSrcs.size() == Depth)
-    return true;
-  unsigned i = SwzCandidate[Depth];
-  for (; i < 6; i++) {
-    SwzCandidate[Depth] = (R600InstrInfo::BankSwizzle) i;
-    if (recursiveFitsFPLimitation(IGSrcs, SwzCandidate, Depth + 1))
-      return true;
-  }
-  SwzCandidate[Depth] = R600InstrInfo::ALU_VEC_012;
-  return false;
-}
-
 bool
 R600InstrInfo::fitsReadPortLimitations(const std::vector<MachineInstr *> &IG,
-                                      const DenseMap<unsigned, unsigned> &PV,
-                                      std::vector<BankSwizzle> &ValidSwizzle)
+                                       const DenseMap<unsigned, unsigned> &PV,
+                                       std::vector<BankSwizzle> &ValidSwizzle,
+                                       bool isLastAluTrans)
    const {
  //Todo : support shared src0 - src1 operand

  std::vector<std::vector<std::pair<int, unsigned> > > IGSrcs;
  ValidSwizzle.clear();
+  unsigned ConstCount;
+  BankSwizzle TransBS;
  for (unsigned i = 0, e = IG.size(); i < e; ++i) {
-    IGSrcs.push_back(ExtractSrcs(IG[i], PV));
+    IGSrcs.push_back(ExtractSrcs(IG[i], PV, ConstCount));
    unsigned Op = getOperandIdx(IG[i]->getOpcode(),
        AMDGPU::OpName::bank_swizzle);
    ValidSwizzle.push_back( (R600InstrInfo::BankSwizzle)
        IG[i]->getOperand(Op).getImm());
  }
-  bool Result = recursiveFitsFPLimitation(IGSrcs, ValidSwizzle);
-  if (!Result)
-    return false;
-  return true;
+  std::vector<std::pair<int, unsigned> > TransOps;
+  if (!isLastAluTrans)
+    return FindSwizzleForVectorSlot(IGSrcs, ValidSwizzle, TransOps, TransBS);
+
+  TransOps = IGSrcs.back();
+  IGSrcs.pop_back();
+  ValidSwizzle.pop_back();
+
+  static const R600InstrInfo::BankSwizzle TransSwz[] = {
+    ALU_VEC_012_SCL_210,
+    ALU_VEC_021_SCL_122,
+    ALU_VEC_120_SCL_212,
+    ALU_VEC_102_SCL_221
+  };
+  for (unsigned i = 0; i < 4; i++) {
+    TransBS = TransSwz[i];
+    if (!isConstCompatible(TransBS, TransOps, ConstCount))
+      continue;
+    bool Result = FindSwizzleForVectorSlot(IGSrcs, ValidSwizzle, TransOps,
+        TransBS);
+    if (Result) {
+      ValidSwizzle.push_back(TransBS);
+      return true;
+    }
+  }
+
+  return false;
 }


@ -406,7 +512,8 @@ R600InstrInfo::fitsConstReadLimitations(const std::vector<unsigned> &Consts)
 }

 bool
-R600InstrInfo::canBundle(const std::vector<MachineInstr *> &MIs) const {
+R600InstrInfo::fitsConstReadLimitations(const std::vector<MachineInstr *> &MIs)
+    const {
  std::vector<unsigned> Consts;
  for (unsigned i = 0, n = MIs.size(); i < n; i++) {
    MachineInstr *MI = MIs[i];
--- a/lib/Target/R600/R600InstrInfo.h
+++ b/lib/Target/R600/R600InstrInfo.h
@ -84,26 +84,38 @@ namespace llvm {
  SmallVector<std::pair<MachineOperand *, int64_t>, 3>
      getSrcs(MachineInstr *MI) const;

-  bool isLegal(
-             const std::vector<std::vector<std::pair<int, unsigned> > > &IGSrcs,
-             const std::vector<R600InstrInfo::BankSwizzle> &Swz,
-             unsigned CheckedSize) const;
-  bool recursiveFitsFPLimitation(
-             const std::vector<std::vector<std::pair<int, unsigned> > > &IGSrcs,
-             std::vector<R600InstrInfo::BankSwizzle> &SwzCandidate,
-             unsigned Depth = 0) const;
+  unsigned  isLegalUpTo(
+    const std::vector<std::vector<std::pair<int, unsigned> > > &IGSrcs,
+    const std::vector<R600InstrInfo::BankSwizzle> &Swz,
+    const std::vector<std::pair<int, unsigned> > &TransSrcs,
+    R600InstrInfo::BankSwizzle TransSwz) const;
+
+  bool FindSwizzleForVectorSlot(
+    const std::vector<std::vector<std::pair<int, unsigned> > > &IGSrcs,
+    std::vector<R600InstrInfo::BankSwizzle> &SwzCandidate,
+    const std::vector<std::pair<int, unsigned> > &TransSrcs,
+    R600InstrInfo::BankSwizzle TransSwz) const;

  /// Given the order VEC_012 < VEC_021 < VEC_120 < VEC_102 < VEC_201 < VEC_210
  /// returns true and the first (in lexical order) BankSwizzle affectation
  /// starting from the one already provided in the Instruction Group MIs that
  /// fits Read Port limitations in BS if available. Otherwise returns false
  /// and undefined content in BS.
+  /// isLastAluTrans should be set if the last Alu of MIs will be executed on
+  /// Trans ALU. In this case, ValidTSwizzle returns the BankSwizzle value to
+  /// apply to the last instruction.
  /// PV holds GPR to PV registers in the Instruction Group MIs.
  bool fitsReadPortLimitations(const std::vector<MachineInstr *> &MIs,
                               const DenseMap<unsigned, unsigned> &PV,
-                               std::vector<BankSwizzle> &BS) const;
+                               std::vector<BankSwizzle> &BS,
+                               bool isLastAluTrans) const;
+
+  /// An instruction group can only access 2 channel pair (either [XY] or [ZW])
+  /// from KCache bank on R700+. This function check if MI set in input meet
+  /// this limitations
+  bool fitsConstReadLimitations(const std::vector<MachineInstr *> &) const;
+  /// Same but using const index set instead of MI set.
  bool fitsConstReadLimitations(const std::vector<unsigned>&) const;
-  bool canBundle(const std::vector<MachineInstr *> &) const;

  /// \breif Vector instructions are instructions that must fill all
  /// instruction slots within an instruction group.
--- a/lib/Target/R600/R600Instructions.td
+++ b/lib/Target/R600/R600Instructions.td
@ -1489,6 +1489,8 @@ let hasSideEffects = 1 in {

  def FLT_TO_INT_eg : FLT_TO_INT_Common<0x50> {
    let Pattern = [];
+    let TransOnly = 0;
+    let Itinerary = AnyALU;
  }

  def INT_TO_FLT_eg : INT_TO_FLT_Common<0x9B>;
--- a/lib/Target/R600/R600MachineScheduler.cpp
+++ b/lib/Target/R600/R600MachineScheduler.cpp
@ -32,7 +32,7 @@ void R600SchedStrategy::initialize(ScheduleDAGMI *dag) {
  MRI = &DAG->MRI;
  CurInstKind = IDOther;
  CurEmitted = 0;
-  OccupedSlotsMask = 15;
+  OccupedSlotsMask = 31;
  InstKindLimit[IDAlu] = TII->getMaxAlusPerClause();
  InstKindLimit[IDOther] = 32;

@ -160,7 +160,7 @@ void R600SchedStrategy::schedNode(SUnit *SU, bool IsTopNode) {
  if (NextInstKind != CurInstKind) {
    DEBUG(dbgs() << "Instruction Type Switch\n");
    if (NextInstKind != IDAlu)
-      OccupedSlotsMask = 15;
+      OccupedSlotsMask |= 31;
    CurEmitted = 0;
    CurInstKind = NextInstKind;
  }
@ -251,6 +251,9 @@ bool R600SchedStrategy::regBelongsToClass(unsigned Reg,
 R600SchedStrategy::AluKind R600SchedStrategy::getAluKind(SUnit *SU) const {
  MachineInstr *MI = SU->getInstr();

+  if (TII->isTransOnly(MI))
+    return AluTrans;
+
    switch (MI->getOpcode()) {
    case AMDGPU::PRED_X:
      return AluPredX;
@ -346,7 +349,7 @@ SUnit *R600SchedStrategy::PopInst(std::vector<SUnit *> &Q) {
      It != E; ++It) {
    SUnit *SU = *It;
    InstructionsGroupCandidate.push_back(SU->getInstr());
-    if (TII->canBundle(InstructionsGroupCandidate)) {
+    if (TII->fitsConstReadLimitations(InstructionsGroupCandidate)) {
      InstructionsGroupCandidate.pop_back();
      Q.erase((It + 1).base());
      return SU;
@ -421,7 +424,8 @@ unsigned R600SchedStrategy::AvailablesAluCount() const {
  return AvailableAlus[AluAny].size() + AvailableAlus[AluT_XYZW].size() +
      AvailableAlus[AluT_X].size() + AvailableAlus[AluT_Y].size() +
      AvailableAlus[AluT_Z].size() + AvailableAlus[AluT_W].size() +
-      AvailableAlus[AluDiscarded].size() + AvailableAlus[AluPredX].size();
+      AvailableAlus[AluTrans].size() + AvailableAlus[AluDiscarded].size() +
+      AvailableAlus[AluPredX].size();
 }

 SUnit* R600SchedStrategy::pickAlu() {
@ -429,20 +433,27 @@ SUnit* R600SchedStrategy::pickAlu() {
    if (!OccupedSlotsMask) {
      // Bottom up scheduling : predX must comes first
      if (!AvailableAlus[AluPredX].empty()) {
-        OccupedSlotsMask = 15;
+        OccupedSlotsMask |= 31;
        return PopInst(AvailableAlus[AluPredX]);
      }
      // Flush physical reg copies (RA will discard them)
      if (!AvailableAlus[AluDiscarded].empty()) {
-        OccupedSlotsMask = 15;
+        OccupedSlotsMask |= 31;
        return PopInst(AvailableAlus[AluDiscarded]);
      }
      // If there is a T_XYZW alu available, use it
      if (!AvailableAlus[AluT_XYZW].empty()) {
-        OccupedSlotsMask = 15;
+        OccupedSlotsMask |= 15;
        return PopInst(AvailableAlus[AluT_XYZW]);
      }
    }
+    bool TransSlotOccuped = OccupedSlotsMask & 16;
+    if (!TransSlotOccuped) {
+      if (!AvailableAlus[AluTrans].empty()) {
+        OccupedSlotsMask |= 16;
+        return PopInst(AvailableAlus[AluTrans]);
+      }
+    }
    for (int Chan = 3; Chan > -1; --Chan) {
      bool isOccupied = OccupedSlotsMask & (1 << Chan);
      if (!isOccupied) {
--- a/lib/Target/R600/R600MachineScheduler.h
+++ b/lib/Target/R600/R600MachineScheduler.h
@ -46,6 +46,7 @@ class R600SchedStrategy : public MachineSchedStrategy {
    AluT_W,
    AluT_XYZW,
    AluPredX,
+    AluTrans,
    AluDiscarded, // LLVM Instructions that are going to be eliminated
    AluLast
  };
--- a/lib/Target/R600/R600Packetizer.cpp
+++ b/lib/Target/R600/R600Packetizer.cpp
@ -77,8 +77,6 @@ private:
    do {
      if (TII->isPredicated(BI))
        continue;
-      if (TII->isTransOnly(BI))
-        continue;
      int OperandIdx = TII->getOperandIdx(BI->getOpcode(), AMDGPU::OpName::write);
      if (OperandIdx > -1 && BI->getOperand(OperandIdx).getImm() == 0)
        continue;
@ -87,6 +85,10 @@ private:
        continue;
      }
      unsigned Dst = BI->getOperand(DstIdx).getReg();
+      if (TII->isTransOnly(BI)) {
+        Result[Dst] = AMDGPU::PS;
+        continue;
+      }
      if (BI->getOpcode() == AMDGPU::DOT4_r600 ||
          BI->getOpcode() == AMDGPU::DOT4_eg) {
        Result[Dst] = AMDGPU::PV_X;
@ -157,10 +159,6 @@ public:
      return true;
    if (!TII->isALUInstr(MI->getOpcode()))
      return true;
-    if (TII->get(MI->getOpcode()).TSFlags & R600_InstFlag::TRANS_ONLY)
-      return true;
-    if (TII->isTransOnly(MI))
-      return true;
    if (MI->getOpcode() == AMDGPU::GROUP_BARRIER)
      return true;
    return false;
@ -170,7 +168,7 @@ public:
  // together.
  bool isLegalToPacketizeTogether(SUnit *SUI, SUnit *SUJ) {
    MachineInstr *MII = SUI->getInstr(), *MIJ = SUJ->getInstr();
-    if (getSlot(MII) <= getSlot(MIJ))
+    if (getSlot(MII) <= getSlot(MIJ) && !TII->isTransOnly(MII))
      return false;
    // Does MII and MIJ share the same pred_sel ?
    int OpI = TII->getOperandIdx(MII->getOpcode(), AMDGPU::OpName::pred_sel),
@ -204,11 +202,16 @@ public:
    MI->getOperand(LastOp).setImm(Bit);
  }

-  MachineBasicBlock::iterator addToPacket(MachineInstr *MI) {
+  bool isBundlableWithCurrentPMI(MachineInstr *MI,
+                                 const DenseMap<unsigned, unsigned> &PV,
+                                 std::vector<R600InstrInfo::BankSwizzle> &BS,
+                                 bool &isTransSlot) {
+    isTransSlot = TII->isTransOnly(MI);
+
+    // Are the Constants limitations met ?
    CurrentPacketMIs.push_back(MI);
-    bool FitsConstLimits = TII->canBundle(CurrentPacketMIs);
-    DEBUG(
-      if (!FitsConstLimits) {
+    if (!TII->fitsConstReadLimitations(CurrentPacketMIs)) {
+      DEBUG(
        dbgs() << "Couldn't pack :\n";
        MI->dump();
        dbgs() << "with the following packets :\n";
@ -217,14 +220,15 @@ public:
          dbgs() << "\n";
        }
        dbgs() << "because of Consts read limitations\n";
-      });
-    const DenseMap<unsigned, unsigned> &PV =
-        getPreviousVector(CurrentPacketMIs.front());
-    std::vector<R600InstrInfo::BankSwizzle> BS;
-    bool FitsReadPortLimits =
-        TII->fitsReadPortLimitations(CurrentPacketMIs, PV, BS);
-    DEBUG(
-      if (!FitsReadPortLimits) {
+      );
+      CurrentPacketMIs.pop_back();
+      return false;
+    }
+
+    // Is there a BankSwizzle set that meet Read Port limitations ?
+    if (!TII->fitsReadPortLimitations(CurrentPacketMIs,
+            PV, BS, isTransSlot)) {
+      DEBUG(
        dbgs() << "Couldn't pack :\n";
        MI->dump();
        dbgs() << "with the following packets :\n";
@ -233,25 +237,43 @@ public:
          dbgs() << "\n";
        }
        dbgs() << "because of Read port limitations\n";
-      });
-    bool isBundlable = FitsConstLimits && FitsReadPortLimits;
-    if (isBundlable) {
+      );
+      CurrentPacketMIs.pop_back();
+      return false;
+    }
+
+    CurrentPacketMIs.pop_back();
+    return true;
+  }
+
+  MachineBasicBlock::iterator addToPacket(MachineInstr *MI) {
+    MachineBasicBlock::iterator FirstInBundle =
+        CurrentPacketMIs.empty() ? MI : CurrentPacketMIs.front();
+    const DenseMap<unsigned, unsigned> &PV =
+        getPreviousVector(FirstInBundle);
+    std::vector<R600InstrInfo::BankSwizzle> BS;
+    bool isTransSlot;
+
+    if (isBundlableWithCurrentPMI(MI, PV, BS, isTransSlot)) {
      for (unsigned i = 0, e = CurrentPacketMIs.size(); i < e; i++) {
        MachineInstr *MI = CurrentPacketMIs[i];
-            unsigned Op = TII->getOperandIdx(MI->getOpcode(),
-                AMDGPU::OpName::bank_swizzle);
-            MI->getOperand(Op).setImm(BS[i]);
+        unsigned Op = TII->getOperandIdx(MI->getOpcode(),
+            AMDGPU::OpName::bank_swizzle);
+        MI->getOperand(Op).setImm(BS[i]);
      }
+      unsigned Op = TII->getOperandIdx(MI->getOpcode(),
+          AMDGPU::OpName::bank_swizzle);
+      MI->getOperand(Op).setImm(BS.back());
+      if (!CurrentPacketMIs.empty())
+        setIsLastBit(CurrentPacketMIs.back(), 0);
+      substitutePV(MI, PV);
+      MachineBasicBlock::iterator It = VLIWPacketizerList::addToPacket(MI);
+      if (isTransSlot) {
+        endPacket(llvm::next(It)->getParent(), llvm::next(It));
+      }
+      return It;
    }
-    CurrentPacketMIs.pop_back();
-    if (!isBundlable) {
-      endPacket(MI->getParent(), MI);
-      substitutePV(MI, getPreviousVector(MI));
-      return VLIWPacketizerList::addToPacket(MI);
-    }
-    if (!CurrentPacketMIs.empty())
-      setIsLastBit(CurrentPacketMIs.back(), 0);
-    substitutePV(MI, PV);
+    endPacket(MI->getParent(), MI);
    return VLIWPacketizerList::addToPacket(MI);
  }
 };
--- a/lib/Target/R600/R600RegisterInfo.td
+++ b/lib/Target/R600/R600RegisterInfo.td
@ -96,6 +96,7 @@ def PV_X : R600RegWithChan<"PV.X", 254, "X">;
 def PV_Y : R600RegWithChan<"PV.Y", 254, "Y">;
 def PV_Z : R600RegWithChan<"PV.Z", 254, "Z">;
 def PV_W : R600RegWithChan<"PV.W", 254, "W">;
+def PS: R600Reg<"PS", 255>;
 def PREDICATE_BIT : R600Reg<"PredicateBit", 0>;
 def PRED_SEL_OFF: R600Reg<"Pred_sel_off", 0>;
 def PRED_SEL_ZERO : R600Reg<"Pred_sel_zero", 2>;
--- a/test/CodeGen/R600/fdiv.ll
+++ b/test/CodeGen/R600/fdiv.ll
@ -1,13 +1,13 @@
 ;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s

 ;CHECK: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;CHECK: MUL_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;CHECK: MUL_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], PS}}
 ;CHECK: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;CHECK: MUL_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], PS}}
 ;CHECK: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;CHECK: MUL_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;CHECK: MUL_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;CHECK: MUL_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], PS}}
 ;CHECK: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;CHECK: MUL_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;CHECK: MUL_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], PS}}

 define void @test(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
  %b_ptr = getelementptr <4 x float> addrspace(1)* %in, i32 1
--- a/test/CodeGen/R600/fp_to_sint.ll
+++ b/test/CodeGen/R600/fp_to_sint.ll
@ -1,10 +1,10 @@
 ; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s

 ; CHECK: @fp_to_sint_v4i32
-; CHECK: FLT_TO_INT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; CHECK: FLT_TO_INT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; CHECK: FLT_TO_INT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; CHECK: FLT_TO_INT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; CHECK: FLT_TO_INT T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
+; CHECK: FLT_TO_INT * T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
+; CHECK: FLT_TO_INT T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
+; CHECK: FLT_TO_INT * T{{[0-9]+\.[XYZW], PV\.[XYZW]}}

 define void @fp_to_sint_v4i32(<4 x i32> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
  %value = load <4 x float> addrspace(1) * %in
--- a/test/CodeGen/R600/fp_to_uint.ll
+++ b/test/CodeGen/R600/fp_to_uint.ll
@ -1,10 +1,10 @@
 ; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s

 ; CHECK: @fp_to_uint_v4i32
-; CHECK: FLT_TO_UINT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; CHECK: FLT_TO_UINT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; CHECK: FLT_TO_UINT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; CHECK: FLT_TO_UINT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; CHECK: FLT_TO_UINT * T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
+; CHECK: FLT_TO_UINT * T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
+; CHECK: FLT_TO_UINT * T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
+; CHECK: FLT_TO_UINT * T{{[0-9]+\.[XYZW], PV\.[XYZW]}}

 define void @fp_to_uint_v4i32(<4 x i32> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
  %value = load <4 x float> addrspace(1) * %in
--- a/test/CodeGen/R600/llvm.cos.ll
+++ b/test/CodeGen/R600/llvm.cos.ll
@ -1,6 +1,6 @@
 ;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s

-;CHECK: COS * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;CHECK: COS * T{{[0-9]+\.[XYZW], PV\.[XYZW]}}

 define void @test() {
   %r0 = call float @llvm.R600.load.input(i32 0)
--- a/test/CodeGen/R600/llvm.pow.ll
+++ b/test/CodeGen/R600/llvm.pow.ll
@ -1,8 +1,8 @@
 ;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s

 ;CHECK: LOG_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;CHECK: MUL NON-IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;CHECK-NEXT: EXP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;CHECK: MUL NON-IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], PS}}
+;CHECK-NEXT: EXP_IEEE * T{{[0-9]+\.[XYZW], PV\.[XYZW]}}

 define void @test() {
   %r0 = call float @llvm.R600.load.input(i32 0)
--- a/test/CodeGen/R600/llvm.sin.ll
+++ b/test/CodeGen/R600/llvm.sin.ll
@ -1,6 +1,6 @@
 ;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s

-;CHECK: SIN * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;CHECK: SIN * T{{[0-9]+\.[XYZW], PV\.[XYZW]}}

 define void @test() {
   %r0 = call float @llvm.R600.load.input(i32 0)