Fix memory access lowering on SPU, adding

support for the case where alignment<value size. These cases were silently miscompiled before this patch. Now they are overly verbose -especially storing is- and any front-end should still avoid misaligned memory accesses as much as possible. The bit juggling algorithm added here probably has some room for improvement still. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@118889 91177308-0d34-0410-b5e6-96231b3b80d8
2025-07-26 05:25:47 +00:00 · 2010-11-12 10:14:03 +00:00
parent d0c82a683e
commit 7ea1ab5f41
7 changed files with 277 additions and 115 deletions
--- a/lib/Target/CellSPU/SPUISelLowering.cpp
+++ b/lib/Target/CellSPU/SPUISelLowering.cpp
@@ -42,41 +42,12 @@ using namespace llvm;
 namespace {
  std::map<unsigned, const char *> node_names;
-  //! EVT mapping to useful data for Cell SPU
+  // Byte offset of the preferred slot (counted from the MSB)
-  struct valtype_map_s {
+  int prefslotOffset(EVT VT) {
-    EVT   valtype;
+    int retval=0;
-    int   prefslot_byte;
+    if (VT==MVT::i1) retval=3; 
-  };
+    if (VT==MVT::i8) retval=3; 
-
+    if (VT==MVT::i16) retval=2; 
  const valtype_map_s valtype_map[] = {
    { MVT::i1,   3 },
    { MVT::i8,   3 },
    { MVT::i16,  2 },
    { MVT::i32,  0 },
    { MVT::f32,  0 },
    { MVT::i64,  0 },
    { MVT::f64,  0 },
    { MVT::i128, 0 }
  };
  const size_t n_valtype_map = sizeof(valtype_map) / sizeof(valtype_map[0]);
  const valtype_map_s *getValueTypeMapEntry(EVT VT) {
    const valtype_map_s *retval = 0;
    for (size_t i = 0; i < n_valtype_map; ++i) {
      if (valtype_map[i].valtype == VT) {
        retval = valtype_map + i;
        break;
      }
    }
 #ifndef NDEBUG
    if (retval == 0) {
      report_fatal_error("getValueTypeMapEntry returns NULL for " +
                         Twine(VT.getEVTString()));
    }
 #endif
    return retval;
  }
@@ -440,9 +411,9 @@ SPUTargetLowering::SPUTargetLowering(SPUTargetMachine &TM)
    setOperationAction(ISD::AND,     VT, Legal);
    setOperationAction(ISD::OR,      VT, Legal);
    setOperationAction(ISD::XOR,     VT, Legal);
-    setOperationAction(ISD::LOAD,    VT, Legal);
+    setOperationAction(ISD::LOAD,    VT, Custom);
    setOperationAction(ISD::SELECT,  VT, Legal);
-    setOperationAction(ISD::STORE,   VT, Legal);
+    setOperationAction(ISD::STORE,   VT, Custom);
    // These operations need to be expanded:
    setOperationAction(ISD::SDIV,    VT, Expand);
@@ -503,8 +474,8 @@ SPUTargetLowering::getTargetNodeName(unsigned Opcode) const
    node_names[(unsigned) SPUISD::CNTB] = "SPUISD::CNTB";
    node_names[(unsigned) SPUISD::PREFSLOT2VEC] = "SPUISD::PREFSLOT2VEC";
    node_names[(unsigned) SPUISD::VEC2PREFSLOT] = "SPUISD::VEC2PREFSLOT";
-    node_names[(unsigned) SPUISD::SHLQUAD_L_BITS] = "SPUISD::SHLQUAD_L_BITS";
+    node_names[(unsigned) SPUISD::SHL_BITS] = "SPUISD::SHL_BITS";
-    node_names[(unsigned) SPUISD::SHLQUAD_L_BYTES] = "SPUISD::SHLQUAD_L_BYTES";
+    node_names[(unsigned) SPUISD::SHL_BYTES] = "SPUISD::SHL_BYTES";
    node_names[(unsigned) SPUISD::VEC_ROTL] = "SPUISD::VEC_ROTL";
    node_names[(unsigned) SPUISD::VEC_ROTR] = "SPUISD::VEC_ROTR";
    node_names[(unsigned) SPUISD::ROTBYTES_LEFT] = "SPUISD::ROTBYTES_LEFT";
@@ -573,11 +544,26 @@ LowerLOAD(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
  EVT OutVT = Op.getValueType();
  ISD::LoadExtType ExtType = LN->getExtensionType();
  unsigned alignment = LN->getAlignment();
-  const valtype_map_s *vtm = getValueTypeMapEntry(InVT);
+  int pso = prefslotOffset(InVT);
  DebugLoc dl = Op.getDebugLoc();
  EVT vecVT = InVT.isVector()? InVT: EVT::getVectorVT(*DAG.getContext(), InVT,
                                                  (128 / InVT.getSizeInBits()));
  // two sanity checks
  assert( LN->getAddressingMode() == ISD::UNINDEXED  
          && "we should get only UNINDEXED adresses");
  // clean aligned loads can be selected as-is
  if (InVT.getSizeInBits() == 128 && alignment == 16)
    return SDValue();
  // Get pointerinfos to the memory chunk(s) that contain the data to load 
  uint64_t mpi_offset = LN->getPointerInfo().Offset;
  mpi_offset -= mpi_offset%16;
  MachinePointerInfo lowMemPtr( LN->getPointerInfo().V, mpi_offset);
  MachinePointerInfo highMemPtr( LN->getPointerInfo().V, mpi_offset+16);
  switch (LN->getAddressingMode()) {
  case ISD::UNINDEXED: {
    SDValue result;
    SDValue basePtr = LN->getBasePtr();
    SDValue rotate;
@@ -591,7 +577,7 @@ LowerLOAD(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
          && (CN = dyn_cast<ConstantSDNode > (basePtr.getOperand(1))) != 0) {
        // Known offset into basePtr
        int64_t offset = CN->getSExtValue();
-        int64_t rotamt = int64_t((offset & 0xf) - vtm->prefslot_byte);
+        int64_t rotamt = int64_t((offset & 0xf) - pso);
        if (rotamt < 0)
          rotamt += 16;
@@ -611,14 +597,14 @@ LowerLOAD(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
                     && basePtr.getOperand(1).getOpcode() == SPUISD::Lo)) {
        // Plain aligned a-form address: rotate into preferred slot
        // Same for (SPUindirect (SPUhi ...), (SPUlo ...))
-        int64_t rotamt = -vtm->prefslot_byte;
+        int64_t rotamt = -pso;
        if (rotamt < 0)
          rotamt += 16;
        rotate = DAG.getConstant(rotamt, MVT::i16);
      } else {
        // Offset the rotate amount by the basePtr and the preferred slot
        // byte offset
-        int64_t rotamt = -vtm->prefslot_byte;
+        int64_t rotamt = -pso;
        if (rotamt < 0)
          rotamt += 16;
        rotate = DAG.getNode(ISD::ADD, dl, PtrVT,
@@ -658,20 +644,23 @@ LowerLOAD(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
      // byte offset
      rotate = DAG.getNode(ISD::ADD, dl, PtrVT,
                           basePtr,
-                           DAG.getConstant(-vtm->prefslot_byte, PtrVT));
+                           DAG.getConstant(-pso, PtrVT));
    }
-    // Re-emit as a v16i8 vector load
+    // Do the load as a i128 to allow possible shifting
-    result = DAG.getLoad(MVT::v16i8, dl, the_chain, basePtr,
+    SDValue low = DAG.getLoad(MVT::i128, dl, the_chain, basePtr,
-                         LN->getPointerInfo(),
+                         lowMemPtr,
                         LN->isVolatile(), LN->isNonTemporal(), 16);
-
+ 
  // When the size is not greater than alignment we get all data with just
  // one load
  if (alignment >= InVT.getSizeInBits()/8) {
    // Update the chain
-    the_chain = result.getValue(1);
+    the_chain = low.getValue(1);
    // Rotate into the preferred slot:
-    result = DAG.getNode(SPUISD::ROTBYTES_LEFT, dl, MVT::v16i8,
+    result = DAG.getNode(SPUISD::ROTBYTES_LEFT, dl, MVT::i128,
-                         result.getValue(0), rotate);
+                         low.getValue(0), rotate);
    // Convert the loaded v16i8 vector to the appropriate vector type
    // specified by the operand:
@@ -679,7 +668,56 @@ LowerLOAD(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
                                 InVT, (128 / InVT.getSizeInBits()));
    result = DAG.getNode(SPUISD::VEC2PREFSLOT, dl, InVT,
                         DAG.getNode(ISD::BIT_CONVERT, dl, vecVT, result));
  }
  // When alignment is less than the size, we might need (known only at
  // run-time) two loads
  // TODO: if the memory address is composed only from constants, we have 
  // extra kowledge, and might avoid the second load
  else {
    // storage position offset from lower 16 byte aligned memory chunk
    SDValue offset = DAG.getNode( ISD::AND, dl, MVT::i32, 
                                  basePtr, DAG.getConstant( 0xf, MVT::i32 ) );
    // 16 - offset
    SDValue offset_compl = DAG.getNode( ISD::SUB, dl, MVT::i32, 
                                        DAG.getConstant( 16, MVT::i32),
                                        offset );
    // get a registerfull of ones. (this implementation is a workaround: LLVM 
    // cannot handle 128 bit signed int constants)
    SDValue ones = DAG.getConstant( -1, MVT::v4i32 );
    ones = DAG.getNode( ISD::BIT_CONVERT, dl, MVT::i128, ones);
    SDValue high = DAG.getLoad(MVT::i128, dl, the_chain,
                               DAG.getNode(ISD::ADD, dl, PtrVT, 
                                           basePtr,
                                           DAG.getConstant(16, PtrVT)),
                               highMemPtr,
                               LN->isVolatile(), LN->isNonTemporal(), 16);
    the_chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, low.getValue(1),
                                                              high.getValue(1));
    // Shift the (possible) high part right to compensate the misalignemnt.
    // if there is no highpart (i.e. value is i64 and offset is 4), this 
    // will zero out the high value.
    high = DAG.getNode( SPUISD::SRL_BYTES, dl, MVT::i128, high, 
                                     DAG.getNode( ISD::SUB, dl, MVT::i32,
                                                 DAG.getConstant( 16, MVT::i32),
                                                 offset
                                                ));
    // Shift the low similarily
    // TODO: add SPUISD::SHL_BYTES
    low = DAG.getNode( SPUISD::SHL_BYTES, dl, MVT::i128, low, offset );
    // Merge the two parts
    result = DAG.getNode( ISD::BIT_CONVERT, dl, vecVT,
                          DAG.getNode(ISD::OR, dl, MVT::i128, low, high));
    if (!InVT.isVector()) {
      result = DAG.getNode( SPUISD::VEC2PREFSLOT, dl, InVT, result );
     }
  }
    // Handle extending loads by extending the scalar result:
    if (ExtType == ISD::SEXTLOAD) {
      result = DAG.getNode(ISD::SIGN_EXTEND, dl, OutVT, result);
@@ -703,21 +741,6 @@ LowerLOAD(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
    result = DAG.getNode(SPUISD::LDRESULT, dl, retvts,
                         retops, sizeof(retops) / sizeof(retops[0]));
    return result;
  }
  case ISD::PRE_INC:
  case ISD::PRE_DEC:
  case ISD::POST_INC:
  case ISD::POST_DEC:
  case ISD::LAST_INDEXED_MODE:
    {
      report_fatal_error("LowerLOAD: Got a LoadSDNode with an addr mode other "
                         "than UNINDEXED\n" +
                         Twine((unsigned)LN->getAddressingMode()));
      /*NOTREACHED*/
    }
  }
  return SDValue();
 }
 /// Custom lower stores for CellSPU
@@ -735,12 +758,24 @@ LowerSTORE(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
  DebugLoc dl = Op.getDebugLoc();
  unsigned alignment = SN->getAlignment();
  SDValue result;
  EVT vecVT = StVT.isVector()? StVT: EVT::getVectorVT(*DAG.getContext(), StVT,
                                                 (128 / StVT.getSizeInBits()));
  // Get pointerinfos to the memory chunk(s) that contain the data to load 
  uint64_t mpi_offset = SN->getPointerInfo().Offset;
  mpi_offset -= mpi_offset%16;
  MachinePointerInfo lowMemPtr( SN->getPointerInfo().V, mpi_offset);
  MachinePointerInfo highMemPtr( SN->getPointerInfo().V, mpi_offset+16);
  // two sanity checks
  assert( SN->getAddressingMode() == ISD::UNINDEXED  
          && "we should get only UNINDEXED adresses");
  // clean aligned loads can be selected as-is
  if (StVT.getSizeInBits() == 128 && alignment == 16)
    return SDValue();
  switch (SN->getAddressingMode()) {
  case ISD::UNINDEXED: {
    // The vector type we really want to load from the 16-byte chunk.
    EVT vecVT = EVT::getVectorVT(*DAG.getContext(),
                                 VT, (128 / VT.getSizeInBits()));
    SDValue alignLoadVec;
    SDValue basePtr = SN->getBasePtr();
@@ -811,17 +846,17 @@ LowerSTORE(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
                                  DAG.getConstant(0, PtrVT));
    }
-    // Load the memory to which to store.
+    // Load the lower part of the memory to which to store.
-    alignLoadVec = DAG.getLoad(vecVT, dl, the_chain, basePtr,
+    SDValue low = DAG.getLoad(vecVT, dl, the_chain, basePtr,
-                               SN->getPointerInfo(),
+                              lowMemPtr, SN->isVolatile(), SN->isNonTemporal(), 16);
                               SN->isVolatile(), SN->isNonTemporal(), 16);
  // if we don't need to store over the 16 byte boundary, one store suffices
  if (alignment >= StVT.getSizeInBits()/8) {
    // Update the chain
-    the_chain = alignLoadVec.getValue(1);
+    the_chain = low.getValue(1);
-    LoadSDNode *LN = cast<LoadSDNode>(alignLoadVec);
+    LoadSDNode *LN = cast<LoadSDNode>(low);
    SDValue theValue = SN->getValue();
    SDValue result;
    if (StVT != VT
        && (theValue.getOpcode() == ISD::AssertZext
@@ -849,14 +884,14 @@ LowerSTORE(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
                                      theValue);
    result = DAG.getNode(SPUISD::SHUFB, dl, vecVT,
-                         vectorizeOp, alignLoadVec,
+                         vectorizeOp, low,
                         DAG.getNode(ISD::BIT_CONVERT, dl,
                                     MVT::v4i32, insertEltOp));
    result = DAG.getStore(the_chain, dl, result, basePtr,
-                          LN->getPointerInfo(),
+                          lowMemPtr,
                          LN->isVolatile(), LN->isNonTemporal(),
-                          LN->getAlignment());
+                          16);
 #if 0 && !defined(NDEBUG)
    if (DebugFlag && isCurrentDebugType(DEBUG_TYPE)) {
@@ -869,24 +904,106 @@ LowerSTORE(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
      DAG.setRoot(currentRoot);
    }
 #endif
    return result;
    /*UNREACHED*/
  }
-  case ISD::PRE_INC:
+  // do the store when it might cross the 16 byte memory access boundary.
-  case ISD::PRE_DEC:
+  else {
-  case ISD::POST_INC:
+    // TODO issue a warning if SN->isVolatile()== true? This is likely not 
-  case ISD::POST_DEC:
+    // what the user wanted.
-  case ISD::LAST_INDEXED_MODE:
+    
-    {
+    // address offset from nearest lower 16byte alinged address
-      report_fatal_error("LowerLOAD: Got a LoadSDNode with an addr mode other "
+    SDValue offset = DAG.getNode(ISD::AND, dl, MVT::i32, 
-                         "than UNINDEXED\n" +
+                                    SN->getBasePtr(), 
-                         Twine((unsigned)SN->getAddressingMode()));
+                                    DAG.getConstant(0xf, MVT::i32));
-      /*NOTREACHED*/
+    // 16 - offset
    SDValue offset_compl = DAG.getNode(ISD::SUB, dl, MVT::i32, 
                                           DAG.getConstant( 16, MVT::i32),
                                           offset);
    SDValue hi_shift = DAG.getNode(ISD::SUB, dl, MVT::i32, 
                                      DAG.getConstant( VT.getSizeInBits()/8,
                                                       MVT::i32),
                                      offset_compl);
    // 16 - sizeof(Value)
    SDValue surplus = DAG.getNode(ISD::SUB, dl, MVT::i32, 
                                     DAG.getConstant( 16, MVT::i32),
                                     DAG.getConstant( VT.getSizeInBits()/8,
                                                      MVT::i32));
    // get a registerfull of ones 
    SDValue ones = DAG.getConstant(-1, MVT::v4i32);
    ones = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i128, ones);
    // Create the 128 bit masks that have ones where the data to store is
    // located.
    SDValue lowmask, himask; 
    // if the value to store don't fill up the an entire 128 bits, zero 
    // out the last bits of the mask so that only the value we want to store
    // is masked. 
    // this is e.g. in the case of store i32, align 2
    if (!VT.isVector()){
      Value = DAG.getNode(SPUISD::PREFSLOT2VEC, dl, vecVT, Value);
      lowmask = DAG.getNode(SPUISD::SRL_BYTES, dl, MVT::i128, ones, surplus);
      lowmask = DAG.getNode(SPUISD::SHL_BYTES, dl, MVT::i128, lowmask, 
                                                               surplus);
      Value = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i128, Value);
      Value = DAG.getNode(ISD::AND, dl, MVT::i128, Value, lowmask);
    }
-  }
+    else {
      lowmask = ones;
      Value = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i128, Value);
    }
    // this will zero, if there are no data that goes to the high quad 
    himask = DAG.getNode(SPUISD::SHL_BYTES, dl, MVT::i128, lowmask, 
                                                            offset_compl);
    lowmask = DAG.getNode(SPUISD::SRL_BYTES, dl, MVT::i128, lowmask, 
                                                             offset);
    // Load in the old data and zero out the parts that will be overwritten with
    // the new data to store.
    SDValue hi = DAG.getLoad(MVT::i128, dl, the_chain, 
                               DAG.getNode(ISD::ADD, dl, PtrVT, basePtr,
                                           DAG.getConstant( 16, PtrVT)),
                               highMemPtr,
                               SN->isVolatile(), SN->isNonTemporal(), 16);
    the_chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, low.getValue(1),
                                                              hi.getValue(1));
    low = DAG.getNode(ISD::AND, dl, MVT::i128, 
                        DAG.getNode( ISD::BIT_CONVERT, dl, MVT::i128, low),
                        DAG.getNode( ISD::XOR, dl, MVT::i128, lowmask, ones));
    hi = DAG.getNode(ISD::AND, dl, MVT::i128, 
                        DAG.getNode( ISD::BIT_CONVERT, dl, MVT::i128, hi),
                        DAG.getNode( ISD::XOR, dl, MVT::i128, himask, ones));
    // Shift the Value to store into place. rlow contains the parts that go to
    // the lower memory chunk, rhi has the parts that go to the upper one. 
    SDValue rlow = DAG.getNode(SPUISD::SRL_BYTES, dl, MVT::i128, Value, offset);
    rlow = DAG.getNode(ISD::AND, dl, MVT::i128, rlow, lowmask);
    SDValue rhi = DAG.getNode(SPUISD::SHL_BYTES, dl, MVT::i128, Value, 
                                                            offset_compl);
    // Merge the old data and the new data and store the results
    // Need to convert vectors here to integer as 'OR'ing floats assert 
    rlow = DAG.getNode(ISD::OR, dl, MVT::i128, 
                          DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i128, low),
                          DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i128, rlow));
    rhi = DAG.getNode(ISD::OR, dl, MVT::i128, 
                         DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i128, hi),
                         DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i128, rhi));
    low = DAG.getStore(the_chain, dl, rlow, basePtr,
                          lowMemPtr,
                          SN->isVolatile(), SN->isNonTemporal(), 16);
    hi  = DAG.getStore(the_chain, dl, rhi, 
                            DAG.getNode(ISD::ADD, dl, PtrVT, basePtr,
                                        DAG.getConstant( 16, PtrVT)),
                            highMemPtr,
                            SN->isVolatile(), SN->isNonTemporal(), 16);
    result = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, low.getValue(0),
                                                           hi.getValue(0));
  } 
  return result;
  return SDValue();
 }
 //! Generate the address of a constant pool entry.
@@ -2002,7 +2119,7 @@ static SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) {
                        DAG.getConstant(scaleShift, MVT::i32));
    }
-    vecShift = DAG.getNode(SPUISD::SHLQUAD_L_BYTES, dl, VecVT, N, Elt);
+    vecShift = DAG.getNode(SPUISD::SHL_BYTES, dl, VecVT, N, Elt);
    // Replicate the bytes starting at byte 0 across the entire vector (for
    // consistency with the notion of a unified register set)
@@ -2911,8 +3028,8 @@ SPUTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const
    }
    break;
  }
-  case SPUISD::SHLQUAD_L_BITS:
+  case SPUISD::SHL_BITS:
-  case SPUISD::SHLQUAD_L_BYTES:
+  case SPUISD::SHL_BYTES:
  case SPUISD::ROTBYTES_LEFT: {
    SDValue Op1 = N->getOperand(1);
--- a/lib/Target/CellSPU/SPUISelLowering.h
+++ b/lib/Target/CellSPU/SPUISelLowering.h
@@ -41,8 +41,9 @@ namespace llvm {
      CNTB,                     ///< Count leading ones in bytes
      PREFSLOT2VEC,             ///< Promote scalar->vector
      VEC2PREFSLOT,             ///< Extract element 0
-      SHLQUAD_L_BITS,           ///< Rotate quad left, by bits
+      SHL_BITS,                 ///< Shift quad left, by bits
-      SHLQUAD_L_BYTES,          ///< Rotate quad left, by bytes
+      SHL_BYTES,                ///< Shift quad left, by bytes
      SRL_BYTES,                ///< Shift quad right, by bytes. Insert zeros.
      VEC_ROTL,                 ///< Vector rotate left
      VEC_ROTR,                 ///< Vector rotate right
      ROTBYTES_LEFT,            ///< Rotate bytes (loads -> ROTQBYI)
--- a/lib/Target/CellSPU/SPUInstrInfo.td
+++ b/lib/Target/CellSPU/SPUInstrInfo.td
@@ -2369,10 +2369,13 @@ class ROTQBYInst<dag OOL, dag IOL, list<dag> pattern>:
    RRForm<0b00111011100, OOL, IOL, "rotqby\t$rT, $rA, $rB",
           RotateShift, pattern>;
-class ROTQBYVecInst<ValueType vectype>:
+class ROTQBYGenInst<ValueType type, RegisterClass rc>:
-    ROTQBYInst<(outs VECREG:$rT), (ins VECREG:$rA, R32C:$rB),
+    ROTQBYInst<(outs rc:$rT), (ins rc:$rA, R32C:$rB),
-               [(set (vectype VECREG:$rT),
+               [(set (type rc:$rT),
-                     (SPUrotbytes_left (vectype VECREG:$rA), R32C:$rB))]>;
+                     (SPUrotbytes_left (type rc:$rA), R32C:$rB))]>;
 class ROTQBYVecInst<ValueType type>:
    ROTQBYGenInst<type, VECREG>;
 multiclass RotateQuadLeftByBytes
 {
@@ -2382,6 +2385,7 @@ multiclass RotateQuadLeftByBytes
  def v4f32: ROTQBYVecInst<v4f32>;
  def v2i64: ROTQBYVecInst<v2i64>;
  def v2f64: ROTQBYVecInst<v2f64>;
  def i128:  ROTQBYGenInst<i128, GPRC>;
 }
 defm ROTQBY: RotateQuadLeftByBytes;
@@ -2394,10 +2398,13 @@ class ROTQBYIInst<dag OOL, dag IOL, list<dag> pattern>:
    RI7Form<0b00111111100, OOL, IOL, "rotqbyi\t$rT, $rA, $val",
            RotateShift, pattern>;
 class ROTQBYIGenInst<ValueType type, RegisterClass rclass>:
    ROTQBYIInst<(outs rclass:$rT), (ins rclass:$rA, u7imm:$val),
                [(set (type rclass:$rT),
                      (SPUrotbytes_left (type rclass:$rA), (i16 uimm7:$val)))]>;
 class ROTQBYIVecInst<ValueType vectype>:
-    ROTQBYIInst<(outs VECREG:$rT), (ins VECREG:$rA, u7imm:$val),
+    ROTQBYIGenInst<vectype, VECREG>;
                [(set (vectype VECREG:$rT),
                      (SPUrotbytes_left (vectype VECREG:$rA), (i16 uimm7:$val)))]>;
 multiclass RotateQuadByBytesImm
 {
@@ -2407,6 +2414,7 @@ multiclass RotateQuadByBytesImm
  def v4f32: ROTQBYIVecInst<v4f32>;
  def v2i64: ROTQBYIVecInst<v2i64>;
  def vfi64: ROTQBYIVecInst<v2f64>;
  def i128:  ROTQBYIGenInst<i128, GPRC>;
 }
 defm ROTQBYI: RotateQuadByBytesImm;
@@ -2661,6 +2669,10 @@ multiclass RotateQuadBytes
 defm ROTQMBY : RotateQuadBytes;
 def : Pat<(SPUsrl_bytes GPRC:$rA, R32C:$rB),
          (ROTQMBYr128  GPRC:$rA, 
                        (SFIr32 R32C:$rB, 0))>;
 class ROTQMBYIInst<dag OOL, dag IOL, list<dag> pattern>:
    RI7Form<0b10111111100, OOL, IOL, "rotqmbyi\t$rT, $rA, $val",
            RotateShift, pattern>;
@@ -2749,6 +2761,11 @@ multiclass RotateMaskQuadByBits
 defm ROTQMBI: RotateMaskQuadByBits;
 def : Pat<(srl GPRC:$rA, R32C:$rB),
          (ROTQMBIr128  GPRC:$rA, 
                        (SFIr32 R32C:$rB, 0))>;
 //-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
 // Rotate quad and mask by bits, immediate
 //-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
--- a/lib/Target/CellSPU/SPUNodes.td
+++ b/lib/Target/CellSPU/SPUNodes.td
@@ -83,10 +83,6 @@ def SPUcntb : SDNode<"SPUISD::CNTB", SDTIntUnaryOp>;
 // SPUISelLowering.h):
 def SPUshuffle: SDNode<"SPUISD::SHUFB", SDT_SPUshuffle, []>;
 // Shift left quadword by bits and bytes
 def SPUshlquad_l_bits: SDNode<"SPUISD::SHLQUAD_L_BITS", SPUvecshift_type, []>;
 def SPUshlquad_l_bytes: SDNode<"SPUISD::SHLQUAD_L_BYTES", SPUvecshift_type, []>;
 // Vector shifts (ISD::SHL,SRL,SRA are for _integers_ only):
 def SPUvec_shl: SDNode<"ISD::SHL", SPUvecshift_type, []>;
 def SPUvec_srl: SDNode<"ISD::SRL", SPUvecshift_type, []>;
@@ -105,6 +101,12 @@ def SPUrotbytes_left: SDNode<"SPUISD::ROTBYTES_LEFT",
 def SPUrotbytes_left_bits : SDNode<"SPUISD::ROTBYTES_LEFT_BITS",
                                   SPUvecshift_type>;
 // Shift entire quad left by bytes/bits. Zeros are shifted in on the right
 // SHL_BITS the same as SHL for i128, but ISD::SHL is not implemented for i128
 def SPUshlquad_l_bytes: SDNode<"SPUISD::SHL_BYTES", SPUvecshift_type, []>;
 def SPUshlquad_l_bits: SDNode<"SPUISD::SHL_BITS", SPUvecshift_type, []>;
 def SPUsrl_bytes: SDNode<"SPUISD::SRL_BYTES", SPUvecshift_type, []>;
 // SPU form select mask for bytes, immediate
 def SPUselmask: SDNode<"SPUISD::SELECT_MASK", SPUselmask_type, []>;
--- a/test/CodeGen/CellSPU/arg_ret.ll
+++ b/test/CodeGen/CellSPU/arg_ret.ll
@@ -26,7 +26,7 @@ define ccc i32 @test_regs_and_stack( %paramstruct %prm, i32 %stackprm )
 define ccc %paramstruct @test_return( i32 %param,  %paramstruct %prm )
 {
-;CHECK:  lqd	$75, 80($sp)
+;CHECK:  lqd	{{\$[0-9]+}}, 80($sp)
 ;CHECK-NOT:	ori	{{\$[0-9]+, \$[0-9]+, 0}}
 ;CHECK:  lr    $3, $4
  ret %paramstruct %prm
--- a/test/CodeGen/CellSPU/loads.ll
+++ b/test/CodeGen/CellSPU/loads.ll
@@ -38,3 +38,15 @@ define <4 x float> @load_undef(){
 	%val = load <4 x float>* undef
 	ret <4 x float> %val
 }
 ;check that 'misaligned' loads that may span two memory chunks
 ;have two loads. Don't check for the bitmanipulation, as that 
 ;might change with improved algorithms or scheduling 
 define i32 @load_misaligned( i32* %ptr ){
 ;CHECK: load_misaligned
 ;CHECK: lqd
 ;CHECK: lqd
 ;CHECK: bi $lr
  %rv = load i32* %ptr, align 2
  ret i32 %rv
 }
--- a/test/CodeGen/CellSPU/stores.ll
+++ b/test/CodeGen/CellSPU/stores.ll
@@ -14,6 +14,7 @@
 ; RUN: grep iohl                %t1.s | count 8
 ; RUN: grep shufb               %t1.s | count 15
 ; RUN: grep frds                %t1.s | count 1
 ; RUN: llc < %s -march=cellspu | FileCheck %s
 ; ModuleID = 'stores.bc'
 target datalayout = "E-p:32:32:128-f64:64:128-f32:32:128-i64:32:128-i32:32:128-i16:16:128-i8:8:128-i1:8:128-a0:0:128-v128:128:128-s0:128:128"
@@ -149,3 +150,15 @@ entry:
 	store float %conv, float* %dest
 	ret float %conv
 }
 ;Check stores that might span two 16 byte memory blocks
 define void @store_misaligned( i32 %val, i32* %ptr) {	
 ;CHECK: store_misaligned
 ;CHECK: lqd
 ;CHECK: lqd
 ;CHECK: stqd
 ;CHECK: stqd
 ;CHECK: bi $lr
 	store i32 %val, i32*%ptr, align 2
 	ret void
 }