diff --git a/lib/Target/CellSPU/SPU.td b/lib/Target/CellSPU/SPU.td
index a5db1d9d2b5..8327fe03d7f 100644
--- a/lib/Target/CellSPU/SPU.td
+++ b/lib/Target/CellSPU/SPU.td
@@ -15,6 +15,13 @@
 //
 include "llvm/Target/Target.td"
 
+// Holder of code fragments (you'd think this'd already be in
+// a td file somewhere... :-)
+
+class CodeFrag<dag frag> {
+  dag Fragment = frag;
+}
+
 //===----------------------------------------------------------------------===//
 // Register File Description
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/CellSPU/SPU64InstrInfo.td b/lib/Target/CellSPU/SPU64InstrInfo.td
index 6d679bac724..4159133770d 100644
--- a/lib/Target/CellSPU/SPU64InstrInfo.td
+++ b/lib/Target/CellSPU/SPU64InstrInfo.td
@@ -1,8 +1,17 @@
+//====--- SPU64InstrInfo.td - Cell SPU 64-bit operations -*- tablegen -*--====//
+//
+//                     Cell SPU 64-bit operations
+//
+// Primary author: Scott Michel (scottm@aero.org)
+//===----------------------------------------------------------------------===//
+
 //-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
 // 64-bit comparisons:
 //
 // 1. The instruction sequences for vector vice scalar differ by a
-//    constant.
+//    constant. In the scalar case, we're only interested in the
+//    top two 32-bit slots, whereas we're interested in an exact
+//    all-four-slot match in the vector case.
 //
 // 2. There are no "immediate" forms, since loading 64-bit constants
 //    could be a constant pool load.
@@ -10,10 +19,10 @@
 // 3. i64 setcc results are i32, which are subsequently converted to a FSM
 //    mask when used in a select pattern.
 //
-// 4. v2i64 setcc results are v4i32, which can be converted to a FSM mask
-//    (TODO)
+// 4. v2i64 setcc results are v4i32, which can be converted to a FSM mask (TODO)
+//    [Note: this may be moot, since gb produces v4i32 or r32.]
 //
-// M00$E Kan be Pretty N@sTi!!!!! (appologies to Monty!)
+// M00$E B!tes Kan be Pretty N@sTi!!!!! (appologies to Monty!)
 //-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
 
 // selb instruction definition for i64. Note that the selection mask is
@@ -22,17 +31,15 @@ def SELBr64_cond:
    SELBInst<(outs R64C:$rT), (ins R64C:$rA, R64C:$rB, VECREG:$rC),
             [/* no pattern */]>;
 
-class CodeFrag<dag frag> {
-  dag Fragment = frag;
-}
-
-class I64SELECTNegCond<PatFrag cond, CodeFrag cmpare>:
+// select the negative condition:
+class I64SELECTNegCond<PatFrag cond, CodeFrag compare>:
   Pat<(select (i32 (cond R64C:$rA, R64C:$rB)), R64C:$rTrue, R64C:$rFalse),
-      (SELBr64_cond R64C:$rTrue, R64C:$rFalse, (FSMr32 cmpare.Fragment))>;
+      (SELBr64_cond R64C:$rTrue, R64C:$rFalse, (FSMr32 compare.Fragment))>;
 
-class I64SETCCNegCond<PatFrag cond, CodeFrag cmpare>:
+// setcc the negative condition:
+class I64SETCCNegCond<PatFrag cond, CodeFrag compare>:
   Pat<(cond R64C:$rA, R64C:$rB),
-      (XORIr32 cmpare.Fragment, -1)>;
+      (XORIr32 compare.Fragment, -1)>;
 
 // The i64 seteq fragment that does the scalar->vector conversion and
 // comparison:
@@ -64,14 +71,13 @@ multiclass CompareEqual64 {
 defm I64EQ: CompareEqual64;
 
 def : Pat<(seteq R64C:$rA, R64C:$rB), I64EQr64.Fragment>;
+def : Pat<(seteq (v2i64 VECREG:$rA), (v2i64 VECREG:$rB)), I64EQv2i64.Fragment>;
 
-def : Pat<(seteq (v2i64 VECREG:$rA), (v2i64 VECREG:$rB)),
-          I64EQv2i64.Fragment>;
-
-def I64Select:
-    Pat<(select R32C:$rC, R64C:$rB, R64C:$rA),
-        (SELBr64_cond R64C:$rA, R64C:$rB, (FSMr32 R32C:$rC))>;
+def : Pat<(select R32C:$rC, R64C:$rB, R64C:$rA),
+          (SELBr64_cond R64C:$rA, R64C:$rB, (FSMr32 R32C:$rC))>;
 
+// i64 setne:
 def : I64SETCCNegCond<setne, I64EQr64>;
+def : I64SELECTNegCond<setne, I64EQr64>;
 
-def : I64SELECTNegCond<setne, I64EQr64>;
\ No newline at end of file
+// i64 setugt:
diff --git a/lib/Target/CellSPU/SPUISelDAGToDAG.cpp b/lib/Target/CellSPU/SPUISelDAGToDAG.cpp
index f51aba2fda6..76b22843696 100644
--- a/lib/Target/CellSPU/SPUISelDAGToDAG.cpp
+++ b/lib/Target/CellSPU/SPUISelDAGToDAG.cpp
@@ -149,7 +149,7 @@ namespace {
   }
 
   bool
-  isHighLow(const SDValue &Op) 
+  isHighLow(const SDValue &Op)
   {
     return (Op.getOpcode() == SPUISD::IndirectAddr
             && ((Op.getOperand(0).getOpcode() == SPUISD::Hi
@@ -229,14 +229,14 @@ public:
     TM(tm),
     SPUtli(*tm.getTargetLowering())
   {}
-    
+
   virtual bool runOnFunction(Function &Fn) {
     // Make sure we re-emit a set of the global base reg if necessary
     GlobalBaseReg = 0;
     SelectionDAGISel::runOnFunction(Fn);
     return true;
   }
-   
+
   /// getI32Imm - Return a target constant with the specified value, of type
   /// i32.
   inline SDValue getI32Imm(uint32_t Imm) {
@@ -248,7 +248,7 @@ public:
   inline SDValue getI64Imm(uint64_t Imm) {
     return CurDAG->getTargetConstant(Imm, MVT::i64);
   }
-    
+
   /// getSmallIPtrImm - Return a target constant of pointer type.
   inline SDValue getSmallIPtrImm(unsigned Imm) {
     return CurDAG->getTargetConstant(Imm, SPUtli.getPointerTy());
@@ -258,6 +258,15 @@ public:
   /// target-specific node if it hasn't already been changed.
   SDNode *Select(SDValue Op);
 
+  //! Emit the instruction sequence for i64 shl
+  SDNode *SelectSHLi64(SDValue &Op, MVT OpVT);
+
+  //! Emit the instruction sequence for i64 srl
+  SDNode *SelectSRLi64(SDValue &Op, MVT OpVT);
+
+  //! Emit the instruction sequence for i64 sra
+  SDNode *SelectSRAi64(SDValue &Op, MVT OpVT);
+
   //! Returns true if the address N is an A-form (local store) address
   bool SelectAFormAddr(SDValue Op, SDValue N, SDValue &Base,
                        SDValue &Index);
@@ -287,7 +296,7 @@ public:
     switch (ConstraintCode) {
     default: return true;
     case 'm':   // memory
-      if (!SelectDFormAddr(Op, Op, Op0, Op1) 
+      if (!SelectDFormAddr(Op, Op, Op0, Op1)
           && !SelectAFormAddr(Op, Op, Op0, Op1))
         SelectXFormAddr(Op, Op, Op0, Op1);
       break;
@@ -306,7 +315,7 @@ public:
 #endif
       break;
     }
-      
+
     OutOps.push_back(Op0);
     OutOps.push_back(Op1);
     return false;
@@ -318,14 +327,14 @@ public:
 
   virtual const char *getPassName() const {
     return "Cell SPU DAG->DAG Pattern Instruction Selection";
-  } 
-    
+  }
+
   /// CreateTargetHazardRecognizer - Return the hazard recognizer to use for
   /// this target when scheduling the DAG.
   virtual HazardRecognizer *CreateTargetHazardRecognizer() {
     const TargetInstrInfo *II = TM.getInstrInfo();
     assert(II && "No InstrInfo?");
-    return new SPUHazardRecognizer(*II); 
+    return new SPUHazardRecognizer(*II);
   }
 
   // Include the pieces autogenerated from the target description.
@@ -375,7 +384,7 @@ SPUDAGToDAGISel::SelectAFormAddr(SDValue Op, SDValue N, SDValue &Base,
     abort();
     /*NOTREACHED*/
 
-  case SPUISD::AFormAddr: 
+  case SPUISD::AFormAddr:
     // Just load from memory if there's only a single use of the location,
     // otherwise, this will get handled below with D-form offset addresses
     if (N.hasOneUse()) {
@@ -404,7 +413,7 @@ SPUDAGToDAGISel::SelectAFormAddr(SDValue Op, SDValue N, SDValue &Base,
   return false;
 }
 
-bool 
+bool
 SPUDAGToDAGISel::SelectDForm2Addr(SDValue Op, SDValue N, SDValue &Disp,
                                   SDValue &Base) {
   const int minDForm2Offset = -(1 << 7);
@@ -527,7 +536,7 @@ SPUDAGToDAGISel::DFormAddressPredicate(SDValue Op, SDValue N, SDValue &Base,
         ConstantSDNode *CN = cast<ConstantSDNode>(Op0);
         offset = int32_t(CN->getSExtValue());
         idxOp = Op1;
-      } 
+      }
 
       if (offset >= minOffset && offset <= maxOffset) {
         Base = CurDAG->getTargetConstant(offset, PtrTy);
@@ -622,27 +631,20 @@ SPUDAGToDAGISel::Select(SDValue Op) {
   if (N->isMachineOpcode()) {
     return NULL;   // Already selected.
   } else if (Opc == ISD::FrameIndex) {
-    // Selects to (add $sp, FI * stackSlotSize)
-    int FI =
-      SPUFrameInfo::FItoStackOffset(cast<FrameIndexSDNode>(N)->getIndex());
-    MVT PtrVT = SPUtli.getPointerTy();
+    int FI = cast<FrameIndexSDNode>(N)->getIndex();
+    SDValue TFI = CurDAG->getTargetFrameIndex(FI, Op.getValueType());
+    SDValue Imm0 = CurDAG->getTargetConstant(0, Op.getValueType());
 
-    // Adjust stack slot to actual offset in frame:
-    if (isS10Constant(FI)) {
-      DEBUG(cerr << "SPUDAGToDAGISel: Replacing FrameIndex with AIr32 $sp, "
-                 << FI
-                 << "\n");
+    if (FI < 128) {
       NewOpc = SPU::AIr32;
-      Ops[0] = CurDAG->getRegister(SPU::R1, PtrVT);
-      Ops[1] = CurDAG->getTargetConstant(FI, PtrVT);
+      Ops[0] = TFI;
+      Ops[1] = Imm0;
       n_ops = 2;
     } else {
-      DEBUG(cerr << "SPUDAGToDAGISel: Replacing FrameIndex with Ar32 $sp, "
-                 << FI
-                 << "\n");
       NewOpc = SPU::Ar32;
-      Ops[0] = CurDAG->getRegister(SPU::R1, PtrVT);
-      Ops[1] = CurDAG->getConstant(FI, PtrVT);
+      Ops[0] = CurDAG->getRegister(SPU::R1, Op.getValueType());
+      Ops[1] = SDValue(CurDAG->getTargetNode(SPU::ILAr32, Op.getValueType(),
+                                             TFI, Imm0), 0);
       n_ops = 2;
     }
   } else if (Opc == ISD::ZERO_EXTEND) {
@@ -661,6 +663,18 @@ SPUDAGToDAGISel::Select(SDValue Op) {
         n_ops = 2;
       }
     }
+  } else if (Opc == ISD::SHL) {
+    if (OpVT == MVT::i64) {
+      return SelectSHLi64(Op, OpVT);
+    }
+  } else if (Opc == ISD::SRL) {
+    if (OpVT == MVT::i64) {
+      return SelectSRLi64(Op, OpVT);
+    }
+  } else if (Opc == ISD::SRA) {
+    if (OpVT == MVT::i64) {
+      return SelectSRAi64(Op, OpVT);
+    }
   } else if (Opc == SPUISD::LDRESULT) {
     // Custom select instructions for LDRESULT
     MVT VT = N->getValueType(0);
@@ -713,7 +727,7 @@ SPUDAGToDAGISel::Select(SDValue Op) {
       n_ops = 2;
     }
   }
-  
+
   if (n_ops > 0) {
     if (N->hasOneUse())
       return CurDAG->SelectNodeTo(N, NewOpc, OpVT, Ops, n_ops);
@@ -723,7 +737,213 @@ SPUDAGToDAGISel::Select(SDValue Op) {
     return SelectCode(Op);
 }
 
-/// createPPCISelDag - This pass converts a legalized DAG into a 
+/*!
+ * Emit the instruction sequence for i64 left shifts. The basic algorithm
+ * is to fill the bottom two word slots with zeros so that zeros are shifted
+ * in as the entire quadword is shifted left.
+ *
+ * \note This code could also be used to implement v2i64 shl.
+ *
+ * @param Op The shl operand
+ * @param OpVT Op's machine value value type (doesn't need to be passed, but
+ * makes life easier.)
+ * @return The SDNode with the entire instruction sequence
+ */
+SDNode *
+SPUDAGToDAGISel::SelectSHLi64(SDValue &Op, MVT OpVT) {
+  SDValue Op0 = Op.getOperand(0);
+  MVT VecVT = MVT::getVectorVT(OpVT, (128 / OpVT.getSizeInBits()));
+  SDValue ShiftAmt = Op.getOperand(1);
+  MVT ShiftAmtVT = ShiftAmt.getValueType();
+  SDNode *VecOp0, *SelMask, *ZeroFill, *Shift = 0;
+  SDValue SelMaskVal;
+
+  VecOp0 = CurDAG->getTargetNode(SPU::ORv2i64_i64, VecVT, Op0);
+  SelMaskVal = CurDAG->getTargetConstant(0xff00ULL, MVT::i16);
+  SelMask = CurDAG->getTargetNode(SPU::FSMBIv2i64, VecVT, SelMaskVal);
+  ZeroFill = CurDAG->getTargetNode(SPU::ILv2i64, VecVT,
+                                   CurDAG->getTargetConstant(0, OpVT));
+  VecOp0 = CurDAG->getTargetNode(SPU::SELBv2i64, VecVT,
+                                 SDValue(ZeroFill, 0),
+                                 SDValue(VecOp0, 0),
+                                 SDValue(SelMask, 0));
+
+  if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(ShiftAmt)) {
+    unsigned bytes = unsigned(CN->getZExtValue()) >> 3;
+    unsigned bits = unsigned(CN->getZExtValue()) & 7;
+
+    if (bytes > 0) {
+      Shift =
+        CurDAG->getTargetNode(SPU::SHLQBYIv2i64, VecVT,
+                              SDValue(VecOp0, 0),
+                              CurDAG->getTargetConstant(bytes, ShiftAmtVT));
+    }
+
+    if (bits > 0) {
+      Shift =
+        CurDAG->getTargetNode(SPU::SHLQBIIv2i64, VecVT,
+                              SDValue((Shift != 0 ? Shift : VecOp0), 0),
+                              CurDAG->getTargetConstant(bits, ShiftAmtVT));
+    }
+  } else {
+    SDNode *Bytes =
+      CurDAG->getTargetNode(SPU::ROTMIr32, ShiftAmtVT,
+                            ShiftAmt,
+                            CurDAG->getTargetConstant(3, ShiftAmtVT));
+    SDNode *Bits =
+      CurDAG->getTargetNode(SPU::ANDIr32, ShiftAmtVT,
+                            ShiftAmt,
+                            CurDAG->getTargetConstant(7, ShiftAmtVT));
+    Shift =
+      CurDAG->getTargetNode(SPU::SHLQBYv2i64, VecVT,
+                            SDValue(VecOp0, 0), SDValue(Bytes, 0));
+    Shift =
+      CurDAG->getTargetNode(SPU::SHLQBIv2i64, VecVT,
+                            SDValue(Shift, 0), SDValue(Bits, 0));
+  }
+
+  return CurDAG->getTargetNode(SPU::ORi64_v2i64, OpVT, SDValue(Shift, 0));
+}
+
+/*!
+ * Emit the instruction sequence for i64 logical right shifts.
+ *
+ * @param Op The shl operand
+ * @param OpVT Op's machine value value type (doesn't need to be passed, but
+ * makes life easier.)
+ * @return The SDNode with the entire instruction sequence
+ */
+SDNode *
+SPUDAGToDAGISel::SelectSRLi64(SDValue &Op, MVT OpVT) {
+  SDValue Op0 = Op.getOperand(0);
+  MVT VecVT = MVT::getVectorVT(OpVT, (128 / OpVT.getSizeInBits()));
+  SDValue ShiftAmt = Op.getOperand(1);
+  MVT ShiftAmtVT = ShiftAmt.getValueType();
+  SDNode *VecOp0, *Shift = 0;
+
+  VecOp0 = CurDAG->getTargetNode(SPU::ORv2i64_i64, VecVT, Op0);
+
+  if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(ShiftAmt)) {
+    unsigned bytes = unsigned(CN->getZExtValue()) >> 3;
+    unsigned bits = unsigned(CN->getZExtValue()) & 7;
+
+    if (bytes > 0) {
+      Shift =
+        CurDAG->getTargetNode(SPU::ROTQMBYIv2i64, VecVT,
+                              SDValue(VecOp0, 0),
+                              CurDAG->getTargetConstant(bytes, ShiftAmtVT));
+    }
+
+    if (bits > 0) {
+      Shift =
+        CurDAG->getTargetNode(SPU::ROTQMBIIv2i64, VecVT,
+                              SDValue((Shift != 0 ? Shift : VecOp0), 0),
+                              CurDAG->getTargetConstant(bits, ShiftAmtVT));
+    }
+  } else {
+    SDNode *Bytes =
+      CurDAG->getTargetNode(SPU::ROTMIr32, ShiftAmtVT,
+                            ShiftAmt,
+                            CurDAG->getTargetConstant(3, ShiftAmtVT));
+    SDNode *Bits =
+      CurDAG->getTargetNode(SPU::ANDIr32, ShiftAmtVT,
+                            ShiftAmt,
+                            CurDAG->getTargetConstant(7, ShiftAmtVT));
+
+    // Ensure that the shift amounts are negated!
+    Bytes = CurDAG->getTargetNode(SPU::SFIr32, ShiftAmtVT,
+                                  SDValue(Bytes, 0),
+                                  CurDAG->getTargetConstant(0, ShiftAmtVT));
+
+    Bits = CurDAG->getTargetNode(SPU::SFIr32, ShiftAmtVT,
+                                 SDValue(Bits, 0),
+                                 CurDAG->getTargetConstant(0, ShiftAmtVT));
+
+    Shift =
+      CurDAG->getTargetNode(SPU::ROTQMBYv2i64, VecVT,
+                            SDValue(VecOp0, 0), SDValue(Bytes, 0));
+    Shift =
+      CurDAG->getTargetNode(SPU::ROTQMBIv2i64, VecVT,
+                            SDValue(Shift, 0), SDValue(Bits, 0));
+  }
+
+  return CurDAG->getTargetNode(SPU::ORi64_v2i64, OpVT, SDValue(Shift, 0));
+}
+
+/*!
+ * Emit the instruction sequence for i64 arithmetic right shifts.
+ *
+ * @param Op The shl operand
+ * @param OpVT Op's machine value value type (doesn't need to be passed, but
+ * makes life easier.)
+ * @return The SDNode with the entire instruction sequence
+ */
+SDNode *
+SPUDAGToDAGISel::SelectSRAi64(SDValue &Op, MVT OpVT) {
+  // Promote Op0 to vector
+  MVT VecVT = MVT::getVectorVT(OpVT, (128 / OpVT.getSizeInBits()));
+  SDValue ShiftAmt = Op.getOperand(1);
+  MVT ShiftAmtVT = ShiftAmt.getValueType();
+
+  SDNode *VecOp0 =
+    CurDAG->getTargetNode(SPU::ORv2i64_i64, VecVT, Op.getOperand(0));
+
+  SDValue SignRotAmt = CurDAG->getTargetConstant(31, ShiftAmtVT);
+  SDNode *SignRot =
+    CurDAG->getTargetNode(SPU::ROTMAIv2i64_i32, MVT::v2i64,
+                          SDValue(VecOp0, 0), SignRotAmt);
+  SDNode *UpperHalfSign =
+    CurDAG->getTargetNode(SPU::ORi32_v4i32, MVT::i32, SDValue(SignRot, 0));
+
+  SDNode *UpperHalfSignMask =
+    CurDAG->getTargetNode(SPU::FSM64r32, VecVT, SDValue(UpperHalfSign, 0));
+  SDNode *UpperLowerMask =
+    CurDAG->getTargetNode(SPU::FSMBIv2i64, VecVT,
+                          CurDAG->getTargetConstant(0xff00ULL, MVT::i16));
+  SDNode *UpperLowerSelect =
+    CurDAG->getTargetNode(SPU::SELBv2i64, VecVT,
+                          SDValue(UpperHalfSignMask, 0),
+                          SDValue(VecOp0, 0),
+                          SDValue(UpperLowerMask, 0));
+
+  SDNode *Shift = 0;
+
+  if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(ShiftAmt)) {
+    unsigned bytes = unsigned(CN->getZExtValue()) >> 3;
+    unsigned bits = unsigned(CN->getZExtValue()) & 7;
+
+    if (bytes > 0) {
+      bytes = 31 - bytes;
+      Shift =
+        CurDAG->getTargetNode(SPU::ROTQBYIv2i64, VecVT,
+                              SDValue(UpperLowerSelect, 0),
+                              CurDAG->getTargetConstant(bytes, ShiftAmtVT));
+    }
+
+    if (bits > 0) {
+      bits = 8 - bits;
+      Shift =
+        CurDAG->getTargetNode(SPU::ROTQBIIv2i64, VecVT,
+                              SDValue((Shift != 0 ? Shift : UpperLowerSelect), 0),
+                              CurDAG->getTargetConstant(bits, ShiftAmtVT));
+    }
+  } else {
+    SDNode *NegShift =
+      CurDAG->getTargetNode(SPU::SFIr32, ShiftAmtVT,
+                            ShiftAmt, CurDAG->getTargetConstant(0, ShiftAmtVT));
+
+    Shift =
+      CurDAG->getTargetNode(SPU::ROTQBYBIv2i64_r32, VecVT,
+                            SDValue(UpperLowerSelect, 0), SDValue(NegShift, 0));
+    Shift =
+      CurDAG->getTargetNode(SPU::ROTQBIv2i64, VecVT,
+                            SDValue(Shift, 0), SDValue(NegShift, 0));
+  }
+
+  return CurDAG->getTargetNode(SPU::ORi64_v2i64, OpVT, SDValue(Shift, 0));
+}
+
+/// createSPUISelDag - This pass converts a legalized DAG into a
 /// SPU-specific DAG, ready for instruction scheduling.
 ///
 FunctionPass *llvm::createSPUISelDag(SPUTargetMachine &TM) {
diff --git a/lib/Target/CellSPU/SPUISelLowering.cpp b/lib/Target/CellSPU/SPUISelLowering.cpp
index 0822181d3b7..5ccfd14aa4b 100644
--- a/lib/Target/CellSPU/SPUISelLowering.cpp
+++ b/lib/Target/CellSPU/SPUISelLowering.cpp
@@ -204,10 +204,10 @@ SPUTargetLowering::SPUTargetLowering(SPUTargetMachine &TM)
   setOperationAction(ISD::SRL,  MVT::i8,     Custom);
   setOperationAction(ISD::SRA,  MVT::i8,     Custom);
 
-  // SPU needs custom lowering for shift left/right for i64
-  setOperationAction(ISD::SHL,  MVT::i64,    Custom);
-  setOperationAction(ISD::SRL,  MVT::i64,    Custom);
-  setOperationAction(ISD::SRA,  MVT::i64,    Custom);
+  // Make these operations legal and handle them during instruction selection:
+  setOperationAction(ISD::SHL,  MVT::i64,    Legal);
+  setOperationAction(ISD::SRL,  MVT::i64,    Legal);
+  setOperationAction(ISD::SRA,  MVT::i64,    Legal);
 
   // Custom lower i8, i32 and i64 multiplications
   setOperationAction(ISD::MUL,  MVT::i8,     Custom);
@@ -215,6 +215,7 @@ SPUTargetLowering::SPUTargetLowering(SPUTargetMachine &TM)
   setOperationAction(ISD::MUL,  MVT::i64,    Expand);   // libcall
 
   // Need to custom handle (some) common i8, i64 math ops
+  setOperationAction(ISD::ADD,  MVT::i8,     Custom);
   setOperationAction(ISD::ADD,  MVT::i64,    Custom);
   setOperationAction(ISD::SUB,  MVT::i8,     Custom);
   setOperationAction(ISD::SUB,  MVT::i64,    Custom);
@@ -249,7 +250,6 @@ SPUTargetLowering::SPUTargetLowering(SPUTargetMachine &TM)
   // Zero extension and sign extension for i64 have to be
   // custom legalized
   setOperationAction(ISD::ZERO_EXTEND, MVT::i64, Custom);
-  setOperationAction(ISD::SIGN_EXTEND, MVT::i64, Custom);
   setOperationAction(ISD::ANY_EXTEND,  MVT::i64, Custom);
 
   // Custom lower i128 -> i64 truncates
@@ -262,7 +262,6 @@ SPUTargetLowering::SPUTargetLowering(SPUTargetMachine &TM)
   setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
 
   // FDIV on SPU requires custom lowering
-  setOperationAction(ISD::FDIV, MVT::f32, Custom);
   setOperationAction(ISD::FDIV, MVT::f64, Expand);      // libcall
 
   // SPU has [U|S]INT_TO_FP
@@ -340,7 +339,8 @@ SPUTargetLowering::SPUTargetLowering(SPUTargetMachine &TM)
     setOperationAction(ISD::ADD , VT, Legal);
     setOperationAction(ISD::SUB , VT, Legal);
     // mul has to be custom lowered.
-    setOperationAction(ISD::MUL , VT, Custom);
+    // TODO: v2i64 vector multiply
+    setOperationAction(ISD::MUL , VT, Legal);
 
     setOperationAction(ISD::AND   , VT, Legal);
     setOperationAction(ISD::OR    , VT, Legal);
@@ -354,7 +354,6 @@ SPUTargetLowering::SPUTargetLowering(SPUTargetMachine &TM)
     setOperationAction(ISD::SREM, VT, Expand);
     setOperationAction(ISD::UDIV, VT, Expand);
     setOperationAction(ISD::UREM, VT, Expand);
-    setOperationAction(ISD::FDIV, VT, Custom);
 
     // Custom lower build_vector, constant pool spills, insert and
     // extract vector elements:
@@ -371,9 +370,7 @@ SPUTargetLowering::SPUTargetLowering(SPUTargetMachine &TM)
   setOperationAction(ISD::XOR, MVT::v16i8, Custom);
   setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4f32, Custom);
 
-  // FIXME: This is only temporary until I put all vector multiplications in
-  // SPUInstrInfo.td:
-  setOperationAction(ISD::MUL, MVT::v4i32, Legal);
+  setOperationAction(ISD::FDIV, MVT::v4f32, Legal);
 
   setShiftAmountType(MVT::i32);
   setBooleanContents(ZeroOrNegativeOneBooleanContent);
@@ -411,10 +408,6 @@ SPUTargetLowering::getTargetNodeName(unsigned Opcode) const
     node_names[(unsigned) SPUISD::CNTB] = "SPUISD::CNTB";
     node_names[(unsigned) SPUISD::PREFSLOT2VEC] = "SPUISD::PREFSLOT2VEC";
     node_names[(unsigned) SPUISD::VEC2PREFSLOT] = "SPUISD::VEC2PREFSLOT";
-    node_names[(unsigned) SPUISD::MPY] = "SPUISD::MPY";
-    node_names[(unsigned) SPUISD::MPYU] = "SPUISD::MPYU";
-    node_names[(unsigned) SPUISD::MPYH] = "SPUISD::MPYH";
-    node_names[(unsigned) SPUISD::MPYHH] = "SPUISD::MPYHH";
     node_names[(unsigned) SPUISD::SHLQUAD_L_BITS] = "SPUISD::SHLQUAD_L_BITS";
     node_names[(unsigned) SPUISD::SHLQUAD_L_BYTES] = "SPUISD::SHLQUAD_L_BYTES";
     node_names[(unsigned) SPUISD::VEC_SHL] = "SPUISD::VEC_SHL";
@@ -422,21 +415,12 @@ SPUTargetLowering::getTargetNodeName(unsigned Opcode) const
     node_names[(unsigned) SPUISD::VEC_SRA] = "SPUISD::VEC_SRA";
     node_names[(unsigned) SPUISD::VEC_ROTL] = "SPUISD::VEC_ROTL";
     node_names[(unsigned) SPUISD::VEC_ROTR] = "SPUISD::VEC_ROTR";
-    node_names[(unsigned) SPUISD::ROTQUAD_RZ_BYTES] =
-      "SPUISD::ROTQUAD_RZ_BYTES";
-    node_names[(unsigned) SPUISD::ROTQUAD_RZ_BITS] =
-      "SPUISD::ROTQUAD_RZ_BITS";
-    node_names[(unsigned) SPUISD::ROTBYTES_LEFT] = "SPUISD::ROTBYTES_LEFT";
-    node_names[(unsigned) SPUISD::ROTBYTES_LEFT_BITS] =
-      "SPUISD::ROTBYTES_LEFT_BITS";
     node_names[(unsigned) SPUISD::SELECT_MASK] = "SPUISD::SELECT_MASK";
     node_names[(unsigned) SPUISD::SELB] = "SPUISD::SELB";
     node_names[(unsigned) SPUISD::ADD_EXTENDED] = "SPUISD::ADD_EXTENDED";
     node_names[(unsigned) SPUISD::CARRY_GENERATE] = "SPUISD::CARRY_GENERATE";
     node_names[(unsigned) SPUISD::SUB_EXTENDED] = "SPUISD::SUB_EXTENDED";
     node_names[(unsigned) SPUISD::BORROW_GENERATE] = "SPUISD::BORROW_GENERATE";
-    node_names[(unsigned) SPUISD::FPInterp] = "SPUISD::FPInterp";
-    node_names[(unsigned) SPUISD::FPRecipEst] = "SPUISD::FPRecipEst";
     node_names[(unsigned) SPUISD::SEXT32TO64] = "SPUISD::SEXT32TO64";
   }
 
@@ -1922,182 +1906,6 @@ static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) {
   return SDValue();
 }
 
-static SDValue LowerVectorMUL(SDValue Op, SelectionDAG &DAG) {
-  switch (Op.getValueType().getSimpleVT()) {
-  default:
-    cerr << "CellSPU: Unknown vector multiplication, got "
-         << Op.getValueType().getMVTString()
-         << "\n";
-    abort();
-    /*NOTREACHED*/
-
-  case MVT::v4i32:
-	  break;
-
-  // Multiply two v8i16 vectors (pipeline friendly version):
-  // a) multiply lower halves, mask off upper 16-bit of 32-bit product
-  // b) multiply upper halves, rotate left by 16 bits (inserts 16 lower zeroes)
-  // c) Use SELB to select upper and lower halves from the intermediate results
-  //
-  // NOTE: We really want to move the SELECT_MASK to earlier to actually get the
-  // dual-issue. This code does manage to do this, even if it's a little on
-  // the wacky side
-  case MVT::v8i16: {
-    MachineFunction &MF = DAG.getMachineFunction();
-    MachineRegisterInfo &RegInfo = MF.getRegInfo();
-    SDValue Chain = Op.getOperand(0);
-    SDValue rA = Op.getOperand(0);
-    SDValue rB = Op.getOperand(1);
-    unsigned FSMBIreg = RegInfo.createVirtualRegister(&SPU::VECREGRegClass);
-    unsigned HiProdReg = RegInfo.createVirtualRegister(&SPU::VECREGRegClass);
-
-    SDValue FSMBOp =
-      DAG.getCopyToReg(Chain, FSMBIreg,
-                       DAG.getNode(SPUISD::SELECT_MASK, MVT::v8i16,
-                                   DAG.getConstant(0xcccc, MVT::i16)));
-
-    SDValue HHProd =
-      DAG.getCopyToReg(FSMBOp, HiProdReg,
-                       DAG.getNode(SPUISD::MPYHH, MVT::v8i16, rA, rB));
-
-    SDValue HHProd_v4i32 =
-      DAG.getNode(ISD::BIT_CONVERT, MVT::v4i32,
-                  DAG.getCopyFromReg(HHProd, HiProdReg, MVT::v4i32));
-
-    return DAG.getNode(SPUISD::SELB, MVT::v8i16,
-                       DAG.getNode(SPUISD::MPY, MVT::v8i16, rA, rB),
-                       DAG.getNode(ISD::BIT_CONVERT, Op.getValueType(),
-                                   DAG.getNode(SPUISD::VEC_SHL, MVT::v4i32,
-                                               HHProd_v4i32,
-                                               DAG.getConstant(16, MVT::i16))),
-                       DAG.getCopyFromReg(FSMBOp, FSMBIreg, MVT::v4i32));
-  }
-
-  // This M00sE is N@stI! (apologies to Monty Python)
-  //
-  // SPU doesn't know how to do any 8-bit multiplication, so the solution
-  // is to break it all apart, sign extend, and reassemble the various
-  // intermediate products.
-  case MVT::v16i8: {
-    SDValue rA = Op.getOperand(0);
-    SDValue rB = Op.getOperand(1);
-    SDValue c8 = DAG.getConstant(8, MVT::i32);
-    SDValue c16 = DAG.getConstant(16, MVT::i32);
-
-    SDValue LLProd =
-      DAG.getNode(SPUISD::MPY, MVT::v8i16,
-                  DAG.getNode(ISD::BIT_CONVERT, MVT::v8i16, rA),
-                  DAG.getNode(ISD::BIT_CONVERT, MVT::v8i16, rB));
-
-    SDValue rALH = DAG.getNode(SPUISD::VEC_SRA, MVT::v8i16, rA, c8);
-
-    SDValue rBLH = DAG.getNode(SPUISD::VEC_SRA, MVT::v8i16, rB, c8);
-
-    SDValue LHProd =
-      DAG.getNode(SPUISD::VEC_SHL, MVT::v8i16,
-                  DAG.getNode(SPUISD::MPY, MVT::v8i16, rALH, rBLH), c8);
-
-    SDValue FSMBmask = DAG.getNode(SPUISD::SELECT_MASK, MVT::v8i16,
-                                     DAG.getConstant(0x2222, MVT::i16));
-
-    SDValue LoProdParts =
-      DAG.getNode(ISD::BIT_CONVERT, MVT::v4i32,
-                  DAG.getNode(SPUISD::SELB, MVT::v8i16,
-                              LLProd, LHProd, FSMBmask));
-
-    SDValue LoProdMask = DAG.getConstant(0xffff, MVT::i32);
-
-    SDValue LoProd =
-      DAG.getNode(ISD::AND, MVT::v4i32,
-                  LoProdParts,
-                  DAG.getNode(ISD::BUILD_VECTOR, MVT::v4i32,
-                              LoProdMask, LoProdMask,
-                              LoProdMask, LoProdMask));
-
-    SDValue rAH =
-      DAG.getNode(SPUISD::VEC_SRA, MVT::v4i32,
-                  DAG.getNode(ISD::BIT_CONVERT, MVT::v4i32, rA), c16);
-
-    SDValue rBH =
-      DAG.getNode(SPUISD::VEC_SRA, MVT::v4i32,
-                  DAG.getNode(ISD::BIT_CONVERT, MVT::v4i32, rB), c16);
-
-    SDValue HLProd =
-      DAG.getNode(SPUISD::MPY, MVT::v8i16,
-                  DAG.getNode(ISD::BIT_CONVERT, MVT::v8i16, rAH),
-                  DAG.getNode(ISD::BIT_CONVERT, MVT::v8i16, rBH));
-
-    SDValue HHProd_1 =
-      DAG.getNode(SPUISD::MPY, MVT::v8i16,
-                  DAG.getNode(ISD::BIT_CONVERT, MVT::v8i16,
-                              DAG.getNode(SPUISD::VEC_SRA,
-                                          MVT::v4i32, rAH, c8)),
-                  DAG.getNode(ISD::BIT_CONVERT, MVT::v8i16,
-                              DAG.getNode(SPUISD::VEC_SRA,
-                                          MVT::v4i32, rBH, c8)));
-
-    SDValue HHProd =
-      DAG.getNode(SPUISD::SELB, MVT::v8i16,
-                  HLProd,
-                  DAG.getNode(SPUISD::VEC_SHL, MVT::v8i16, HHProd_1, c8),
-                  FSMBmask);
-
-    SDValue HiProd =
-      DAG.getNode(SPUISD::VEC_SHL, MVT::v4i32, HHProd, c16);
-
-    return DAG.getNode(ISD::BIT_CONVERT, MVT::v16i8,
-                       DAG.getNode(ISD::OR, MVT::v4i32,
-                                   LoProd, HiProd));
-  }
-  }
-
-  return SDValue();
-}
-
-static SDValue LowerFDIVf32(SDValue Op, SelectionDAG &DAG) {
-  MachineFunction &MF = DAG.getMachineFunction();
-  MachineRegisterInfo &RegInfo = MF.getRegInfo();
-
-  SDValue A = Op.getOperand(0);
-  SDValue B = Op.getOperand(1);
-  MVT VT = Op.getValueType();
-
-  unsigned VRegBR, VRegC;
-
-  if (VT == MVT::f32) {
-    VRegBR = RegInfo.createVirtualRegister(&SPU::R32FPRegClass);
-    VRegC = RegInfo.createVirtualRegister(&SPU::R32FPRegClass);
-  } else {
-    VRegBR = RegInfo.createVirtualRegister(&SPU::VECREGRegClass);
-    VRegC = RegInfo.createVirtualRegister(&SPU::VECREGRegClass);
-  }
-  // TODO: make sure we're feeding FPInterp the right arguments
-  // Right now: fi B, frest(B)
-
-  // Computes BRcpl =
-  // (Floating Interpolate (FP Reciprocal Estimate B))
-  SDValue BRcpl =
-      DAG.getCopyToReg(DAG.getEntryNode(), VRegBR,
-                       DAG.getNode(SPUISD::FPInterp, VT, B,
-                                DAG.getNode(SPUISD::FPRecipEst, VT, B)));
-
-  // Computes A * BRcpl and stores in a temporary register
-  SDValue AxBRcpl =
-      DAG.getCopyToReg(BRcpl, VRegC,
-                 DAG.getNode(ISD::FMUL, VT, A,
-                        DAG.getCopyFromReg(BRcpl, VRegBR, VT)));
-  // What's the Chain variable do? It's magic!
-  // TODO: set Chain = Op(0).getEntryNode()
-
-  return DAG.getNode(ISD::FADD, VT,
-                DAG.getCopyFromReg(AxBRcpl, VRegC, VT),
-                DAG.getNode(ISD::FMUL, VT,
-                        DAG.getCopyFromReg(AxBRcpl, VRegBR, VT),
-                        DAG.getNode(ISD::FSUB, VT, A,
-                            DAG.getNode(ISD::FMUL, VT, B,
-                            DAG.getCopyFromReg(AxBRcpl, VRegC, VT)))));
-}
-
 static SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) {
   MVT VT = Op.getValueType();
   SDValue N = Op.getOperand(0);
@@ -2296,18 +2104,23 @@ static SDValue LowerI8Math(SDValue Op, SelectionDAG &DAG, unsigned Opc,
     assert(0 && "Unhandled i8 math operator");
     /*NOTREACHED*/
     break;
+  case ISD::ADD: {
+    // 8-bit addition: Promote the arguments up to 16-bits and truncate
+    // the result:
+    SDValue N1 = Op.getOperand(1);
+    N0 = DAG.getNode(ISD::SIGN_EXTEND, MVT::i16, N0);
+    N1 = DAG.getNode(ISD::SIGN_EXTEND, MVT::i16, N1);
+    return DAG.getNode(ISD::TRUNCATE, MVT::i8,
+                       DAG.getNode(Opc, MVT::i16, N0, N1));
+
+  }
+
   case ISD::SUB: {
     // 8-bit subtraction: Promote the arguments up to 16-bits and truncate
     // the result:
     SDValue N1 = Op.getOperand(1);
-    N0 = (N0.getOpcode() != ISD::Constant
-          ? DAG.getNode(ISD::SIGN_EXTEND, MVT::i16, N0)
-          : DAG.getConstant(cast<ConstantSDNode>(N0)->getSExtValue(),
-                            MVT::i16));
-    N1 = (N1.getOpcode() != ISD::Constant
-          ? DAG.getNode(ISD::SIGN_EXTEND, MVT::i16, N1)
-          : DAG.getConstant(cast<ConstantSDNode>(N1)->getSExtValue(),
-                            MVT::i16));
+    N0 = DAG.getNode(ISD::SIGN_EXTEND, MVT::i16, N0);
+    N1 = DAG.getNode(ISD::SIGN_EXTEND, MVT::i16, N1);
     return DAG.getNode(ISD::TRUNCATE, MVT::i8,
                        DAG.getNode(Opc, MVT::i16, N0, N1));
   }
@@ -2397,7 +2210,6 @@ static SDValue LowerI64Math(SDValue Op, SelectionDAG &DAG, unsigned Opc)
 
   switch (Opc) {
   case ISD::ZERO_EXTEND:
-  case ISD::SIGN_EXTEND:
   case ISD::ANY_EXTEND: {
     MVT Op0VT = Op0.getValueType();
     MVT Op0VecVT = MVT::getVectorVT(Op0VT, (128 / Op0VT.getSizeInBits()));
@@ -2410,39 +2222,16 @@ static SDValue LowerI64Math(SDValue Op, SelectionDAG &DAG, unsigned Opc)
     SDValue PromoteScalar =
             DAG.getNode(SPUISD::PREFSLOT2VEC, Op0VecVT, Op0);
 
-    if (Opc != ISD::SIGN_EXTEND) {
-      // Use a shuffle to zero extend the i32 to i64 directly:
-      SDValue shufMask =
-              DAG.getNode(ISD::BUILD_VECTOR, Op0VecVT,
-                          DAG.getConstant(0x80808080, MVT::i32),
-                          DAG.getConstant(0x00010203, MVT::i32),
-                          DAG.getConstant(0x80808080, MVT::i32),
-                          DAG.getConstant(0x08090a0b, MVT::i32));
-      SDValue zextShuffle =
-              DAG.getNode(SPUISD::SHUFB, Op0VecVT,
-                          PromoteScalar, PromoteScalar, shufMask);
+    // Use a shuffle to zero extend the i32 to i64 directly:
+    SDValue shufMask = DAG.getNode(ISD::BUILD_VECTOR, Op0VecVT,
+        DAG.getConstant(0x80808080, MVT::i32), DAG.getConstant(0x00010203,
+            MVT::i32), DAG.getConstant(0x80808080, MVT::i32), DAG.getConstant(
+            0x08090a0b, MVT::i32));
+    SDValue zextShuffle = DAG.getNode(SPUISD::SHUFB, Op0VecVT, PromoteScalar,
+        PromoteScalar, shufMask);
 
-      return DAG.getNode(SPUISD::VEC2PREFSLOT, VT,
-                         DAG.getNode(ISD::BIT_CONVERT, VecVT, zextShuffle));
-    } else {
-      // SPU has no "rotate quadword and replicate bit 0" (i.e. rotate/shift
-      // right and propagate the sign bit) instruction.
-      SDValue RotQuad =
-              DAG.getNode(SPUISD::ROTQUAD_RZ_BYTES, Op0VecVT,
-                          PromoteScalar, DAG.getConstant(4, MVT::i32));
-      SDValue SignQuad =
-              DAG.getNode(SPUISD::VEC_SRA, Op0VecVT,
-                          PromoteScalar, DAG.getConstant(32, MVT::i32));
-      SDValue SelMask =
-              DAG.getNode(SPUISD::SELECT_MASK, Op0VecVT,
-                          DAG.getConstant(0xf0f0, MVT::i16));
-      SDValue CombineQuad =
-              DAG.getNode(SPUISD::SELB, Op0VecVT,
-                          SignQuad, RotQuad, SelMask);
-
-      return DAG.getNode(SPUISD::VEC2PREFSLOT, VT,
-                         DAG.getNode(ISD::BIT_CONVERT, VecVT, CombineQuad));
-    }
+    return DAG.getNode(SPUISD::VEC2PREFSLOT, VT, DAG.getNode(ISD::BIT_CONVERT,
+        VecVT, zextShuffle));
   }
 
   case ISD::ADD: {
@@ -2502,88 +2291,6 @@ static SDValue LowerI64Math(SDValue Op, SelectionDAG &DAG, unsigned Opc)
                        DAG.getNode(SPUISD::SUB_EXTENDED, MVT::v2i64,
                                    Op0, Op1, ShiftedBorrow));
   }
-
-  case ISD::SHL: {
-    SDValue ShiftAmt = Op.getOperand(1);
-    MVT ShiftAmtVT = ShiftAmt.getValueType();
-    SDValue Op0Vec = DAG.getNode(SPUISD::PREFSLOT2VEC, VecVT, Op0);
-    SDValue MaskLower =
-      DAG.getNode(SPUISD::SELB, VecVT,
-                  Op0Vec,
-                  DAG.getConstant(0, VecVT),
-                  DAG.getNode(SPUISD::SELECT_MASK, VecVT,
-                              DAG.getConstant(0xff00ULL, MVT::i16)));
-    SDValue ShiftAmtBytes =
-      DAG.getNode(ISD::SRL, ShiftAmtVT,
-                  ShiftAmt,
-                  DAG.getConstant(3, ShiftAmtVT));
-    SDValue ShiftAmtBits =
-      DAG.getNode(ISD::AND, ShiftAmtVT,
-                  ShiftAmt,
-                  DAG.getConstant(7, ShiftAmtVT));
-
-    return DAG.getNode(SPUISD::VEC2PREFSLOT, VT,
-                       DAG.getNode(SPUISD::SHLQUAD_L_BITS, VecVT,
-                                   DAG.getNode(SPUISD::SHLQUAD_L_BYTES, VecVT,
-                                               MaskLower, ShiftAmtBytes),
-                                   ShiftAmtBits));
-  }
-
-  case ISD::SRL: {
-    MVT VT = Op.getValueType();
-    SDValue ShiftAmt = Op.getOperand(1);
-    MVT ShiftAmtVT = ShiftAmt.getValueType();
-    SDValue ShiftAmtBytes =
-      DAG.getNode(ISD::SRL, ShiftAmtVT,
-                  ShiftAmt,
-                  DAG.getConstant(3, ShiftAmtVT));
-    SDValue ShiftAmtBits =
-      DAG.getNode(ISD::AND, ShiftAmtVT,
-                  ShiftAmt,
-                  DAG.getConstant(7, ShiftAmtVT));
-
-    return DAG.getNode(SPUISD::ROTQUAD_RZ_BITS, VT,
-                       DAG.getNode(SPUISD::ROTQUAD_RZ_BYTES, VT,
-                                   Op0, ShiftAmtBytes),
-                       ShiftAmtBits);
-  }
-
-  case ISD::SRA: {
-    // Promote Op0 to vector
-    SDValue Op0 =
-      DAG.getNode(SPUISD::PREFSLOT2VEC, MVT::v2i64, Op.getOperand(0));
-    SDValue ShiftAmt = Op.getOperand(1);
-    MVT ShiftVT = ShiftAmt.getValueType();
-
-    // Negate variable shift amounts
-    if (!isa<ConstantSDNode>(ShiftAmt)) {
-      ShiftAmt = DAG.getNode(ISD::SUB, ShiftVT,
-                             DAG.getConstant(0, ShiftVT), ShiftAmt);
-    }
-
-    SDValue UpperHalfSign =
-      DAG.getNode(SPUISD::VEC2PREFSLOT, MVT::i32,
-                  DAG.getNode(ISD::BIT_CONVERT, MVT::v4i32,
-                              DAG.getNode(SPUISD::VEC_SRA, MVT::v2i64,
-                                          Op0, DAG.getConstant(31, MVT::i32))));
-    SDValue UpperHalfSignMask =
-      DAG.getNode(SPUISD::SELECT_MASK, MVT::v2i64, UpperHalfSign);
-    SDValue UpperLowerMask =
-      DAG.getNode(SPUISD::SELECT_MASK, MVT::v2i64,
-                  DAG.getConstant(0xff00, MVT::i16));
-    SDValue UpperLowerSelect =
-      DAG.getNode(SPUISD::SELB, MVT::v2i64,
-                  UpperHalfSignMask, Op0, UpperLowerMask);
-    SDValue RotateLeftBytes =
-      DAG.getNode(SPUISD::ROTBYTES_LEFT_BITS, MVT::v2i64,
-                  UpperLowerSelect, ShiftAmt);
-    SDValue RotateLeftBits =
-      DAG.getNode(SPUISD::ROTBYTES_LEFT, MVT::v2i64,
-                  RotateLeftBytes, ShiftAmt);
-
-    return DAG.getNode(SPUISD::VEC2PREFSLOT, MVT::i64,
-                       RotateLeftBits);
-  }
   }
 
   return SDValue();
@@ -2890,10 +2597,11 @@ SPUTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG)
     return LowerRET(Op, DAG, getTargetMachine());
 
 
-  // i8, i64 math ops:
   case ISD::ZERO_EXTEND:
-  case ISD::SIGN_EXTEND:
   case ISD::ANY_EXTEND:
+    return LowerI64Math(Op, DAG, Opc);
+
+  // i8, i64 math ops:
   case ISD::ADD:
   case ISD::SUB:
   case ISD::ROTR:
@@ -2928,22 +2636,9 @@ SPUTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG)
 
   // Vector and i8 multiply:
   case ISD::MUL:
-    if (VT.isVector())
-      return LowerVectorMUL(Op, DAG);
-    else if (VT == MVT::i8)
+    if (VT == MVT::i8)
       return LowerI8Math(Op, DAG, Opc, *this);
 
-  case ISD::FDIV:
-    if (VT == MVT::f32 || VT == MVT::v4f32)
-      return LowerFDIVf32(Op, DAG);
-#if 0
-    // This is probably a libcall
-    else if (Op.getValueType() == MVT::f64)
-      return LowerFDIVf64(Op, DAG);
-#endif
-    else
-      assert(0 && "Calling FDIV on unsupported MVT");
-
   case ISD::CTPOP:
     return LowerCTPOP(Op, DAG);
 
@@ -3119,8 +2814,6 @@ SPUTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const
   case SPUISD::VEC_SHL:
   case SPUISD::VEC_SRL:
   case SPUISD::VEC_SRA:
-  case SPUISD::ROTQUAD_RZ_BYTES:
-  case SPUISD::ROTQUAD_RZ_BITS:
   case SPUISD::ROTBYTES_LEFT: {
     SDValue Op1 = N->getOperand(1);
 
@@ -3268,10 +2961,6 @@ SPUTargetLowering::computeMaskedBitsForTargetNode(const SDValue Op,
   }
 
 #if 0
-  case MPY:
-  case MPYU:
-  case MPYH:
-  case MPYHH:
   case SPUISD::SHLQUAD_L_BITS:
   case SPUISD::SHLQUAD_L_BYTES:
   case SPUISD::VEC_SHL:
@@ -3279,18 +2968,14 @@ SPUTargetLowering::computeMaskedBitsForTargetNode(const SDValue Op,
   case SPUISD::VEC_SRA:
   case SPUISD::VEC_ROTL:
   case SPUISD::VEC_ROTR:
-  case SPUISD::ROTQUAD_RZ_BYTES:
-  case SPUISD::ROTQUAD_RZ_BITS:
   case SPUISD::ROTBYTES_LEFT:
   case SPUISD::SELECT_MASK:
   case SPUISD::SELB:
-  case SPUISD::FPInterp:
-  case SPUISD::FPRecipEst:
   case SPUISD::SEXT32TO64:
 #endif
   }
 }
-  
+
 unsigned
 SPUTargetLowering::ComputeNumSignBitsForTargetNode(SDValue Op,
                                                    unsigned Depth) const {
diff --git a/lib/Target/CellSPU/SPUISelLowering.h b/lib/Target/CellSPU/SPUISelLowering.h
index 8d2e9945455..0eed9b0cfc5 100644
--- a/lib/Target/CellSPU/SPUISelLowering.h
+++ b/lib/Target/CellSPU/SPUISelLowering.h
@@ -24,10 +24,10 @@ namespace llvm {
     enum NodeType {
       // Start the numbering where the builting ops and target ops leave off.
       FIRST_NUMBER = ISD::BUILTIN_OP_END,
-      
+
       // Pseudo instructions:
       RET_FLAG,                 ///< Return with flag, matched by bi instruction
-      
+
       Hi,                       ///< High address component (upper 16)
       Lo,                       ///< Low address component (lower 16)
       PCRelAddr,                ///< Program counter relative address
@@ -41,10 +41,6 @@ namespace llvm {
       CNTB,                     ///< Count leading ones in bytes
       PREFSLOT2VEC,             ///< Promote scalar->vector
       VEC2PREFSLOT,             ///< Extract element 0
-      MPY,                      ///< 16-bit Multiply (low parts of a 32-bit)
-      MPYU,                     ///< Multiply Unsigned
-      MPYH,                     ///< Multiply High
-      MPYHH,                    ///< Multiply High-High
       SHLQUAD_L_BITS,           ///< Rotate quad left, by bits
       SHLQUAD_L_BYTES,          ///< Rotate quad left, by bytes
       VEC_SHL,                  ///< Vector shift left
@@ -52,8 +48,6 @@ namespace llvm {
       VEC_SRA,                  ///< Vector shift right (arithmetic)
       VEC_ROTL,                 ///< Vector rotate left
       VEC_ROTR,                 ///< Vector rotate right
-      ROTQUAD_RZ_BYTES,         ///< Rotate quad right, by bytes, zero fill
-      ROTQUAD_RZ_BITS,          ///< Rotate quad right, by bits, zero fill
       ROTBYTES_LEFT,            ///< Rotate bytes (loads -> ROTQBYI)
       ROTBYTES_LEFT_BITS,       ///< Rotate bytes left by bit shift count
       SELECT_MASK,              ///< Select Mask (FSM, FSMB, FSMH, FSMBI)
@@ -63,8 +57,6 @@ namespace llvm {
       CARRY_GENERATE,           ///< Carry generate for ADD_EXTENDED
       SUB_EXTENDED,             ///< Subtract extended, with borrow
       BORROW_GENERATE,          ///< Borrow generate for SUB_EXTENDED
-      FPInterp,                 ///< Floating point interpolate
-      FPRecipEst,               ///< Floating point reciprocal estimate
       SEXT32TO64,               ///< Sign-extended 32-bit const -> 64-bits
       LAST_SPUISD               ///< Last user-defined instruction
     };
@@ -87,7 +79,7 @@ namespace llvm {
   }
 
   class SPUTargetMachine;            // forward dec'l.
-  
+
   class SPUTargetLowering :
     public TargetLowering
   {
@@ -97,14 +89,14 @@ namespace llvm {
 
   public:
     SPUTargetLowering(SPUTargetMachine &TM);
-    
+
     /// getTargetNodeName() - This method returns the name of a target specific
     /// DAG node.
     virtual const char *getTargetNodeName(unsigned Opcode) const;
 
     /// getSetCCResultType - Return the ValueType for ISD::SETCC
     virtual MVT getSetCCResultType(const SDValue &) const;
-    
+
     //! Custom lowering hooks
     virtual SDValue LowerOperation(SDValue Op, SelectionDAG &DAG);
 
@@ -116,7 +108,7 @@ namespace llvm {
 
     virtual void computeMaskedBitsForTargetNode(const SDValue Op,
                                                 const APInt &Mask,
-                                                APInt &KnownZero, 
+                                                APInt &KnownZero,
                                                 APInt &KnownOne,
                                                 const SelectionDAG &DAG,
                                                 unsigned Depth = 0) const;
@@ -126,12 +118,12 @@ namespace llvm {
 
     ConstraintType getConstraintType(const std::string &ConstraintLetter) const;
 
-    std::pair<unsigned, const TargetRegisterClass*> 
+    std::pair<unsigned, const TargetRegisterClass*>
       getRegForInlineAsmConstraint(const std::string &Constraint,
                                    MVT VT) const;
 
     void LowerAsmOperandForConstraint(SDValue Op, char ConstraintLetter,
-                                      bool hasMemory, 
+                                      bool hasMemory,
                                       std::vector<SDValue> &Ops,
                                       SelectionDAG &DAG) const;
 
diff --git a/lib/Target/CellSPU/SPUInstrInfo.cpp b/lib/Target/CellSPU/SPUInstrInfo.cpp
index 37a58705795..3c8165fbbd7 100644
--- a/lib/Target/CellSPU/SPUInstrInfo.cpp
+++ b/lib/Target/CellSPU/SPUInstrInfo.cpp
@@ -82,7 +82,7 @@ SPUInstrInfo::isMoveInstr(const MachineInstr& MI,
   case SPU::ORIi8i32:
   case SPU::AHIvec:
   case SPU::AHIr16:
-  case SPU::AIvec:
+  case SPU::AIv4i32:
     assert(MI.getNumOperands() == 3 &&
            MI.getOperand(0).isReg() &&
            MI.getOperand(1).isReg() &&
@@ -98,8 +98,7 @@ SPUInstrInfo::isMoveInstr(const MachineInstr& MI,
     assert(MI.getNumOperands() == 3 &&
            "wrong number of operands to AIr32");
     if (MI.getOperand(0).isReg() &&
-        (MI.getOperand(1).isReg() ||
-         MI.getOperand(1).isFI()) &&
+        MI.getOperand(1).isReg() &&
         (MI.getOperand(2).isImm() &&
          MI.getOperand(2).getImm() == 0)) {
       sourceReg = MI.getOperand(1).getReg();
@@ -265,7 +264,7 @@ bool SPUInstrInfo::copyRegToReg(MachineBasicBlock &MBB,
   // reg class to any other reg class containing R3.  This is required because
   // we instruction select bitconvert i64 -> f64 as a noop for example, so our
   // types have no specific meaning.
-  
+
   if (DestRC == SPU::R8CRegisterClass) {
     BuildMI(MBB, MI, get(SPU::ORBIr8), DestReg).addReg(SrcReg).addImm(0);
   } else if (DestRC == SPU::R16CRegisterClass) {
@@ -291,7 +290,7 @@ bool SPUInstrInfo::copyRegToReg(MachineBasicBlock &MBB,
     // Attempt to copy unknown/unsupported register class!
     return false;
   }
-  
+
   return true;
 }
 
@@ -464,7 +463,7 @@ SPUInstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
   unsigned OpNum = Ops[0];
   unsigned Opc = MI->getOpcode();
   MachineInstr *NewMI = 0;
-  
+
   if ((Opc == SPU::ORr32
        || Opc == SPU::ORv4i32)
        && MI->getOperand(1).getReg() == MI->getOperand(2).getReg()) {
@@ -508,7 +507,7 @@ SPUInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
 
   // Get the last instruction in the block.
   MachineInstr *LastInst = I;
-  
+
   // If there is only one terminator instruction, process it.
   if (I == MBB.begin() || !isUnpredicatedTerminator(--I)) {
     if (isUncondBranch(LastInst)) {
@@ -524,7 +523,7 @@ SPUInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
     // Otherwise, don't know what this is.
     return true;
   }
-  
+
   // Get the instruction before it if it's a terminator.
   MachineInstr *SecondLastInst = I;
 
@@ -532,7 +531,7 @@ SPUInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
   if (SecondLastInst && I != MBB.begin() &&
       isUnpredicatedTerminator(--I))
     return true;
-  
+
   // If the block ends with a conditional and unconditional branch, handle it.
   if (isCondBranch(SecondLastInst) && isUncondBranch(LastInst)) {
     TBB =  SecondLastInst->getOperand(1).getMBB();
@@ -541,7 +540,7 @@ SPUInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
     FBB = LastInst->getOperand(0).getMBB();
     return false;
   }
-  
+
   // If the block ends with two unconditional branches, handle it.  The second
   // one is not executed, so remove it.
   if (isUncondBranch(SecondLastInst) && isUncondBranch(LastInst)) {
@@ -554,7 +553,7 @@ SPUInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
   // Otherwise, can't handle this.
   return true;
 }
-    
+
 unsigned
 SPUInstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
   MachineBasicBlock::iterator I = MBB.end();
@@ -578,16 +577,16 @@ SPUInstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
   I->eraseFromParent();
   return 2;
 }
-    
+
 unsigned
 SPUInstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
 			   MachineBasicBlock *FBB,
 			   const SmallVectorImpl<MachineOperand> &Cond) const {
   // Shouldn't be a fall through.
   assert(TBB && "InsertBranch must not be told to insert a fallthrough");
-  assert((Cond.size() == 2 || Cond.size() == 0) && 
+  assert((Cond.size() == 2 || Cond.size() == 0) &&
          "SPU branch conditions have two components!");
-  
+
   // One-way branch.
   if (FBB == 0) {
     if (Cond.empty())   // Unconditional branch
@@ -600,7 +599,7 @@ SPUInstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
     }
     return 1;
   }
-  
+
   // Two-way Conditional Branch.
 #if 0
   BuildMI(&MBB, get(SPU::BRNZ))
diff --git a/lib/Target/CellSPU/SPUInstrInfo.td b/lib/Target/CellSPU/SPUInstrInfo.td
index 1abbc0a5c04..751f36e6972 100644
--- a/lib/Target/CellSPU/SPUInstrInfo.td
+++ b/lib/Target/CellSPU/SPUInstrInfo.td
@@ -583,7 +583,9 @@ def AHIvec:
 def AHIr16:
   RI10Form<0b10111000, (outs R16C:$rT), (ins R16C:$rA, s10imm:$val),
     "ahi\t$rT, $rA, $val", IntegerOp,
-    [(set R16C:$rT, (add R16C:$rA, v8i16SExt10Imm:$val))]>;
+    [(set R16C:$rT, (add R16C:$rA, i16ImmSExt10:$val))]>;
+
+// v4i32, i32 add instruction:
 
 class AInst<dag OOL, dag IOL, list<dag> pattern>:
   RRForm<0b00000011000, OOL, IOL,
@@ -604,21 +606,42 @@ multiclass AddInstruction {
   def v16i8: AVecInst<v16i8>;
   
   def r32:   ARegInst<R32C>;
-  def r8:    AInst<(outs R8C:$rT), (ins R8C:$rA, R8C:$rB), [/* no pattern */]>; 
 }
 
 defm A : AddInstruction;
 
-def AIvec:
-    RI10Form<0b00111000, (outs VECREG:$rT), (ins VECREG:$rA, s10imm:$val),
-      "ai\t$rT, $rA, $val", IntegerOp,
-      [(set (v4i32 VECREG:$rT), (add (v4i32 VECREG:$rA),
-                                      v4i32SExt10Imm:$val))]>;
+class AIInst<dag OOL, dag IOL, list<dag> pattern>:
+    RI10Form<0b00111000, OOL, IOL,
+	     "ai\t$rT, $rA, $val", IntegerOp,
+	     pattern>;
 
-def AIr32:
-    RI10Form<0b00111000, (outs R32C:$rT), (ins R32C:$rA, s10imm_i32:$val),
-      "ai\t$rT, $rA, $val", IntegerOp,
-      [(set R32C:$rT, (add R32C:$rA, i32ImmSExt10:$val))]>;
+class AIVecInst<ValueType vectype, PatLeaf immpred>:
+    AIInst<(outs VECREG:$rT), (ins VECREG:$rA, s10imm:$val),
+	    [(set (vectype VECREG:$rT), (add (vectype VECREG:$rA), immpred:$val))]>;
+
+class AIFPVecInst<ValueType vectype, PatLeaf immpred>:
+    AIInst<(outs VECREG:$rT), (ins VECREG:$rA, s10imm:$val),
+	    [/* no pattern */]>;
+
+class AIRegInst<RegisterClass rclass, PatLeaf immpred>:
+    AIInst<(outs rclass:$rT), (ins rclass:$rA, s10imm_i32:$val),
+	   [(set rclass:$rT, (add rclass:$rA, immpred:$val))]>;
+
+// This is used to add epsilons to floating point numbers in the f32 fdiv code:
+class AIFPInst<RegisterClass rclass, PatLeaf immpred>:
+    AIInst<(outs rclass:$rT), (ins rclass:$rA, s10imm_i32:$val),
+	   [/* no pattern */]>;
+
+multiclass AddImmediate {
+  def v4i32: AIVecInst<v4i32, v4i32SExt10Imm>;
+
+  def r32: AIRegInst<R32C, i32ImmSExt10>;
+
+  def v4f32: AIFPVecInst<v4f32, v4i32SExt10Imm>;
+  def f32: AIFPInst<R32FP, i32ImmSExt10>;
+}
+
+defm AI : AddImmediate;
 
 def SFHvec:
     RRForm<0b00010010000, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
@@ -795,8 +818,7 @@ def BGXvec:
 def MPYv8i16:
   RRForm<0b00100011110, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
     "mpy\t$rT, $rA, $rB", IntegerMulDiv,
-    [(set (v8i16 VECREG:$rT), (SPUmpy_vec (v8i16 VECREG:$rA),
-                                          (v8i16 VECREG:$rB)))]>;
+    [/* no pattern */]>;
 
 def MPYr16:
   RRForm<0b00100011110, (outs R16C:$rT), (ins R16C:$rA, R16C:$rB),
@@ -812,8 +834,7 @@ class MPYUInst<dag OOL, dag IOL, list<dag> pattern>:
 
 def MPYUv4i32:
   MPYUInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
-           [(set (v4i32 VECREG:$rT),
-                 (SPUmpyu_vec (v4i32 VECREG:$rA), (v4i32 VECREG:$rB)))]>;
+           [/* no pattern */]>;
 
 def MPYUr16:
   MPYUInst<(outs R32C:$rT), (ins R16C:$rA, R16C:$rB),
@@ -821,7 +842,7 @@ def MPYUr16:
 
 def MPYUr32:
   MPYUInst<(outs R32C:$rT), (ins R32C:$rA, R32C:$rB),
-           [(set R32C:$rT, (SPUmpyu_int R32C:$rA, R32C:$rB))]>;
+           [/* no pattern */]>;
 
 // mpyi: multiply 16 x s10imm -> 32 result.
 
@@ -892,87 +913,78 @@ class MPYHInst<dag OOL, dag IOL, list<dag> pattern>:
          
 def MPYHv4i32:
     MPYHInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
-             [(set (v4i32 VECREG:$rT),
-                   (SPUmpyh_vec (v4i32 VECREG:$rA), (v4i32 VECREG:$rB)))]>;
+             [/* no pattern */]>;
 
 def MPYHr32:
     MPYHInst<(outs R32C:$rT), (ins R32C:$rA, R32C:$rB),
-             [(set R32C:$rT, (SPUmpyh_int R32C:$rA, R32C:$rB))]>;
+             [/* no pattern */]>;
 
 // mpys: multiply high and shift right (returns the top half of
 // a 16-bit multiply, sign extended to 32 bits.)
-def MPYSvec:
-    RRForm<0b11100011110, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
-      "mpys\t$rT, $rA, $rB", IntegerMulDiv,
-      []>;
 
-def MPYSr16:
-    RRForm<0b11100011110, (outs R32C:$rT), (ins R16C:$rA, R16C:$rB),
+class MPYSInst<dag OOL, dag IOL>:
+    RRForm<0b11100011110, OOL, IOL, 
       "mpys\t$rT, $rA, $rB", IntegerMulDiv,
-      []>;
+      [/* no pattern */]>;
+
+def MPYSvec:
+    MPYSInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB)>;
+    
+def MPYSr16:
+    MPYSInst<(outs R32C:$rT), (ins R16C:$rA, R16C:$rB)>;
 
 // mpyhh: multiply high-high (returns the 32-bit result from multiplying
 // the top 16 bits of the $rA, $rB)
+
+class MPYHHInst<dag OOL, dag IOL>:
+  RRForm<0b01100011110, OOL, IOL,
+        "mpyhh\t$rT, $rA, $rB", IntegerMulDiv,
+        [/* no pattern */]>;
+        
 def MPYHHv8i16:
-    RRForm<0b01100011110, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
-      "mpyhh\t$rT, $rA, $rB", IntegerMulDiv,
-      [(set (v8i16 VECREG:$rT),
-            (SPUmpyhh_vec (v8i16 VECREG:$rA), (v8i16 VECREG:$rB)))]>;
+    MPYHHInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB)>;
 
 def MPYHHr32:
-    RRForm<0b01100011110, (outs R32C:$rT), (ins R32C:$rA, R32C:$rB),
-      "mpyhh\t$rT, $rA, $rB", IntegerMulDiv,
-      []>;
+    MPYHHInst<(outs R32C:$rT), (ins R32C:$rA, R32C:$rB)>;
 
 // mpyhha: Multiply high-high, add to $rT:
-def MPYHHAvec:
-    RRForm<0b01100010110, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
-      "mpyhha\t$rT, $rA, $rB", IntegerMulDiv,
-      []>;
 
-def MPYHHAr32:
-    RRForm<0b01100010110, (outs R32C:$rT), (ins R32C:$rA, R32C:$rB),
+class MPYHHAInst<dag OOL, dag IOL>:
+    RRForm<0b01100010110, OOL, IOL,
       "mpyhha\t$rT, $rA, $rB", IntegerMulDiv,
-      []>;
+      [/* no pattern */]>;
+
+def MPYHHAvec:
+    MPYHHAInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB)>;
+    
+def MPYHHAr32:
+    MPYHHAInst<(outs R32C:$rT), (ins R32C:$rA, R32C:$rB)>;
 
 // mpyhhu: Multiply high-high, unsigned
-def MPYHHUvec:
-    RRForm<0b01110011110, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
-      "mpyhhu\t$rT, $rA, $rB", IntegerMulDiv,
-      []>;
 
-def MPYHHUr32:
-    RRForm<0b01110011110, (outs R32C:$rT), (ins R32C:$rA, R32C:$rB),
+class MPYHHUInst<dag OOL, dag IOL>:
+    RRForm<0b01110011110, OOL, IOL,
       "mpyhhu\t$rT, $rA, $rB", IntegerMulDiv,
-      []>;
+      [/* no pattern */]>;
+
+def MPYHHUvec:
+    MPYHHUInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB)>;
+    
+def MPYHHUr32:
+    MPYHHUInst<(outs R32C:$rT), (ins R32C:$rA, R32C:$rB)>;
 
 // mpyhhau: Multiply high-high, unsigned
+
+class MPYHHAUInst<dag OOL, dag IOL>:
+    RRForm<0b01110010110, OOL, IOL,
+      "mpyhhau\t$rT, $rA, $rB", IntegerMulDiv,
+      [/* no pattern */]>;
+
 def MPYHHAUvec:
-    RRForm<0b01110010110, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
-      "mpyhhau\t$rT, $rA, $rB", IntegerMulDiv,
-      []>;
-
+    MPYHHAUInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB)>;
+    
 def MPYHHAUr32:
-    RRForm<0b01110010110, (outs R32C:$rT), (ins R32C:$rA, R32C:$rB),
-      "mpyhhau\t$rT, $rA, $rB", IntegerMulDiv,
-      []>;
-
-//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
-// v4i32, i32 multiply instruction sequence:
-//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
-def MPYv4i32:
-  Pat<(mul (v4i32 VECREG:$rA), (v4i32 VECREG:$rB)),
-      (Av4i32
-        (Av4i32 (MPYHv4i32 VECREG:$rA, VECREG:$rB),
-                (MPYHv4i32 VECREG:$rB, VECREG:$rA)),
-        (MPYUv4i32 VECREG:$rA, VECREG:$rB))>;
-
-def MPYi32:
-  Pat<(mul R32C:$rA, R32C:$rB),
-      (Ar32
-        (Ar32 (MPYHr32 R32C:$rA, R32C:$rB),
-              (MPYHr32 R32C:$rB, R32C:$rA)),
-        (MPYUr32 R32C:$rA, R32C:$rB))>;
+    MPYHHAUInst<(outs R32C:$rT), (ins R32C:$rA, R32C:$rB)>;
 
 //-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
 // clz: Count leading zeroes
@@ -983,7 +995,7 @@ class CLZInst<dag OOL, dag IOL, list<dag> pattern>:
 
 class CLZRegInst<RegisterClass rclass>:
     CLZInst<(outs rclass:$rT), (ins rclass:$rA),
-	    [(set rclass:$rT, (ctlz rclass:$rA))]>;
+            [(set rclass:$rT, (ctlz rclass:$rA))]>;
 
 class CLZVecInst<ValueType vectype>:
     CLZInst<(outs VECREG:$rT), (ins VECREG:$rA),
@@ -1424,7 +1436,7 @@ multiclass BitwiseOr
   def f64: ORInst<(outs R64FP:$rT), (ins R64FP:$rA, R64FP:$rB),
                   [/* no pattern */]>;
 
-  // scalar->vector promotion:
+  // scalar->vector promotion, prefslot2vec:
   def v16i8_i8:  ORPromoteScalar<R8C>;
   def v8i16_i16: ORPromoteScalar<R16C>;
   def v4i32_i32: ORPromoteScalar<R32C>;
@@ -1432,7 +1444,7 @@ multiclass BitwiseOr
   def v4f32_f32: ORPromoteScalar<R32FP>;
   def v2f64_f64: ORPromoteScalar<R64FP>;
 
-  // extract element 0:
+  // vector->scalar demotion, vec2prefslot:
   def i8_v16i8:  ORExtractElt<R8C>;
   def i16_v8i16: ORExtractElt<R16C>;
   def i32_v4i32: ORExtractElt<R32C>;
@@ -1831,6 +1843,13 @@ class SELBVecInst<ValueType vectype>:
                      (and (vnot (vectype VECREG:$rC)),
                           (vectype VECREG:$rA))))]>;
 
+class SELBVecVCondInst<ValueType vectype>:
+  SELBInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB, VECREG:$rC),
+           [(set (vectype VECREG:$rT),
+                 (select (vectype VECREG:$rC),
+                         (vectype VECREG:$rB),
+                         (vectype VECREG:$rA)))]>;
+
 class SELBVecCondInst<ValueType vectype>:
   SELBInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB, R32C:$rC),
            [(set (vectype VECREG:$rT),
@@ -1867,8 +1886,21 @@ multiclass SelectBits
   def v4i32_cond: SELBVecCondInst<v4i32>;
   def v2i64_cond: SELBVecCondInst<v2i64>;
 
+  def v16i8_vcond: SELBVecCondInst<v16i8>;
+  def v8i16_vcond: SELBVecCondInst<v8i16>;
+  def v4i32_vcond: SELBVecCondInst<v4i32>;
+  def v2i64_vcond: SELBVecCondInst<v2i64>;
+
+  def v4f32_cond:
+	SELBInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB, VECREG:$rC),
+		 [(set (v4f32 VECREG:$rT),
+		       (select (v4i32 VECREG:$rC),
+			       (v4f32 VECREG:$rB),
+			       (v4f32 VECREG:$rA)))]>;
+
   // SELBr64_cond is defined further down, look for i64 comparisons
   def r32_cond:   SELBRegCondInst<R32C, R32C>;
+  def f32_cond:   SELBRegCondInst<R32C, R32FP>;
   def r16_cond:   SELBRegCondInst<R16C, R16C>;
   def r8_cond:    SELBRegCondInst<R8C,  R8C>;
 }
@@ -2454,11 +2486,11 @@ class ROTQBIInst<dag OOL, dag IOL, list<dag> pattern>:
            RotateShift, pattern>;
 
 class ROTQBIVecInst<ValueType vectype>:
-    ROTQBIInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+    ROTQBIInst<(outs VECREG:$rT), (ins VECREG:$rA, R32C:$rB),
                [/* no pattern yet */]>;
 
 class ROTQBIRegInst<RegisterClass rclass>:
-    ROTQBIInst<(outs rclass:$rT), (ins rclass:$rA, rclass:$rB),
+    ROTQBIInst<(outs rclass:$rT), (ins rclass:$rA, R32C:$rB),
                [/* no pattern yet */]>;
 
 multiclass RotateQuadByBitCount
@@ -2645,9 +2677,6 @@ def : Pat<(srl R32C:$rA, (i8 imm:$val)),
 // ROTQMBYvec: This is a vector form merely so that when used in an
 // instruction pattern, type checking will succeed. This instruction assumes
 // that the user knew to negate $rB.
-//
-// Using the SPUrotquad_rz_bytes target-specific DAG node, the patterns
-// ensure that $rB is negated.
 //-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
 
 class ROTQMBYInst<dag OOL, dag IOL, list<dag> pattern>:
@@ -2660,8 +2689,7 @@ class ROTQMBYVecInst<ValueType vectype>:
 
 class ROTQMBYRegInst<RegisterClass rclass>:
     ROTQMBYInst<(outs rclass:$rT), (ins rclass:$rA, R32C:$rB),
-                [(set rclass:$rT,
-                      (SPUrotquad_rz_bytes rclass:$rA, R32C:$rB))]>;
+                [/* no pattern */]>;
 
 multiclass RotateQuadBytes
 {
@@ -2676,32 +2704,17 @@ multiclass RotateQuadBytes
 
 defm ROTQMBY : RotateQuadBytes;
 
-def : Pat<(SPUrotquad_rz_bytes (v16i8 VECREG:$rA), R32C:$rB),
-          (ROTQMBYv16i8 VECREG:$rA, (SFIr32 R32C:$rB, 0))>;
-def : Pat<(SPUrotquad_rz_bytes (v8i16 VECREG:$rA), R32C:$rB),
-          (ROTQMBYv8i16 VECREG:$rA, (SFIr32 R32C:$rB, 0))>;
-def : Pat<(SPUrotquad_rz_bytes (v4i32 VECREG:$rA), R32C:$rB),
-          (ROTQMBYv4i32 VECREG:$rA, (SFIr32 R32C:$rB, 0))>;
-def : Pat<(SPUrotquad_rz_bytes (v2i64 VECREG:$rA), R32C:$rB),
-          (ROTQMBYv2i64 VECREG:$rA, (SFIr32 R32C:$rB, 0))>;
-def : Pat<(SPUrotquad_rz_bytes GPRC:$rA, R32C:$rB),
-          (ROTQMBYr128 GPRC:$rA, (SFIr32 R32C:$rB, 0))>;
-def : Pat<(SPUrotquad_rz_bytes R64C:$rA, R32C:$rB),
-          (ROTQMBYr64 R64C:$rA, (SFIr32 R32C:$rB, 0))>;
-
 class ROTQMBYIInst<dag OOL, dag IOL, list<dag> pattern>:
     RI7Form<0b10111111100, OOL, IOL, "rotqmbyi\t$rT, $rA, $val",
             RotateShift, pattern>;
 
 class ROTQMBYIVecInst<ValueType vectype>:
     ROTQMBYIInst<(outs VECREG:$rT), (ins VECREG:$rA, rotNeg7imm:$val),
-                 [(set (vectype VECREG:$rT),
-                       (SPUrotquad_rz_bytes (vectype VECREG:$rA), (i32 uimm7:$val)))]>;
+                 [/* no pattern */]>;
 
 class ROTQMBYIRegInst<RegisterClass rclass, Operand optype, ValueType inttype, PatLeaf pred>:
     ROTQMBYIInst<(outs rclass:$rT), (ins rclass:$rA, optype:$val),
-                 [(set rclass:$rT,
-                       (SPUrotquad_rz_bytes rclass:$rA, (inttype pred:$val)))]>;
+                 [/* no pattern */]>;
 
 multiclass RotateQuadBytesImm
 {
@@ -2725,8 +2738,8 @@ class ROTQMBYBIInst<dag OOL, dag IOL, list<dag> pattern>:
            RotateShift, pattern>;
 
 class ROTQMBYBIVecInst<ValueType vectype>:
-    ROTQMBYBIInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
-                  [/* no pattern, intrinsic? */]>;
+    ROTQMBYBIInst<(outs VECREG:$rT), (ins VECREG:$rA, R32C:$rB),
+                  [/* no pattern, */]>;
 
 multiclass RotateMaskQuadByBitCount
 {
@@ -2768,19 +2781,6 @@ multiclass RotateMaskQuadByBits
 
 defm ROTQMBI: RotateMaskQuadByBits;
 
-def : Pat<(SPUrotquad_rz_bits (v16i8 VECREG:$rA), R32C:$rB),
-          (ROTQMBIv16i8 VECREG:$rA, (SFIr32 R32C:$rB, 0))>;
-def : Pat<(SPUrotquad_rz_bits (v8i16 VECREG:$rA), R32C:$rB),
-          (ROTQMBIv8i16 VECREG:$rA, (SFIr32 R32C:$rB, 0))>;
-def : Pat<(SPUrotquad_rz_bits (v4i32 VECREG:$rA), R32C:$rB),
-          (ROTQMBIv4i32 VECREG:$rA, (SFIr32 R32C:$rB, 0))>;
-def : Pat<(SPUrotquad_rz_bits (v2i64 VECREG:$rA), R32C:$rB),
-          (ROTQMBIv2i64 VECREG:$rA, (SFIr32 R32C:$rB, 0))>;
-def : Pat<(SPUrotquad_rz_bits GPRC:$rA, R32C:$rB),
-          (ROTQMBIr128 GPRC:$rA, (SFIr32 R32C:$rB, 0))>;
-def : Pat<(SPUrotquad_rz_bits R64C:$rA, R32C:$rB),
-          (ROTQMBIr64 R64C:$rA, (SFIr32 R32C:$rB, 0))>;
-
 //-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
 // Rotate quad and mask by bits, immediate
 //-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
@@ -2791,13 +2791,11 @@ class ROTQMBIIInst<dag OOL, dag IOL, list<dag> pattern>:
 
 class ROTQMBIIVecInst<ValueType vectype>:
    ROTQMBIIInst<(outs VECREG:$rT), (ins VECREG:$rA, rotNeg7imm:$val),
-                 [(set (vectype VECREG:$rT),
-                       (SPUrotquad_rz_bits (vectype VECREG:$rA), (i32 uimm7:$val)))]>;
+                 [/* no pattern */]>;
 
 class ROTQMBIIRegInst<RegisterClass rclass>:
    ROTQMBIIInst<(outs rclass:$rT), (ins rclass:$rA, rotNeg7imm:$val),
-                 [(set rclass:$rT,
-                       (SPUrotquad_rz_bits rclass:$rA, (i32 uimm7:$val)))]>;
+                 [/* no pattern */]>;
 
 multiclass RotateMaskQuadByBitsImm
 {
@@ -3142,6 +3140,15 @@ multiclass CmpGtrWordImm
 
   def r32: CGTIInst<(outs R32C:$rT), (ins R32C:$rA, s10imm_i32:$val),
                     [(set R32C:$rT, (setgt R32C:$rA, i32ImmSExt10:$val))]>;
+
+  // CGTIv4f32, CGTIf32: These are used in the f32 fdiv instruction sequence:
+  def v4f32: CGTIInst<(outs VECREG:$rT), (ins VECREG:$rA, s10imm:$val),
+                       [(set (v4i32 VECREG:$rT),
+                             (setgt (v4i32 (bitconvert (v4f32 VECREG:$rA))),
+                                    (v4i32 v4i32SExt16Imm:$val)))]>;
+
+  def f32:   CGTIInst<(outs R32C:$rT), (ins R32FP:$rA, s10imm_i32:$val),
+  		      [/* no pattern */]>;
 }
 
 class CLGTBInst<dag OOL, dag IOL, list<dag> pattern> :
@@ -3750,62 +3757,63 @@ let isTerminator = 1, isBarrier = 1 in {
 
 class FAInst<dag OOL, dag IOL, list<dag> pattern>:
     RRForm<0b01011000100, OOL, IOL, "fa\t$rT, $rA, $rB",
-	   SPrecFP, pattern>;
+           SPrecFP, pattern>;
 
 class FAVecInst<ValueType vectype>:
     FAInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
              [(set (vectype VECREG:$rT),
-		   (fadd (vectype VECREG:$rA), (vectype VECREG:$rB)))]>;
+                   (fadd (vectype VECREG:$rA), (vectype VECREG:$rB)))]>;
 
 multiclass SFPAdd
 {
   def v4f32: FAVecInst<v4f32>;
-  def r32:   FAInst<(outs R32FP:$rT), (ins R32FP:$rA, R32FP:$rB),
-		    [(set R32FP:$rT, (fadd R32FP:$rA, R32FP:$rB))]>;
+  def f32:   FAInst<(outs R32FP:$rT), (ins R32FP:$rA, R32FP:$rB),
+                    [(set R32FP:$rT, (fadd R32FP:$rA, R32FP:$rB))]>;
 }
 
 defm FA : SFPAdd;
 
 class FSInst<dag OOL, dag IOL, list<dag> pattern>:
     RRForm<0b01011000100, OOL, IOL, "fs\t$rT, $rA, $rB",
-	   SPrecFP, pattern>;
+           SPrecFP, pattern>;
 
 class FSVecInst<ValueType vectype>:
     FSInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
-    	   [(set (vectype VECREG:$rT),
-	         (fsub (vectype VECREG:$rA), (vectype VECREG:$rB)))]>;
+           [(set (vectype VECREG:$rT),
+                 (fsub (vectype VECREG:$rA), (vectype VECREG:$rB)))]>;
 
 multiclass SFPSub
 {
   def v4f32: FSVecInst<v4f32>;
-  def r32:   FSInst<(outs R32FP:$rT), (ins R32FP:$rA, R32FP:$rB),
-		    [(set R32FP:$rT, (fsub R32FP:$rA, R32FP:$rB))]>;
+  def f32:   FSInst<(outs R32FP:$rT), (ins R32FP:$rA, R32FP:$rB),
+                    [(set R32FP:$rT, (fsub R32FP:$rA, R32FP:$rB))]>;
 }
 
 defm FS : SFPSub;
 
 // Floating point reciprocal estimate
-def FREv4f32 :
-    RRForm_1<0b00011101100, (outs VECREG:$rT), (ins VECREG:$rA),
-      "frest\t$rT, $rA", SPrecFP,
-      [(set (v4f32 VECREG:$rT), (SPUreciprocalEst (v4f32 VECREG:$rA)))]>;
 
-def FREf32 :
-    RRForm_1<0b00011101100, (outs R32FP:$rT), (ins R32FP:$rA),
-      "frest\t$rT, $rA", SPrecFP,
-      [(set R32FP:$rT, (SPUreciprocalEst R32FP:$rA))]>;
+class FRESTInst<dag OOL, dag IOL>:
+  RRForm_1<0b00110111000, OOL, IOL,
+           "frest\t$rT, $rA", SPrecFP,
+           [/* no pattern */]>;
+
+def FRESTv4f32 :
+    FRESTInst<(outs VECREG:$rT), (ins VECREG:$rA)>;
+
+def FRESTf32 :
+    FRESTInst<(outs R32FP:$rT), (ins R32FP:$rA)>;
 
 // Floating point interpolate (used in conjunction with reciprocal estimate)
 def FIv4f32 :
     RRForm<0b00101011110, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
       "fi\t$rT, $rA, $rB", SPrecFP,
-      [(set (v4f32 VECREG:$rT), (SPUinterpolate (v4f32 VECREG:$rA),
-                                                (v4f32 VECREG:$rB)))]>;
+      [/* no pattern */]>;
 
 def FIf32 :
     RRForm<0b00101011110, (outs R32FP:$rT), (ins R32FP:$rA, R32FP:$rB),
       "fi\t$rT, $rA, $rB", SPrecFP,
-      [(set R32FP:$rT, (SPUinterpolate R32FP:$rA, R32FP:$rB))]>;
+      [/* no pattern */]>;
 
 //--------------------------------------------------------------------------
 // Basic single precision floating point comparisons:
@@ -4445,12 +4453,14 @@ def : Pat<(SPUindirect (SPUhi tconstpool:$in, 0),
                        (SPUlo tconstpool:$in, 0)),
           (IOHLlo (ILHUhi tconstpool:$in), tconstpool:$in)>;
 
+/*
 def : Pat<(SPUindirect R32C:$sp, i32ImmSExt10:$imm),
           (AIr32 R32C:$sp, i32ImmSExt10:$imm)>;
 
 def : Pat<(SPUindirect R32C:$sp, imm:$imm),
           (Ar32 R32C:$sp,
                 (IOHLr32 (ILHUr32 (HI16 imm:$imm)), (LO16 imm:$imm)))>;
+ */
 
 def : Pat<(add (SPUhi tglobaladdr:$in, 0), (SPUlo tglobaladdr:$in, 0)),
           (IOHLlo (ILHUhi tglobaladdr:$in), tglobaladdr:$in)>;
@@ -4466,5 +4476,7 @@ def : Pat<(add (SPUhi tconstpool:$in, 0), (SPUlo tconstpool:$in, 0)),
 
 // Instrinsics:
 include "CellSDKIntrinsics.td"
+// Various math operator instruction sequences
+include "SPUMathInstr.td"
 // 64-bit "instructions"/support
 include "SPU64InstrInfo.td"
diff --git a/lib/Target/CellSPU/SPUMathInstr.td b/lib/Target/CellSPU/SPUMathInstr.td
new file mode 100644
index 00000000000..38279a0a9f8
--- /dev/null
+++ b/lib/Target/CellSPU/SPUMathInstr.td
@@ -0,0 +1,99 @@
+//======--- SPUMathInst.td - Cell SPU math operations -*- tablegen -*---======//
+//
+//                     Cell SPU math operations
+//
+// This target description file contains instruction sequences for various
+// math operations, such as vector multiplies, i32 multiply, etc., for the
+// SPU's i32, i16 i8 and corresponding vector types.
+//
+// Any resemblance to libsimdmath or the Cell SDK simdmath library is
+// purely and completely coincidental.
+//
+// Primary author: Scott Michel (scottm@aero.org)
+//===----------------------------------------------------------------------===//
+
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+// v16i8 multiply instruction sequence:
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+
+def : Pat<(mul (v16i8 VECREG:$rA), (v16i8 VECREG:$rB)),
+          (ORv4i32
+           (ANDv4i32
+            (SELBv4i32 (MPYv8i16 VECREG:$rA, VECREG:$rB),
+                       (SHLHIv8i16 (MPYv8i16 (ROTMAHIv8i16 VECREG:$rA, 8),
+                                             (ROTMAHIv8i16 VECREG:$rB, 8)), 8),
+                       (FSMBIv8i16 0x2222)),
+            (ILAv4i32 0x0000ffff)),
+           (SHLIv4i32
+            (SELBv4i32 (MPYv8i16 (ROTMAIv4i32_i32 VECREG:$rA, 16),
+                                 (ROTMAIv4i32_i32 VECREG:$rB, 16)),
+                       (SHLHIv8i16 (MPYv8i16 (ROTMAIv4i32_i32 VECREG:$rA, 8),
+                                             (ROTMAIv4i32_i32 VECREG:$rB, 8)), 8),
+                       (FSMBIv8i16 0x2222)), 16))>;
+                        
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+// v8i16 multiply instruction sequence:
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+
+def : Pat<(mul (v8i16 VECREG:$rA), (v8i16 VECREG:$rB)),
+          (SELBv8i16 (MPYv8i16 VECREG:$rA, VECREG:$rB),
+                     (SHLIv4i32 (MPYHHv8i16 VECREG:$rA, VECREG:$rB), 16),
+                     (FSMBIv8i16 0xcccc))>;
+                 
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+// v4i32, i32 multiply instruction sequence:
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+
+def MPYv4i32:
+  Pat<(mul (v4i32 VECREG:$rA), (v4i32 VECREG:$rB)),
+      (Av4i32
+        (Av4i32 (MPYHv4i32 VECREG:$rA, VECREG:$rB),
+                (MPYHv4i32 VECREG:$rB, VECREG:$rA)),
+        (MPYUv4i32 VECREG:$rA, VECREG:$rB))>;
+
+def MPYi32:
+  Pat<(mul R32C:$rA, R32C:$rB),
+      (Ar32
+        (Ar32 (MPYHr32 R32C:$rA, R32C:$rB),
+              (MPYHr32 R32C:$rB, R32C:$rA)),
+        (MPYUr32 R32C:$rA, R32C:$rB))>;
+
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+// f32, v4f32 divide instruction sequence:
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+
+// Reciprocal estimate and interpolation
+def Interpf32: CodeFrag<(FIf32 R32FP:$rB, (FRESTf32 R32FP:$rB))>;
+// Division estimate
+def DivEstf32: CodeFrag<(FMf32 R32FP:$rA, Interpf32.Fragment)>;
+// Newton-Raphson iteration
+def NRaphf32: CodeFrag<(FMAf32 (FNMSf32 DivEstf32.Fragment, R32FP:$rB, R32FP:$rA),
+		  	       Interpf32.Fragment,
+	  	  	       DivEstf32.Fragment)>;
+// Epsilon addition
+def Epsilonf32: CodeFrag<(AIf32 NRaphf32.Fragment, 1)>;
+
+def : Pat<(fdiv R32FP:$rA, R32FP:$rB),
+	  (SELBf32_cond NRaphf32.Fragment,
+			Epsilonf32.Fragment,
+			(CGTIf32 (FNMSf32 R32FP:$rB, Epsilonf32.Fragment, R32FP:$rA), -1))>;
+
+// Reciprocal estimate and interpolation
+def Interpv4f32: CodeFrag<(FIv4f32 (v4f32 VECREG:$rB), (FRESTv4f32 (v4f32 VECREG:$rB)))>;
+// Division estimate
+def DivEstv4f32: CodeFrag<(FMv4f32 (v4f32 VECREG:$rA), Interpv4f32.Fragment)>;
+// Newton-Raphson iteration
+def NRaphv4f32: CodeFrag<(FMAv4f32 (FNMSv4f32 DivEstv4f32.Fragment,
+					      (v4f32 VECREG:$rB),
+					      (v4f32 VECREG:$rA)),
+		  	           Interpv4f32.Fragment,
+	  	  	           DivEstv4f32.Fragment)>;
+// Epsilon addition
+def Epsilonv4f32: CodeFrag<(AIv4f32 NRaphv4f32.Fragment, 1)>;
+
+def : Pat<(fdiv (v4f32 VECREG:$rA), (v4f32 VECREG:$rB)),
+	  (SELBv4f32_cond NRaphv4f32.Fragment,
+			Epsilonv4f32.Fragment,
+			(CGTIv4f32 (FNMSv4f32 (v4f32 VECREG:$rB),
+					      Epsilonv4f32.Fragment,
+					      (v4f32 VECREG:$rA)), -1))>;
diff --git a/lib/Target/CellSPU/SPUNodes.td b/lib/Target/CellSPU/SPUNodes.td
index 5cf229e4b78..89a52eedb18 100644
--- a/lib/Target/CellSPU/SPUNodes.td
+++ b/lib/Target/CellSPU/SPUNodes.td
@@ -87,24 +87,6 @@ def SPUcntb : SDNode<"SPUISD::CNTB", SDTIntUnaryOp>;
 // SPUISelLowering.h):
 def SPUshuffle: SDNode<"SPUISD::SHUFB", SDT_SPUshuffle, []>;
 
-// SPU 16-bit multiply
-def SPUmpy_vec: SDNode<"SPUISD::MPY", SPUVecBinop, []>;
-
-// SPU multiply unsigned, used in instruction lowering for v4i32
-// multiplies:
-def SPUmpyu_vec: SDNode<"SPUISD::MPYU", SPUVecBinop, []>;
-def SPUmpyu_int: SDNode<"SPUISD::MPYU", SDTIntBinOp, []>;
-
-// SPU 16-bit multiply high x low, shift result 16-bits
-// Used to compute intermediate products for 32-bit multiplies
-def SPUmpyh_vec: SDNode<"SPUISD::MPYH", SPUVecBinop, []>;
-def SPUmpyh_int: SDNode<"SPUISD::MPYH", SDTIntBinOp, []>;
-
-// SPU 16-bit multiply high x high, 32-bit product
-// Used to compute intermediate products for 16-bit multiplies
-def SPUmpyhh_vec: SDNode<"SPUISD::MPYHH", SPUVecBinop, []>;
-def SPUmpyhh_int: SDNode<"SPUISD::MPYHH", SDTIntBinOp, []>;
-
 // Shift left quadword by bits and bytes
 def SPUshlquad_l_bits: SDNode<"SPUISD::SHLQUAD_L_BITS", SPUvecshift_type, []>;
 def SPUshlquad_l_bytes: SDNode<"SPUISD::SHLQUAD_L_BYTES", SPUvecshift_type, []>;
@@ -117,11 +99,6 @@ def SPUvec_sra: SDNode<"SPUISD::VEC_SRA", SPUvecshift_type, []>;
 def SPUvec_rotl: SDNode<"SPUISD::VEC_ROTL", SPUvecshift_type, []>;
 def SPUvec_rotr: SDNode<"SPUISD::VEC_ROTR", SPUvecshift_type, []>;
 
-def SPUrotquad_rz_bytes: SDNode<"SPUISD::ROTQUAD_RZ_BYTES",
-                                    SPUvecshift_type, []>;
-def SPUrotquad_rz_bits: SDNode<"SPUISD::ROTQUAD_RZ_BITS",
-                                    SPUvecshift_type, []>;
-
 // Vector rotate left, bits shifted out of the left are rotated in on the right
 def SPUrotbytes_left: SDNode<"SPUISD::ROTBYTES_LEFT",
                              SPUvecshift_type, []>;
@@ -141,12 +118,6 @@ def SPUselb: SDNode<"SPUISD::SELB", SPUselb_type, []>;
 // SPU gather bits instruction:
 def SPUgatherbits: SDNode<"SPUISD::GATHER_BITS", SPUgatherbits_type, []>;
 
-// SPU floating point interpolate
-def SPUinterpolate : SDNode<"SPUISD::FPInterp", SDTFPBinOp, []>;
-
-// SPU floating point reciprocal estimate (used for fdiv)
-def SPUreciprocalEst: SDNode<"SPUISD::FPRecipEst", SDTFPUnaryOp, []>;
-
 def SDTprefslot2vec: SDTypeProfile<1, 1, []>;
 def SPUprefslot2vec: SDNode<"SPUISD::PREFSLOT2VEC", SDTprefslot2vec, []>;
 
diff --git a/lib/Target/CellSPU/SPURegisterInfo.cpp b/lib/Target/CellSPU/SPURegisterInfo.cpp
index cf4089fa29e..381522dac54 100644
--- a/lib/Target/CellSPU/SPURegisterInfo.cpp
+++ b/lib/Target/CellSPU/SPURegisterInfo.cpp
@@ -238,7 +238,7 @@ SPURegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const
     SPU::R0,    /* link register */
     0 /* end */
   };
-  
+
   return SPU_CalleeSaveRegs;
 }
 
@@ -268,7 +268,7 @@ SPURegisterInfo::getCalleeSavedRegClasses(const MachineFunction *MF) const
     &SPU::GPRCRegClass, /* link register */
     0 /* end */
   };
- 
+
   return SPU_CalleeSaveRegClasses;
 }
 
@@ -339,10 +339,13 @@ SPURegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj,
   // Now add the frame object offset to the offset from r1.
   int Offset = MFI->getObjectOffset(FrameIndex);
 
-  // Most instructions, except for generated FrameIndex additions using AIr32,
-  // have the immediate in operand 1. AIr32, in this case, has the immediate
-  // in operand 2.
-  unsigned OpNo = (MI.getOpcode() != SPU::AIr32 ? 1 : 2);
+  // Most instructions, except for generated FrameIndex additions using AIr32
+  // and ILAr32, have the immediate in operand 1. AIr32 and ILAr32 have the
+  // immediate in operand 2.
+  unsigned OpNo = 1;
+  if (MI.getOpcode() == SPU::AIr32 || MI.getOpcode() == SPU::ILAr32)
+    OpNo = 2;
+
   MachineOperand &MO = MI.getOperand(OpNo);
 
   // Offset is biased by $lr's slot at the bottom.
@@ -355,7 +358,7 @@ SPURegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj,
   if (Offset > SPUFrameInfo::maxFrameOffset()
       || Offset < SPUFrameInfo::minFrameOffset()) {
     cerr << "Large stack adjustment ("
-         << Offset 
+         << Offset
          << ") in SPURegisterInfo::eliminateFrameIndex.";
   } else {
     MO.ChangeToImmediate(Offset);
@@ -371,7 +374,7 @@ SPURegisterInfo::determineFrameLayout(MachineFunction &MF) const
 
   // Get the number of bytes to allocate from the FrameInfo
   unsigned FrameSize = MFI->getStackSize();
-  
+
   // Get the alignments provided by the target, and the maximum alignment
   // (if any) of the fixed frame objects.
   unsigned TargetAlign = MF.getTarget().getFrameInfo()->getStackAlignment();
@@ -381,7 +384,7 @@ SPURegisterInfo::determineFrameLayout(MachineFunction &MF) const
 
   // Get the maximum call frame size of all the calls.
   unsigned maxCallFrameSize = MFI->getMaxCallFrameSize();
-    
+
   // If we have dynamic alloca then maxCallFrameSize needs to be aligned so
   // that allocations will be aligned.
   if (MFI->hasVarSizedObjects())
@@ -389,7 +392,7 @@ SPURegisterInfo::determineFrameLayout(MachineFunction &MF) const
 
   // Update maximum call frame size.
   MFI->setMaxCallFrameSize(maxCallFrameSize);
-  
+
   // Include call frame size in total.
   FrameSize += maxCallFrameSize;
 
@@ -418,18 +421,18 @@ void SPURegisterInfo::emitPrologue(MachineFunction &MF) const
   MachineBasicBlock::iterator MBBI = MBB.begin();
   MachineFrameInfo *MFI = MF.getFrameInfo();
   MachineModuleInfo *MMI = MFI->getMachineModuleInfo();
-  
+
   // Prepare for debug frame info.
   bool hasDebugInfo = MMI && MMI->hasDebugInfo();
   unsigned FrameLabelId = 0;
-  
+
   // Move MBBI back to the beginning of the function.
   MBBI = MBB.begin();
-  
+
   // Work out frame sizes.
   determineFrameLayout(MF);
   int FrameSize = MFI->getStackSize();
-  
+
   assert((FrameSize & 0xf) == 0
          && "SPURegisterInfo::emitPrologue: FrameSize not aligned");
 
@@ -440,7 +443,7 @@ void SPURegisterInfo::emitPrologue(MachineFunction &MF) const
       FrameLabelId = MMI->NextLabelID();
       BuildMI(MBB, MBBI, TII.get(SPU::DBG_LABEL)).addImm(FrameLabelId);
     }
-  
+
     // Adjust stack pointer, spilling $lr -> 16($sp) and $sp -> -FrameSize($sp)
     // for the ABI
     BuildMI(MBB, MBBI, TII.get(SPU::STQDr32), SPU::R0).addImm(16)
@@ -476,15 +479,15 @@ void SPURegisterInfo::emitPrologue(MachineFunction &MF) const
       cerr << "Unhandled frame size: " << FrameSize << "\n";
       abort();
     }
- 
+
     if (hasDebugInfo) {
       std::vector<MachineMove> &Moves = MMI->getFrameMoves();
-    
+
       // Show update of SP.
       MachineLocation SPDst(MachineLocation::VirtualFP);
       MachineLocation SPSrc(MachineLocation::VirtualFP, -FrameSize);
       Moves.push_back(MachineMove(FrameLabelId, SPDst, SPSrc));
-    
+
       // Add callee saved registers to move list.
       const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo();
       for (unsigned I = 0, E = CSI.size(); I != E; ++I) {
@@ -495,11 +498,11 @@ void SPURegisterInfo::emitPrologue(MachineFunction &MF) const
         MachineLocation CSSrc(Reg);
         Moves.push_back(MachineMove(FrameLabelId, CSDst, CSSrc));
       }
-    
+
       // Mark effective beginning of when frame pointer is ready.
       unsigned ReadyLabelId = MMI->NextLabelID();
       BuildMI(MBB, MBBI, TII.get(SPU::DBG_LABEL)).addImm(ReadyLabelId);
-    
+
       MachineLocation FPDst(SPU::R1);
       MachineLocation FPSrc(MachineLocation::VirtualFP);
       Moves.push_back(MachineMove(ReadyLabelId, FPDst, FPSrc));
diff --git a/test/CodeGen/CellSPU/fdiv.ll b/test/CodeGen/CellSPU/fdiv.ll
index 826a2faaabf..d121c3f8c90 100644
--- a/test/CodeGen/CellSPU/fdiv.ll
+++ b/test/CodeGen/CellSPU/fdiv.ll
@@ -1,9 +1,11 @@
 ; RUN: llvm-as -o - %s | llc -march=cellspu > %t1.s
 ; RUN: grep frest    %t1.s | count 2 
 ; RUN: grep -w fi    %t1.s | count 2 
-; RUN: grep fm       %t1.s | count 4 
+; RUN: grep -w fm    %t1.s | count 2
 ; RUN: grep fma      %t1.s | count 2 
-; RUN: grep fnms     %t1.s | count 2
+; RUN: grep fnms     %t1.s | count 4
+; RUN: grep cgti     %t1.s | count 2
+; RUN: grep selb     %t1.s | count 2
 ;
 ; This file includes standard floating point arithmetic instructions
 target datalayout = "E-p:32:32:128-f64:64:128-f32:32:128-i64:32:128-i32:32:128-i16:16:128-i8:8:128-i1:8:128-a0:0:128-v128:128:128-s0:128:128"
diff --git a/test/CodeGen/CellSPU/i64ops.ll b/test/CodeGen/CellSPU/i64ops.ll
index 5e7897bc971..51abd44a09e 100644
--- a/test/CodeGen/CellSPU/i64ops.ll
+++ b/test/CodeGen/CellSPU/i64ops.ll
@@ -1,8 +1,5 @@
 ; RUN: llvm-as -o - %s | llc -march=cellspu > %t1.s
-; RUN: grep {fsmbi.*61680}   %t1.s | count 1
-; RUN: grep rotqmbyi         %t1.s | count 1
-; RUN: grep rotmai           %t1.s | count 1
-; RUN: grep selb             %t1.s | count 1
+; RUN: grep xswd	     %t1.s | count 1
 ; RUN: grep shufb            %t1.s | count 2
 ; RUN: grep cg               %t1.s | count 1
 ; RUN: grep addx             %t1.s | count 1
diff --git a/test/CodeGen/CellSPU/mul_ops.ll b/test/CodeGen/CellSPU/mul_ops.ll
index 843505f1359..085ce555dc2 100644
--- a/test/CodeGen/CellSPU/mul_ops.ll
+++ b/test/CodeGen/CellSPU/mul_ops.ll
@@ -8,7 +8,7 @@
 ; RUN: grep and     %t1.s | count 2
 ; RUN: grep selb    %t1.s | count 6
 ; RUN: grep fsmbi   %t1.s | count 4
-; RUN: grep shli    %t1.s | count 2
+; RUN: grep shli    %t1.s | count 4
 ; RUN: grep shlhi   %t1.s | count 4
 ; RUN: grep ila     %t1.s | count 2
 ; RUN: grep xsbh    %t1.s | count 4
diff --git a/test/CodeGen/CellSPU/shift_ops.ll b/test/CodeGen/CellSPU/shift_ops.ll
index b6629cac2a1..5b60dc178fa 100644
--- a/test/CodeGen/CellSPU/shift_ops.ll
+++ b/test/CodeGen/CellSPU/shift_ops.ll
@@ -1,10 +1,21 @@
 ; RUN: llvm-as -o - %s | llc -march=cellspu > %t1.s
-; RUN: grep shlh   %t1.s | count 84
-; RUN: grep shlhi  %t1.s | count 51
-; RUN: grep shl    %t1.s | count 168
-; RUN: grep shli   %t1.s | count 51
-; RUN: grep xshw   %t1.s | count 5
-; RUN: grep and    %t1.s | count 5
+; RUN: grep -w shlh      %t1.s | count 9
+; RUN: grep -w shlhi     %t1.s | count 3
+; RUN: grep -w shl       %t1.s | count 9
+; RUN: grep -w shli      %t1.s | count 3
+; RUN: grep -w xshw      %t1.s | count 5
+; RUN: grep -w and       %t1.s | count 5
+; RUN: grep -w andi      %t1.s | count 2
+; RUN: grep -w rotmi     %t1.s | count 2
+; RUN: grep -w rotqmbyi  %t1.s | count 1
+; RUN: grep -w rotqmbii  %t1.s | count 2
+; RUN: grep -w rotqmby   %t1.s | count 1
+; RUN: grep -w rotqmbi   %t1.s | count 1
+; RUN: grep -w rotqbyi   %t1.s | count 1
+; RUN: grep -w rotqbii   %t1.s | count 2
+; RUN: grep -w rotqbybi  %t1.s | count 1
+; RUN: grep -w sfi       %t1.s | count 3
+
 target datalayout = "E-p:32:32:128-f64:64:128-f32:32:128-i64:32:128-i32:32:128-i16:16:128-i8:8:128-i1:8:128-a0:0:128-v128:128:128-s0:128:128"
 target triple = "spu"
 
@@ -210,3 +221,57 @@ define i32 @shli_i32_12(i32 zeroext %arg1) zeroext {
         %A = shl i32 0, %arg1
         ret i32 %A
 }
+
+;; i64 shift left
+
+define i64 @shl_i64_1(i64 %arg1) {
+	%A = shl i64 %arg1, 9
+	ret i64 %A
+}
+
+define i64 @shl_i64_2(i64 %arg1) {
+	%A = shl i64 %arg1, 3
+	ret i64 %A
+}
+
+define i64 @shl_i64_3(i64 %arg1, i32 %shift) {
+	%1 = zext i32 %shift to i64
+	%2 = shl i64 %arg1, %1
+	ret i64 %2
+}
+
+;; i64 shift right logical (shift 0s from the right)
+
+define i64 @lshr_i64_1(i64 %arg1) {
+	%1 = lshr i64 %arg1, 9
+	ret i64 %1
+}
+
+define i64 @lshr_i64_2(i64 %arg1) {
+	%1 = lshr i64 %arg1, 3
+	ret i64 %1
+}
+
+define i64 @lshr_i64_3(i64 %arg1, i32 %shift) {
+	%1 = zext i32 %shift to i64
+	%2 = lshr i64 %arg1, %1
+	ret i64 %2
+}
+
+;; i64 shift right arithmetic (shift 1s from the right)
+
+define i64 @ashr_i64_1(i64 %arg) {
+	%1 = ashr i64 %arg, 9
+	ret i64 %1
+}
+
+define i64 @ashr_i64_2(i64 %arg) {
+	%1 = ashr i64 %arg, 3
+	ret i64 %1
+}
+
+define i64 @ashr_i64_3(i64 %arg1, i32 %shift) {
+	%1 = zext i32 %shift to i64
+	%2 = ashr i64 %arg1, %1
+	ret i64 %2
+}
diff --git a/test/CodeGen/CellSPU/useful-harnesses/i64operations.c b/test/CodeGen/CellSPU/useful-harnesses/i64operations.c
index 7b86070095f..3819797d148 100644
--- a/test/CodeGen/CellSPU/useful-harnesses/i64operations.c
+++ b/test/CodeGen/CellSPU/useful-harnesses/i64operations.c
@@ -34,19 +34,45 @@ struct pred_s preds[] = {
   { "neq", i64_neq, i64_neq_select }
 };
 
+uint64_t i64_shl_const(uint64_t a) {
+  return a << 10;
+}
+
+uint64_t i64_shl(uint64_t a, int amt) {
+  return a << amt;
+}
+
+uint64_t i64_srl_const(uint64_t a) {
+  return a >> 10;
+}
+
+uint64_t i64_srl(uint64_t a, int amt) {
+  return a >> amt;
+}
+
+int64_t i64_sra_const(int64_t a) {
+  return a >> 10;
+}
+
+int64_t i64_sra(int64_t a, int amt) {
+  return a >> amt;
+}
+
 int main(void) {
   int i;
-  int64_t a = 1234567890000LL;
-  int64_t b = 2345678901234LL;
-  int64_t c = 1234567890001LL;
-  int64_t d =         10001LL;
-  int64_t e =         10000LL;
+  int64_t a =  1234567890003LL;
+  int64_t b =  2345678901235LL;
+  int64_t c =  1234567890001LL;
+  int64_t d =          10001LL;
+  int64_t e =          10000LL;
+  int64_t f = -1068103409991LL;
 
   printf("a = %16lld (0x%016llx)\n", a, a);
   printf("b = %16lld (0x%016llx)\n", b, b);
   printf("c = %16lld (0x%016llx)\n", c, c);
   printf("d = %16lld (0x%016llx)\n", d, d);
   printf("e = %16lld (0x%016llx)\n", e, e);
+  printf("f = %16lld (0x%016llx)\n", f, f);
   printf("----------------------------------------\n");
 
   for (i = 0; i < sizeof(preds)/sizeof(preds[0]); ++i) {
@@ -64,5 +90,23 @@ int main(void) {
     printf("----------------------------------------\n");
   }
 
+  printf("a                = 0x%016llx\n", a);
+  printf("i64_shl_const(a) = 0x%016llx\n", i64_shl_const(a));
+  printf("i64_shl(a)       = 0x%016llx\n", i64_shl(a, 5));
+  printf("i64_srl_const(a) = 0x%016llx\n", i64_srl_const(a));
+  printf("i64_srl(a)       = 0x%016llx\n", i64_srl(a, 5));
+  printf("i64_sra_const(a) = 0x%016llx\n", i64_sra_const(a));
+  printf("i64_sra(a)       = 0x%016llx\n", i64_sra(a, 5));
+  printf("----------------------------------------\n");
+
+  printf("f                = 0x%016llx\n", f);
+  printf("i64_shl_const(f) = 0x%016llx\n", i64_shl_const(f));
+  printf("i64_shl(f)       = 0x%016llx\n", i64_shl(f, 10));
+  printf("i64_srl_const(f) = 0x%016llx\n", i64_srl_const(f));
+  printf("i64_srl(f)       = 0x%016llx\n", i64_srl(f, 10));
+  printf("i64_sra_const(f) = 0x%016llx\n", i64_sra_const(f));
+  printf("i64_sra(f)       = 0x%016llx\n", i64_sra(f, 10));
+  printf("----------------------------------------\n");
+
   return 0;
 }