- Start moving target-dependent nodes that could be represented by an

instruction sequence and cannot ordinarily be simplified by DAGcombine into the various target description files or SPUDAGToDAGISel.cpp. This makes some 64-bit operations legal. - Eliminate target-dependent ISD enums. - Update tests. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@61508 91177308-0d34-0410-b5e6-96231b3b80d8
2025-07-24 22:24:54 +00:00 · 2008-12-30 23:28:25 +00:00
parent 998dee96d3
commit 02d711b93e
15 changed files with 736 additions and 634 deletions
--- a/lib/Target/CellSPU/SPU.td
+++ b/lib/Target/CellSPU/SPU.td
@@ -15,6 +15,13 @@
 //
 include "llvm/Target/Target.td"

+// Holder of code fragments (you'd think this'd already be in
+// a td file somewhere... :-)
+
+class CodeFrag<dag frag> {
+  dag Fragment = frag;
+}
+
 //===----------------------------------------------------------------------===//
 // Register File Description
 //===----------------------------------------------------------------------===//
--- a/lib/Target/CellSPU/SPU64InstrInfo.td
+++ b/lib/Target/CellSPU/SPU64InstrInfo.td
@@ -1,8 +1,17 @@
+//====--- SPU64InstrInfo.td - Cell SPU 64-bit operations -*- tablegen -*--====//
+//
+//                     Cell SPU 64-bit operations
+//
+// Primary author: Scott Michel (scottm@aero.org)
+//===----------------------------------------------------------------------===//
+
 //-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
 // 64-bit comparisons:
 //
 // 1. The instruction sequences for vector vice scalar differ by a
-//    constant.
+//    constant. In the scalar case, we're only interested in the
+//    top two 32-bit slots, whereas we're interested in an exact
+//    all-four-slot match in the vector case.
 //
 // 2. There are no "immediate" forms, since loading 64-bit constants
 //    could be a constant pool load.
@@ -10,10 +19,10 @@
 // 3. i64 setcc results are i32, which are subsequently converted to a FSM
 //    mask when used in a select pattern.
 //
-// 4. v2i64 setcc results are v4i32, which can be converted to a FSM mask
-//    (TODO)
+// 4. v2i64 setcc results are v4i32, which can be converted to a FSM mask (TODO)
+//    [Note: this may be moot, since gb produces v4i32 or r32.]
 //
-// M00$E Kan be Pretty N@sTi!!!!! (appologies to Monty!)
+// M00$E B!tes Kan be Pretty N@sTi!!!!! (appologies to Monty!)
 //-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~

 // selb instruction definition for i64. Note that the selection mask is
@@ -22,17 +31,15 @@ def SELBr64_cond:
   SELBInst<(outs R64C:$rT), (ins R64C:$rA, R64C:$rB, VECREG:$rC),
            [/* no pattern */]>;

-class CodeFrag<dag frag> {
-  dag Fragment = frag;
-}
-
-class I64SELECTNegCond<PatFrag cond, CodeFrag cmpare>:
+// select the negative condition:
+class I64SELECTNegCond<PatFrag cond, CodeFrag compare>:
  Pat<(select (i32 (cond R64C:$rA, R64C:$rB)), R64C:$rTrue, R64C:$rFalse),
-      (SELBr64_cond R64C:$rTrue, R64C:$rFalse, (FSMr32 cmpare.Fragment))>;
+      (SELBr64_cond R64C:$rTrue, R64C:$rFalse, (FSMr32 compare.Fragment))>;

-class I64SETCCNegCond<PatFrag cond, CodeFrag cmpare>:
+// setcc the negative condition:
+class I64SETCCNegCond<PatFrag cond, CodeFrag compare>:
  Pat<(cond R64C:$rA, R64C:$rB),
-      (XORIr32 cmpare.Fragment, -1)>;
+      (XORIr32 compare.Fragment, -1)>;

 // The i64 seteq fragment that does the scalar->vector conversion and
 // comparison:
@@ -64,14 +71,13 @@ multiclass CompareEqual64 {
 defm I64EQ: CompareEqual64;

 def : Pat<(seteq R64C:$rA, R64C:$rB), I64EQr64.Fragment>;
+def : Pat<(seteq (v2i64 VECREG:$rA), (v2i64 VECREG:$rB)), I64EQv2i64.Fragment>;

-def : Pat<(seteq (v2i64 VECREG:$rA), (v2i64 VECREG:$rB)),
-          I64EQv2i64.Fragment>;
-
-def I64Select:
-    Pat<(select R32C:$rC, R64C:$rB, R64C:$rA),
-        (SELBr64_cond R64C:$rA, R64C:$rB, (FSMr32 R32C:$rC))>;
+def : Pat<(select R32C:$rC, R64C:$rB, R64C:$rA),
+          (SELBr64_cond R64C:$rA, R64C:$rB, (FSMr32 R32C:$rC))>;

+// i64 setne:
 def : I64SETCCNegCond<setne, I64EQr64>;
+def : I64SELECTNegCond<setne, I64EQr64>;

-def : I64SELECTNegCond<setne, I64EQr64>;
+// i64 setugt:
--- a/lib/Target/CellSPU/SPUISelDAGToDAG.cpp
+++ b/lib/Target/CellSPU/SPUISelDAGToDAG.cpp
@@ -149,7 +149,7 @@ namespace {
  }

  bool
-  isHighLow(const SDValue &Op) 
+  isHighLow(const SDValue &Op)
  {
    return (Op.getOpcode() == SPUISD::IndirectAddr
            && ((Op.getOperand(0).getOpcode() == SPUISD::Hi
@@ -229,14 +229,14 @@ public:
    TM(tm),
    SPUtli(*tm.getTargetLowering())
  {}
-    
+
  virtual bool runOnFunction(Function &Fn) {
    // Make sure we re-emit a set of the global base reg if necessary
    GlobalBaseReg = 0;
    SelectionDAGISel::runOnFunction(Fn);
    return true;
  }
-   
+
  /// getI32Imm - Return a target constant with the specified value, of type
  /// i32.
  inline SDValue getI32Imm(uint32_t Imm) {
@@ -248,7 +248,7 @@ public:
  inline SDValue getI64Imm(uint64_t Imm) {
    return CurDAG->getTargetConstant(Imm, MVT::i64);
  }
-    
+
  /// getSmallIPtrImm - Return a target constant of pointer type.
  inline SDValue getSmallIPtrImm(unsigned Imm) {
    return CurDAG->getTargetConstant(Imm, SPUtli.getPointerTy());
@@ -258,6 +258,15 @@ public:
  /// target-specific node if it hasn't already been changed.
  SDNode *Select(SDValue Op);

+  //! Emit the instruction sequence for i64 shl
+  SDNode *SelectSHLi64(SDValue &Op, MVT OpVT);
+
+  //! Emit the instruction sequence for i64 srl
+  SDNode *SelectSRLi64(SDValue &Op, MVT OpVT);
+
+  //! Emit the instruction sequence for i64 sra
+  SDNode *SelectSRAi64(SDValue &Op, MVT OpVT);
+
  //! Returns true if the address N is an A-form (local store) address
  bool SelectAFormAddr(SDValue Op, SDValue N, SDValue &Base,
                       SDValue &Index);
@@ -287,7 +296,7 @@ public:
    switch (ConstraintCode) {
    default: return true;
    case 'm':   // memory
-      if (!SelectDFormAddr(Op, Op, Op0, Op1) 
+      if (!SelectDFormAddr(Op, Op, Op0, Op1)
          && !SelectAFormAddr(Op, Op, Op0, Op1))
        SelectXFormAddr(Op, Op, Op0, Op1);
      break;
@@ -306,7 +315,7 @@ public:
 #endif
      break;
    }
-      
+
    OutOps.push_back(Op0);
    OutOps.push_back(Op1);
    return false;
@@ -318,14 +327,14 @@ public:

  virtual const char *getPassName() const {
    return "Cell SPU DAG->DAG Pattern Instruction Selection";
-  } 
-    
+  }
+
  /// CreateTargetHazardRecognizer - Return the hazard recognizer to use for
  /// this target when scheduling the DAG.
  virtual HazardRecognizer *CreateTargetHazardRecognizer() {
    const TargetInstrInfo *II = TM.getInstrInfo();
    assert(II && "No InstrInfo?");
-    return new SPUHazardRecognizer(*II); 
+    return new SPUHazardRecognizer(*II);
  }

  // Include the pieces autogenerated from the target description.
@@ -375,7 +384,7 @@ SPUDAGToDAGISel::SelectAFormAddr(SDValue Op, SDValue N, SDValue &Base,
    abort();
    /*NOTREACHED*/

-  case SPUISD::AFormAddr: 
+  case SPUISD::AFormAddr:
    // Just load from memory if there's only a single use of the location,
    // otherwise, this will get handled below with D-form offset addresses
    if (N.hasOneUse()) {
@@ -404,7 +413,7 @@ SPUDAGToDAGISel::SelectAFormAddr(SDValue Op, SDValue N, SDValue &Base,
  return false;
 }

-bool 
+bool
 SPUDAGToDAGISel::SelectDForm2Addr(SDValue Op, SDValue N, SDValue &Disp,
                                  SDValue &Base) {
  const int minDForm2Offset = -(1 << 7);
@@ -527,7 +536,7 @@ SPUDAGToDAGISel::DFormAddressPredicate(SDValue Op, SDValue N, SDValue &Base,
        ConstantSDNode *CN = cast<ConstantSDNode>(Op0);
        offset = int32_t(CN->getSExtValue());
        idxOp = Op1;
-      } 
+      }

      if (offset >= minOffset && offset <= maxOffset) {
        Base = CurDAG->getTargetConstant(offset, PtrTy);
@@ -622,27 +631,20 @@ SPUDAGToDAGISel::Select(SDValue Op) {
  if (N->isMachineOpcode()) {
    return NULL;   // Already selected.
  } else if (Opc == ISD::FrameIndex) {
-    // Selects to (add $sp, FI * stackSlotSize)
-    int FI =
-      SPUFrameInfo::FItoStackOffset(cast<FrameIndexSDNode>(N)->getIndex());
-    MVT PtrVT = SPUtli.getPointerTy();
+    int FI = cast<FrameIndexSDNode>(N)->getIndex();
+    SDValue TFI = CurDAG->getTargetFrameIndex(FI, Op.getValueType());
+    SDValue Imm0 = CurDAG->getTargetConstant(0, Op.getValueType());

-    // Adjust stack slot to actual offset in frame:
-    if (isS10Constant(FI)) {
-      DEBUG(cerr << "SPUDAGToDAGISel: Replacing FrameIndex with AIr32 $sp, "
-                 << FI
-                 << "\n");
+    if (FI < 128) {
      NewOpc = SPU::AIr32;
-      Ops[0] = CurDAG->getRegister(SPU::R1, PtrVT);
-      Ops[1] = CurDAG->getTargetConstant(FI, PtrVT);
+      Ops[0] = TFI;
+      Ops[1] = Imm0;
      n_ops = 2;
    } else {
-      DEBUG(cerr << "SPUDAGToDAGISel: Replacing FrameIndex with Ar32 $sp, "
-                 << FI
-                 << "\n");
      NewOpc = SPU::Ar32;
-      Ops[0] = CurDAG->getRegister(SPU::R1, PtrVT);
-      Ops[1] = CurDAG->getConstant(FI, PtrVT);
+      Ops[0] = CurDAG->getRegister(SPU::R1, Op.getValueType());
+      Ops[1] = SDValue(CurDAG->getTargetNode(SPU::ILAr32, Op.getValueType(),
+                                             TFI, Imm0), 0);
      n_ops = 2;
    }
  } else if (Opc == ISD::ZERO_EXTEND) {
@@ -661,6 +663,18 @@ SPUDAGToDAGISel::Select(SDValue Op) {
        n_ops = 2;
      }
    }
+  } else if (Opc == ISD::SHL) {
+    if (OpVT == MVT::i64) {
+      return SelectSHLi64(Op, OpVT);
+    }
+  } else if (Opc == ISD::SRL) {
+    if (OpVT == MVT::i64) {
+      return SelectSRLi64(Op, OpVT);
+    }
+  } else if (Opc == ISD::SRA) {
+    if (OpVT == MVT::i64) {
+      return SelectSRAi64(Op, OpVT);
+    }
  } else if (Opc == SPUISD::LDRESULT) {
    // Custom select instructions for LDRESULT
    MVT VT = N->getValueType(0);
@@ -713,7 +727,7 @@ SPUDAGToDAGISel::Select(SDValue Op) {
      n_ops = 2;
    }
  }
-  
+
  if (n_ops > 0) {
    if (N->hasOneUse())
      return CurDAG->SelectNodeTo(N, NewOpc, OpVT, Ops, n_ops);
@@ -723,7 +737,213 @@ SPUDAGToDAGISel::Select(SDValue Op) {
    return SelectCode(Op);
 }

-/// createPPCISelDag - This pass converts a legalized DAG into a 
+/*!
+ * Emit the instruction sequence for i64 left shifts. The basic algorithm
+ * is to fill the bottom two word slots with zeros so that zeros are shifted
+ * in as the entire quadword is shifted left.
+ *
+ * \note This code could also be used to implement v2i64 shl.
+ *
+ * @param Op The shl operand
+ * @param OpVT Op's machine value value type (doesn't need to be passed, but
+ * makes life easier.)
+ * @return The SDNode with the entire instruction sequence
+ */
+SDNode *
+SPUDAGToDAGISel::SelectSHLi64(SDValue &Op, MVT OpVT) {
+  SDValue Op0 = Op.getOperand(0);
+  MVT VecVT = MVT::getVectorVT(OpVT, (128 / OpVT.getSizeInBits()));
+  SDValue ShiftAmt = Op.getOperand(1);
+  MVT ShiftAmtVT = ShiftAmt.getValueType();
+  SDNode *VecOp0, *SelMask, *ZeroFill, *Shift = 0;
+  SDValue SelMaskVal;
+
+  VecOp0 = CurDAG->getTargetNode(SPU::ORv2i64_i64, VecVT, Op0);
+  SelMaskVal = CurDAG->getTargetConstant(0xff00ULL, MVT::i16);
+  SelMask = CurDAG->getTargetNode(SPU::FSMBIv2i64, VecVT, SelMaskVal);
+  ZeroFill = CurDAG->getTargetNode(SPU::ILv2i64, VecVT,
+                                   CurDAG->getTargetConstant(0, OpVT));
+  VecOp0 = CurDAG->getTargetNode(SPU::SELBv2i64, VecVT,
+                                 SDValue(ZeroFill, 0),
+                                 SDValue(VecOp0, 0),
+                                 SDValue(SelMask, 0));
+
+  if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(ShiftAmt)) {
+    unsigned bytes = unsigned(CN->getZExtValue()) >> 3;
+    unsigned bits = unsigned(CN->getZExtValue()) & 7;
+
+    if (bytes > 0) {
+      Shift =
+        CurDAG->getTargetNode(SPU::SHLQBYIv2i64, VecVT,
+                              SDValue(VecOp0, 0),
+                              CurDAG->getTargetConstant(bytes, ShiftAmtVT));
+    }
+
+    if (bits > 0) {
+      Shift =
+        CurDAG->getTargetNode(SPU::SHLQBIIv2i64, VecVT,
+                              SDValue((Shift != 0 ? Shift : VecOp0), 0),
+                              CurDAG->getTargetConstant(bits, ShiftAmtVT));
+    }
+  } else {
+    SDNode *Bytes =
+      CurDAG->getTargetNode(SPU::ROTMIr32, ShiftAmtVT,
+                            ShiftAmt,
+                            CurDAG->getTargetConstant(3, ShiftAmtVT));
+    SDNode *Bits =
+      CurDAG->getTargetNode(SPU::ANDIr32, ShiftAmtVT,
+                            ShiftAmt,
+                            CurDAG->getTargetConstant(7, ShiftAmtVT));
+    Shift =
+      CurDAG->getTargetNode(SPU::SHLQBYv2i64, VecVT,
+                            SDValue(VecOp0, 0), SDValue(Bytes, 0));
+    Shift =
+      CurDAG->getTargetNode(SPU::SHLQBIv2i64, VecVT,
+                            SDValue(Shift, 0), SDValue(Bits, 0));
+  }
+
+  return CurDAG->getTargetNode(SPU::ORi64_v2i64, OpVT, SDValue(Shift, 0));
+}
+
+/*!
+ * Emit the instruction sequence for i64 logical right shifts.
+ *
+ * @param Op The shl operand
+ * @param OpVT Op's machine value value type (doesn't need to be passed, but
+ * makes life easier.)
+ * @return The SDNode with the entire instruction sequence
+ */
+SDNode *
+SPUDAGToDAGISel::SelectSRLi64(SDValue &Op, MVT OpVT) {
+  SDValue Op0 = Op.getOperand(0);
+  MVT VecVT = MVT::getVectorVT(OpVT, (128 / OpVT.getSizeInBits()));
+  SDValue ShiftAmt = Op.getOperand(1);
+  MVT ShiftAmtVT = ShiftAmt.getValueType();
+  SDNode *VecOp0, *Shift = 0;
+
+  VecOp0 = CurDAG->getTargetNode(SPU::ORv2i64_i64, VecVT, Op0);
+
+  if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(ShiftAmt)) {
+    unsigned bytes = unsigned(CN->getZExtValue()) >> 3;
+    unsigned bits = unsigned(CN->getZExtValue()) & 7;
+
+    if (bytes > 0) {
+      Shift =
+        CurDAG->getTargetNode(SPU::ROTQMBYIv2i64, VecVT,
+                              SDValue(VecOp0, 0),
+                              CurDAG->getTargetConstant(bytes, ShiftAmtVT));
+    }
+
+    if (bits > 0) {
+      Shift =
+        CurDAG->getTargetNode(SPU::ROTQMBIIv2i64, VecVT,
+                              SDValue((Shift != 0 ? Shift : VecOp0), 0),
+                              CurDAG->getTargetConstant(bits, ShiftAmtVT));
+    }
+  } else {
+    SDNode *Bytes =
+      CurDAG->getTargetNode(SPU::ROTMIr32, ShiftAmtVT,
+                            ShiftAmt,
+                            CurDAG->getTargetConstant(3, ShiftAmtVT));
+    SDNode *Bits =
+      CurDAG->getTargetNode(SPU::ANDIr32, ShiftAmtVT,
+                            ShiftAmt,
+                            CurDAG->getTargetConstant(7, ShiftAmtVT));
+
+    // Ensure that the shift amounts are negated!
+    Bytes = CurDAG->getTargetNode(SPU::SFIr32, ShiftAmtVT,
+                                  SDValue(Bytes, 0),
+                                  CurDAG->getTargetConstant(0, ShiftAmtVT));
+
+    Bits = CurDAG->getTargetNode(SPU::SFIr32, ShiftAmtVT,
+                                 SDValue(Bits, 0),
+                                 CurDAG->getTargetConstant(0, ShiftAmtVT));
+
+    Shift =
+      CurDAG->getTargetNode(SPU::ROTQMBYv2i64, VecVT,
+                            SDValue(VecOp0, 0), SDValue(Bytes, 0));
+    Shift =
+      CurDAG->getTargetNode(SPU::ROTQMBIv2i64, VecVT,
+                            SDValue(Shift, 0), SDValue(Bits, 0));
+  }
+
+  return CurDAG->getTargetNode(SPU::ORi64_v2i64, OpVT, SDValue(Shift, 0));
+}
+
+/*!
+ * Emit the instruction sequence for i64 arithmetic right shifts.
+ *
+ * @param Op The shl operand
+ * @param OpVT Op's machine value value type (doesn't need to be passed, but
+ * makes life easier.)
+ * @return The SDNode with the entire instruction sequence
+ */
+SDNode *
+SPUDAGToDAGISel::SelectSRAi64(SDValue &Op, MVT OpVT) {
+  // Promote Op0 to vector
+  MVT VecVT = MVT::getVectorVT(OpVT, (128 / OpVT.getSizeInBits()));
+  SDValue ShiftAmt = Op.getOperand(1);
+  MVT ShiftAmtVT = ShiftAmt.getValueType();
+
+  SDNode *VecOp0 =
+    CurDAG->getTargetNode(SPU::ORv2i64_i64, VecVT, Op.getOperand(0));
+
+  SDValue SignRotAmt = CurDAG->getTargetConstant(31, ShiftAmtVT);
+  SDNode *SignRot =
+    CurDAG->getTargetNode(SPU::ROTMAIv2i64_i32, MVT::v2i64,
+                          SDValue(VecOp0, 0), SignRotAmt);
+  SDNode *UpperHalfSign =
+    CurDAG->getTargetNode(SPU::ORi32_v4i32, MVT::i32, SDValue(SignRot, 0));
+
+  SDNode *UpperHalfSignMask =
+    CurDAG->getTargetNode(SPU::FSM64r32, VecVT, SDValue(UpperHalfSign, 0));
+  SDNode *UpperLowerMask =
+    CurDAG->getTargetNode(SPU::FSMBIv2i64, VecVT,
+                          CurDAG->getTargetConstant(0xff00ULL, MVT::i16));
+  SDNode *UpperLowerSelect =
+    CurDAG->getTargetNode(SPU::SELBv2i64, VecVT,
+                          SDValue(UpperHalfSignMask, 0),
+                          SDValue(VecOp0, 0),
+                          SDValue(UpperLowerMask, 0));
+
+  SDNode *Shift = 0;
+
+  if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(ShiftAmt)) {
+    unsigned bytes = unsigned(CN->getZExtValue()) >> 3;
+    unsigned bits = unsigned(CN->getZExtValue()) & 7;
+
+    if (bytes > 0) {
+      bytes = 31 - bytes;
+      Shift =
+        CurDAG->getTargetNode(SPU::ROTQBYIv2i64, VecVT,
+                              SDValue(UpperLowerSelect, 0),
+                              CurDAG->getTargetConstant(bytes, ShiftAmtVT));
+    }
+
+    if (bits > 0) {
+      bits = 8 - bits;
+      Shift =
+        CurDAG->getTargetNode(SPU::ROTQBIIv2i64, VecVT,
+                              SDValue((Shift != 0 ? Shift : UpperLowerSelect), 0),
+                              CurDAG->getTargetConstant(bits, ShiftAmtVT));
+    }
+  } else {
+    SDNode *NegShift =
+      CurDAG->getTargetNode(SPU::SFIr32, ShiftAmtVT,
+                            ShiftAmt, CurDAG->getTargetConstant(0, ShiftAmtVT));
+
+    Shift =
+      CurDAG->getTargetNode(SPU::ROTQBYBIv2i64_r32, VecVT,
+                            SDValue(UpperLowerSelect, 0), SDValue(NegShift, 0));
+    Shift =
+      CurDAG->getTargetNode(SPU::ROTQBIv2i64, VecVT,
+                            SDValue(Shift, 0), SDValue(NegShift, 0));
+  }
+
+  return CurDAG->getTargetNode(SPU::ORi64_v2i64, OpVT, SDValue(Shift, 0));
+}
+
+/// createSPUISelDag - This pass converts a legalized DAG into a
 /// SPU-specific DAG, ready for instruction scheduling.
 ///
 FunctionPass *llvm::createSPUISelDag(SPUTargetMachine &TM) {
--- a/lib/Target/CellSPU/SPUISelLowering.cpp
+++ b/lib/Target/CellSPU/SPUISelLowering.cpp
@@ -204,10 +204,10 @@ SPUTargetLowering::SPUTargetLowering(SPUTargetMachine &TM)
  setOperationAction(ISD::SRL,  MVT::i8,     Custom);
  setOperationAction(ISD::SRA,  MVT::i8,     Custom);

-  // SPU needs custom lowering for shift left/right for i64
-  setOperationAction(ISD::SHL,  MVT::i64,    Custom);
-  setOperationAction(ISD::SRL,  MVT::i64,    Custom);
-  setOperationAction(ISD::SRA,  MVT::i64,    Custom);
+  // Make these operations legal and handle them during instruction selection:
+  setOperationAction(ISD::SHL,  MVT::i64,    Legal);
+  setOperationAction(ISD::SRL,  MVT::i64,    Legal);
+  setOperationAction(ISD::SRA,  MVT::i64,    Legal);

  // Custom lower i8, i32 and i64 multiplications
  setOperationAction(ISD::MUL,  MVT::i8,     Custom);
@@ -215,6 +215,7 @@ SPUTargetLowering::SPUTargetLowering(SPUTargetMachine &TM)
  setOperationAction(ISD::MUL,  MVT::i64,    Expand);   // libcall

  // Need to custom handle (some) common i8, i64 math ops
+  setOperationAction(ISD::ADD,  MVT::i8,     Custom);
  setOperationAction(ISD::ADD,  MVT::i64,    Custom);
  setOperationAction(ISD::SUB,  MVT::i8,     Custom);
  setOperationAction(ISD::SUB,  MVT::i64,    Custom);
@@ -249,7 +250,6 @@ SPUTargetLowering::SPUTargetLowering(SPUTargetMachine &TM)
  // Zero extension and sign extension for i64 have to be
  // custom legalized
  setOperationAction(ISD::ZERO_EXTEND, MVT::i64, Custom);
-  setOperationAction(ISD::SIGN_EXTEND, MVT::i64, Custom);
  setOperationAction(ISD::ANY_EXTEND,  MVT::i64, Custom);

  // Custom lower i128 -> i64 truncates
@@ -262,7 +262,6 @@ SPUTargetLowering::SPUTargetLowering(SPUTargetMachine &TM)
  setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);

  // FDIV on SPU requires custom lowering
-  setOperationAction(ISD::FDIV, MVT::f32, Custom);
  setOperationAction(ISD::FDIV, MVT::f64, Expand);      // libcall

  // SPU has [U|S]INT_TO_FP
@@ -340,7 +339,8 @@ SPUTargetLowering::SPUTargetLowering(SPUTargetMachine &TM)
    setOperationAction(ISD::ADD , VT, Legal);
    setOperationAction(ISD::SUB , VT, Legal);
    // mul has to be custom lowered.
-    setOperationAction(ISD::MUL , VT, Custom);
+    // TODO: v2i64 vector multiply
+    setOperationAction(ISD::MUL , VT, Legal);

    setOperationAction(ISD::AND   , VT, Legal);
    setOperationAction(ISD::OR    , VT, Legal);
@@ -354,7 +354,6 @@ SPUTargetLowering::SPUTargetLowering(SPUTargetMachine &TM)
    setOperationAction(ISD::SREM, VT, Expand);
    setOperationAction(ISD::UDIV, VT, Expand);
    setOperationAction(ISD::UREM, VT, Expand);
-    setOperationAction(ISD::FDIV, VT, Custom);

    // Custom lower build_vector, constant pool spills, insert and
    // extract vector elements:
@@ -371,9 +370,7 @@ SPUTargetLowering::SPUTargetLowering(SPUTargetMachine &TM)
  setOperationAction(ISD::XOR, MVT::v16i8, Custom);
  setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4f32, Custom);

-  // FIXME: This is only temporary until I put all vector multiplications in
-  // SPUInstrInfo.td:
-  setOperationAction(ISD::MUL, MVT::v4i32, Legal);
+  setOperationAction(ISD::FDIV, MVT::v4f32, Legal);

  setShiftAmountType(MVT::i32);
  setBooleanContents(ZeroOrNegativeOneBooleanContent);
@@ -411,10 +408,6 @@ SPUTargetLowering::getTargetNodeName(unsigned Opcode) const
    node_names[(unsigned) SPUISD::CNTB] = "SPUISD::CNTB";
    node_names[(unsigned) SPUISD::PREFSLOT2VEC] = "SPUISD::PREFSLOT2VEC";
    node_names[(unsigned) SPUISD::VEC2PREFSLOT] = "SPUISD::VEC2PREFSLOT";
-    node_names[(unsigned) SPUISD::MPY] = "SPUISD::MPY";
-    node_names[(unsigned) SPUISD::MPYU] = "SPUISD::MPYU";
-    node_names[(unsigned) SPUISD::MPYH] = "SPUISD::MPYH";
-    node_names[(unsigned) SPUISD::MPYHH] = "SPUISD::MPYHH";
    node_names[(unsigned) SPUISD::SHLQUAD_L_BITS] = "SPUISD::SHLQUAD_L_BITS";
    node_names[(unsigned) SPUISD::SHLQUAD_L_BYTES] = "SPUISD::SHLQUAD_L_BYTES";
    node_names[(unsigned) SPUISD::VEC_SHL] = "SPUISD::VEC_SHL";
@@ -422,21 +415,12 @@ SPUTargetLowering::getTargetNodeName(unsigned Opcode) const
    node_names[(unsigned) SPUISD::VEC_SRA] = "SPUISD::VEC_SRA";
    node_names[(unsigned) SPUISD::VEC_ROTL] = "SPUISD::VEC_ROTL";
    node_names[(unsigned) SPUISD::VEC_ROTR] = "SPUISD::VEC_ROTR";
-    node_names[(unsigned) SPUISD::ROTQUAD_RZ_BYTES] =
-      "SPUISD::ROTQUAD_RZ_BYTES";
-    node_names[(unsigned) SPUISD::ROTQUAD_RZ_BITS] =
-      "SPUISD::ROTQUAD_RZ_BITS";
-    node_names[(unsigned) SPUISD::ROTBYTES_LEFT] = "SPUISD::ROTBYTES_LEFT";
-    node_names[(unsigned) SPUISD::ROTBYTES_LEFT_BITS] =
-      "SPUISD::ROTBYTES_LEFT_BITS";
    node_names[(unsigned) SPUISD::SELECT_MASK] = "SPUISD::SELECT_MASK";
    node_names[(unsigned) SPUISD::SELB] = "SPUISD::SELB";
    node_names[(unsigned) SPUISD::ADD_EXTENDED] = "SPUISD::ADD_EXTENDED";
    node_names[(unsigned) SPUISD::CARRY_GENERATE] = "SPUISD::CARRY_GENERATE";
    node_names[(unsigned) SPUISD::SUB_EXTENDED] = "SPUISD::SUB_EXTENDED";
    node_names[(unsigned) SPUISD::BORROW_GENERATE] = "SPUISD::BORROW_GENERATE";
-    node_names[(unsigned) SPUISD::FPInterp] = "SPUISD::FPInterp";
-    node_names[(unsigned) SPUISD::FPRecipEst] = "SPUISD::FPRecipEst";
    node_names[(unsigned) SPUISD::SEXT32TO64] = "SPUISD::SEXT32TO64";
  }

@@ -1922,182 +1906,6 @@ static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) {
  return SDValue();
 }

-static SDValue LowerVectorMUL(SDValue Op, SelectionDAG &DAG) {
-  switch (Op.getValueType().getSimpleVT()) {
-  default:
-    cerr << "CellSPU: Unknown vector multiplication, got "
-         << Op.getValueType().getMVTString()
-         << "\n";
-    abort();
-    /*NOTREACHED*/
-
-  case MVT::v4i32:
-	  break;
-
-  // Multiply two v8i16 vectors (pipeline friendly version):
-  // a) multiply lower halves, mask off upper 16-bit of 32-bit product
-  // b) multiply upper halves, rotate left by 16 bits (inserts 16 lower zeroes)
-  // c) Use SELB to select upper and lower halves from the intermediate results
-  //
-  // NOTE: We really want to move the SELECT_MASK to earlier to actually get the
-  // dual-issue. This code does manage to do this, even if it's a little on
-  // the wacky side
-  case MVT::v8i16: {
-    MachineFunction &MF = DAG.getMachineFunction();
-    MachineRegisterInfo &RegInfo = MF.getRegInfo();
-    SDValue Chain = Op.getOperand(0);
-    SDValue rA = Op.getOperand(0);
-    SDValue rB = Op.getOperand(1);
-    unsigned FSMBIreg = RegInfo.createVirtualRegister(&SPU::VECREGRegClass);
-    unsigned HiProdReg = RegInfo.createVirtualRegister(&SPU::VECREGRegClass);
-
-    SDValue FSMBOp =
-      DAG.getCopyToReg(Chain, FSMBIreg,
-                       DAG.getNode(SPUISD::SELECT_MASK, MVT::v8i16,
-                                   DAG.getConstant(0xcccc, MVT::i16)));
-
-    SDValue HHProd =
-      DAG.getCopyToReg(FSMBOp, HiProdReg,
-                       DAG.getNode(SPUISD::MPYHH, MVT::v8i16, rA, rB));
-
-    SDValue HHProd_v4i32 =
-      DAG.getNode(ISD::BIT_CONVERT, MVT::v4i32,
-                  DAG.getCopyFromReg(HHProd, HiProdReg, MVT::v4i32));
-
-    return DAG.getNode(SPUISD::SELB, MVT::v8i16,
-                       DAG.getNode(SPUISD::MPY, MVT::v8i16, rA, rB),
-                       DAG.getNode(ISD::BIT_CONVERT, Op.getValueType(),
-                                   DAG.getNode(SPUISD::VEC_SHL, MVT::v4i32,
-                                               HHProd_v4i32,
-                                               DAG.getConstant(16, MVT::i16))),
-                       DAG.getCopyFromReg(FSMBOp, FSMBIreg, MVT::v4i32));
-  }
-
-  // This M00sE is N@stI! (apologies to Monty Python)
-  //
-  // SPU doesn't know how to do any 8-bit multiplication, so the solution
-  // is to break it all apart, sign extend, and reassemble the various
-  // intermediate products.
-  case MVT::v16i8: {
-    SDValue rA = Op.getOperand(0);
-    SDValue rB = Op.getOperand(1);
-    SDValue c8 = DAG.getConstant(8, MVT::i32);
-    SDValue c16 = DAG.getConstant(16, MVT::i32);
-
-    SDValue LLProd =
-      DAG.getNode(SPUISD::MPY, MVT::v8i16,
-                  DAG.getNode(ISD::BIT_CONVERT, MVT::v8i16, rA),
-                  DAG.getNode(ISD::BIT_CONVERT, MVT::v8i16, rB));
-
-    SDValue rALH = DAG.getNode(SPUISD::VEC_SRA, MVT::v8i16, rA, c8);
-
-    SDValue rBLH = DAG.getNode(SPUISD::VEC_SRA, MVT::v8i16, rB, c8);
-
-    SDValue LHProd =
-      DAG.getNode(SPUISD::VEC_SHL, MVT::v8i16,
-                  DAG.getNode(SPUISD::MPY, MVT::v8i16, rALH, rBLH), c8);
-
-    SDValue FSMBmask = DAG.getNode(SPUISD::SELECT_MASK, MVT::v8i16,
-                                     DAG.getConstant(0x2222, MVT::i16));
-
-    SDValue LoProdParts =
-      DAG.getNode(ISD::BIT_CONVERT, MVT::v4i32,
-                  DAG.getNode(SPUISD::SELB, MVT::v8i16,
-                              LLProd, LHProd, FSMBmask));
-
-    SDValue LoProdMask = DAG.getConstant(0xffff, MVT::i32);
-
-    SDValue LoProd =
-      DAG.getNode(ISD::AND, MVT::v4i32,
-                  LoProdParts,
-                  DAG.getNode(ISD::BUILD_VECTOR, MVT::v4i32,
-                              LoProdMask, LoProdMask,
-                              LoProdMask, LoProdMask));
-
-    SDValue rAH =
-      DAG.getNode(SPUISD::VEC_SRA, MVT::v4i32,
-                  DAG.getNode(ISD::BIT_CONVERT, MVT::v4i32, rA), c16);
-
-    SDValue rBH =
-      DAG.getNode(SPUISD::VEC_SRA, MVT::v4i32,
-                  DAG.getNode(ISD::BIT_CONVERT, MVT::v4i32, rB), c16);
-
-    SDValue HLProd =
-      DAG.getNode(SPUISD::MPY, MVT::v8i16,
-                  DAG.getNode(ISD::BIT_CONVERT, MVT::v8i16, rAH),
-                  DAG.getNode(ISD::BIT_CONVERT, MVT::v8i16, rBH));
-
-    SDValue HHProd_1 =
-      DAG.getNode(SPUISD::MPY, MVT::v8i16,
-                  DAG.getNode(ISD::BIT_CONVERT, MVT::v8i16,
-                              DAG.getNode(SPUISD::VEC_SRA,
-                                          MVT::v4i32, rAH, c8)),
-                  DAG.getNode(ISD::BIT_CONVERT, MVT::v8i16,
-                              DAG.getNode(SPUISD::VEC_SRA,
-                                          MVT::v4i32, rBH, c8)));
-
-    SDValue HHProd =
-      DAG.getNode(SPUISD::SELB, MVT::v8i16,
-                  HLProd,
-                  DAG.getNode(SPUISD::VEC_SHL, MVT::v8i16, HHProd_1, c8),
-                  FSMBmask);
-
-    SDValue HiProd =
-      DAG.getNode(SPUISD::VEC_SHL, MVT::v4i32, HHProd, c16);
-
-    return DAG.getNode(ISD::BIT_CONVERT, MVT::v16i8,
-                       DAG.getNode(ISD::OR, MVT::v4i32,
-                                   LoProd, HiProd));
-  }
-  }
-
-  return SDValue();
-}
-
-static SDValue LowerFDIVf32(SDValue Op, SelectionDAG &DAG) {
-  MachineFunction &MF = DAG.getMachineFunction();
-  MachineRegisterInfo &RegInfo = MF.getRegInfo();
-
-  SDValue A = Op.getOperand(0);
-  SDValue B = Op.getOperand(1);
-  MVT VT = Op.getValueType();
-
-  unsigned VRegBR, VRegC;
-
-  if (VT == MVT::f32) {
-    VRegBR = RegInfo.createVirtualRegister(&SPU::R32FPRegClass);
-    VRegC = RegInfo.createVirtualRegister(&SPU::R32FPRegClass);
-  } else {
-    VRegBR = RegInfo.createVirtualRegister(&SPU::VECREGRegClass);
-    VRegC = RegInfo.createVirtualRegister(&SPU::VECREGRegClass);
-  }
-  // TODO: make sure we're feeding FPInterp the right arguments
-  // Right now: fi B, frest(B)
-
-  // Computes BRcpl =
-  // (Floating Interpolate (FP Reciprocal Estimate B))
-  SDValue BRcpl =
-      DAG.getCopyToReg(DAG.getEntryNode(), VRegBR,
-                       DAG.getNode(SPUISD::FPInterp, VT, B,
-                                DAG.getNode(SPUISD::FPRecipEst, VT, B)));
-
-  // Computes A * BRcpl and stores in a temporary register
-  SDValue AxBRcpl =
-      DAG.getCopyToReg(BRcpl, VRegC,
-                 DAG.getNode(ISD::FMUL, VT, A,
-                        DAG.getCopyFromReg(BRcpl, VRegBR, VT)));
-  // What's the Chain variable do? It's magic!
-  // TODO: set Chain = Op(0).getEntryNode()
-
-  return DAG.getNode(ISD::FADD, VT,
-                DAG.getCopyFromReg(AxBRcpl, VRegC, VT),
-                DAG.getNode(ISD::FMUL, VT,
-                        DAG.getCopyFromReg(AxBRcpl, VRegBR, VT),
-                        DAG.getNode(ISD::FSUB, VT, A,
-                            DAG.getNode(ISD::FMUL, VT, B,
-                            DAG.getCopyFromReg(AxBRcpl, VRegC, VT)))));
-}
-
 static SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) {
  MVT VT = Op.getValueType();
  SDValue N = Op.getOperand(0);
@@ -2296,18 +2104,23 @@ static SDValue LowerI8Math(SDValue Op, SelectionDAG &DAG, unsigned Opc,
    assert(0 && "Unhandled i8 math operator");
    /*NOTREACHED*/
    break;
+  case ISD::ADD: {
+    // 8-bit addition: Promote the arguments up to 16-bits and truncate
+    // the result:
+    SDValue N1 = Op.getOperand(1);
+    N0 = DAG.getNode(ISD::SIGN_EXTEND, MVT::i16, N0);
+    N1 = DAG.getNode(ISD::SIGN_EXTEND, MVT::i16, N1);
+    return DAG.getNode(ISD::TRUNCATE, MVT::i8,
+                       DAG.getNode(Opc, MVT::i16, N0, N1));
+
+  }
+
  case ISD::SUB: {
    // 8-bit subtraction: Promote the arguments up to 16-bits and truncate
    // the result:
    SDValue N1 = Op.getOperand(1);
-    N0 = (N0.getOpcode() != ISD::Constant
-          ? DAG.getNode(ISD::SIGN_EXTEND, MVT::i16, N0)
-          : DAG.getConstant(cast<ConstantSDNode>(N0)->getSExtValue(),
-                            MVT::i16));
-    N1 = (N1.getOpcode() != ISD::Constant
-          ? DAG.getNode(ISD::SIGN_EXTEND, MVT::i16, N1)
-          : DAG.getConstant(cast<ConstantSDNode>(N1)->getSExtValue(),
-                            MVT::i16));
+    N0 = DAG.getNode(ISD::SIGN_EXTEND, MVT::i16, N0);
+    N1 = DAG.getNode(ISD::SIGN_EXTEND, MVT::i16, N1);
    return DAG.getNode(ISD::TRUNCATE, MVT::i8,
                       DAG.getNode(Opc, MVT::i16, N0, N1));
  }
@@ -2397,7 +2210,6 @@ static SDValue LowerI64Math(SDValue Op, SelectionDAG &DAG, unsigned Opc)

  switch (Opc) {
  case ISD::ZERO_EXTEND:
-  case ISD::SIGN_EXTEND:
  case ISD::ANY_EXTEND: {
    MVT Op0VT = Op0.getValueType();
    MVT Op0VecVT = MVT::getVectorVT(Op0VT, (128 / Op0VT.getSizeInBits()));
@@ -2410,39 +2222,16 @@ static SDValue LowerI64Math(SDValue Op, SelectionDAG &DAG, unsigned Opc)
    SDValue PromoteScalar =
            DAG.getNode(SPUISD::PREFSLOT2VEC, Op0VecVT, Op0);

-    if (Opc != ISD::SIGN_EXTEND) {
-      // Use a shuffle to zero extend the i32 to i64 directly:
-      SDValue shufMask =
-              DAG.getNode(ISD::BUILD_VECTOR, Op0VecVT,
-                          DAG.getConstant(0x80808080, MVT::i32),
-                          DAG.getConstant(0x00010203, MVT::i32),
-                          DAG.getConstant(0x80808080, MVT::i32),
-                          DAG.getConstant(0x08090a0b, MVT::i32));
-      SDValue zextShuffle =
-              DAG.getNode(SPUISD::SHUFB, Op0VecVT,
-                          PromoteScalar, PromoteScalar, shufMask);
+    // Use a shuffle to zero extend the i32 to i64 directly:
+    SDValue shufMask = DAG.getNode(ISD::BUILD_VECTOR, Op0VecVT,
+        DAG.getConstant(0x80808080, MVT::i32), DAG.getConstant(0x00010203,
+            MVT::i32), DAG.getConstant(0x80808080, MVT::i32), DAG.getConstant(
+            0x08090a0b, MVT::i32));
+    SDValue zextShuffle = DAG.getNode(SPUISD::SHUFB, Op0VecVT, PromoteScalar,
+        PromoteScalar, shufMask);

-      return DAG.getNode(SPUISD::VEC2PREFSLOT, VT,
-                         DAG.getNode(ISD::BIT_CONVERT, VecVT, zextShuffle));
-    } else {
-      // SPU has no "rotate quadword and replicate bit 0" (i.e. rotate/shift
-      // right and propagate the sign bit) instruction.
-      SDValue RotQuad =
-              DAG.getNode(SPUISD::ROTQUAD_RZ_BYTES, Op0VecVT,
-                          PromoteScalar, DAG.getConstant(4, MVT::i32));
-      SDValue SignQuad =
-              DAG.getNode(SPUISD::VEC_SRA, Op0VecVT,
-                          PromoteScalar, DAG.getConstant(32, MVT::i32));
-      SDValue SelMask =
-              DAG.getNode(SPUISD::SELECT_MASK, Op0VecVT,
-                          DAG.getConstant(0xf0f0, MVT::i16));
-      SDValue CombineQuad =
-              DAG.getNode(SPUISD::SELB, Op0VecVT,
-                          SignQuad, RotQuad, SelMask);
-
-      return DAG.getNode(SPUISD::VEC2PREFSLOT, VT,
-                         DAG.getNode(ISD::BIT_CONVERT, VecVT, CombineQuad));
-    }
+    return DAG.getNode(SPUISD::VEC2PREFSLOT, VT, DAG.getNode(ISD::BIT_CONVERT,
+        VecVT, zextShuffle));
  }

  case ISD::ADD: {
@@ -2502,88 +2291,6 @@ static SDValue LowerI64Math(SDValue Op, SelectionDAG &DAG, unsigned Opc)
                       DAG.getNode(SPUISD::SUB_EXTENDED, MVT::v2i64,
                                   Op0, Op1, ShiftedBorrow));
  }
-
-  case ISD::SHL: {
-    SDValue ShiftAmt = Op.getOperand(1);
-    MVT ShiftAmtVT = ShiftAmt.getValueType();
-    SDValue Op0Vec = DAG.getNode(SPUISD::PREFSLOT2VEC, VecVT, Op0);
-    SDValue MaskLower =
-      DAG.getNode(SPUISD::SELB, VecVT,
-                  Op0Vec,
-                  DAG.getConstant(0, VecVT),
-                  DAG.getNode(SPUISD::SELECT_MASK, VecVT,
-                              DAG.getConstant(0xff00ULL, MVT::i16)));
-    SDValue ShiftAmtBytes =
-      DAG.getNode(ISD::SRL, ShiftAmtVT,
-                  ShiftAmt,
-                  DAG.getConstant(3, ShiftAmtVT));
-    SDValue ShiftAmtBits =
-      DAG.getNode(ISD::AND, ShiftAmtVT,
-                  ShiftAmt,
-                  DAG.getConstant(7, ShiftAmtVT));
-
-    return DAG.getNode(SPUISD::VEC2PREFSLOT, VT,
-                       DAG.getNode(SPUISD::SHLQUAD_L_BITS, VecVT,
-                                   DAG.getNode(SPUISD::SHLQUAD_L_BYTES, VecVT,
-                                               MaskLower, ShiftAmtBytes),
-                                   ShiftAmtBits));
-  }
-
-  case ISD::SRL: {
-    MVT VT = Op.getValueType();
-    SDValue ShiftAmt = Op.getOperand(1);
-    MVT ShiftAmtVT = ShiftAmt.getValueType();
-    SDValue ShiftAmtBytes =
-      DAG.getNode(ISD::SRL, ShiftAmtVT,
-                  ShiftAmt,
-                  DAG.getConstant(3, ShiftAmtVT));
-    SDValue ShiftAmtBits =
-      DAG.getNode(ISD::AND, ShiftAmtVT,
-                  ShiftAmt,
-                  DAG.getConstant(7, ShiftAmtVT));
-
-    return DAG.getNode(SPUISD::ROTQUAD_RZ_BITS, VT,
-                       DAG.getNode(SPUISD::ROTQUAD_RZ_BYTES, VT,
-                                   Op0, ShiftAmtBytes),
-                       ShiftAmtBits);
-  }
-
-  case ISD::SRA: {
-    // Promote Op0 to vector
-    SDValue Op0 =
-      DAG.getNode(SPUISD::PREFSLOT2VEC, MVT::v2i64, Op.getOperand(0));
-    SDValue ShiftAmt = Op.getOperand(1);
-    MVT ShiftVT = ShiftAmt.getValueType();
-
-    // Negate variable shift amounts
-    if (!isa<ConstantSDNode>(ShiftAmt)) {
-      ShiftAmt = DAG.getNode(ISD::SUB, ShiftVT,
-                             DAG.getConstant(0, ShiftVT), ShiftAmt);
-    }
-
-    SDValue UpperHalfSign =
-      DAG.getNode(SPUISD::VEC2PREFSLOT, MVT::i32,
-                  DAG.getNode(ISD::BIT_CONVERT, MVT::v4i32,
-                              DAG.getNode(SPUISD::VEC_SRA, MVT::v2i64,
-                                          Op0, DAG.getConstant(31, MVT::i32))));
-    SDValue UpperHalfSignMask =
-      DAG.getNode(SPUISD::SELECT_MASK, MVT::v2i64, UpperHalfSign);
-    SDValue UpperLowerMask =
-      DAG.getNode(SPUISD::SELECT_MASK, MVT::v2i64,
-                  DAG.getConstant(0xff00, MVT::i16));
-    SDValue UpperLowerSelect =
-      DAG.getNode(SPUISD::SELB, MVT::v2i64,
-                  UpperHalfSignMask, Op0, UpperLowerMask);
-    SDValue RotateLeftBytes =
-      DAG.getNode(SPUISD::ROTBYTES_LEFT_BITS, MVT::v2i64,
-                  UpperLowerSelect, ShiftAmt);
-    SDValue RotateLeftBits =
-      DAG.getNode(SPUISD::ROTBYTES_LEFT, MVT::v2i64,
-                  RotateLeftBytes, ShiftAmt);
-
-    return DAG.getNode(SPUISD::VEC2PREFSLOT, MVT::i64,
-                       RotateLeftBits);
-  }
  }

  return SDValue();
@@ -2890,10 +2597,11 @@ SPUTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG)
    return LowerRET(Op, DAG, getTargetMachine());


-  // i8, i64 math ops:
  case ISD::ZERO_EXTEND:
-  case ISD::SIGN_EXTEND:
  case ISD::ANY_EXTEND:
+    return LowerI64Math(Op, DAG, Opc);
+
+  // i8, i64 math ops:
  case ISD::ADD:
  case ISD::SUB:
  case ISD::ROTR:
@@ -2928,22 +2636,9 @@ SPUTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG)

  // Vector and i8 multiply:
  case ISD::MUL:
-    if (VT.isVector())
-      return LowerVectorMUL(Op, DAG);
-    else if (VT == MVT::i8)
+    if (VT == MVT::i8)
      return LowerI8Math(Op, DAG, Opc, *this);

-  case ISD::FDIV:
-    if (VT == MVT::f32 || VT == MVT::v4f32)
-      return LowerFDIVf32(Op, DAG);
-#if 0
-    // This is probably a libcall
-    else if (Op.getValueType() == MVT::f64)
-      return LowerFDIVf64(Op, DAG);
-#endif
-    else
-      assert(0 && "Calling FDIV on unsupported MVT");
-
  case ISD::CTPOP:
    return LowerCTPOP(Op, DAG);

@@ -3119,8 +2814,6 @@ SPUTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const
  case SPUISD::VEC_SHL:
  case SPUISD::VEC_SRL:
  case SPUISD::VEC_SRA:
-  case SPUISD::ROTQUAD_RZ_BYTES:
-  case SPUISD::ROTQUAD_RZ_BITS:
  case SPUISD::ROTBYTES_LEFT: {
    SDValue Op1 = N->getOperand(1);

@@ -3268,10 +2961,6 @@ SPUTargetLowering::computeMaskedBitsForTargetNode(const SDValue Op,
  }

 #if 0
-  case MPY:
-  case MPYU:
-  case MPYH:
-  case MPYHH:
  case SPUISD::SHLQUAD_L_BITS:
  case SPUISD::SHLQUAD_L_BYTES:
  case SPUISD::VEC_SHL:
@@ -3279,18 +2968,14 @@ SPUTargetLowering::computeMaskedBitsForTargetNode(const SDValue Op,
  case SPUISD::VEC_SRA:
  case SPUISD::VEC_ROTL:
  case SPUISD::VEC_ROTR:
-  case SPUISD::ROTQUAD_RZ_BYTES:
-  case SPUISD::ROTQUAD_RZ_BITS:
  case SPUISD::ROTBYTES_LEFT:
  case SPUISD::SELECT_MASK:
  case SPUISD::SELB:
-  case SPUISD::FPInterp:
-  case SPUISD::FPRecipEst:
  case SPUISD::SEXT32TO64:
 #endif
  }
 }
-  
+
 unsigned
 SPUTargetLowering::ComputeNumSignBitsForTargetNode(SDValue Op,
                                                   unsigned Depth) const {
--- a/lib/Target/CellSPU/SPUISelLowering.h
+++ b/lib/Target/CellSPU/SPUISelLowering.h
@@ -24,10 +24,10 @@ namespace llvm {
    enum NodeType {
      // Start the numbering where the builting ops and target ops leave off.
      FIRST_NUMBER = ISD::BUILTIN_OP_END,
-      
+
      // Pseudo instructions:
      RET_FLAG,                 ///< Return with flag, matched by bi instruction
-      
+
      Hi,                       ///< High address component (upper 16)
      Lo,                       ///< Low address component (lower 16)
      PCRelAddr,                ///< Program counter relative address
@@ -41,10 +41,6 @@ namespace llvm {
      CNTB,                     ///< Count leading ones in bytes
      PREFSLOT2VEC,             ///< Promote scalar->vector
      VEC2PREFSLOT,             ///< Extract element 0
-      MPY,                      ///< 16-bit Multiply (low parts of a 32-bit)
-      MPYU,                     ///< Multiply Unsigned
-      MPYH,                     ///< Multiply High
-      MPYHH,                    ///< Multiply High-High
      SHLQUAD_L_BITS,           ///< Rotate quad left, by bits
      SHLQUAD_L_BYTES,          ///< Rotate quad left, by bytes
      VEC_SHL,                  ///< Vector shift left
@@ -52,8 +48,6 @@ namespace llvm {
      VEC_SRA,                  ///< Vector shift right (arithmetic)
      VEC_ROTL,                 ///< Vector rotate left
      VEC_ROTR,                 ///< Vector rotate right
-      ROTQUAD_RZ_BYTES,         ///< Rotate quad right, by bytes, zero fill
-      ROTQUAD_RZ_BITS,          ///< Rotate quad right, by bits, zero fill
      ROTBYTES_LEFT,            ///< Rotate bytes (loads -> ROTQBYI)
      ROTBYTES_LEFT_BITS,       ///< Rotate bytes left by bit shift count
      SELECT_MASK,              ///< Select Mask (FSM, FSMB, FSMH, FSMBI)
@@ -63,8 +57,6 @@ namespace llvm {
      CARRY_GENERATE,           ///< Carry generate for ADD_EXTENDED
      SUB_EXTENDED,             ///< Subtract extended, with borrow
      BORROW_GENERATE,          ///< Borrow generate for SUB_EXTENDED
-      FPInterp,                 ///< Floating point interpolate
-      FPRecipEst,               ///< Floating point reciprocal estimate
      SEXT32TO64,               ///< Sign-extended 32-bit const -> 64-bits
      LAST_SPUISD               ///< Last user-defined instruction
    };
@@ -87,7 +79,7 @@ namespace llvm {
  }

  class SPUTargetMachine;            // forward dec'l.
-  
+
  class SPUTargetLowering :
    public TargetLowering
  {
@@ -97,14 +89,14 @@ namespace llvm {

  public:
    SPUTargetLowering(SPUTargetMachine &TM);
-    
+
    /// getTargetNodeName() - This method returns the name of a target specific
    /// DAG node.
    virtual const char *getTargetNodeName(unsigned Opcode) const;

    /// getSetCCResultType - Return the ValueType for ISD::SETCC
    virtual MVT getSetCCResultType(const SDValue &) const;
-    
+
    //! Custom lowering hooks
    virtual SDValue LowerOperation(SDValue Op, SelectionDAG &DAG);

@@ -116,7 +108,7 @@ namespace llvm {

    virtual void computeMaskedBitsForTargetNode(const SDValue Op,
                                                const APInt &Mask,
-                                                APInt &KnownZero, 
+                                                APInt &KnownZero,
                                                APInt &KnownOne,
                                                const SelectionDAG &DAG,
                                                unsigned Depth = 0) const;
@@ -126,12 +118,12 @@ namespace llvm {

    ConstraintType getConstraintType(const std::string &ConstraintLetter) const;

-    std::pair<unsigned, const TargetRegisterClass*> 
+    std::pair<unsigned, const TargetRegisterClass*>
      getRegForInlineAsmConstraint(const std::string &Constraint,
                                   MVT VT) const;

    void LowerAsmOperandForConstraint(SDValue Op, char ConstraintLetter,
-                                      bool hasMemory, 
+                                      bool hasMemory,
                                      std::vector<SDValue> &Ops,
                                      SelectionDAG &DAG) const;

--- a/lib/Target/CellSPU/SPUInstrInfo.cpp
+++ b/lib/Target/CellSPU/SPUInstrInfo.cpp
@@ -82,7 +82,7 @@ SPUInstrInfo::isMoveInstr(const MachineInstr& MI,
  case SPU::ORIi8i32:
  case SPU::AHIvec:
  case SPU::AHIr16:
-  case SPU::AIvec:
+  case SPU::AIv4i32:
    assert(MI.getNumOperands() == 3 &&
           MI.getOperand(0).isReg() &&
           MI.getOperand(1).isReg() &&
@@ -98,8 +98,7 @@ SPUInstrInfo::isMoveInstr(const MachineInstr& MI,
    assert(MI.getNumOperands() == 3 &&
           "wrong number of operands to AIr32");
    if (MI.getOperand(0).isReg() &&
-        (MI.getOperand(1).isReg() ||
-         MI.getOperand(1).isFI()) &&
+        MI.getOperand(1).isReg() &&
        (MI.getOperand(2).isImm() &&
         MI.getOperand(2).getImm() == 0)) {
      sourceReg = MI.getOperand(1).getReg();
@@ -265,7 +264,7 @@ bool SPUInstrInfo::copyRegToReg(MachineBasicBlock &MBB,
  // reg class to any other reg class containing R3.  This is required because
  // we instruction select bitconvert i64 -> f64 as a noop for example, so our
  // types have no specific meaning.
-  
+
  if (DestRC == SPU::R8CRegisterClass) {
    BuildMI(MBB, MI, get(SPU::ORBIr8), DestReg).addReg(SrcReg).addImm(0);
  } else if (DestRC == SPU::R16CRegisterClass) {
@@ -291,7 +290,7 @@ bool SPUInstrInfo::copyRegToReg(MachineBasicBlock &MBB,
    // Attempt to copy unknown/unsupported register class!
    return false;
  }
-  
+
  return true;
 }

@@ -464,7 +463,7 @@ SPUInstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
  unsigned OpNum = Ops[0];
  unsigned Opc = MI->getOpcode();
  MachineInstr *NewMI = 0;
-  
+
  if ((Opc == SPU::ORr32
       || Opc == SPU::ORv4i32)
       && MI->getOperand(1).getReg() == MI->getOperand(2).getReg()) {
@@ -508,7 +507,7 @@ SPUInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,

  // Get the last instruction in the block.
  MachineInstr *LastInst = I;
-  
+
  // If there is only one terminator instruction, process it.
  if (I == MBB.begin() || !isUnpredicatedTerminator(--I)) {
    if (isUncondBranch(LastInst)) {
@@ -524,7 +523,7 @@ SPUInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
    // Otherwise, don't know what this is.
    return true;
  }
-  
+
  // Get the instruction before it if it's a terminator.
  MachineInstr *SecondLastInst = I;

@@ -532,7 +531,7 @@ SPUInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
  if (SecondLastInst && I != MBB.begin() &&
      isUnpredicatedTerminator(--I))
    return true;
-  
+
  // If the block ends with a conditional and unconditional branch, handle it.
  if (isCondBranch(SecondLastInst) && isUncondBranch(LastInst)) {
    TBB =  SecondLastInst->getOperand(1).getMBB();
@@ -541,7 +540,7 @@ SPUInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
    FBB = LastInst->getOperand(0).getMBB();
    return false;
  }
-  
+
  // If the block ends with two unconditional branches, handle it.  The second
  // one is not executed, so remove it.
  if (isUncondBranch(SecondLastInst) && isUncondBranch(LastInst)) {
@@ -554,7 +553,7 @@ SPUInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
  // Otherwise, can't handle this.
  return true;
 }
-    
+
 unsigned
 SPUInstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
  MachineBasicBlock::iterator I = MBB.end();
@@ -578,16 +577,16 @@ SPUInstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
  I->eraseFromParent();
  return 2;
 }
-    
+
 unsigned
 SPUInstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
 			   MachineBasicBlock *FBB,
 			   const SmallVectorImpl<MachineOperand> &Cond) const {
  // Shouldn't be a fall through.
  assert(TBB && "InsertBranch must not be told to insert a fallthrough");
-  assert((Cond.size() == 2 || Cond.size() == 0) && 
+  assert((Cond.size() == 2 || Cond.size() == 0) &&
         "SPU branch conditions have two components!");
-  
+
  // One-way branch.
  if (FBB == 0) {
    if (Cond.empty())   // Unconditional branch
@@ -600,7 +599,7 @@ SPUInstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
    }
    return 1;
  }
-  
+
  // Two-way Conditional Branch.
 #if 0
  BuildMI(&MBB, get(SPU::BRNZ))
--- a/lib/Target/CellSPU/SPUInstrInfo.td
+++ b/lib/Target/CellSPU/SPUInstrInfo.td
@@ -583,7 +583,9 @@ def AHIvec:
 def AHIr16:
  RI10Form<0b10111000, (outs R16C:$rT), (ins R16C:$rA, s10imm:$val),
    "ahi\t$rT, $rA, $val", IntegerOp,
-    [(set R16C:$rT, (add R16C:$rA, v8i16SExt10Imm:$val))]>;
+    [(set R16C:$rT, (add R16C:$rA, i16ImmSExt10:$val))]>;
+
+// v4i32, i32 add instruction:

 class AInst<dag OOL, dag IOL, list<dag> pattern>:
  RRForm<0b00000011000, OOL, IOL,
@@ -604,21 +606,42 @@ multiclass AddInstruction {
  def v16i8: AVecInst<v16i8>;
  
  def r32:   ARegInst<R32C>;
-  def r8:    AInst<(outs R8C:$rT), (ins R8C:$rA, R8C:$rB), [/* no pattern */]>; 
 }

 defm A : AddInstruction;

-def AIvec:
-    RI10Form<0b00111000, (outs VECREG:$rT), (ins VECREG:$rA, s10imm:$val),
-      "ai\t$rT, $rA, $val", IntegerOp,
-      [(set (v4i32 VECREG:$rT), (add (v4i32 VECREG:$rA),
-                                      v4i32SExt10Imm:$val))]>;
+class AIInst<dag OOL, dag IOL, list<dag> pattern>:
+    RI10Form<0b00111000, OOL, IOL,
+	     "ai\t$rT, $rA, $val", IntegerOp,
+	     pattern>;

-def AIr32:
-    RI10Form<0b00111000, (outs R32C:$rT), (ins R32C:$rA, s10imm_i32:$val),
-      "ai\t$rT, $rA, $val", IntegerOp,
-      [(set R32C:$rT, (add R32C:$rA, i32ImmSExt10:$val))]>;
+class AIVecInst<ValueType vectype, PatLeaf immpred>:
+    AIInst<(outs VECREG:$rT), (ins VECREG:$rA, s10imm:$val),
+	    [(set (vectype VECREG:$rT), (add (vectype VECREG:$rA), immpred:$val))]>;
+
+class AIFPVecInst<ValueType vectype, PatLeaf immpred>:
+    AIInst<(outs VECREG:$rT), (ins VECREG:$rA, s10imm:$val),
+	    [/* no pattern */]>;
+
+class AIRegInst<RegisterClass rclass, PatLeaf immpred>:
+    AIInst<(outs rclass:$rT), (ins rclass:$rA, s10imm_i32:$val),
+	   [(set rclass:$rT, (add rclass:$rA, immpred:$val))]>;
+
+// This is used to add epsilons to floating point numbers in the f32 fdiv code:
+class AIFPInst<RegisterClass rclass, PatLeaf immpred>:
+    AIInst<(outs rclass:$rT), (ins rclass:$rA, s10imm_i32:$val),
+	   [/* no pattern */]>;
+
+multiclass AddImmediate {
+  def v4i32: AIVecInst<v4i32, v4i32SExt10Imm>;
+
+  def r32: AIRegInst<R32C, i32ImmSExt10>;
+
+  def v4f32: AIFPVecInst<v4f32, v4i32SExt10Imm>;
+  def f32: AIFPInst<R32FP, i32ImmSExt10>;
+}
+
+defm AI : AddImmediate;

 def SFHvec:
    RRForm<0b00010010000, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
@@ -795,8 +818,7 @@ def BGXvec:
 def MPYv8i16:
  RRForm<0b00100011110, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
    "mpy\t$rT, $rA, $rB", IntegerMulDiv,
-    [(set (v8i16 VECREG:$rT), (SPUmpy_vec (v8i16 VECREG:$rA),
-                                          (v8i16 VECREG:$rB)))]>;
+    [/* no pattern */]>;

 def MPYr16:
  RRForm<0b00100011110, (outs R16C:$rT), (ins R16C:$rA, R16C:$rB),
@@ -812,8 +834,7 @@ class MPYUInst<dag OOL, dag IOL, list<dag> pattern>:

 def MPYUv4i32:
  MPYUInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
-           [(set (v4i32 VECREG:$rT),
-                 (SPUmpyu_vec (v4i32 VECREG:$rA), (v4i32 VECREG:$rB)))]>;
+           [/* no pattern */]>;

 def MPYUr16:
  MPYUInst<(outs R32C:$rT), (ins R16C:$rA, R16C:$rB),
@@ -821,7 +842,7 @@ def MPYUr16:

 def MPYUr32:
  MPYUInst<(outs R32C:$rT), (ins R32C:$rA, R32C:$rB),
-           [(set R32C:$rT, (SPUmpyu_int R32C:$rA, R32C:$rB))]>;
+           [/* no pattern */]>;

 // mpyi: multiply 16 x s10imm -> 32 result.

@@ -892,87 +913,78 @@ class MPYHInst<dag OOL, dag IOL, list<dag> pattern>:
         
 def MPYHv4i32:
    MPYHInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
-             [(set (v4i32 VECREG:$rT),
-                   (SPUmpyh_vec (v4i32 VECREG:$rA), (v4i32 VECREG:$rB)))]>;
+             [/* no pattern */]>;

 def MPYHr32:
    MPYHInst<(outs R32C:$rT), (ins R32C:$rA, R32C:$rB),
-             [(set R32C:$rT, (SPUmpyh_int R32C:$rA, R32C:$rB))]>;
+             [/* no pattern */]>;

 // mpys: multiply high and shift right (returns the top half of
 // a 16-bit multiply, sign extended to 32 bits.)
-def MPYSvec:
-    RRForm<0b11100011110, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
-      "mpys\t$rT, $rA, $rB", IntegerMulDiv,
-      []>;

-def MPYSr16:
-    RRForm<0b11100011110, (outs R32C:$rT), (ins R16C:$rA, R16C:$rB),
+class MPYSInst<dag OOL, dag IOL>:
+    RRForm<0b11100011110, OOL, IOL, 
      "mpys\t$rT, $rA, $rB", IntegerMulDiv,
-      []>;
+      [/* no pattern */]>;
+
+def MPYSvec:
+    MPYSInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB)>;
+    
+def MPYSr16:
+    MPYSInst<(outs R32C:$rT), (ins R16C:$rA, R16C:$rB)>;

 // mpyhh: multiply high-high (returns the 32-bit result from multiplying
 // the top 16 bits of the $rA, $rB)
+
+class MPYHHInst<dag OOL, dag IOL>:
+  RRForm<0b01100011110, OOL, IOL,
+        "mpyhh\t$rT, $rA, $rB", IntegerMulDiv,
+        [/* no pattern */]>;
+        
 def MPYHHv8i16:
-    RRForm<0b01100011110, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
-      "mpyhh\t$rT, $rA, $rB", IntegerMulDiv,
-      [(set (v8i16 VECREG:$rT),
-            (SPUmpyhh_vec (v8i16 VECREG:$rA), (v8i16 VECREG:$rB)))]>;
+    MPYHHInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB)>;

 def MPYHHr32:
-    RRForm<0b01100011110, (outs R32C:$rT), (ins R32C:$rA, R32C:$rB),
-      "mpyhh\t$rT, $rA, $rB", IntegerMulDiv,
-      []>;
+    MPYHHInst<(outs R32C:$rT), (ins R32C:$rA, R32C:$rB)>;

 // mpyhha: Multiply high-high, add to $rT:
-def MPYHHAvec:
-    RRForm<0b01100010110, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
-      "mpyhha\t$rT, $rA, $rB", IntegerMulDiv,
-      []>;

-def MPYHHAr32:
-    RRForm<0b01100010110, (outs R32C:$rT), (ins R32C:$rA, R32C:$rB),
+class MPYHHAInst<dag OOL, dag IOL>:
+    RRForm<0b01100010110, OOL, IOL,
      "mpyhha\t$rT, $rA, $rB", IntegerMulDiv,
-      []>;
+      [/* no pattern */]>;
+
+def MPYHHAvec:
+    MPYHHAInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB)>;
+    
+def MPYHHAr32:
+    MPYHHAInst<(outs R32C:$rT), (ins R32C:$rA, R32C:$rB)>;

 // mpyhhu: Multiply high-high, unsigned
-def MPYHHUvec:
-    RRForm<0b01110011110, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
-      "mpyhhu\t$rT, $rA, $rB", IntegerMulDiv,
-      []>;

-def MPYHHUr32:
-    RRForm<0b01110011110, (outs R32C:$rT), (ins R32C:$rA, R32C:$rB),
+class MPYHHUInst<dag OOL, dag IOL>:
+    RRForm<0b01110011110, OOL, IOL,
      "mpyhhu\t$rT, $rA, $rB", IntegerMulDiv,
-      []>;
+      [/* no pattern */]>;
+
+def MPYHHUvec:
+    MPYHHUInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB)>;
+    
+def MPYHHUr32:
+    MPYHHUInst<(outs R32C:$rT), (ins R32C:$rA, R32C:$rB)>;

 // mpyhhau: Multiply high-high, unsigned
+
+class MPYHHAUInst<dag OOL, dag IOL>:
+    RRForm<0b01110010110, OOL, IOL,
+      "mpyhhau\t$rT, $rA, $rB", IntegerMulDiv,
+      [/* no pattern */]>;
+
 def MPYHHAUvec:
-    RRForm<0b01110010110, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
-      "mpyhhau\t$rT, $rA, $rB", IntegerMulDiv,
-      []>;
-
+    MPYHHAUInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB)>;
+    
 def MPYHHAUr32:
-    RRForm<0b01110010110, (outs R32C:$rT), (ins R32C:$rA, R32C:$rB),
-      "mpyhhau\t$rT, $rA, $rB", IntegerMulDiv,
-      []>;
-
-//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
-// v4i32, i32 multiply instruction sequence:
-//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
-def MPYv4i32:
-  Pat<(mul (v4i32 VECREG:$rA), (v4i32 VECREG:$rB)),
-      (Av4i32
-        (Av4i32 (MPYHv4i32 VECREG:$rA, VECREG:$rB),
-                (MPYHv4i32 VECREG:$rB, VECREG:$rA)),
-        (MPYUv4i32 VECREG:$rA, VECREG:$rB))>;
-
-def MPYi32:
-  Pat<(mul R32C:$rA, R32C:$rB),
-      (Ar32
-        (Ar32 (MPYHr32 R32C:$rA, R32C:$rB),
-              (MPYHr32 R32C:$rB, R32C:$rA)),
-        (MPYUr32 R32C:$rA, R32C:$rB))>;
+    MPYHHAUInst<(outs R32C:$rT), (ins R32C:$rA, R32C:$rB)>;

 //-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
 // clz: Count leading zeroes
@@ -983,7 +995,7 @@ class CLZInst<dag OOL, dag IOL, list<dag> pattern>:

 class CLZRegInst<RegisterClass rclass>:
    CLZInst<(outs rclass:$rT), (ins rclass:$rA),
-	    [(set rclass:$rT, (ctlz rclass:$rA))]>;
+            [(set rclass:$rT, (ctlz rclass:$rA))]>;

 class CLZVecInst<ValueType vectype>:
    CLZInst<(outs VECREG:$rT), (ins VECREG:$rA),
@@ -1424,7 +1436,7 @@ multiclass BitwiseOr
  def f64: ORInst<(outs R64FP:$rT), (ins R64FP:$rA, R64FP:$rB),
                  [/* no pattern */]>;

-  // scalar->vector promotion:
+  // scalar->vector promotion, prefslot2vec:
  def v16i8_i8:  ORPromoteScalar<R8C>;
  def v8i16_i16: ORPromoteScalar<R16C>;
  def v4i32_i32: ORPromoteScalar<R32C>;
@@ -1432,7 +1444,7 @@ multiclass BitwiseOr
  def v4f32_f32: ORPromoteScalar<R32FP>;
  def v2f64_f64: ORPromoteScalar<R64FP>;

-  // extract element 0:
+  // vector->scalar demotion, vec2prefslot:
  def i8_v16i8:  ORExtractElt<R8C>;
  def i16_v8i16: ORExtractElt<R16C>;
  def i32_v4i32: ORExtractElt<R32C>;
@@ -1831,6 +1843,13 @@ class SELBVecInst<ValueType vectype>:
                     (and (vnot (vectype VECREG:$rC)),
                          (vectype VECREG:$rA))))]>;

+class SELBVecVCondInst<ValueType vectype>:
+  SELBInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB, VECREG:$rC),
+           [(set (vectype VECREG:$rT),
+                 (select (vectype VECREG:$rC),
+                         (vectype VECREG:$rB),
+                         (vectype VECREG:$rA)))]>;
+
 class SELBVecCondInst<ValueType vectype>:
  SELBInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB, R32C:$rC),
           [(set (vectype VECREG:$rT),
@@ -1867,8 +1886,21 @@ multiclass SelectBits
  def v4i32_cond: SELBVecCondInst<v4i32>;
  def v2i64_cond: SELBVecCondInst<v2i64>;

+  def v16i8_vcond: SELBVecCondInst<v16i8>;
+  def v8i16_vcond: SELBVecCondInst<v8i16>;
+  def v4i32_vcond: SELBVecCondInst<v4i32>;
+  def v2i64_vcond: SELBVecCondInst<v2i64>;
+
+  def v4f32_cond:
+	SELBInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB, VECREG:$rC),
+		 [(set (v4f32 VECREG:$rT),
+		       (select (v4i32 VECREG:$rC),
+			       (v4f32 VECREG:$rB),
+			       (v4f32 VECREG:$rA)))]>;
+
  // SELBr64_cond is defined further down, look for i64 comparisons
  def r32_cond:   SELBRegCondInst<R32C, R32C>;
+  def f32_cond:   SELBRegCondInst<R32C, R32FP>;
  def r16_cond:   SELBRegCondInst<R16C, R16C>;
  def r8_cond:    SELBRegCondInst<R8C,  R8C>;
 }
@@ -2454,11 +2486,11 @@ class ROTQBIInst<dag OOL, dag IOL, list<dag> pattern>:
           RotateShift, pattern>;

 class ROTQBIVecInst<ValueType vectype>:
-    ROTQBIInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+    ROTQBIInst<(outs VECREG:$rT), (ins VECREG:$rA, R32C:$rB),
               [/* no pattern yet */]>;

 class ROTQBIRegInst<RegisterClass rclass>:
-    ROTQBIInst<(outs rclass:$rT), (ins rclass:$rA, rclass:$rB),
+    ROTQBIInst<(outs rclass:$rT), (ins rclass:$rA, R32C:$rB),
               [/* no pattern yet */]>;

 multiclass RotateQuadByBitCount
@@ -2645,9 +2677,6 @@ def : Pat<(srl R32C:$rA, (i8 imm:$val)),
 // ROTQMBYvec: This is a vector form merely so that when used in an
 // instruction pattern, type checking will succeed. This instruction assumes
 // that the user knew to negate $rB.
-//
-// Using the SPUrotquad_rz_bytes target-specific DAG node, the patterns
-// ensure that $rB is negated.
 //-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~

 class ROTQMBYInst<dag OOL, dag IOL, list<dag> pattern>:
@@ -2660,8 +2689,7 @@ class ROTQMBYVecInst<ValueType vectype>:

 class ROTQMBYRegInst<RegisterClass rclass>:
    ROTQMBYInst<(outs rclass:$rT), (ins rclass:$rA, R32C:$rB),
-                [(set rclass:$rT,
-                      (SPUrotquad_rz_bytes rclass:$rA, R32C:$rB))]>;
+                [/* no pattern */]>;

 multiclass RotateQuadBytes
 {
@@ -2676,32 +2704,17 @@ multiclass RotateQuadBytes

 defm ROTQMBY : RotateQuadBytes;

-def : Pat<(SPUrotquad_rz_bytes (v16i8 VECREG:$rA), R32C:$rB),
-          (ROTQMBYv16i8 VECREG:$rA, (SFIr32 R32C:$rB, 0))>;
-def : Pat<(SPUrotquad_rz_bytes (v8i16 VECREG:$rA), R32C:$rB),
-          (ROTQMBYv8i16 VECREG:$rA, (SFIr32 R32C:$rB, 0))>;
-def : Pat<(SPUrotquad_rz_bytes (v4i32 VECREG:$rA), R32C:$rB),
-          (ROTQMBYv4i32 VECREG:$rA, (SFIr32 R32C:$rB, 0))>;
-def : Pat<(SPUrotquad_rz_bytes (v2i64 VECREG:$rA), R32C:$rB),
-          (ROTQMBYv2i64 VECREG:$rA, (SFIr32 R32C:$rB, 0))>;
-def : Pat<(SPUrotquad_rz_bytes GPRC:$rA, R32C:$rB),
-          (ROTQMBYr128 GPRC:$rA, (SFIr32 R32C:$rB, 0))>;
-def : Pat<(SPUrotquad_rz_bytes R64C:$rA, R32C:$rB),
-          (ROTQMBYr64 R64C:$rA, (SFIr32 R32C:$rB, 0))>;
-
 class ROTQMBYIInst<dag OOL, dag IOL, list<dag> pattern>:
    RI7Form<0b10111111100, OOL, IOL, "rotqmbyi\t$rT, $rA, $val",
            RotateShift, pattern>;

 class ROTQMBYIVecInst<ValueType vectype>:
    ROTQMBYIInst<(outs VECREG:$rT), (ins VECREG:$rA, rotNeg7imm:$val),
-                 [(set (vectype VECREG:$rT),
-                       (SPUrotquad_rz_bytes (vectype VECREG:$rA), (i32 uimm7:$val)))]>;
+                 [/* no pattern */]>;

 class ROTQMBYIRegInst<RegisterClass rclass, Operand optype, ValueType inttype, PatLeaf pred>:
    ROTQMBYIInst<(outs rclass:$rT), (ins rclass:$rA, optype:$val),
-                 [(set rclass:$rT,
-                       (SPUrotquad_rz_bytes rclass:$rA, (inttype pred:$val)))]>;
+                 [/* no pattern */]>;

 multiclass RotateQuadBytesImm
 {
@@ -2725,8 +2738,8 @@ class ROTQMBYBIInst<dag OOL, dag IOL, list<dag> pattern>:
           RotateShift, pattern>;

 class ROTQMBYBIVecInst<ValueType vectype>:
-    ROTQMBYBIInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
-                  [/* no pattern, intrinsic? */]>;
+    ROTQMBYBIInst<(outs VECREG:$rT), (ins VECREG:$rA, R32C:$rB),
+                  [/* no pattern, */]>;

 multiclass RotateMaskQuadByBitCount
 {
@@ -2768,19 +2781,6 @@ multiclass RotateMaskQuadByBits

 defm ROTQMBI: RotateMaskQuadByBits;

-def : Pat<(SPUrotquad_rz_bits (v16i8 VECREG:$rA), R32C:$rB),
-          (ROTQMBIv16i8 VECREG:$rA, (SFIr32 R32C:$rB, 0))>;
-def : Pat<(SPUrotquad_rz_bits (v8i16 VECREG:$rA), R32C:$rB),
-          (ROTQMBIv8i16 VECREG:$rA, (SFIr32 R32C:$rB, 0))>;
-def : Pat<(SPUrotquad_rz_bits (v4i32 VECREG:$rA), R32C:$rB),
-          (ROTQMBIv4i32 VECREG:$rA, (SFIr32 R32C:$rB, 0))>;
-def : Pat<(SPUrotquad_rz_bits (v2i64 VECREG:$rA), R32C:$rB),
-          (ROTQMBIv2i64 VECREG:$rA, (SFIr32 R32C:$rB, 0))>;
-def : Pat<(SPUrotquad_rz_bits GPRC:$rA, R32C:$rB),
-          (ROTQMBIr128 GPRC:$rA, (SFIr32 R32C:$rB, 0))>;
-def : Pat<(SPUrotquad_rz_bits R64C:$rA, R32C:$rB),
-          (ROTQMBIr64 R64C:$rA, (SFIr32 R32C:$rB, 0))>;
-
 //-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
 // Rotate quad and mask by bits, immediate
 //-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
@@ -2791,13 +2791,11 @@ class ROTQMBIIInst<dag OOL, dag IOL, list<dag> pattern>:

 class ROTQMBIIVecInst<ValueType vectype>:
   ROTQMBIIInst<(outs VECREG:$rT), (ins VECREG:$rA, rotNeg7imm:$val),
-                 [(set (vectype VECREG:$rT),
-                       (SPUrotquad_rz_bits (vectype VECREG:$rA), (i32 uimm7:$val)))]>;
+                 [/* no pattern */]>;

 class ROTQMBIIRegInst<RegisterClass rclass>:
   ROTQMBIIInst<(outs rclass:$rT), (ins rclass:$rA, rotNeg7imm:$val),
-                 [(set rclass:$rT,
-                       (SPUrotquad_rz_bits rclass:$rA, (i32 uimm7:$val)))]>;
+                 [/* no pattern */]>;

 multiclass RotateMaskQuadByBitsImm
 {
@@ -3142,6 +3140,15 @@ multiclass CmpGtrWordImm

  def r32: CGTIInst<(outs R32C:$rT), (ins R32C:$rA, s10imm_i32:$val),
                    [(set R32C:$rT, (setgt R32C:$rA, i32ImmSExt10:$val))]>;
+
+  // CGTIv4f32, CGTIf32: These are used in the f32 fdiv instruction sequence:
+  def v4f32: CGTIInst<(outs VECREG:$rT), (ins VECREG:$rA, s10imm:$val),
+                       [(set (v4i32 VECREG:$rT),
+                             (setgt (v4i32 (bitconvert (v4f32 VECREG:$rA))),
+                                    (v4i32 v4i32SExt16Imm:$val)))]>;
+
+  def f32:   CGTIInst<(outs R32C:$rT), (ins R32FP:$rA, s10imm_i32:$val),
+  		      [/* no pattern */]>;
 }

 class CLGTBInst<dag OOL, dag IOL, list<dag> pattern> :
@@ -3750,62 +3757,63 @@ let isTerminator = 1, isBarrier = 1 in {

 class FAInst<dag OOL, dag IOL, list<dag> pattern>:
    RRForm<0b01011000100, OOL, IOL, "fa\t$rT, $rA, $rB",
-	   SPrecFP, pattern>;
+           SPrecFP, pattern>;

 class FAVecInst<ValueType vectype>:
    FAInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
             [(set (vectype VECREG:$rT),
-		   (fadd (vectype VECREG:$rA), (vectype VECREG:$rB)))]>;
+                   (fadd (vectype VECREG:$rA), (vectype VECREG:$rB)))]>;

 multiclass SFPAdd
 {
  def v4f32: FAVecInst<v4f32>;
-  def r32:   FAInst<(outs R32FP:$rT), (ins R32FP:$rA, R32FP:$rB),
-		    [(set R32FP:$rT, (fadd R32FP:$rA, R32FP:$rB))]>;
+  def f32:   FAInst<(outs R32FP:$rT), (ins R32FP:$rA, R32FP:$rB),
+                    [(set R32FP:$rT, (fadd R32FP:$rA, R32FP:$rB))]>;
 }

 defm FA : SFPAdd;

 class FSInst<dag OOL, dag IOL, list<dag> pattern>:
    RRForm<0b01011000100, OOL, IOL, "fs\t$rT, $rA, $rB",
-	   SPrecFP, pattern>;
+           SPrecFP, pattern>;

 class FSVecInst<ValueType vectype>:
    FSInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
-    	   [(set (vectype VECREG:$rT),
-	         (fsub (vectype VECREG:$rA), (vectype VECREG:$rB)))]>;
+           [(set (vectype VECREG:$rT),
+                 (fsub (vectype VECREG:$rA), (vectype VECREG:$rB)))]>;

 multiclass SFPSub
 {
  def v4f32: FSVecInst<v4f32>;
-  def r32:   FSInst<(outs R32FP:$rT), (ins R32FP:$rA, R32FP:$rB),
-		    [(set R32FP:$rT, (fsub R32FP:$rA, R32FP:$rB))]>;
+  def f32:   FSInst<(outs R32FP:$rT), (ins R32FP:$rA, R32FP:$rB),
+                    [(set R32FP:$rT, (fsub R32FP:$rA, R32FP:$rB))]>;
 }

 defm FS : SFPSub;

 // Floating point reciprocal estimate
-def FREv4f32 :
-    RRForm_1<0b00011101100, (outs VECREG:$rT), (ins VECREG:$rA),
-      "frest\t$rT, $rA", SPrecFP,
-      [(set (v4f32 VECREG:$rT), (SPUreciprocalEst (v4f32 VECREG:$rA)))]>;

-def FREf32 :
-    RRForm_1<0b00011101100, (outs R32FP:$rT), (ins R32FP:$rA),
-      "frest\t$rT, $rA", SPrecFP,
-      [(set R32FP:$rT, (SPUreciprocalEst R32FP:$rA))]>;
+class FRESTInst<dag OOL, dag IOL>:
+  RRForm_1<0b00110111000, OOL, IOL,
+           "frest\t$rT, $rA", SPrecFP,
+           [/* no pattern */]>;
+
+def FRESTv4f32 :
+    FRESTInst<(outs VECREG:$rT), (ins VECREG:$rA)>;
+
+def FRESTf32 :
+    FRESTInst<(outs R32FP:$rT), (ins R32FP:$rA)>;

 // Floating point interpolate (used in conjunction with reciprocal estimate)
 def FIv4f32 :
    RRForm<0b00101011110, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
      "fi\t$rT, $rA, $rB", SPrecFP,
-      [(set (v4f32 VECREG:$rT), (SPUinterpolate (v4f32 VECREG:$rA),
-                                                (v4f32 VECREG:$rB)))]>;
+      [/* no pattern */]>;

 def FIf32 :
    RRForm<0b00101011110, (outs R32FP:$rT), (ins R32FP:$rA, R32FP:$rB),
      "fi\t$rT, $rA, $rB", SPrecFP,
-      [(set R32FP:$rT, (SPUinterpolate R32FP:$rA, R32FP:$rB))]>;
+      [/* no pattern */]>;

 //--------------------------------------------------------------------------
 // Basic single precision floating point comparisons:
@@ -4445,12 +4453,14 @@ def : Pat<(SPUindirect (SPUhi tconstpool:$in, 0),
                       (SPUlo tconstpool:$in, 0)),
          (IOHLlo (ILHUhi tconstpool:$in), tconstpool:$in)>;

+/*
 def : Pat<(SPUindirect R32C:$sp, i32ImmSExt10:$imm),
          (AIr32 R32C:$sp, i32ImmSExt10:$imm)>;

 def : Pat<(SPUindirect R32C:$sp, imm:$imm),
          (Ar32 R32C:$sp,
                (IOHLr32 (ILHUr32 (HI16 imm:$imm)), (LO16 imm:$imm)))>;
+ */

 def : Pat<(add (SPUhi tglobaladdr:$in, 0), (SPUlo tglobaladdr:$in, 0)),
          (IOHLlo (ILHUhi tglobaladdr:$in), tglobaladdr:$in)>;
@@ -4466,5 +4476,7 @@ def : Pat<(add (SPUhi tconstpool:$in, 0), (SPUlo tconstpool:$in, 0)),

 // Instrinsics:
 include "CellSDKIntrinsics.td"
+// Various math operator instruction sequences
+include "SPUMathInstr.td"
 // 64-bit "instructions"/support
 include "SPU64InstrInfo.td"
--- a/lib/Target/CellSPU/SPUMathInstr.td
+++ b/lib/Target/CellSPU/SPUMathInstr.td
@@ -0,0 +1,99 @@
+//======--- SPUMathInst.td - Cell SPU math operations -*- tablegen -*---======//
+//
+//                     Cell SPU math operations
+//
+// This target description file contains instruction sequences for various
+// math operations, such as vector multiplies, i32 multiply, etc., for the
+// SPU's i32, i16 i8 and corresponding vector types.
+//
+// Any resemblance to libsimdmath or the Cell SDK simdmath library is
+// purely and completely coincidental.
+//
+// Primary author: Scott Michel (scottm@aero.org)
+//===----------------------------------------------------------------------===//
+
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+// v16i8 multiply instruction sequence:
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+
+def : Pat<(mul (v16i8 VECREG:$rA), (v16i8 VECREG:$rB)),
+          (ORv4i32
+           (ANDv4i32
+            (SELBv4i32 (MPYv8i16 VECREG:$rA, VECREG:$rB),
+                       (SHLHIv8i16 (MPYv8i16 (ROTMAHIv8i16 VECREG:$rA, 8),
+                                             (ROTMAHIv8i16 VECREG:$rB, 8)), 8),
+                       (FSMBIv8i16 0x2222)),
+            (ILAv4i32 0x0000ffff)),
+           (SHLIv4i32
+            (SELBv4i32 (MPYv8i16 (ROTMAIv4i32_i32 VECREG:$rA, 16),
+                                 (ROTMAIv4i32_i32 VECREG:$rB, 16)),
+                       (SHLHIv8i16 (MPYv8i16 (ROTMAIv4i32_i32 VECREG:$rA, 8),
+                                             (ROTMAIv4i32_i32 VECREG:$rB, 8)), 8),
+                       (FSMBIv8i16 0x2222)), 16))>;
+                        
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+// v8i16 multiply instruction sequence:
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+
+def : Pat<(mul (v8i16 VECREG:$rA), (v8i16 VECREG:$rB)),
+          (SELBv8i16 (MPYv8i16 VECREG:$rA, VECREG:$rB),
+                     (SHLIv4i32 (MPYHHv8i16 VECREG:$rA, VECREG:$rB), 16),
+                     (FSMBIv8i16 0xcccc))>;
+                 
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+// v4i32, i32 multiply instruction sequence:
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+
+def MPYv4i32:
+  Pat<(mul (v4i32 VECREG:$rA), (v4i32 VECREG:$rB)),
+      (Av4i32
+        (Av4i32 (MPYHv4i32 VECREG:$rA, VECREG:$rB),
+                (MPYHv4i32 VECREG:$rB, VECREG:$rA)),
+        (MPYUv4i32 VECREG:$rA, VECREG:$rB))>;
+
+def MPYi32:
+  Pat<(mul R32C:$rA, R32C:$rB),
+      (Ar32
+        (Ar32 (MPYHr32 R32C:$rA, R32C:$rB),
+              (MPYHr32 R32C:$rB, R32C:$rA)),
+        (MPYUr32 R32C:$rA, R32C:$rB))>;
+
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+// f32, v4f32 divide instruction sequence:
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+
+// Reciprocal estimate and interpolation
+def Interpf32: CodeFrag<(FIf32 R32FP:$rB, (FRESTf32 R32FP:$rB))>;
+// Division estimate
+def DivEstf32: CodeFrag<(FMf32 R32FP:$rA, Interpf32.Fragment)>;
+// Newton-Raphson iteration
+def NRaphf32: CodeFrag<(FMAf32 (FNMSf32 DivEstf32.Fragment, R32FP:$rB, R32FP:$rA),
+		  	       Interpf32.Fragment,
+	  	  	       DivEstf32.Fragment)>;
+// Epsilon addition
+def Epsilonf32: CodeFrag<(AIf32 NRaphf32.Fragment, 1)>;
+
+def : Pat<(fdiv R32FP:$rA, R32FP:$rB),
+	  (SELBf32_cond NRaphf32.Fragment,
+			Epsilonf32.Fragment,
+			(CGTIf32 (FNMSf32 R32FP:$rB, Epsilonf32.Fragment, R32FP:$rA), -1))>;
+
+// Reciprocal estimate and interpolation
+def Interpv4f32: CodeFrag<(FIv4f32 (v4f32 VECREG:$rB), (FRESTv4f32 (v4f32 VECREG:$rB)))>;
+// Division estimate
+def DivEstv4f32: CodeFrag<(FMv4f32 (v4f32 VECREG:$rA), Interpv4f32.Fragment)>;
+// Newton-Raphson iteration
+def NRaphv4f32: CodeFrag<(FMAv4f32 (FNMSv4f32 DivEstv4f32.Fragment,
+					      (v4f32 VECREG:$rB),
+					      (v4f32 VECREG:$rA)),
+		  	           Interpv4f32.Fragment,
+	  	  	           DivEstv4f32.Fragment)>;
+// Epsilon addition
+def Epsilonv4f32: CodeFrag<(AIv4f32 NRaphv4f32.Fragment, 1)>;
+
+def : Pat<(fdiv (v4f32 VECREG:$rA), (v4f32 VECREG:$rB)),
+	  (SELBv4f32_cond NRaphv4f32.Fragment,
+			Epsilonv4f32.Fragment,
+			(CGTIv4f32 (FNMSv4f32 (v4f32 VECREG:$rB),
+					      Epsilonv4f32.Fragment,
+					      (v4f32 VECREG:$rA)), -1))>;
--- a/lib/Target/CellSPU/SPUNodes.td
+++ b/lib/Target/CellSPU/SPUNodes.td
@@ -87,24 +87,6 @@ def SPUcntb : SDNode<"SPUISD::CNTB", SDTIntUnaryOp>;
 // SPUISelLowering.h):
 def SPUshuffle: SDNode<"SPUISD::SHUFB", SDT_SPUshuffle, []>;

-// SPU 16-bit multiply
-def SPUmpy_vec: SDNode<"SPUISD::MPY", SPUVecBinop, []>;
-
-// SPU multiply unsigned, used in instruction lowering for v4i32
-// multiplies:
-def SPUmpyu_vec: SDNode<"SPUISD::MPYU", SPUVecBinop, []>;
-def SPUmpyu_int: SDNode<"SPUISD::MPYU", SDTIntBinOp, []>;
-
-// SPU 16-bit multiply high x low, shift result 16-bits
-// Used to compute intermediate products for 32-bit multiplies
-def SPUmpyh_vec: SDNode<"SPUISD::MPYH", SPUVecBinop, []>;
-def SPUmpyh_int: SDNode<"SPUISD::MPYH", SDTIntBinOp, []>;
-
-// SPU 16-bit multiply high x high, 32-bit product
-// Used to compute intermediate products for 16-bit multiplies
-def SPUmpyhh_vec: SDNode<"SPUISD::MPYHH", SPUVecBinop, []>;
-def SPUmpyhh_int: SDNode<"SPUISD::MPYHH", SDTIntBinOp, []>;
-
 // Shift left quadword by bits and bytes
 def SPUshlquad_l_bits: SDNode<"SPUISD::SHLQUAD_L_BITS", SPUvecshift_type, []>;
 def SPUshlquad_l_bytes: SDNode<"SPUISD::SHLQUAD_L_BYTES", SPUvecshift_type, []>;
@@ -117,11 +99,6 @@ def SPUvec_sra: SDNode<"SPUISD::VEC_SRA", SPUvecshift_type, []>;
 def SPUvec_rotl: SDNode<"SPUISD::VEC_ROTL", SPUvecshift_type, []>;
 def SPUvec_rotr: SDNode<"SPUISD::VEC_ROTR", SPUvecshift_type, []>;

-def SPUrotquad_rz_bytes: SDNode<"SPUISD::ROTQUAD_RZ_BYTES",
-                                    SPUvecshift_type, []>;
-def SPUrotquad_rz_bits: SDNode<"SPUISD::ROTQUAD_RZ_BITS",
-                                    SPUvecshift_type, []>;
-
 // Vector rotate left, bits shifted out of the left are rotated in on the right
 def SPUrotbytes_left: SDNode<"SPUISD::ROTBYTES_LEFT",
                             SPUvecshift_type, []>;
@@ -141,12 +118,6 @@ def SPUselb: SDNode<"SPUISD::SELB", SPUselb_type, []>;
 // SPU gather bits instruction:
 def SPUgatherbits: SDNode<"SPUISD::GATHER_BITS", SPUgatherbits_type, []>;

-// SPU floating point interpolate
-def SPUinterpolate : SDNode<"SPUISD::FPInterp", SDTFPBinOp, []>;
-
-// SPU floating point reciprocal estimate (used for fdiv)
-def SPUreciprocalEst: SDNode<"SPUISD::FPRecipEst", SDTFPUnaryOp, []>;
-
 def SDTprefslot2vec: SDTypeProfile<1, 1, []>;
 def SPUprefslot2vec: SDNode<"SPUISD::PREFSLOT2VEC", SDTprefslot2vec, []>;

--- a/lib/Target/CellSPU/SPURegisterInfo.cpp
+++ b/lib/Target/CellSPU/SPURegisterInfo.cpp
@@ -238,7 +238,7 @@ SPURegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const
    SPU::R0,    /* link register */
    0 /* end */
  };
-  
+
  return SPU_CalleeSaveRegs;
 }

@@ -268,7 +268,7 @@ SPURegisterInfo::getCalleeSavedRegClasses(const MachineFunction *MF) const
    &SPU::GPRCRegClass, /* link register */
    0 /* end */
  };
- 
+
  return SPU_CalleeSaveRegClasses;
 }

@@ -339,10 +339,13 @@ SPURegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj,
  // Now add the frame object offset to the offset from r1.
  int Offset = MFI->getObjectOffset(FrameIndex);

-  // Most instructions, except for generated FrameIndex additions using AIr32,
-  // have the immediate in operand 1. AIr32, in this case, has the immediate
-  // in operand 2.
-  unsigned OpNo = (MI.getOpcode() != SPU::AIr32 ? 1 : 2);
+  // Most instructions, except for generated FrameIndex additions using AIr32
+  // and ILAr32, have the immediate in operand 1. AIr32 and ILAr32 have the
+  // immediate in operand 2.
+  unsigned OpNo = 1;
+  if (MI.getOpcode() == SPU::AIr32 || MI.getOpcode() == SPU::ILAr32)
+    OpNo = 2;
+
  MachineOperand &MO = MI.getOperand(OpNo);

  // Offset is biased by $lr's slot at the bottom.
@@ -355,7 +358,7 @@ SPURegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj,
  if (Offset > SPUFrameInfo::maxFrameOffset()
      || Offset < SPUFrameInfo::minFrameOffset()) {
    cerr << "Large stack adjustment ("
-         << Offset 
+         << Offset
         << ") in SPURegisterInfo::eliminateFrameIndex.";
  } else {
    MO.ChangeToImmediate(Offset);
@@ -371,7 +374,7 @@ SPURegisterInfo::determineFrameLayout(MachineFunction &MF) const

  // Get the number of bytes to allocate from the FrameInfo
  unsigned FrameSize = MFI->getStackSize();
-  
+
  // Get the alignments provided by the target, and the maximum alignment
  // (if any) of the fixed frame objects.
  unsigned TargetAlign = MF.getTarget().getFrameInfo()->getStackAlignment();
@@ -381,7 +384,7 @@ SPURegisterInfo::determineFrameLayout(MachineFunction &MF) const

  // Get the maximum call frame size of all the calls.
  unsigned maxCallFrameSize = MFI->getMaxCallFrameSize();
-    
+
  // If we have dynamic alloca then maxCallFrameSize needs to be aligned so
  // that allocations will be aligned.
  if (MFI->hasVarSizedObjects())
@@ -389,7 +392,7 @@ SPURegisterInfo::determineFrameLayout(MachineFunction &MF) const

  // Update maximum call frame size.
  MFI->setMaxCallFrameSize(maxCallFrameSize);
-  
+
  // Include call frame size in total.
  FrameSize += maxCallFrameSize;

@@ -418,18 +421,18 @@ void SPURegisterInfo::emitPrologue(MachineFunction &MF) const
  MachineBasicBlock::iterator MBBI = MBB.begin();
  MachineFrameInfo *MFI = MF.getFrameInfo();
  MachineModuleInfo *MMI = MFI->getMachineModuleInfo();
-  
+
  // Prepare for debug frame info.
  bool hasDebugInfo = MMI && MMI->hasDebugInfo();
  unsigned FrameLabelId = 0;
-  
+
  // Move MBBI back to the beginning of the function.
  MBBI = MBB.begin();
-  
+
  // Work out frame sizes.
  determineFrameLayout(MF);
  int FrameSize = MFI->getStackSize();
-  
+
  assert((FrameSize & 0xf) == 0
         && "SPURegisterInfo::emitPrologue: FrameSize not aligned");

@@ -440,7 +443,7 @@ void SPURegisterInfo::emitPrologue(MachineFunction &MF) const
      FrameLabelId = MMI->NextLabelID();
      BuildMI(MBB, MBBI, TII.get(SPU::DBG_LABEL)).addImm(FrameLabelId);
    }
-  
+
    // Adjust stack pointer, spilling $lr -> 16($sp) and $sp -> -FrameSize($sp)
    // for the ABI
    BuildMI(MBB, MBBI, TII.get(SPU::STQDr32), SPU::R0).addImm(16)
@@ -476,15 +479,15 @@ void SPURegisterInfo::emitPrologue(MachineFunction &MF) const
      cerr << "Unhandled frame size: " << FrameSize << "\n";
      abort();
    }
- 
+
    if (hasDebugInfo) {
      std::vector<MachineMove> &Moves = MMI->getFrameMoves();
-    
+
      // Show update of SP.
      MachineLocation SPDst(MachineLocation::VirtualFP);
      MachineLocation SPSrc(MachineLocation::VirtualFP, -FrameSize);
      Moves.push_back(MachineMove(FrameLabelId, SPDst, SPSrc));
-    
+
      // Add callee saved registers to move list.
      const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo();
      for (unsigned I = 0, E = CSI.size(); I != E; ++I) {
@@ -495,11 +498,11 @@ void SPURegisterInfo::emitPrologue(MachineFunction &MF) const
        MachineLocation CSSrc(Reg);
        Moves.push_back(MachineMove(FrameLabelId, CSDst, CSSrc));
      }
-    
+
      // Mark effective beginning of when frame pointer is ready.
      unsigned ReadyLabelId = MMI->NextLabelID();
      BuildMI(MBB, MBBI, TII.get(SPU::DBG_LABEL)).addImm(ReadyLabelId);
-    
+
      MachineLocation FPDst(SPU::R1);
      MachineLocation FPSrc(MachineLocation::VirtualFP);
      Moves.push_back(MachineMove(ReadyLabelId, FPDst, FPSrc));
--- a/test/CodeGen/CellSPU/fdiv.ll
+++ b/test/CodeGen/CellSPU/fdiv.ll
@@ -1,9 +1,11 @@
 ; RUN: llvm-as -o - %s | llc -march=cellspu > %t1.s
 ; RUN: grep frest    %t1.s | count 2 
 ; RUN: grep -w fi    %t1.s | count 2 
-; RUN: grep fm       %t1.s | count 4 
+; RUN: grep -w fm    %t1.s | count 2
 ; RUN: grep fma      %t1.s | count 2 
-; RUN: grep fnms     %t1.s | count 2
+; RUN: grep fnms     %t1.s | count 4
+; RUN: grep cgti     %t1.s | count 2
+; RUN: grep selb     %t1.s | count 2
 ;
 ; This file includes standard floating point arithmetic instructions
 target datalayout = "E-p:32:32:128-f64:64:128-f32:32:128-i64:32:128-i32:32:128-i16:16:128-i8:8:128-i1:8:128-a0:0:128-v128:128:128-s0:128:128"
--- a/test/CodeGen/CellSPU/i64ops.ll
+++ b/test/CodeGen/CellSPU/i64ops.ll
@@ -1,8 +1,5 @@
 ; RUN: llvm-as -o - %s | llc -march=cellspu > %t1.s
-; RUN: grep {fsmbi.*61680}   %t1.s | count 1
-; RUN: grep rotqmbyi         %t1.s | count 1
-; RUN: grep rotmai           %t1.s | count 1
-; RUN: grep selb             %t1.s | count 1
+; RUN: grep xswd	     %t1.s | count 1
 ; RUN: grep shufb            %t1.s | count 2
 ; RUN: grep cg               %t1.s | count 1
 ; RUN: grep addx             %t1.s | count 1
--- a/test/CodeGen/CellSPU/mul_ops.ll
+++ b/test/CodeGen/CellSPU/mul_ops.ll
@@ -8,7 +8,7 @@
 ; RUN: grep and     %t1.s | count 2
 ; RUN: grep selb    %t1.s | count 6
 ; RUN: grep fsmbi   %t1.s | count 4
-; RUN: grep shli    %t1.s | count 2
+; RUN: grep shli    %t1.s | count 4
 ; RUN: grep shlhi   %t1.s | count 4
 ; RUN: grep ila     %t1.s | count 2
 ; RUN: grep xsbh    %t1.s | count 4
--- a/test/CodeGen/CellSPU/shift_ops.ll
+++ b/test/CodeGen/CellSPU/shift_ops.ll
@@ -1,10 +1,21 @@
 ; RUN: llvm-as -o - %s | llc -march=cellspu > %t1.s
-; RUN: grep shlh   %t1.s | count 84
-; RUN: grep shlhi  %t1.s | count 51
-; RUN: grep shl    %t1.s | count 168
-; RUN: grep shli   %t1.s | count 51
-; RUN: grep xshw   %t1.s | count 5
-; RUN: grep and    %t1.s | count 5
+; RUN: grep -w shlh      %t1.s | count 9
+; RUN: grep -w shlhi     %t1.s | count 3
+; RUN: grep -w shl       %t1.s | count 9
+; RUN: grep -w shli      %t1.s | count 3
+; RUN: grep -w xshw      %t1.s | count 5
+; RUN: grep -w and       %t1.s | count 5
+; RUN: grep -w andi      %t1.s | count 2
+; RUN: grep -w rotmi     %t1.s | count 2
+; RUN: grep -w rotqmbyi  %t1.s | count 1
+; RUN: grep -w rotqmbii  %t1.s | count 2
+; RUN: grep -w rotqmby   %t1.s | count 1
+; RUN: grep -w rotqmbi   %t1.s | count 1
+; RUN: grep -w rotqbyi   %t1.s | count 1
+; RUN: grep -w rotqbii   %t1.s | count 2
+; RUN: grep -w rotqbybi  %t1.s | count 1
+; RUN: grep -w sfi       %t1.s | count 3
+
 target datalayout = "E-p:32:32:128-f64:64:128-f32:32:128-i64:32:128-i32:32:128-i16:16:128-i8:8:128-i1:8:128-a0:0:128-v128:128:128-s0:128:128"
 target triple = "spu"

@@ -210,3 +221,57 @@ define i32 @shli_i32_12(i32 zeroext %arg1) zeroext {
        %A = shl i32 0, %arg1
        ret i32 %A
 }
+
+;; i64 shift left
+
+define i64 @shl_i64_1(i64 %arg1) {
+	%A = shl i64 %arg1, 9
+	ret i64 %A
+}
+
+define i64 @shl_i64_2(i64 %arg1) {
+	%A = shl i64 %arg1, 3
+	ret i64 %A
+}
+
+define i64 @shl_i64_3(i64 %arg1, i32 %shift) {
+	%1 = zext i32 %shift to i64
+	%2 = shl i64 %arg1, %1
+	ret i64 %2
+}
+
+;; i64 shift right logical (shift 0s from the right)
+
+define i64 @lshr_i64_1(i64 %arg1) {
+	%1 = lshr i64 %arg1, 9
+	ret i64 %1
+}
+
+define i64 @lshr_i64_2(i64 %arg1) {
+	%1 = lshr i64 %arg1, 3
+	ret i64 %1
+}
+
+define i64 @lshr_i64_3(i64 %arg1, i32 %shift) {
+	%1 = zext i32 %shift to i64
+	%2 = lshr i64 %arg1, %1
+	ret i64 %2
+}
+
+;; i64 shift right arithmetic (shift 1s from the right)
+
+define i64 @ashr_i64_1(i64 %arg) {
+	%1 = ashr i64 %arg, 9
+	ret i64 %1
+}
+
+define i64 @ashr_i64_2(i64 %arg) {
+	%1 = ashr i64 %arg, 3
+	ret i64 %1
+}
+
+define i64 @ashr_i64_3(i64 %arg1, i32 %shift) {
+	%1 = zext i32 %shift to i64
+	%2 = ashr i64 %arg1, %1
+	ret i64 %2
+}
--- a/test/CodeGen/CellSPU/useful-harnesses/i64operations.c
+++ b/test/CodeGen/CellSPU/useful-harnesses/i64operations.c
@@ -34,19 +34,45 @@ struct pred_s preds[] = {
  { "neq", i64_neq, i64_neq_select }
 };

+uint64_t i64_shl_const(uint64_t a) {
+  return a << 10;
+}
+
+uint64_t i64_shl(uint64_t a, int amt) {
+  return a << amt;
+}
+
+uint64_t i64_srl_const(uint64_t a) {
+  return a >> 10;
+}
+
+uint64_t i64_srl(uint64_t a, int amt) {
+  return a >> amt;
+}
+
+int64_t i64_sra_const(int64_t a) {
+  return a >> 10;
+}
+
+int64_t i64_sra(int64_t a, int amt) {
+  return a >> amt;
+}
+
 int main(void) {
  int i;
-  int64_t a = 1234567890000LL;
-  int64_t b = 2345678901234LL;
-  int64_t c = 1234567890001LL;
-  int64_t d =         10001LL;
-  int64_t e =         10000LL;
+  int64_t a =  1234567890003LL;
+  int64_t b =  2345678901235LL;
+  int64_t c =  1234567890001LL;
+  int64_t d =          10001LL;
+  int64_t e =          10000LL;
+  int64_t f = -1068103409991LL;

  printf("a = %16lld (0x%016llx)\n", a, a);
  printf("b = %16lld (0x%016llx)\n", b, b);
  printf("c = %16lld (0x%016llx)\n", c, c);
  printf("d = %16lld (0x%016llx)\n", d, d);
  printf("e = %16lld (0x%016llx)\n", e, e);
+  printf("f = %16lld (0x%016llx)\n", f, f);
  printf("----------------------------------------\n");

  for (i = 0; i < sizeof(preds)/sizeof(preds[0]); ++i) {
@@ -64,5 +90,23 @@ int main(void) {
    printf("----------------------------------------\n");
  }

+  printf("a                = 0x%016llx\n", a);
+  printf("i64_shl_const(a) = 0x%016llx\n", i64_shl_const(a));
+  printf("i64_shl(a)       = 0x%016llx\n", i64_shl(a, 5));
+  printf("i64_srl_const(a) = 0x%016llx\n", i64_srl_const(a));
+  printf("i64_srl(a)       = 0x%016llx\n", i64_srl(a, 5));
+  printf("i64_sra_const(a) = 0x%016llx\n", i64_sra_const(a));
+  printf("i64_sra(a)       = 0x%016llx\n", i64_sra(a, 5));
+  printf("----------------------------------------\n");
+
+  printf("f                = 0x%016llx\n", f);
+  printf("i64_shl_const(f) = 0x%016llx\n", i64_shl_const(f));
+  printf("i64_shl(f)       = 0x%016llx\n", i64_shl(f, 10));
+  printf("i64_srl_const(f) = 0x%016llx\n", i64_srl_const(f));
+  printf("i64_srl(f)       = 0x%016llx\n", i64_srl(f, 10));
+  printf("i64_sra_const(f) = 0x%016llx\n", i64_sra_const(f));
+  printf("i64_sra(f)       = 0x%016llx\n", i64_sra(f, 10));
+  printf("----------------------------------------\n");
+
  return 0;
 }