- Convert remaining i64 custom lowering into custom instruction emission

sequences in SPUDAGToDAGISel.cpp and SPU64InstrInfo.td, killing custom DAG node types as needed. - i64 mul is now a legal instruction, but emits an instruction sequence that stretches tblgen and the imagination, as well as violating laws of several small countries and most southern US states (just kidding, but looking at a function with 80+ parameters is really weird and just plain wrong.) - Update tests as needed. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@62254 91177308-0d34-0410-b5e6-96231b3b80d8
2025-05-23 11:38:38 +00:00 · 2009-01-15 04:41:47 +00:00 · 2009-01-15 04:41:47 +00:00 · 94bd57e154
commit 94bd57e154
parent f9b1d79a54
9 changed files with 372 additions and 193 deletions
--- a/lib/Target/CellSPU/SPU64InstrInfo.td
+++ b/lib/Target/CellSPU/SPU64InstrInfo.td
@ -2,7 +2,6 @@
 //
 //                     Cell SPU 64-bit operations
 //
-// Primary author: Scott Michel (scottm@aero.org)
 //===----------------------------------------------------------------------===//

 //-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
@ -240,3 +239,145 @@ def : Pat<(setge (v2i64 VECREG:$rA), (v2i64 VECREG:$rB)),
 // i64 setult:
 def : I64SETCCNegCond<setlt, I64GEr64>;
 def : I64SELECTNegCond<setlt, I64GEr64>;
+
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+// v2i64, i64 add
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+
+class v2i64_add_cg<dag lhs, dag rhs>:
+    CodeFrag<(CGv4i32 lhs, rhs)>;
+
+class v2i64_add_1<dag lhs, dag rhs, dag cg, dag cg_mask>:
+    CodeFrag<(ADDXv4i32 lhs, rhs, (SHUFBv4i32 cg, cg, cg_mask))>;
+
+class v2i64_add<dag lhs, dag rhs, dag cg_mask>:
+    v2i64_add_1<lhs, rhs, v2i64_add_cg<lhs, rhs>.Fragment, cg_mask>;
+
+def : Pat<(SPUadd64 R64C:$rA, R64C:$rB, (v4i32 VECREG:$rCGmask)),
+           (ORi64_v2i64 v2i64_add<(ORv2i64_i64 R64C:$rA),
+                                  (ORv2i64_i64 R64C:$rB),
+                                  (v4i32 VECREG:$rCGmask)>.Fragment)>;
+
+def : Pat<(SPUadd64 (v2i64 VECREG:$rA), (v2i64 VECREG:$rB),
+                    (v4i32 VECREG:$rCGmask)),
+           v2i64_add<(v2i64 VECREG:$rA),
+                     (v2i64 VECREG:$rB),
+                     (v4i32 VECREG:$rCGmask)>.Fragment>;
+
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+// v2i64, i64 subtraction
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+
+class v2i64_sub_bg<dag lhs, dag rhs>: CodeFrag<(BGv4i32 lhs, rhs)>;
+
+class v2i64_sub<dag lhs, dag rhs, dag bg, dag bg_mask>:
+    CodeFrag<(SFXv4i32 lhs, rhs, (SHUFBv4i32 bg, bg, bg_mask))>;
+
+def : Pat<(SPUsub64 R64C:$rA, R64C:$rB, (v4i32 VECREG:$rCGmask)),
+           (ORi64_v2i64 v2i64_sub<(ORv2i64_i64 R64C:$rA),
+                                  (ORv2i64_i64 R64C:$rB),
+                                  v2i64_sub_bg<(ORv2i64_i64 R64C:$rA),
+                                               (ORv2i64_i64 R64C:$rB)>.Fragment,
+                                  (v4i32 VECREG:$rCGmask)>.Fragment)>;
+
+def : Pat<(SPUsub64 (v2i64 VECREG:$rA), (v2i64 VECREG:$rB),
+                    (v4i32 VECREG:$rCGmask)),
+           v2i64_sub<(v2i64 VECREG:$rA),
+                     (v2i64 VECREG:$rB),
+                     v2i64_sub_bg<(v2i64 VECREG:$rA),
+                                  (v2i64 VECREG:$rB)>.Fragment,
+                     (v4i32 VECREG:$rCGmask)>.Fragment>;
+
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+// v2i64, i64 multiply
+//
+// Note: i64 multiply is simply the vector->scalar conversion of the
+// full-on v2i64 multiply, since the entire vector has to be manipulated
+// anyway.
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+
+class v2i64_mul_ahi64<dag rA> :
+    CodeFrag<(SELBv4i32 rA, (ILv4i32 0), (FSMBIv4i32 0x0f0f))>;
+
+class v2i64_mul_bhi64<dag rB> :
+    CodeFrag<(SELBv4i32 rB, (ILv4i32 0), (FSMBIv4i32 0x0f0f))>;
+
+class v2i64_mul_alo64<dag rB> :
+    CodeFrag<(SELBv4i32 rB, (ILv4i32 0), (FSMBIv4i32 0xf0f0))>;
+
+class v2i64_mul_blo64<dag rB> :
+    CodeFrag<(SELBv4i32 rB, (ILv4i32 0), (FSMBIv4i32 0xf0f0))>;
+
+class v2i64_mul_ashlq2<dag rA>:
+    CodeFrag<(SHLQBYIv4i32 rA, 0x2)>;
+
+class v2i64_mul_ashlq4<dag rA>:
+    CodeFrag<(SHLQBYIv4i32 rA, 0x4)>;
+
+class v2i64_mul_bshlq2<dag rB> :
+    CodeFrag<(SHLQBYIv4i32 rB, 0x2)>;
+
+class v2i64_mul_bshlq4<dag rB> :
+    CodeFrag<(SHLQBYIv4i32 rB, 0x4)>;
+
+class v2i64_highprod<dag rA, dag rB>:
+    CodeFrag<(Av4i32
+                (Av4i32
+                  (MPYUv4i32 v2i64_mul_bshlq4<rB>.Fragment,     // a1 x b3
+                             v2i64_mul_ahi64<rA>.Fragment),
+                  (MPYHv4i32 v2i64_mul_ahi64<rA>.Fragment,      // a0 x b3
+                             v2i64_mul_bshlq4<rB>.Fragment)),
+                (Av4i32
+                  (MPYHv4i32 v2i64_mul_bhi64<rB>.Fragment,
+                             v2i64_mul_ashlq4<rA>.Fragment),
+                  (Av4i32
+                    (MPYHv4i32 v2i64_mul_ashlq4<rA>.Fragment,
+                               v2i64_mul_bhi64<rB>.Fragment),
+                    (Av4i32
+                      (MPYUv4i32 v2i64_mul_ashlq4<rA>.Fragment,
+                                 v2i64_mul_bhi64<rB>.Fragment),
+                      (Av4i32
+                        (MPYHv4i32 v2i64_mul_ashlq2<rA>.Fragment,
+                                   v2i64_mul_bshlq2<rB>.Fragment),
+                        (MPYUv4i32 v2i64_mul_ashlq2<rA>.Fragment,
+                                   v2i64_mul_bshlq2<rB>.Fragment))))))>;
+
+class v2i64_mul_a3_b3<dag rA, dag rB>:
+    CodeFrag<(MPYUv4i32 v2i64_mul_alo64<rA>.Fragment,
+                        v2i64_mul_blo64<rB>.Fragment)>;
+
+class v2i64_mul_a2_b3<dag rA, dag rB>:
+    CodeFrag<(SELBv4i32 (SHLQBYIv4i32
+                          (MPYHHUv4i32 v2i64_mul_alo64<rA>.Fragment,
+                                       v2i64_mul_bshlq2<rB>.Fragment), 0x2),
+                        (ILv4i32 0),
+                        (FSMBIv4i32 0xc3c3))>;
+
+class v2i64_mul_a3_b2<dag rA, dag rB>:
+    CodeFrag<(SELBv4i32 (SHLQBYIv4i32
+                          (MPYHHUv4i32 v2i64_mul_blo64<rB>.Fragment,
+                                       v2i64_mul_ashlq2<rA>.Fragment), 0x2),
+                        (ILv4i32 0),
+                        (FSMBIv4i32 0xc3c3))>;
+
+class v2i64_lowsum<dag rA, dag rB, dag rCGmask>:
+    v2i64_add<v2i64_add<v2i64_mul_a3_b3<rA, rB>.Fragment,
+                        v2i64_mul_a2_b3<rA, rB>.Fragment, rCGmask>.Fragment,
+              v2i64_mul_a3_b2<rA, rB>.Fragment, rCGmask>;
+
+class v2i64_mul<dag rA, dag rB, dag rCGmask>:
+    v2i64_add<v2i64_lowsum<rA, rB, rCGmask>.Fragment,
+              (SELBv4i32 v2i64_highprod<rA, rB>.Fragment,
+                         (ILv4i32 0),
+                         (FSMBIv4i32 0x0f0f)),
+              rCGmask>;
+
+def : Pat<(SPUmul64 R64C:$rA, R64C:$rB, (v4i32 VECREG:$rCGmask)),
+          (ORi64_v2i64 v2i64_mul<(ORv2i64_i64 R64C:$rA),
+                                 (ORv2i64_i64 R64C:$rB),
+                                 (v4i32 VECREG:$rCGmask)>.Fragment)>;
+
+def : Pat<(SPUmul64 (v2i64 VECREG:$rA), (v2i64 VECREG:$rB),
+                    (v4i32 VECREG:$rCGmask)),
+          v2i64_mul<(v2i64 VECREG:$rA), (v2i64 VECREG:$rB),
+                    (v4i32 VECREG:$rCGmask)>.Fragment>;
--- a/lib/Target/CellSPU/SPUISelDAGToDAG.cpp
+++ b/lib/Target/CellSPU/SPUISelDAGToDAG.cpp
@ -18,11 +18,13 @@
 #include "SPUHazardRecognizers.h"
 #include "SPUFrameInfo.h"
 #include "SPURegisterNames.h"
+#include "SPUTargetMachine.h"
 #include "llvm/CodeGen/MachineConstantPool.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/CodeGen/PseudoSourceValue.h"
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Constants.h"
@ -254,6 +256,26 @@ public:
    return CurDAG->getTargetConstant(Imm, SPUtli.getPointerTy());
  }

+  SDNode *emitBuildVector(SDValue build_vec) {
+    std::vector<Constant*> CV;
+
+    for (size_t i = 0; i < build_vec.getNumOperands(); ++i) {
+      ConstantSDNode *V = dyn_cast<ConstantSDNode>(build_vec.getOperand(i));
+      CV.push_back(const_cast<ConstantInt *>(V->getConstantIntValue()));
+    }
+
+    Constant *CP = ConstantVector::get(CV);
+    SDValue CPIdx = CurDAG->getConstantPool(CP, SPUtli.getPointerTy());
+    unsigned Alignment = 1 << cast<ConstantPoolSDNode>(CPIdx)->getAlignment();
+    SDValue CGPoolOffset =
+            SPU::LowerConstantPool(CPIdx, *CurDAG,
+                                   SPUtli.getSPUTargetMachine());
+    return SelectCode(CurDAG->getLoad(build_vec.getValueType(),
+                      CurDAG->getEntryNode(), CGPoolOffset,
+                      PseudoSourceValue::getConstantPool(), 0,
+                      false, Alignment));
+  }
+
  /// Select - Convert the specified operand from a target-independent to a
  /// target-specific node if it hasn't already been changed.
  SDNode *Select(SDValue Op);
@ -647,22 +669,82 @@ SPUDAGToDAGISel::Select(SDValue Op) {
                                             TFI, Imm0), 0);
      n_ops = 2;
    }
-  } else if (Opc == ISD::ZERO_EXTEND) {
-    // (zero_extend:i16 (and:i8 <arg>, <const>))
-    const SDValue &Op1 = N->getOperand(0);
+  } else if ((Opc == ISD::ZERO_EXTEND || Opc == ISD::ANY_EXTEND)
+             && OpVT == MVT::i64) {
+    SDValue Op0 = Op.getOperand(0);
+    MVT Op0VT = Op0.getValueType();
+    MVT Op0VecVT = MVT::getVectorVT(Op0VT, (128 / Op0VT.getSizeInBits()));
+    MVT OpVecVT = MVT::getVectorVT(OpVT, (128 / OpVT.getSizeInBits()));
+    SDValue shufMask;

-    if (Op.getValueType() == MVT::i16 && Op1.getValueType() == MVT::i8) {
-      if (Op1.getOpcode() == ISD::AND) {
-        // Fold this into a single ANDHI. This is often seen in expansions of i1
-        // to i8, then i8 to i16 in logical/branching operations.
-        DEBUG(cerr << "CellSPU: Coalescing (zero_extend:i16 (and:i8 "
-                      "<arg>, <const>))\n");
-        NewOpc = SPU::ANDHIi8i16;
-        Ops[0] = Op1.getOperand(0);
-        Ops[1] = Op1.getOperand(1);
-        n_ops = 2;
-      }
+    switch (Op0VT.getSimpleVT()) {
+    default:
+      cerr << "CellSPU Select: Unhandled zero/any extend MVT\n";
+      abort();
+      /*NOTREACHED*/
+      break;
+    case MVT::i32:
+      shufMask = CurDAG->getNode(ISD::BUILD_VECTOR, MVT::v4i32,
+                             CurDAG->getConstant(0x80808080, MVT::i32),
+                             CurDAG->getConstant(0x00010203, MVT::i32),
+                             CurDAG->getConstant(0x80808080, MVT::i32),
+                             CurDAG->getConstant(0x08090a0b, MVT::i32));
+      break;
+
+    case MVT::i16:
+      shufMask = CurDAG->getNode(ISD::BUILD_VECTOR, MVT::v4i32,
+                             CurDAG->getConstant(0x80808080, MVT::i32),
+                             CurDAG->getConstant(0x80800203, MVT::i32),
+                             CurDAG->getConstant(0x80808080, MVT::i32),
+                             CurDAG->getConstant(0x80800a0b, MVT::i32));
+      break;
+
+    case MVT::i8:
+      shufMask = CurDAG->getNode(ISD::BUILD_VECTOR, MVT::v4i32,
+                             CurDAG->getConstant(0x80808080, MVT::i32),
+                             CurDAG->getConstant(0x80808003, MVT::i32),
+                             CurDAG->getConstant(0x80808080, MVT::i32),
+                             CurDAG->getConstant(0x8080800b, MVT::i32));
+      break;
    }
+
+    SDNode *shufMaskLoad = emitBuildVector(shufMask);
+    SDNode *PromoteScalar =
+            SelectCode(CurDAG->getNode(SPUISD::PREFSLOT2VEC, Op0VecVT, Op0));
+
+    SDValue zextShuffle =
+            CurDAG->getNode(SPUISD::SHUFB, OpVecVT,
+                                       SDValue(PromoteScalar, 0),
+                                       SDValue(PromoteScalar, 0),
+                                       SDValue(shufMaskLoad, 0));
+
+    // N.B.: BIT_CONVERT replaces and updates the zextShuffle node, so we
+    // re-use it in the VEC2PREFSLOT selection without needing to explicitly
+    // call SelectCode (it's already done for us.)
+    SelectCode(CurDAG->getNode(ISD::BIT_CONVERT, OpVecVT, zextShuffle));
+    return SelectCode(CurDAG->getNode(SPUISD::VEC2PREFSLOT, OpVT,
+                                      zextShuffle));
+  } else if (Opc == ISD::ADD && (OpVT == MVT::i64 || OpVT == MVT::v2i64)) {
+    SDNode *CGLoad =
+            emitBuildVector(SPU::getCarryGenerateShufMask(*CurDAG));
+
+    return SelectCode(CurDAG->getNode(SPUISD::ADD64_MARKER, OpVT,
+                                      Op.getOperand(0), Op.getOperand(1),
+                                      SDValue(CGLoad, 0)));
+  } else if (Opc == ISD::SUB && (OpVT == MVT::i64 || OpVT == MVT::v2i64)) {
+    SDNode *CGLoad =
+            emitBuildVector(SPU::getBorrowGenerateShufMask(*CurDAG));
+
+    return SelectCode(CurDAG->getNode(SPUISD::SUB64_MARKER, OpVT,
+                                      Op.getOperand(0), Op.getOperand(1),
+                                      SDValue(CGLoad, 0)));
+  } else if (Opc == ISD::MUL && (OpVT == MVT::i64 || OpVT == MVT::v2i64)) {
+    SDNode *CGLoad =
+            emitBuildVector(SPU::getCarryGenerateShufMask(*CurDAG));
+
+    return SelectCode(CurDAG->getNode(SPUISD::MUL64_MARKER, OpVT,
+                                      Op.getOperand(0), Op.getOperand(1),
+                                      SDValue(CGLoad, 0)));
  } else if (Opc == ISD::SHL) {
    if (OpVT == MVT::i64) {
      return SelectSHLi64(Op, OpVT);
--- a/lib/Target/CellSPU/SPUISelLowering.cpp
+++ b/lib/Target/CellSPU/SPUISelLowering.cpp
@ -78,6 +78,7 @@ namespace {

    return retval;
  }
+
 }

 SPUTargetLowering::SPUTargetLowering(SPUTargetMachine &TM)
@ -208,13 +209,13 @@ SPUTargetLowering::SPUTargetLowering(SPUTargetMachine &TM)
  // Custom lower i8, i32 and i64 multiplications
  setOperationAction(ISD::MUL,  MVT::i8,     Custom);
  setOperationAction(ISD::MUL,  MVT::i32,    Legal);
-  setOperationAction(ISD::MUL,  MVT::i64,    Expand);   // libcall
+  setOperationAction(ISD::MUL,  MVT::i64,    Legal);

  // Need to custom handle (some) common i8, i64 math ops
  setOperationAction(ISD::ADD,  MVT::i8,     Custom);
-  setOperationAction(ISD::ADD,  MVT::i64,    Custom);
+  setOperationAction(ISD::ADD,  MVT::i64,    Legal);
  setOperationAction(ISD::SUB,  MVT::i8,     Custom);
-  setOperationAction(ISD::SUB,  MVT::i64,    Custom);
+  setOperationAction(ISD::SUB,  MVT::i64,    Legal);

  // SPU does not have BSWAP. It does have i32 support CTLZ.
  // CTPOP has to be custom lowered.
@ -243,11 +244,6 @@ SPUTargetLowering::SPUTargetLowering(SPUTargetMachine &TM)
  setOperationAction(ISD::SETCC, MVT::i32,   Legal);
  setOperationAction(ISD::SETCC, MVT::i64,   Legal);

-  // Zero extension and sign extension for i64 have to be
-  // custom legalized
-  setOperationAction(ISD::ZERO_EXTEND, MVT::i64, Custom);
-  setOperationAction(ISD::ANY_EXTEND,  MVT::i64, Custom);
-
  // Custom lower i128 -> i64 truncates
  setOperationAction(ISD::TRUNCATE, MVT::i64, Custom);

@ -416,10 +412,9 @@ SPUTargetLowering::getTargetNodeName(unsigned Opcode) const
    node_names[(unsigned) SPUISD::VEC_ROTR] = "SPUISD::VEC_ROTR";
    node_names[(unsigned) SPUISD::SELECT_MASK] = "SPUISD::SELECT_MASK";
    node_names[(unsigned) SPUISD::SELB] = "SPUISD::SELB";
-    node_names[(unsigned) SPUISD::ADD_EXTENDED] = "SPUISD::ADD_EXTENDED";
-    node_names[(unsigned) SPUISD::CARRY_GENERATE] = "SPUISD::CARRY_GENERATE";
-    node_names[(unsigned) SPUISD::SUB_EXTENDED] = "SPUISD::SUB_EXTENDED";
-    node_names[(unsigned) SPUISD::BORROW_GENERATE] = "SPUISD::BORROW_GENERATE";
+    node_names[(unsigned) SPUISD::ADD64_MARKER] = "SPUISD::ADD64_MARKER";
+    node_names[(unsigned) SPUISD::SUB64_MARKER] = "SPUISD::SUB64_MARKER";
+    node_names[(unsigned) SPUISD::MUL64_MARKER] = "SPUISD::MUL64_MARKER";
  }

  std::map<unsigned, const char *>::iterator i = node_names.find(Opcode);
@ -778,8 +773,8 @@ LowerSTORE(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
  return SDValue();
 }

-/// Generate the address of a constant pool entry.
-static SDValue
+//! Generate the address of a constant pool entry.
+SDValue
 LowerConstantPool(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
  MVT PtrVT = Op.getValueType();
  ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
@ -805,6 +800,12 @@ LowerConstantPool(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
  return SDValue();
 }

+//! Alternate entry point for generating the address of a constant pool entry
+SDValue
+SPU::LowerConstantPool(SDValue Op, SelectionDAG &DAG, const SPUTargetMachine &TM) {
+  return ::LowerConstantPool(Op, DAG, TM.getSubtargetImpl());
+}
+
 static SDValue
 LowerJumpTable(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
  MVT PtrVT = Op.getValueType();
@ -2185,123 +2186,34 @@ static SDValue LowerI8Math(SDValue Op, SelectionDAG &DAG, unsigned Opc,
  return SDValue();
 }

-static SDValue LowerI64Math(SDValue Op, SelectionDAG &DAG, unsigned Opc)
-{
-  MVT VT = Op.getValueType();
-  MVT VecVT = MVT::getVectorVT(VT, (128 / VT.getSizeInBits()));
+//! Generate the carry-generate shuffle mask.
+SDValue SPU::getCarryGenerateShufMask(SelectionDAG &DAG) {
+SmallVector<SDValue, 16> ShufBytes;

-  SDValue Op0 = Op.getOperand(0);
+// Create the shuffle mask for "rotating" the borrow up one register slot
+// once the borrow is generated.
+ShufBytes.push_back(DAG.getConstant(0x04050607, MVT::i32));
+ShufBytes.push_back(DAG.getConstant(0x80808080, MVT::i32));
+ShufBytes.push_back(DAG.getConstant(0x0c0d0e0f, MVT::i32));
+ShufBytes.push_back(DAG.getConstant(0x80808080, MVT::i32));

-  switch (Opc) {
-  case ISD::ZERO_EXTEND:
-  case ISD::ANY_EXTEND: {
-    MVT Op0VT = Op0.getValueType();
-    MVT Op0VecVT = MVT::getVectorVT(Op0VT, (128 / Op0VT.getSizeInBits()));
+return DAG.getNode(ISD::BUILD_VECTOR, MVT::v4i32,
+                   &ShufBytes[0], ShufBytes.size());
+}

-    SDValue PromoteScalar =
-            DAG.getNode(SPUISD::PREFSLOT2VEC, Op0VecVT, Op0);
+//! Generate the borrow-generate shuffle mask
+SDValue SPU::getBorrowGenerateShufMask(SelectionDAG &DAG) {
+SmallVector<SDValue, 16> ShufBytes;

-    // Use a shuffle to zero extend the i32 to i64 directly:
-    SDValue shufMask;
+// Create the shuffle mask for "rotating" the borrow up one register slot
+// once the borrow is generated.
+ShufBytes.push_back(DAG.getConstant(0x04050607, MVT::i32));
+ShufBytes.push_back(DAG.getConstant(0xc0c0c0c0, MVT::i32));
+ShufBytes.push_back(DAG.getConstant(0x0c0d0e0f, MVT::i32));
+ShufBytes.push_back(DAG.getConstant(0xc0c0c0c0, MVT::i32));

-    switch (Op0VT.getSimpleVT()) {
-    default:
-      cerr << "CellSPU LowerI64Math: Unhandled zero/any extend MVT\n";
-      abort();
-      /*NOTREACHED*/
-      break;
-    case MVT::i32:
-      shufMask = DAG.getNode(ISD::BUILD_VECTOR, MVT::v4i32,
-                             DAG.getConstant(0x80808080, MVT::i32),
-                             DAG.getConstant(0x00010203, MVT::i32),
-                             DAG.getConstant(0x80808080, MVT::i32),
-                             DAG.getConstant(0x08090a0b, MVT::i32));
-        break;
-
-    case MVT::i16:
-      shufMask = DAG.getNode(ISD::BUILD_VECTOR, MVT::v4i32,
-                             DAG.getConstant(0x80808080, MVT::i32),
-                             DAG.getConstant(0x80800203, MVT::i32),
-                             DAG.getConstant(0x80808080, MVT::i32),
-                             DAG.getConstant(0x80800a0b, MVT::i32));
-      break;
-
-    case MVT::i8:
-      shufMask = DAG.getNode(ISD::BUILD_VECTOR, MVT::v4i32,
-                             DAG.getConstant(0x80808080, MVT::i32),
-                             DAG.getConstant(0x80808003, MVT::i32),
-                             DAG.getConstant(0x80808080, MVT::i32),
-                             DAG.getConstant(0x8080800b, MVT::i32));
-      break;
-    }
-
-    SDValue zextShuffle = DAG.getNode(SPUISD::SHUFB, Op0VecVT,
-                                      PromoteScalar, PromoteScalar, shufMask);
-
-    return DAG.getNode(SPUISD::VEC2PREFSLOT, VT,
-                       DAG.getNode(ISD::BIT_CONVERT, VecVT, zextShuffle));
-  }
-
-  case ISD::ADD: {
-    // Turn operands into vectors to satisfy type checking (shufb works on
-    // vectors)
-    SDValue Op0 =
-      DAG.getNode(SPUISD::PREFSLOT2VEC, MVT::v2i64, Op.getOperand(0));
-    SDValue Op1 =
-      DAG.getNode(SPUISD::PREFSLOT2VEC, MVT::v2i64, Op.getOperand(1));
-    SmallVector<SDValue, 16> ShufBytes;
-
-    // Create the shuffle mask for "rotating" the borrow up one register slot
-    // once the borrow is generated.
-    ShufBytes.push_back(DAG.getConstant(0x04050607, MVT::i32));
-    ShufBytes.push_back(DAG.getConstant(0x80808080, MVT::i32));
-    ShufBytes.push_back(DAG.getConstant(0x0c0d0e0f, MVT::i32));
-    ShufBytes.push_back(DAG.getConstant(0x80808080, MVT::i32));
-
-    SDValue CarryGen =
-      DAG.getNode(SPUISD::CARRY_GENERATE, MVT::v2i64, Op0, Op1);
-    SDValue ShiftedCarry =
-      DAG.getNode(SPUISD::SHUFB, MVT::v2i64,
-                  CarryGen, CarryGen,
-                  DAG.getNode(ISD::BUILD_VECTOR, MVT::v4i32,
-                              &ShufBytes[0], ShufBytes.size()));
-
-    return DAG.getNode(SPUISD::VEC2PREFSLOT, MVT::i64,
-                       DAG.getNode(SPUISD::ADD_EXTENDED, MVT::v2i64,
-                                   Op0, Op1, ShiftedCarry));
-  }
-
-  case ISD::SUB: {
-    // Turn operands into vectors to satisfy type checking (shufb works on
-    // vectors)
-    SDValue Op0 =
-      DAG.getNode(SPUISD::PREFSLOT2VEC, MVT::v2i64, Op.getOperand(0));
-    SDValue Op1 =
-      DAG.getNode(SPUISD::PREFSLOT2VEC, MVT::v2i64, Op.getOperand(1));
-    SmallVector<SDValue, 16> ShufBytes;
-
-    // Create the shuffle mask for "rotating" the borrow up one register slot
-    // once the borrow is generated.
-    ShufBytes.push_back(DAG.getConstant(0x04050607, MVT::i32));
-    ShufBytes.push_back(DAG.getConstant(0xc0c0c0c0, MVT::i32));
-    ShufBytes.push_back(DAG.getConstant(0x0c0d0e0f, MVT::i32));
-    ShufBytes.push_back(DAG.getConstant(0xc0c0c0c0, MVT::i32));
-
-    SDValue BorrowGen =
-      DAG.getNode(SPUISD::BORROW_GENERATE, MVT::v2i64, Op0, Op1);
-    SDValue ShiftedBorrow =
-      DAG.getNode(SPUISD::SHUFB, MVT::v2i64,
-                  BorrowGen, BorrowGen,
-                  DAG.getNode(ISD::BUILD_VECTOR, MVT::v4i32,
-                              &ShufBytes[0], ShufBytes.size()));
-
-    return DAG.getNode(SPUISD::VEC2PREFSLOT, MVT::i64,
-                       DAG.getNode(SPUISD::SUB_EXTENDED, MVT::v2i64,
-                                   Op0, Op1, ShiftedBorrow));
-  }
-  }
-
-  return SDValue();
+return DAG.getNode(ISD::BUILD_VECTOR, MVT::v4i32,
+                   &ShufBytes[0], ShufBytes.size());
 }

 //! Lower byte immediate operations for v16i8 vectors:
@ -2576,11 +2488,6 @@ SPUTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG)
  case ISD::RET:
    return LowerRET(Op, DAG, getTargetMachine());

-
-  case ISD::ZERO_EXTEND:
-  case ISD::ANY_EXTEND:
-    return LowerI64Math(Op, DAG, Opc);
-
  // i8, i64 math ops:
  case ISD::ADD:
  case ISD::SUB:
@ -2591,8 +2498,6 @@ SPUTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG)
  case ISD::SRA: {
    if (VT == MVT::i8)
      return LowerI8Math(Op, DAG, Opc, *this);
-    else if (VT == MVT::i64)
-      return LowerI64Math(Op, DAG, Opc);
    break;
  }

@ -2831,6 +2736,7 @@ SPUTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const
    break;
  }
  }
+  
  // Otherwise, return unchanged.
 #ifndef NDEBUG
  if (Result.getNode()) {
--- a/lib/Target/CellSPU/SPUISelLowering.h
+++ b/lib/Target/CellSPU/SPUISelLowering.h
@ -52,10 +52,11 @@ namespace llvm {
      ROTBYTES_LEFT_BITS,       ///< Rotate bytes left by bit shift count
      SELECT_MASK,              ///< Select Mask (FSM, FSMB, FSMH, FSMBI)
      SELB,                     ///< Select bits -> (b & mask) | (a & ~mask)
-      ADD_EXTENDED,             ///< Add extended, with carry
-      CARRY_GENERATE,           ///< Carry generate for ADD_EXTENDED
-      SUB_EXTENDED,             ///< Subtract extended, with borrow
-      BORROW_GENERATE,          ///< Borrow generate for SUB_EXTENDED
+      // Markers: These aren't used to generate target-dependent nodes, but
+      // are used during instruction selection.
+      ADD64_MARKER,             ///< i64 addition marker
+      SUB64_MARKER,             ///< i64 subtraction marker
+      MUL64_MARKER,             ///< i64 multiply marker
      LAST_SPUISD               ///< Last user-defined instruction
    };
  }
@ -74,6 +75,12 @@ namespace llvm {
                              MVT ValueType);
    SDValue get_v4i32_imm(SDNode *N, SelectionDAG &DAG);
    SDValue get_v2i64_imm(SDNode *N, SelectionDAG &DAG);
+
+    SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG,
+                              const SPUTargetMachine &TM);
+
+    SDValue getBorrowGenerateShufMask(SelectionDAG &DAG);
+    SDValue getCarryGenerateShufMask(SelectionDAG &DAG);
  }

  class SPUTargetMachine;            // forward dec'l.
@ -86,8 +93,18 @@ namespace llvm {
    SPUTargetMachine &SPUTM;

  public:
+    //! The venerable constructor
+    /*!
+     This is where the CellSPU backend sets operation handling (i.e., legal,
+     custom, expand or promote.)
+     */
    SPUTargetLowering(SPUTargetMachine &TM);

+    //! Get the target machine
+    SPUTargetMachine &getSPUTargetMachine() {
+      return SPUTM;
+    }
+
    /// getTargetNodeName() - This method returns the name of a target specific
    /// DAG node.
    virtual const char *getTargetNodeName(unsigned Opcode) const;
--- a/lib/Target/CellSPU/SPUInstrInfo.td
+++ b/lib/Target/CellSPU/SPUInstrInfo.td
@ -705,17 +705,14 @@ class ADDXInst<dag OOL, dag IOL, list<dag> pattern>:
 class ADDXVecInst<ValueType vectype>:
    ADDXInst<(outs VECREG:$rT),
             (ins VECREG:$rA, VECREG:$rB, VECREG:$rCarry),
-             [(set (vectype VECREG:$rT),
-                   (SPUaddx (vectype VECREG:$rA), (vectype VECREG:$rB),
-                            (vectype VECREG:$rCarry)))]>,
+             [/* no pattern */]>,
    RegConstraint<"$rCarry = $rT">,
    NoEncode<"$rCarry">;

 class ADDXRegInst<RegisterClass rclass>:
    ADDXInst<(outs rclass:$rT),
             (ins rclass:$rA, rclass:$rB, rclass:$rCarry),
-             [(set rclass:$rT,
-                   (SPUaddx rclass:$rA, rclass:$rB, rclass:$rCarry))]>,
+             [/* no pattern */]>,
    RegConstraint<"$rCarry = $rT">,
    NoEncode<"$rCarry">;

@ -737,14 +734,12 @@ class CGInst<dag OOL, dag IOL, list<dag> pattern>:
 class CGVecInst<ValueType vectype>:
    CGInst<(outs VECREG:$rT),
           (ins VECREG:$rA, VECREG:$rB),
-             [(set (vectype VECREG:$rT),
-                   (SPUcarry_gen (vectype VECREG:$rA), (vectype VECREG:$rB)))]>;
+           [/* no pattern */]>;

 class CGRegInst<RegisterClass rclass>:
    CGInst<(outs rclass:$rT),
           (ins rclass:$rA, rclass:$rB),
-             [(set rclass:$rT,
-                   (SPUcarry_gen rclass:$rA, rclass:$rB))]>;
+           [/* no pattern */]>;

 multiclass CarryGenerate {
  def v2i64 : CGVecInst<v2i64>;
@ -765,17 +760,14 @@ class SFXInst<dag OOL, dag IOL, list<dag> pattern>:
 class SFXVecInst<ValueType vectype>:
    SFXInst<(outs VECREG:$rT),
            (ins VECREG:$rA, VECREG:$rB, VECREG:$rCarry),
-             [(set (vectype VECREG:$rT),
-                   (SPUsubx (vectype VECREG:$rA), (vectype VECREG:$rB),
-                            (vectype VECREG:$rCarry)))]>,
+             [/* no pattern */]>,
    RegConstraint<"$rCarry = $rT">,
    NoEncode<"$rCarry">;

 class SFXRegInst<RegisterClass rclass>:
    SFXInst<(outs rclass:$rT),
            (ins rclass:$rA, rclass:$rB, rclass:$rCarry),
-             [(set rclass:$rT,
-                   (SPUsubx rclass:$rA, rclass:$rB, rclass:$rCarry))]>,
+             [/* no pattern */]>,
    RegConstraint<"$rCarry = $rT">,
    NoEncode<"$rCarry">;

@ -797,14 +789,12 @@ class BGInst<dag OOL, dag IOL, list<dag> pattern>:
 class BGVecInst<ValueType vectype>:
    BGInst<(outs VECREG:$rT),
           (ins VECREG:$rA, VECREG:$rB),
-           [(set (vectype VECREG:$rT),
-                 (SPUborrow_gen (vectype VECREG:$rA), (vectype VECREG:$rB)))]>;
+           [/* no pattern */]>;

 class BGRegInst<RegisterClass rclass>:
    BGInst<(outs rclass:$rT),
           (ins rclass:$rA, rclass:$rB),
-           [(set rclass:$rT,
-                 (SPUborrow_gen rclass:$rA, rclass:$rB))]>;
+           [/* no pattern */]>;

 multiclass BorrowGenerate {
  def v4i32 : BGVecInst<v4i32>;
@ -894,7 +884,7 @@ class MPYAInst<dag OOL, dag IOL, list<dag> pattern>:
          "mpya\t$rT, $rA, $rB, $rC", IntegerMulDiv,
          pattern>;
          
-def MPYAvec:
+def MPYAv4i32:
  MPYAInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB, VECREG:$rC),
           [(set (v4i32 VECREG:$rT),
                 (add (v4i32 (bitconvert (mul (v8i16 VECREG:$rA),
@ -939,7 +929,7 @@ class MPYSInst<dag OOL, dag IOL>:
      "mpys\t$rT, $rA, $rB", IntegerMulDiv,
      [/* no pattern */]>;

-def MPYSvec:
+def MPYSv4i32:
    MPYSInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB)>;
    
 def MPYSr16:
@ -972,14 +962,20 @@ def MPYHHAvec:
 def MPYHHAr32:
    MPYHHAInst<(outs R32C:$rT), (ins R32C:$rA, R32C:$rB)>;

-// mpyhhu: Multiply high-high, unsigned
+// mpyhhu: Multiply high-high, unsigned, e.g.:
+//
+// +-------+-------+   +-------+-------+   +---------+
+// |  a0   .  a1   | x |  b0   .  b1   | = | a0 x b0 |
+// +-------+-------+   +-------+-------+   +---------+
+//
+// where a0, b0 are the upper 16 bits of the 32-bit word

 class MPYHHUInst<dag OOL, dag IOL>:
    RRForm<0b01110011110, OOL, IOL,
      "mpyhhu\t$rT, $rA, $rB", IntegerMulDiv,
      [/* no pattern */]>;

-def MPYHHUvec:
+def MPYHHUv4i32:
    MPYHHUInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB)>;
    
 def MPYHHUr32:
--- a/lib/Target/CellSPU/SPUMathInstr.td
+++ b/lib/Target/CellSPU/SPUMathInstr.td
@ -8,8 +8,6 @@
 //
 // Any resemblance to libsimdmath or the Cell SDK simdmath library is
 // purely and completely coincidental.
-//
-// Primary author: Scott Michel (scottm@aero.org)
 //===----------------------------------------------------------------------===//

 //-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
--- a/lib/Target/CellSPU/SPUNodes.td
+++ b/lib/Target/CellSPU/SPUNodes.td
@ -61,18 +61,20 @@ def SPUselb_type: SDTypeProfile<1, 3, [
 def SPUvecshift_type: SDTypeProfile<1, 2, [
  SDTCisSameAs<0, 1>, SDTCisInt<2>]>;

+// "marker" type for i64 operators that need a shuffle mask
+// (i.e., uses cg or bg or another instruction that needs to
+// use shufb to get things in the right place.)
+// Op0: The result
+// Op1, 2: LHS, RHS
+// Op3: Carry-generate shuffle mask
+
+def SPUmarker_type : SDTypeProfile<1, 3, [
+  SDTCisInt<0>, SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2> ]>;
+
 //===----------------------------------------------------------------------===//
 // Synthetic/pseudo-instructions
 //===----------------------------------------------------------------------===//

-/// Add extended, carry generate:
-def SPUaddx : SDNode<"SPUISD::ADD_EXTENDED", SPUIntTrinaryOp, []>;
-def SPUcarry_gen : SDNode<"SPUISD::CARRY_GENERATE", SDTIntBinOp, []>;
-
-// Subtract extended, borrow generate
-def SPUsubx : SDNode<"SPUISD::SUB_EXTENDED", SPUIntTrinaryOp, []>;
-def SPUborrow_gen : SDNode<"SPUISD::BORROW_GENERATE", SDTIntBinOp, []>;
-
 // SPU CNTB:
 def SPUcntb : SDNode<"SPUISD::CNTB", SDTIntUnaryOp>;

@ -127,6 +129,12 @@ def SPUaform : SDNode<"SPUISD::AFormAddr", SDTIntBinOp, []>;
 // Indirect [D-Form "imm($reg)" and X-Form "$reg($reg)"] addresses
 def SPUindirect : SDNode<"SPUISD::IndirectAddr", SDTIntBinOp, []>;

+// i64 markers: supplies extra operands used to generate the i64 operator
+// instruction sequences
+def SPUadd64 : SDNode<"SPUISD::ADD64_MARKER", SPUmarker_type, []>;
+def SPUsub64 : SDNode<"SPUISD::SUB64_MARKER", SPUmarker_type, []>;
+def SPUmul64 : SDNode<"SPUISD::MUL64_MARKER", SPUmarker_type, []>;
+
 //===----------------------------------------------------------------------===//
 // Constraints: (taken from PPCInstrInfo.td)
 //===----------------------------------------------------------------------===//
--- a/test/CodeGen/CellSPU/i64ops.ll
+++ b/test/CodeGen/CellSPU/i64ops.ll
@ -2,9 +2,15 @@
 ; RUN: grep xswd	     %t1.s | count 3
 ; RUN: grep xsbh	     %t1.s | count 1
 ; RUN: grep xshw	     %t1.s | count 2
-; RUN: grep shufb            %t1.s | count 4
-; RUN: grep cg               %t1.s | count 1
-; RUN: grep addx             %t1.s | count 1
+; RUN: grep shufb        %t1.s | count 7
+; RUN: grep cg           %t1.s | count 4
+; RUN: grep addx         %t1.s | count 4
+; RUN: grep fsmbi        %t1.s | count 3
+; RUN: grep il           %t1.s | count 2
+; RUN: grep mpy          %t1.s | count 10
+; RUN: grep mpyh         %t1.s | count 6
+; RUN: grep mpyhhu       %t1.s | count 2
+; RUN: grep mpyu         %t1.s | count 4

 ; ModuleID = 'stores.bc'
 target datalayout = "E-p:32:32:128-f64:64:128-f32:32:128-i64:32:128-i32:32:128-i16:16:128-i8:8:128-i1:8:128-a0:0:128-v128:128:128-s0:128:128"
@ -44,3 +50,8 @@ define i64 @add_i64(i64 %a, i64 %b) nounwind {
  %1 = add i64 %a, %b
  ret i64 %1
 }
+
+define i64 @mul_i64(i64 %a, i64 %b) nounwind {
+  %1 = mul i64 %a, %b
+  ret i64 %1
+}
--- a/test/CodeGen/CellSPU/useful-harnesses/i64operations.c
+++ b/test/CodeGen/CellSPU/useful-harnesses/i64operations.c
@ -7,6 +7,7 @@ int64_t         tval_c = 1234567890001LL;
 int64_t         tval_d = 10001LL;
 int64_t         tval_e = 10000LL;
 uint64_t        tval_f = 0xffffff0750135eb9;
+int64_t		tval_g = -1;

 /* ~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~- */

@ -546,6 +547,12 @@ test_i64_variable_shift(const char *func_name, int64_t (*func)(int64_t, int), in

 /* ~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~- */

+int64_t i64_mul(int64_t a, int64_t b) {
+  return a * b;
+}
+
+/* ~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~- */
+
 int
 main(void)
 {
@ -553,12 +560,13 @@ main(void)
  const char     *something_failed = "  %d tests failed.\n";
  const char     *all_tests_passed = "  All tests passed.\n";

-  printf("tval_a = %20lld (0x%020llx)\n", tval_a, tval_a);
-  printf("tval_b = %20lld (0x%020llx)\n", tval_b, tval_b);
-  printf("tval_c = %20lld (0x%020llx)\n", tval_c, tval_c);
-  printf("tval_d = %20lld (0x%020llx)\n", tval_d, tval_d);
-  printf("tval_e = %20lld (0x%020llx)\n", tval_e, tval_e);
-  printf("tval_f = %20llu (0x%020llx)\n", tval_f, tval_f);
+  printf("tval_a = %20lld (0x%016llx)\n", tval_a, tval_a);
+  printf("tval_b = %20lld (0x%016llx)\n", tval_b, tval_b);
+  printf("tval_c = %20lld (0x%016llx)\n", tval_c, tval_c);
+  printf("tval_d = %20lld (0x%016llx)\n", tval_d, tval_d);
+  printf("tval_e = %20lld (0x%016llx)\n", tval_e, tval_e);
+  printf("tval_f = %20llu (0x%016llx)\n", tval_f, tval_f);
+  printf("tval_g = %20llu (0x%016llx)\n", tval_g, tval_g);
  printf("----------------------------------------\n");

  for (i = 0; i < ARR_SIZE(int64_preds); ++i) {
@ -649,5 +657,17 @@ main(void)

  printf("----------------------------------------\n");

+  int64_t result;
+  
+  result = i64_mul(tval_g, tval_g);
+  printf("%20lld * %20lld = %20lld (0x%016llx)\n", tval_g, tval_g, result, result);
+  result = i64_mul(tval_d, tval_e);
+  printf("%20lld * %20lld = %20lld (0x%016llx)\n", tval_d, tval_e, result, result);
+  /* 0xba7a664f13077c9 */
+  result = i64_mul(tval_a, tval_b);
+  printf("%20lld * %20lld = %20lld (0x%016llx)\n", tval_a, tval_b, result, result);
+
+  printf("----------------------------------------\n");
+
  return 0;
 }