diff --git a/lib/Target/CellSPU/SPUAsmPrinter.cpp b/lib/Target/CellSPU/SPUAsmPrinter.cpp
new file mode 100644
index 00000000000..6fdb14cf3b8
--- /dev/null
+++ b/lib/Target/CellSPU/SPUAsmPrinter.cpp
@@ -0,0 +1,654 @@
+//===-- SPUAsmPrinter.cpp - Print machine instrs to Cell SPU assembly -------=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by a team from the Computer Systems Research
+// Department at The Aerospace Corporation.
+//
+// See README.txt for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a printer that converts from our internal representation
+// of machine-dependent LLVM code to Cell SPU assembly language. This printer
+// is the output mechanism used by `llc'.
+//
+// Documentation at http://developer.apple.com/documentation/DeveloperTools/
+// Reference/Assembler/ASMIntroduction/chapter_1_section_1.html
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "asmprinter"
+#include "SPU.h"
+#include "SPUTargetMachine.h"
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Module.h"
+#include "llvm/Assembly/Writer.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/DwarfWriter.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/Support/Mangler.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Target/TargetAsmInfo.h"
+#include "llvm/Target/MRegisterInfo.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringExtras.h"
+#include <set>
+using namespace llvm;
+
+namespace {
+  STATISTIC(EmittedInsts, "Number of machine instrs printed");
+
+  const std::string bss_section(".bss");
+
+  struct VISIBILITY_HIDDEN SPUAsmPrinter : public AsmPrinter {
+    std::set<std::string> FnStubs, GVStubs;
+
+    SPUAsmPrinter(std::ostream &O, TargetMachine &TM, const TargetAsmInfo *T) :
+      AsmPrinter(O, TM, T)
+    {
+    }
+
+    virtual const char *getPassName() const {
+      return "STI CBEA SPU Assembly Printer";
+    }
+
+    SPUTargetMachine &getTM() {
+      return static_cast<SPUTargetMachine&>(TM);
+    }
+
+    /// printInstruction - This method is automatically generated by tablegen
+    /// from the instruction set description.  This method returns true if the
+    /// machine instruction was sufficiently described to print it, otherwise it
+    /// returns false.
+    bool printInstruction(const MachineInstr *MI);
+
+    void printMachineInstruction(const MachineInstr *MI);
+    void printOp(const MachineOperand &MO);
+
+    /// printRegister - Print register according to target requirements.
+    ///
+    void printRegister(const MachineOperand &MO, bool R0AsZero) {
+      unsigned RegNo = MO.getReg();
+      assert(MRegisterInfo::isPhysicalRegister(RegNo) && "Not physreg??");
+      O << TM.getRegisterInfo()->get(RegNo).Name;
+    }
+
+    void printOperand(const MachineInstr *MI, unsigned OpNo) {
+      const MachineOperand &MO = MI->getOperand(OpNo);
+      if (MO.isRegister()) {
+        assert(MRegisterInfo::isPhysicalRegister(MO.getReg())&&"Not physreg??");
+        O << TM.getRegisterInfo()->get(MO.getReg()).Name;
+      } else if (MO.isImmediate()) {
+        O << MO.getImmedValue();
+      } else {
+        printOp(MO);
+      }
+    }
+    
+    bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+                         unsigned AsmVariant, const char *ExtraCode);
+    bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo,
+                               unsigned AsmVariant, const char *ExtraCode);
+   
+   
+    void
+    printS7ImmOperand(const MachineInstr *MI, unsigned OpNo)
+    {
+      int value = MI->getOperand(OpNo).getImmedValue();
+      value = (value << (32 - 7)) >> (32 - 7);
+
+      assert((value >= -(1 << 8) && value <= (1 << 7) - 1)
+	     && "Invalid s7 argument");
+      O << value;
+    }
+
+    void
+    printU7ImmOperand(const MachineInstr *MI, unsigned OpNo)
+    {
+      unsigned int value = MI->getOperand(OpNo).getImmedValue();
+      assert(value < (1 << 8) && "Invalid u7 argument");
+      O << value;
+    }
+ 
+    void
+    printMemRegImmS7(const MachineInstr *MI, unsigned OpNo)
+    {
+      char value = MI->getOperand(OpNo).getImmedValue();
+      O << (int) value;
+      O << "(";
+      printOperand(MI, OpNo+1);
+      O << ")";
+    }
+
+    void
+    printS16ImmOperand(const MachineInstr *MI, unsigned OpNo)
+    {
+      O << (short) MI->getOperand(OpNo).getImmedValue();
+    }
+
+    void
+    printU16ImmOperand(const MachineInstr *MI, unsigned OpNo)
+    {
+      O << (unsigned short)MI->getOperand(OpNo).getImmedValue();
+    }
+
+    void
+    printU32ImmOperand(const MachineInstr *MI, unsigned OpNo)
+    {
+      O << (unsigned)MI->getOperand(OpNo).getImmedValue();
+    }
+    
+    void
+    printMemRegReg(const MachineInstr *MI, unsigned OpNo) {
+      // When used as the base register, r0 reads constant zero rather than
+      // the value contained in the register.  For this reason, the darwin
+      // assembler requires that we print r0 as 0 (no r) when used as the base.
+      const MachineOperand &MO = MI->getOperand(OpNo);
+      O << TM.getRegisterInfo()->get(MO.getReg()).Name;
+      O << ", ";
+      printOperand(MI, OpNo+1);
+    }
+
+    void
+    printU18ImmOperand(const MachineInstr *MI, unsigned OpNo)
+    {
+      unsigned int value = MI->getOperand(OpNo).getImmedValue();
+      assert(value <= (1 << 19) - 1 && "Invalid u18 argument");
+      O << value;
+    }
+
+    void
+    printS10ImmOperand(const MachineInstr *MI, unsigned OpNo)
+    {
+      short value = (short) (((int) MI->getOperand(OpNo).getImmedValue() << 16)
+                             >> 16);
+      assert((value >= -(1 << 9) && value <= (1 << 9) - 1)
+             && "Invalid s10 argument");
+      O << value;
+    }
+
+    void
+    printU10ImmOperand(const MachineInstr *MI, unsigned OpNo)
+    {
+      short value = (short) (((int) MI->getOperand(OpNo).getImmedValue() << 16)
+                             >> 16);
+      assert((value <= (1 << 10) - 1) && "Invalid u10 argument");
+      O << value;
+    }
+
+    void
+    printMemRegImmS10(const MachineInstr *MI, unsigned OpNo)
+    {
+      const MachineOperand &MO = MI->getOperand(OpNo);
+      assert(MO.isImmediate()
+	     && "printMemRegImmS10 first operand is not immedate");
+      printS10ImmOperand(MI, OpNo);
+      O << "(";
+      printOperand(MI, OpNo+1);
+      O << ")";
+    }
+
+    void
+    printAddr256K(const MachineInstr *MI, unsigned OpNo)
+    {
+      /* Note: operand 1 is an offset or symbol name. Operand 2 is
+	 ignored. */
+      if (MI->getOperand(OpNo).isImmediate()) {
+        printS16ImmOperand(MI, OpNo);
+      } else {
+        printOp(MI->getOperand(OpNo));
+      }
+    }
+
+    void printCallOperand(const MachineInstr *MI, unsigned OpNo) {
+      printOp(MI->getOperand(OpNo));
+    }
+
+    void printPCRelativeOperand(const MachineInstr *MI, unsigned OpNo) {
+      printOp(MI->getOperand(OpNo));
+      O << "-.";
+    }
+
+    void printSymbolHi(const MachineInstr *MI, unsigned OpNo) {
+      if (MI->getOperand(OpNo).isImmediate()) {
+        printS16ImmOperand(MI, OpNo);
+      } else {
+        printOp(MI->getOperand(OpNo));
+        O << "@h";
+      }
+    }
+
+    void printSymbolLo(const MachineInstr *MI, unsigned OpNo) {
+      if (MI->getOperand(OpNo).isImmediate()) {
+        printS16ImmOperand(MI, OpNo);
+      } else {
+        printOp(MI->getOperand(OpNo));
+        O << "@l";
+      }
+    }
+
+    /// Print local store address
+    void printSymbolLSA(const MachineInstr *MI, unsigned OpNo) {
+      printOp(MI->getOperand(OpNo));
+    }
+
+    void printROTHNeg7Imm(const MachineInstr *MI, unsigned OpNo) {
+      if (MI->getOperand(OpNo).isImmediate()) {
+        int value = (int) MI->getOperand(OpNo).getImmedValue();
+        assert((value >= 0 && value < 16)
+	       && "Invalid negated immediate rotate 7-bit argument");
+        O << -value;
+      } else {
+        assert(0 && "Invalid/non-immediate rotate amount in printRotateNeg7Imm");
+      }
+    }
+
+    void printROTNeg7Imm(const MachineInstr *MI, unsigned OpNo) {
+      if (MI->getOperand(OpNo).isImmediate()) {
+        int value = (int) MI->getOperand(OpNo).getImmedValue();
+        assert((value >= 0 && value < 32)
+	       && "Invalid negated immediate rotate 7-bit argument");
+        O << -value;
+      } else {
+        assert(0 && "Invalid/non-immediate rotate amount in printRotateNeg7Imm");
+      }
+    }
+
+    virtual bool runOnMachineFunction(MachineFunction &F) = 0;
+    virtual bool doFinalization(Module &M) = 0;
+  };
+
+  /// LinuxAsmPrinter - SPU assembly printer, customized for Linux
+  struct VISIBILITY_HIDDEN LinuxAsmPrinter : public SPUAsmPrinter {
+  
+    DwarfWriter DW;
+
+    LinuxAsmPrinter(std::ostream &O, SPUTargetMachine &TM,
+                    const TargetAsmInfo *T) :
+      SPUAsmPrinter(O, TM, T),
+      DW(O, this, T)
+    { }
+
+    virtual const char *getPassName() const {
+      return "STI CBEA SPU Assembly Printer";
+    }
+    
+    bool runOnMachineFunction(MachineFunction &F);
+    bool doInitialization(Module &M);
+    bool doFinalization(Module &M);
+    
+    void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.setPreservesAll();
+      AU.addRequired<MachineModuleInfo>();
+      SPUAsmPrinter::getAnalysisUsage(AU);
+    }
+
+    /// getSectionForFunction - Return the section that we should emit the
+    /// specified function body into.
+    virtual std::string getSectionForFunction(const Function &F) const;
+  };
+} // end of anonymous namespace
+
+// Include the auto-generated portion of the assembly writer
+#include "SPUGenAsmWriter.inc"
+
+void SPUAsmPrinter::printOp(const MachineOperand &MO) {
+  switch (MO.getType()) {
+  case MachineOperand::MO_Immediate:
+    cerr << "printOp() does not handle immediate values\n";
+    abort();
+    return;
+
+  case MachineOperand::MO_MachineBasicBlock:
+    printBasicBlockLabel(MO.getMachineBasicBlock());
+    return;
+  case MachineOperand::MO_JumpTableIndex:
+    O << TAI->getPrivateGlobalPrefix() << "JTI" << getFunctionNumber()
+      << '_' << MO.getJumpTableIndex();
+    // FIXME: PIC relocation model
+    return;
+  case MachineOperand::MO_ConstantPoolIndex:
+    O << TAI->getPrivateGlobalPrefix() << "CPI" << getFunctionNumber()
+      << '_' << MO.getConstantPoolIndex();
+    return;
+  case MachineOperand::MO_ExternalSymbol:
+    // Computing the address of an external symbol, not calling it.
+    if (TM.getRelocationModel() != Reloc::Static) {
+      std::string Name(TAI->getGlobalPrefix()); Name += MO.getSymbolName();
+      GVStubs.insert(Name);
+      O << "L" << Name << "$non_lazy_ptr";
+      return;
+    }
+    O << TAI->getGlobalPrefix() << MO.getSymbolName();
+    return;
+  case MachineOperand::MO_GlobalAddress: {
+    // Computing the address of a global symbol, not calling it.
+    GlobalValue *GV = MO.getGlobal();
+    std::string Name = Mang->getValueName(GV);
+
+    // External or weakly linked global variables need non-lazily-resolved
+    // stubs
+    if (TM.getRelocationModel() != Reloc::Static) {
+      if (((GV->isDeclaration() || GV->hasWeakLinkage() ||
+            GV->hasLinkOnceLinkage()))) {
+        GVStubs.insert(Name);
+        O << "L" << Name << "$non_lazy_ptr";
+        return;
+      }
+    }
+    O << Name;
+    
+    if (GV->hasExternalWeakLinkage())
+      ExtWeakSymbols.insert(GV);
+    return;
+  }
+
+  default:
+    O << "<unknown operand type: " << MO.getType() << ">";
+    return;
+  }
+}
+
+/// PrintAsmOperand - Print out an operand for an inline asm expression.
+///
+bool SPUAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+                                    unsigned AsmVariant, 
+                                    const char *ExtraCode) {
+  // Does this asm operand have a single letter operand modifier?
+  if (ExtraCode && ExtraCode[0]) {
+    if (ExtraCode[1] != 0) return true; // Unknown modifier.
+    
+    switch (ExtraCode[0]) {
+    default: return true;  // Unknown modifier.
+    case 'L': // Write second word of DImode reference.  
+      // Verify that this operand has two consecutive registers.
+      if (!MI->getOperand(OpNo).isRegister() ||
+          OpNo+1 == MI->getNumOperands() ||
+          !MI->getOperand(OpNo+1).isRegister())
+        return true;
+      ++OpNo;   // Return the high-part.
+      break;
+    }
+  }
+  
+  printOperand(MI, OpNo);
+  return false;
+}
+
+bool SPUAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
+    				          unsigned OpNo,
+                                          unsigned AsmVariant, 
+                                          const char *ExtraCode) {
+  if (ExtraCode && ExtraCode[0])
+    return true; // Unknown modifier.
+  printMemRegReg(MI, OpNo);
+  return false;
+}
+
+/// printMachineInstruction -- Print out a single PowerPC MI in Darwin syntax
+/// to the current output stream.
+///
+void SPUAsmPrinter::printMachineInstruction(const MachineInstr *MI) {
+  ++EmittedInsts;
+  printInstruction(MI);
+}
+
+
+
+std::string LinuxAsmPrinter::getSectionForFunction(const Function &F) const {
+  switch (F.getLinkage()) {
+  default: assert(0 && "Unknown linkage type!");
+  case Function::ExternalLinkage:
+  case Function::InternalLinkage: return TAI->getTextSection();
+  case Function::WeakLinkage:
+  case Function::LinkOnceLinkage:
+    return ""; // Print nothing for the time being...
+  }
+}
+
+/// runOnMachineFunction - This uses the printMachineInstruction()
+/// method to print assembly for each instruction.
+///
+bool
+LinuxAsmPrinter::runOnMachineFunction(MachineFunction &MF)
+{
+  DW.SetModuleInfo(&getAnalysis<MachineModuleInfo>());
+
+  SetupMachineFunction(MF);
+  O << "\n\n";
+  
+  // Print out constants referenced by the function
+  EmitConstantPool(MF.getConstantPool());
+
+  // Print out labels for the function.
+  const Function *F = MF.getFunction();
+
+  SwitchToTextSection(getSectionForFunction(*F).c_str(), F);
+  EmitAlignment(3, F);
+
+  switch (F->getLinkage()) {
+  default: assert(0 && "Unknown linkage type!");
+  case Function::InternalLinkage:  // Symbols default to internal.
+    break;
+  case Function::ExternalLinkage:
+    O << "\t.global\t" << CurrentFnName << "\n"
+      << "\t.type\t" << CurrentFnName << ", @function\n";
+    break;
+  case Function::WeakLinkage:
+  case Function::LinkOnceLinkage:
+    O << "\t.global\t" << CurrentFnName << "\n";
+    O << "\t.weak_definition\t" << CurrentFnName << "\n";
+    break;
+  }
+  O << CurrentFnName << ":\n";
+
+  // Emit pre-function debug information.
+  DW.BeginFunction(&MF);
+
+  // Print out code for the function.
+  for (MachineFunction::const_iterator I = MF.begin(), E = MF.end();
+       I != E; ++I) {
+    // Print a label for the basic block.
+    if (I != MF.begin()) {
+      printBasicBlockLabel(I, true);
+      O << '\n';
+    }
+    for (MachineBasicBlock::const_iterator II = I->begin(), E = I->end();
+         II != E; ++II) {
+      // Print the assembly for the instruction.
+      O << "\t";
+      printMachineInstruction(II);
+    }
+  }
+
+  O << "\t.size\t" << CurrentFnName << ",.-" << CurrentFnName << "\n";
+
+  // Print out jump tables referenced by the function.
+  EmitJumpTableInfo(MF.getJumpTableInfo(), MF);
+  
+  // Emit post-function debug information.
+  DW.EndFunction();
+  
+  // We didn't modify anything.
+  return false;
+}
+
+
+bool LinuxAsmPrinter::doInitialization(Module &M) {
+  bool Result = AsmPrinter::doInitialization(M);
+  SwitchToTextSection(TAI->getTextSection());
+  // Emit initial debug information.
+  DW.BeginModule(&M);
+  return Result;
+}
+
+bool LinuxAsmPrinter::doFinalization(Module &M) {
+  const TargetData *TD = TM.getTargetData();
+
+  // Print out module-level global variables here.
+  for (Module::const_global_iterator I = M.global_begin(), E = M.global_end();
+       I != E; ++I) {
+    if (!I->hasInitializer()) continue;   // External global require no code
+    
+    // Check to see if this is a special global used by LLVM, if so, emit it.
+    if (EmitSpecialLLVMGlobal(I))
+      continue;
+    
+    std::string name = Mang->getValueName(I);
+    Constant *C = I->getInitializer();
+    unsigned Size = TD->getTypeStoreSize(C->getType());
+    unsigned Align = TD->getPreferredAlignmentLog(I);
+
+    if (C->isNullValue() && /* FIXME: Verify correct */
+        (I->hasInternalLinkage() || I->hasWeakLinkage() ||
+         I->hasLinkOnceLinkage() ||
+         (I->hasExternalLinkage() && !I->hasSection()))) {
+      if (Size == 0) Size = 1;   // .comm Foo, 0 is undefined, avoid it.
+      if (I->hasExternalLinkage()) {
+        // External linkage globals -> .bss section
+        // FIXME: Want to set the global variable's section so that
+        // SwitchToDataSection emits the ".section" directive
+        SwitchToDataSection("\t.section\t.bss", I);
+        O << "\t.global\t" << name << '\n';
+        O << "\t.align\t" << Align << '\n';
+        O << "\t.type\t" << name << ", @object\n";
+        O << "\t.size\t" << name << ", " << Size << '\n';
+        O << name << ":\n";
+        O << "\t.zero\t" << Size;
+      } else if (I->hasInternalLinkage()) {
+        SwitchToDataSection("\t.data", I);
+        O << TAI->getLCOMMDirective() << name << "," << Size << "," << Align;
+      } else {
+        SwitchToDataSection("\t.data", I);
+        O << ".comm " << name << "," << Size;
+      }
+      O << "\t\t# '" << I->getName() << "'\n";
+    } else {
+      switch (I->getLinkage()) {
+      case GlobalValue::LinkOnceLinkage:
+      case GlobalValue::WeakLinkage:
+        O << "\t.global " << name << '\n'
+          << "\t.weak_definition " << name << '\n';
+        SwitchToDataSection(".section __DATA,__datacoal_nt,coalesced", I);
+        break;
+      case GlobalValue::AppendingLinkage:
+        // FIXME: appending linkage variables should go into a section of
+        // their name or something.  For now, just emit them as external.
+      case GlobalValue::ExternalLinkage:
+        // If external or appending, declare as a global symbol
+        O << "\t.global " << name << "\n";
+        // FALL THROUGH
+      case GlobalValue::InternalLinkage:
+        if (I->isConstant()) {
+          const ConstantArray *CVA = dyn_cast<ConstantArray>(C);
+          if (TAI->getCStringSection() && CVA && CVA->isCString()) {
+            SwitchToDataSection(TAI->getCStringSection(), I);
+            break;
+          }
+        }
+
+        SwitchToDataSection("\t.data", I);
+        break;
+      default:
+        cerr << "Unknown linkage type!";
+        abort();
+      }
+
+      EmitAlignment(Align, I);
+      O << name << ":\t\t\t\t# '" << I->getName() << "'\n";
+
+      // If the initializer is a extern weak symbol, remember to emit the weak
+      // reference!
+      if (const GlobalValue *GV = dyn_cast<GlobalValue>(C))
+        if (GV->hasExternalWeakLinkage())
+          ExtWeakSymbols.insert(GV);
+
+      EmitGlobalConstant(C);
+      O << '\n';
+    }
+  }
+
+  // Output stubs for dynamically-linked functions
+  if (TM.getRelocationModel() == Reloc::PIC_) {
+    for (std::set<std::string>::iterator i = FnStubs.begin(), e = FnStubs.end();
+         i != e; ++i) {
+      SwitchToTextSection(".section __TEXT,__picsymbolstub1,symbol_stubs,"
+                          "pure_instructions,32");
+      EmitAlignment(4);
+      O << "L" << *i << "$stub:\n";
+      O << "\t.indirect_symbol " << *i << "\n";
+      O << "\tmflr r0\n";
+      O << "\tbcl 20,31,L0$" << *i << "\n";
+      O << "L0$" << *i << ":\n";
+      O << "\tmflr r11\n";
+      O << "\taddis r11,r11,ha16(L" << *i << "$lazy_ptr-L0$" << *i << ")\n";
+      O << "\tmtlr r0\n";
+      O << "\tlwzu r12,lo16(L" << *i << "$lazy_ptr-L0$" << *i << ")(r11)\n";
+      O << "\tmtctr r12\n";
+      O << "\tbctr\n";
+      SwitchToDataSection(".lazy_symbol_pointer");
+      O << "L" << *i << "$lazy_ptr:\n";
+      O << "\t.indirect_symbol " << *i << "\n";
+      O << "\t.long dyld_stub_binding_helper\n";
+    }
+  } else {
+    for (std::set<std::string>::iterator i = FnStubs.begin(), e = FnStubs.end();
+         i != e; ++i) {
+      SwitchToTextSection(".section __TEXT,__symbol_stub1,symbol_stubs,"
+                          "pure_instructions,16");
+      EmitAlignment(4);
+      O << "L" << *i << "$stub:\n";
+      O << "\t.indirect_symbol " << *i << "\n";
+      O << "\tlis r11,ha16(L" << *i << "$lazy_ptr)\n";
+      O << "\tlwzu r12,lo16(L" << *i << "$lazy_ptr)(r11)\n";
+      O << "\tmtctr r12\n";
+      O << "\tbctr\n";
+      SwitchToDataSection(".lazy_symbol_pointer");
+      O << "L" << *i << "$lazy_ptr:\n";
+      O << "\t.indirect_symbol " << *i << "\n";
+      O << "\t.long dyld_stub_binding_helper\n";
+    }
+  }
+
+  O << "\n";
+
+  // Output stubs for external and common global variables.
+  if (GVStubs.begin() != GVStubs.end()) {
+    SwitchToDataSection(".non_lazy_symbol_pointer");
+    for (std::set<std::string>::iterator I = GVStubs.begin(),
+         E = GVStubs.end(); I != E; ++I) {
+      O << "L" << *I << "$non_lazy_ptr:\n";
+      O << "\t.indirect_symbol " << *I << "\n";
+      O << "\t.long\t0\n";
+    }
+  }
+
+  // Emit initial debug information.
+  DW.EndModule();
+
+  // Emit ident information
+  O << "\t.ident\t\"(llvm 1.9+) STI CBEA Cell SPU backend\"\n";
+
+  return AsmPrinter::doFinalization(M);
+}
+
+
+
+/// createSPUCodePrinterPass - Returns a pass that prints the Cell SPU
+/// assembly code for a MachineFunction to the given output stream, in a format
+/// that the Linux SPU assembler can deal with.
+///
+FunctionPass *llvm::createSPUAsmPrinterPass(std::ostream &o,
+                                            SPUTargetMachine &tm) {
+  return new LinuxAsmPrinter(o, tm, tm.getTargetAsmInfo());
+}
+
diff --git a/lib/Target/CellSPU/SPUCallingConv.td b/lib/Target/CellSPU/SPUCallingConv.td
new file mode 100644
index 00000000000..cc1a9d6fd90
--- /dev/null
+++ b/lib/Target/CellSPU/SPUCallingConv.td
@@ -0,0 +1,62 @@
+//===- SPUCallingConv.td - Calling Conventions for CellSPU ------*- C++ -*-===//
+// 
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by a team from the Computer Systems Research
+// Department at The Aerospace Corporation.
+//
+// See README.txt for details.
+// 
+//===----------------------------------------------------------------------===//
+//
+// This describes the calling conventions for the STI Cell SPU architecture.
+//
+//===----------------------------------------------------------------------===//
+
+/// CCIfSubtarget - Match if the current subtarget has a feature F.
+class CCIfSubtarget<string F, CCAction A>
+ : CCIf<!strconcat("State.getTarget().getSubtarget<PPCSubtarget>().", F), A>;
+
+//===----------------------------------------------------------------------===//
+// Return Value Calling Convention
+//===----------------------------------------------------------------------===//
+
+// Return-value convention for Cell SPU: Everything can be passed back via $3:
+def RetCC_SPU : CallingConv<[
+  CCIfType<[i32], CCAssignToReg<[R3]>>,
+  CCIfType<[i64], CCAssignToReg<[R3]>>,
+  CCIfType<[f32, f64], CCAssignToReg<[R3]>>,
+  CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], CCAssignToReg<[R3]>>
+]>;
+
+
+//===----------------------------------------------------------------------===//
+// CellSPU Argument Calling Conventions
+// FIXME
+//===----------------------------------------------------------------------===//
+/*
+def CC_SPU : CallingConv<[
+  // The first 8 integer arguments are passed in integer registers.
+  CCIfType<[i32], CCAssignToReg<[R3, R4, R5, R6, R7, R8, R9, R10]>>,
+  CCIfType<[i64], CCAssignToReg<[X3, X4, X5, X6, X7, X8, X9, X10]>>,
+  
+  // SPU can pass back arguments in all 
+  CCIfType<[f32, f64], CCIfSubtarget<"isMachoABI()",
+           CCAssignToReg<[F1, F2, F3, F4, F5, F6, F7, F8,F9,F10,F11,F12,F13]>>>,
+  // Other sub-targets pass FP values in F1-10.
+  CCIfType<[f32, f64], CCAssignToReg<[F1, F2, F3, F4, F5, F6, F7, F8, F9,F10]>>,
+           
+  // The first 12 Vector arguments are passed in altivec registers.
+  CCIfType<[v16i8, v8i16, v4i32, v4f32],
+              CCAssignToReg<[V2, V3, V4, V5, V6, V7, V8, V9, V10,V11,V12,V13]>>
+ */
+/*
+  // Integer/FP values get stored in stack slots that are 8 bytes in size and
+  // 8-byte aligned if there are no more registers to hold them.
+  CCIfType<[i32, i64, f32, f64], CCAssignToStack<8, 8>>,
+  
+  // Vectors get 16-byte stack slots that are 16-byte aligned.
+  CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+              CCAssignToStack<16, 16>>*/
+]>;
+ */
diff --git a/lib/Target/CellSPU/SPUFrameInfo.cpp b/lib/Target/CellSPU/SPUFrameInfo.cpp
new file mode 100644
index 00000000000..c110db9abc9
--- /dev/null
+++ b/lib/Target/CellSPU/SPUFrameInfo.cpp
@@ -0,0 +1,32 @@
+//===-- SPUTargetMachine.cpp - Define TargetMachine for Cell SPU ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by a team from the Computer Systems Research
+// Department at The Aerospace Corporation.
+//
+// See README.txt for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Top-level implementation for the Cell SPU target.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SPU.h"
+#include "SPUFrameInfo.h"
+#include "SPURegisterNames.h"
+
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+// SPUFrameInfo:
+//===----------------------------------------------------------------------===//
+
+SPUFrameInfo::SPUFrameInfo(const TargetMachine &tm):
+  TargetFrameInfo(TargetFrameInfo::StackGrowsDown, 16, 0),
+  TM(tm)
+{
+  LR[0].first = SPU::R0;
+  LR[0].second = 16;
+}
diff --git a/lib/Target/CellSPU/SPUFrameInfo.h b/lib/Target/CellSPU/SPUFrameInfo.h
new file mode 100644
index 00000000000..2fe7b3542b8
--- /dev/null
+++ b/lib/Target/CellSPU/SPUFrameInfo.h
@@ -0,0 +1,77 @@
+//===-- SPUFrameInfo.h - Top-level interface for Cell SPU Target -*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by a team from the Computer Systems Research
+// Department at The Aerospace Corporation.
+//
+// See README.txt for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains CellSPU frame information that doesn't fit anywhere else
+// cleanly...
+//
+//===----------------------------------------------------------------------===//
+
+#if !defined(SPUFRAMEINFO_H)
+
+#include "llvm/Target/TargetFrameInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "SPURegisterInfo.h"
+
+namespace llvm {
+  class SPUFrameInfo: public TargetFrameInfo {
+    const TargetMachine &TM;
+    std::pair<unsigned, int> LR[1];
+
+  public:
+    SPUFrameInfo(const TargetMachine &tm);
+
+    //! Return a function's saved spill slots
+    /*!
+      For CellSPU, a function's saved spill slots is just the link register.
+     */
+    const std::pair<unsigned, int> *
+    getCalleeSaveSpillSlots(unsigned &NumEntries) const;
+
+    //! Stack slot size (16 bytes)
+    static const int stackSlotSize() {
+      return 16;
+    }
+    //! Maximum frame offset representable by a signed 10-bit integer
+    /*!
+      This is the maximum frame offset that can be expressed as a 10-bit
+      integer, used in D-form addresses.
+     */
+    static const int maxFrameOffset() {
+      return ((1 << 9) - 1) * stackSlotSize();
+    }
+    //! Minimum frame offset representable by a signed 10-bit integer
+    static const int minFrameOffset() {
+      return -(1 << 9) * stackSlotSize();
+    }
+    //! Minimum frame size (enough to spill LR + SP)
+    static const int minStackSize() {
+      return (2 * stackSlotSize());
+    }
+    //! Frame size required to spill all registers plus frame info
+    static const int fullSpillSize() {
+      return (SPURegisterInfo::getNumArgRegs() * stackSlotSize());
+    }
+    //! Number of instructions required to overcome hint-for-branch latency
+    /*!
+      HBR (hint-for-branch) instructions can be inserted when, for example,
+      we know that a given function is going to be called, such as printf(),
+      in the control flow graph. HBRs are only inserted if a sufficient number
+      of instructions occurs between the HBR and the target. Currently, HBRs
+      take 6 cycles, ergo, the magic number 6.
+     */
+    static const int branchHintPenalty() {
+      return 6;
+    }
+  };
+}
+
+#define SPUFRAMEINFO_H 1
+#endif
diff --git a/lib/Target/CellSPU/SPUHazardRecognizers.cpp b/lib/Target/CellSPU/SPUHazardRecognizers.cpp
new file mode 100644
index 00000000000..e4787ebfc31
--- /dev/null
+++ b/lib/Target/CellSPU/SPUHazardRecognizers.cpp
@@ -0,0 +1,137 @@
+//===-- SPUHazardRecognizers.cpp - Cell Hazard Recognizer Impls -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by a team from the Computer Systems Research
+// Department at The Aerospace Corporation.
+//
+// See README.txt for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements hazard recognizers for scheduling on Cell SPU
+// processors.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "sched"
+
+#include "SPUHazardRecognizers.h"
+#include "SPU.h"
+#include "SPUInstrInfo.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+// Cell SPU hazard recognizer
+//
+// This is the pipeline hazard recognizer for the Cell SPU processor. It does
+// very little right now.
+//===----------------------------------------------------------------------===//
+
+SPUHazardRecognizer::SPUHazardRecognizer(const TargetInstrInfo &tii) :
+  TII(tii),
+  EvenOdd(0)
+{
+}
+
+/// Return the pipeline hazard type encountered or generated by this
+/// instruction. Currently returns NoHazard.
+///
+/// \return NoHazard
+HazardRecognizer::HazardType
+SPUHazardRecognizer::getHazardType(SDNode *Node)
+{
+  // Initial thoughts on how to do this, but this code cannot work unless the
+  // function's prolog and epilog code are also being scheduled so that we can
+  // accurately determine which pipeline is being scheduled.
+#if 0
+  HazardRecognizer::HazardType retval = NoHazard;
+  bool mustBeOdd = false;
+
+  switch (Node->getOpcode()) {
+  case SPU::LQDv16i8:
+  case SPU::LQDv8i16:
+  case SPU::LQDv4i32:
+  case SPU::LQDv4f32:
+  case SPU::LQDv2f64:
+  case SPU::LQDr128:
+  case SPU::LQDr64:
+  case SPU::LQDr32:
+  case SPU::LQDr16:
+  case SPU::LQAv16i8:
+  case SPU::LQAv8i16:
+  case SPU::LQAv4i32:
+  case SPU::LQAv4f32:
+  case SPU::LQAv2f64:
+  case SPU::LQAr128:
+  case SPU::LQAr64:
+  case SPU::LQAr32:
+  case SPU::LQXv4i32:
+  case SPU::LQXr128:
+  case SPU::LQXr64:
+  case SPU::LQXr32:
+  case SPU::LQXr16:
+  case SPU::STQDv16i8:
+  case SPU::STQDv8i16:
+  case SPU::STQDv4i32:
+  case SPU::STQDv4f32:
+  case SPU::STQDv2f64:
+  case SPU::STQDr128:
+  case SPU::STQDr64:
+  case SPU::STQDr32:
+  case SPU::STQDr16:
+  case SPU::STQDr8:
+  case SPU::STQAv16i8:
+  case SPU::STQAv8i16:
+  case SPU::STQAv4i32:
+  case SPU::STQAv4f32:
+  case SPU::STQAv2f64:
+  case SPU::STQAr128:
+  case SPU::STQAr64:
+  case SPU::STQAr32:
+  case SPU::STQAr16:
+  case SPU::STQAr8:
+  case SPU::STQXv16i8:
+  case SPU::STQXv8i16:
+  case SPU::STQXv4i32:
+  case SPU::STQXv4f32:
+  case SPU::STQXv2f64:
+  case SPU::STQXr128:
+  case SPU::STQXr64:
+  case SPU::STQXr32:
+  case SPU::STQXr16:
+  case SPU::STQXr8:
+  case SPU::RET:
+    mustBeOdd = true;
+    break;
+  default:
+    // Assume that this instruction can be on the even pipe
+    break;
+  }
+
+  if (mustBeOdd && !EvenOdd)
+    retval = Hazard;
+
+  DOUT << "SPUHazardRecognizer EvenOdd " << EvenOdd << " Hazard " << retval << "\n";
+  EvenOdd ^= 1;
+  return retval;
+#else
+  return NoHazard;
+#endif
+}
+
+void SPUHazardRecognizer::EmitInstruction(SDNode *Node)
+{
+}
+
+void SPUHazardRecognizer::AdvanceCycle()
+{
+  DOUT << "SPUHazardRecognizer::AdvanceCycle\n";
+}
+
+void SPUHazardRecognizer::EmitNoop()
+{
+  AdvanceCycle();
+}
diff --git a/lib/Target/CellSPU/SPUHazardRecognizers.h b/lib/Target/CellSPU/SPUHazardRecognizers.h
new file mode 100644
index 00000000000..ce602fd72f5
--- /dev/null
+++ b/lib/Target/CellSPU/SPUHazardRecognizers.h
@@ -0,0 +1,43 @@
+//===-- SPUHazardRecognizers.h - Cell SPU Hazard Recognizer -----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by a team from the Computer Systems Research
+// Department at The Aerospace Corporation.
+//
+// See README.txt for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines hazard recognizers for scheduling on the Cell SPU
+// processor.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SPUHAZRECS_H
+#define SPUHAZRECS_H
+
+#include "llvm/CodeGen/ScheduleDAG.h"
+#include "SPUInstrInfo.h"
+
+namespace llvm {
+  
+/// SPUHazardRecognizer
+class SPUHazardRecognizer : public HazardRecognizer
+{
+private:
+  const TargetInstrInfo &TII;
+  int EvenOdd;
+
+public:
+  SPUHazardRecognizer(const TargetInstrInfo &TII);
+  virtual HazardType getHazardType(SDNode *Node);
+  virtual void EmitInstruction(SDNode *Node);
+  virtual void AdvanceCycle();
+  virtual void EmitNoop();
+};
+
+} // end namespace llvm
+
+#endif
+
diff --git a/lib/Target/CellSPU/SPUISelDAGToDAG.cpp b/lib/Target/CellSPU/SPUISelDAGToDAG.cpp
new file mode 100644
index 00000000000..7d5c8ca8614
--- /dev/null
+++ b/lib/Target/CellSPU/SPUISelDAGToDAG.cpp
@@ -0,0 +1,615 @@
+//===-- SPUISelDAGToDAG.cpp - CellSPU -pattern matching inst selector -----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by a team from the Computer Systems Research
+// Department at The Aerospace Corporation.
+//
+// See README.txt for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines a pattern matching instruction selector for the Cell SPU,
+// converting from a legalized dag to a SPU-target dag.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SPU.h"
+#include "SPUTargetMachine.h"
+#include "SPUISelLowering.h"
+#include "SPUHazardRecognizers.h"
+#include "SPUFrameInfo.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/SSARegMap.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Constants.h"
+#include "llvm/GlobalValue.h"
+#include "llvm/Intrinsics.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/Compiler.h"
+#include <iostream>
+#include <queue>
+#include <set>
+
+using namespace llvm;
+
+namespace {
+  //! ConstantSDNode predicate for i32 sign-extended, 10-bit immediates
+  bool
+  isI64IntS10Immediate(ConstantSDNode *CN)
+  {
+    return isS10Constant(CN->getValue());
+  }
+
+  //! ConstantSDNode predicate for i32 sign-extended, 10-bit immediates
+  bool
+  isI32IntS10Immediate(ConstantSDNode *CN)
+  {
+    return isS10Constant((int) CN->getValue());
+  }
+
+#if 0
+  //! SDNode predicate for sign-extended, 10-bit immediate values
+  bool
+  isI32IntS10Immediate(SDNode *N)
+  {
+    return (N->getOpcode() == ISD::Constant
+            && isI32IntS10Immediate(cast<ConstantSDNode>(N)));
+  }
+#endif
+
+  //! ConstantSDNode predicate for i16 sign-extended, 10-bit immediate values
+  bool
+  isI16IntS10Immediate(ConstantSDNode *CN)
+  {
+    return isS10Constant((short) CN->getValue());
+  }
+
+  //! SDNode predicate for i16 sign-extended, 10-bit immediate values
+  bool
+  isI16IntS10Immediate(SDNode *N)
+  {
+    return (N->getOpcode() == ISD::Constant
+            && isI16IntS10Immediate(cast<ConstantSDNode>(N)));
+  }
+
+  //! ConstantSDNode predicate for signed 16-bit values
+  /*!
+    \arg CN The constant SelectionDAG node holding the value
+    \arg Imm The returned 16-bit value, if returning true
+
+    This predicate tests the value in \a CN to see whether it can be
+    represented as a 16-bit, sign-extended quantity. Returns true if
+    this is the case.
+   */
+  bool
+  isIntS16Immediate(ConstantSDNode *CN, short &Imm)
+  {
+    MVT::ValueType vt = CN->getValueType(0);
+    Imm = (short) CN->getValue();
+    if (vt >= MVT::i1 && vt <= MVT::i16) {
+      return true;
+    } else if (vt == MVT::i32) {
+      int32_t i_val = (int32_t) CN->getValue();
+      short s_val = (short) i_val;
+      return i_val == s_val;
+    } else {
+      int64_t i_val = (int64_t) CN->getValue();
+      short s_val = (short) i_val;
+      return i_val == s_val;
+    }
+
+    return false;
+  }
+
+  //! SDNode predicate for signed 16-bit values.
+  bool
+  isIntS16Immediate(SDNode *N, short &Imm)
+  {
+    return (N->getOpcode() == ISD::Constant
+            && isIntS16Immediate(cast<ConstantSDNode>(N), Imm));
+  }
+
+  //! ConstantFPSDNode predicate for representing floats as 16-bit sign ext.
+  static bool
+  isFPS16Immediate(ConstantFPSDNode *FPN, short &Imm)
+  {
+    MVT::ValueType vt = FPN->getValueType(0);
+    if (vt == MVT::f32) {
+      const APFloat &apf = FPN->getValueAPF();
+      float fval = apf.convertToFloat();
+      int val = *((int *) &fval);
+      int sval = (int) ((val << 16) >> 16);
+      Imm = (short) val;
+      return val == sval;
+    }
+
+    return false;
+  }
+
+  //===------------------------------------------------------------------===//
+  //! MVT::ValueType to useful stuff structure:
+
+  struct valtype_map_s {
+    MVT::ValueType VT;
+    unsigned ldresult_ins;	/// LDRESULT instruction (0 = undefined)
+    int prefslot_byte;		/// Byte offset of the "preferred" slot
+    unsigned brcc_eq_ins;	/// br_cc equal instruction
+    unsigned brcc_neq_ins;	/// br_cc not equal instruction
+  };
+
+  const valtype_map_s valtype_map[] = {
+    { MVT::i1,   0,            3, 0,         0 },
+    { MVT::i8,   0,            3, 0,         0 },
+    { MVT::i16,  SPU::ORHIr16, 2, SPU::BRHZ, SPU::BRHNZ },
+    { MVT::i32,  SPU::ORIr32,  0, SPU::BRZ,  SPU::BRNZ },
+    { MVT::i64,  SPU::ORIr64,  0, 0,         0 },
+    { MVT::f32,  SPU::ORIf32,  0, 0,         0 },
+    { MVT::f64,  SPU::ORIf64,  0, 0,         0 }
+  };
+
+  const size_t n_valtype_map = sizeof(valtype_map) / sizeof(valtype_map[0]);
+
+  const valtype_map_s *getValueTypeMapEntry(MVT::ValueType VT)
+  {
+    const valtype_map_s *retval = 0;
+    for (size_t i = 0; i < n_valtype_map; ++i) {
+      if (valtype_map[i].VT == VT) {
+	retval = valtype_map + i;
+	break;
+      }
+    }
+
+
+#ifndef NDEBUG
+    if (retval == 0) {
+      cerr << "SPUISelDAGToDAG.cpp: getValueTypeMapEntry returns NULL for "
+	   << MVT::getValueTypeString(VT)
+	   << "\n";
+      abort();
+    }
+#endif
+
+    return retval;
+  }
+}
+
+//===--------------------------------------------------------------------===//
+/// SPUDAGToDAGISel - Cell SPU-specific code to select SPU machine
+/// instructions for SelectionDAG operations.
+///
+class SPUDAGToDAGISel :
+  public SelectionDAGISel
+{
+  SPUTargetMachine &TM;
+  SPUTargetLowering &SPUtli;
+  unsigned GlobalBaseReg;
+
+public:
+  SPUDAGToDAGISel(SPUTargetMachine &tm) :
+    SelectionDAGISel(*tm.getTargetLowering()),
+    TM(tm),
+    SPUtli(*tm.getTargetLowering())
+  {}
+    
+  virtual bool runOnFunction(Function &Fn) {
+    // Make sure we re-emit a set of the global base reg if necessary
+    GlobalBaseReg = 0;
+    SelectionDAGISel::runOnFunction(Fn);
+    return true;
+  }
+   
+  /// getI32Imm - Return a target constant with the specified value, of type
+  /// i32.
+  inline SDOperand getI32Imm(uint32_t Imm) {
+    return CurDAG->getTargetConstant(Imm, MVT::i32);
+  }
+
+  /// getI64Imm - Return a target constant with the specified value, of type
+  /// i64.
+  inline SDOperand getI64Imm(uint64_t Imm) {
+    return CurDAG->getTargetConstant(Imm, MVT::i64);
+  }
+    
+  /// getSmallIPtrImm - Return a target constant of pointer type.
+  inline SDOperand getSmallIPtrImm(unsigned Imm) {
+    return CurDAG->getTargetConstant(Imm, SPUtli.getPointerTy());
+  }
+
+  /// Select - Convert the specified operand from a target-independent to a
+  /// target-specific node if it hasn't already been changed.
+  SDNode *Select(SDOperand Op);
+
+  /// Return true if the address N is a RI7 format address [r+imm]
+  bool SelectDForm2Addr(SDOperand Op, SDOperand N, SDOperand &Disp,
+			SDOperand &Base);
+
+  //! Returns true if the address N is an A-form (local store) address
+  bool SelectAFormAddr(SDOperand Op, SDOperand N, SDOperand &Base,
+		       SDOperand &Index);
+
+  //! D-form address predicate
+  bool SelectDFormAddr(SDOperand Op, SDOperand N, SDOperand &Base,
+		       SDOperand &Index);
+
+  //! Address predicate if N can be expressed as an indexed [r+r] operation.
+  bool SelectXFormAddr(SDOperand Op, SDOperand N, SDOperand &Base,
+		       SDOperand &Index);
+
+  /// SelectInlineAsmMemoryOperand - Implement addressing mode selection for
+  /// inline asm expressions.
+  virtual bool SelectInlineAsmMemoryOperand(const SDOperand &Op,
+					    char ConstraintCode,
+					    std::vector<SDOperand> &OutOps,
+					    SelectionDAG &DAG) {
+    SDOperand Op0, Op1;
+    switch (ConstraintCode) {
+    default: return true;
+    case 'm':   // memory
+      if (!SelectDFormAddr(Op, Op, Op0, Op1) 
+	  && !SelectAFormAddr(Op, Op, Op0, Op1))
+	SelectXFormAddr(Op, Op, Op0, Op1);
+      break;
+    case 'o':   // offsetable
+      if (!SelectDFormAddr(Op, Op, Op0, Op1)
+	  && !SelectAFormAddr(Op, Op, Op0, Op1)) {
+	Op0 = Op;
+	AddToISelQueue(Op0);     // r+0.
+	Op1 = getSmallIPtrImm(0);
+      }
+      break;
+    case 'v':   // not offsetable
+#if 1
+      assert(0 && "InlineAsmMemoryOperand 'v' constraint not handled.");
+#else
+      SelectAddrIdxOnly(Op, Op, Op0, Op1);
+#endif
+      break;
+    }
+      
+    OutOps.push_back(Op0);
+    OutOps.push_back(Op1);
+    return false;
+  }
+
+  /// InstructionSelectBasicBlock - This callback is invoked by
+  /// SelectionDAGISel when it has created a SelectionDAG for us to codegen.
+  virtual void InstructionSelectBasicBlock(SelectionDAG &DAG);
+
+  virtual const char *getPassName() const {
+    return "Cell SPU DAG->DAG Pattern Instruction Selection";
+  } 
+    
+  /// CreateTargetHazardRecognizer - Return the hazard recognizer to use for
+  /// this target when scheduling the DAG.
+  virtual HazardRecognizer *CreateTargetHazardRecognizer() {
+    const TargetInstrInfo *II = SPUtli.getTargetMachine().getInstrInfo();
+    assert(II && "No InstrInfo?");
+    return new SPUHazardRecognizer(*II); 
+  }
+
+  // Include the pieces autogenerated from the target description.
+#include "SPUGenDAGISel.inc"
+};
+
+/// InstructionSelectBasicBlock - This callback is invoked by
+/// SelectionDAGISel when it has created a SelectionDAG for us to codegen.
+void
+SPUDAGToDAGISel::InstructionSelectBasicBlock(SelectionDAG &DAG)
+{
+  DEBUG(BB->dump());
+
+  // Select target instructions for the DAG.
+  DAG.setRoot(SelectRoot(DAG.getRoot()));
+  DAG.RemoveDeadNodes();
+  
+  // Emit machine code to BB.
+  ScheduleAndEmitDAG(DAG);
+}
+
+bool 
+SPUDAGToDAGISel::SelectDForm2Addr(SDOperand Op, SDOperand N, SDOperand &Disp,
+				  SDOperand &Base) {
+  unsigned Opc = N.getOpcode();
+  unsigned VT = N.getValueType();
+  MVT::ValueType PtrVT = SPUtli.getPointerTy();
+  ConstantSDNode *CN = 0;
+  int Imm;
+
+  if (Opc == ISD::ADD) {
+    SDOperand Op0 = N.getOperand(0);
+    SDOperand Op1 = N.getOperand(1);
+    if (Op1.getOpcode() == ISD::Constant ||
+	Op1.getOpcode() == ISD::TargetConstant) {
+      CN = cast<ConstantSDNode>(Op1);
+      Imm = int(CN->getValue());
+      if (Imm <= 0xff) {
+	Disp = CurDAG->getTargetConstant(Imm, SPUtli.getPointerTy());
+	Base = Op0;
+	return true;
+      }
+    }
+  } else if (Opc == ISD::GlobalAddress
+	     || Opc == ISD::TargetGlobalAddress
+	     || Opc == ISD::Register) {
+    // Plain old local store address: 
+    Disp = CurDAG->getTargetConstant(0, VT);
+    Base = N;
+    return true;
+  } else if (Opc == SPUISD::DFormAddr) {
+    // D-Form address: This is pretty straightforward, naturally...
+    CN = cast<ConstantSDNode>(N.getOperand(1));
+    assert(CN != 0 && "SelectDFormAddr/SPUISD::DForm2Addr expecting constant");
+    Imm = unsigned(CN->getValue());
+    if (Imm < 0xff) {
+      Disp = CurDAG->getTargetConstant(CN->getValue(), PtrVT);
+      Base = N.getOperand(0);
+      return true;
+    }
+  }
+
+  return false;
+}
+
+/*!
+ \arg Op The ISD instructio operand
+ \arg N The address to be tested
+ \arg Base The base address
+ \arg Index The base address index
+ */
+bool
+SPUDAGToDAGISel::SelectAFormAddr(SDOperand Op, SDOperand N, SDOperand &Base,
+		    SDOperand &Index) {
+  // These match the addr256k operand type:
+  MVT::ValueType PtrVT = SPUtli.getPointerTy();
+  MVT::ValueType OffsVT = MVT::i16;
+
+  switch (N.getOpcode()) {
+  case ISD::Constant:
+  case ISD::TargetConstant: {
+    // Loading from a constant address.
+    ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N);
+    int Imm = (int)CN->getValue();
+    if (Imm < 0x3ffff && (Imm & 0x3) == 0) {
+      Base = CurDAG->getTargetConstant(Imm, PtrVT);
+      // Note that this operand will be ignored by the assembly printer...
+      Index = CurDAG->getTargetConstant(0, OffsVT);
+      return true;
+    }
+  }
+  case ISD::ConstantPool:
+  case ISD::TargetConstantPool: {
+    // The constant pool address is N. Base is a dummy that will be ignored by
+    // the assembly printer.
+    Base = N;
+    Index = CurDAG->getTargetConstant(0, OffsVT);
+    return true;
+  }
+
+  case ISD::GlobalAddress:
+  case ISD::TargetGlobalAddress: {
+    // The global address is N. Base is a dummy that is ignored by the
+    // assembly printer.
+    Base = N;
+    Index = CurDAG->getTargetConstant(0, OffsVT);
+    return true;
+  }
+  }
+
+  return false;
+}
+
+/*!
+  \arg Op The ISD instruction (ignored)
+  \arg N The address to be tested
+  \arg Base Base address register/pointer
+  \arg Index Base address index
+
+  Examine the input address by a base register plus a signed 10-bit
+  displacement, [r+I10] (D-form address).
+
+  \return true if \a N is a D-form address with \a Base and \a Index set
+  to non-empty SDOperand instances.
+*/
+bool
+SPUDAGToDAGISel::SelectDFormAddr(SDOperand Op, SDOperand N, SDOperand &Base,
+				 SDOperand &Index) {
+  unsigned Opc = N.getOpcode();
+  unsigned PtrTy = SPUtli.getPointerTy();
+
+  if (Opc == ISD::Register) {
+    Base = N;
+    Index = CurDAG->getTargetConstant(0, PtrTy);
+    return true;
+  } else if (Opc == ISD::FrameIndex) {
+    // Stack frame index must be less than 512 (divided by 16):
+    FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(N);
+    DEBUG(cerr << "SelectDFormAddr: ISD::FrameIndex = "
+	  << FI->getIndex() << "\n");
+    if (FI->getIndex() < SPUFrameInfo::maxFrameOffset()) {
+      Base = CurDAG->getTargetConstant(0, PtrTy);
+      Index = CurDAG->getTargetFrameIndex(FI->getIndex(), PtrTy);
+      return true;
+    }
+  } else if (Opc == ISD::ADD) {
+    // Generated by getelementptr
+    const SDOperand Op0 = N.getOperand(0); // Frame index/base
+    const SDOperand Op1 = N.getOperand(1); // Offset within base
+    ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Op1);
+
+    // Not a constant?
+    if (CN == 0)
+      return false;
+
+    int32_t offset = (int32_t) CN->getSignExtended();
+    unsigned Opc0 = Op0.getOpcode();
+
+    if ((offset & 0xf) != 0) {
+      cerr << "SelectDFormAddr: unaligned offset = " << offset << "\n";
+      abort();
+      /*NOTREACHED*/
+    }
+
+    if (Opc0 == ISD::FrameIndex) {
+      FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(Op0);
+      DEBUG(cerr << "SelectDFormAddr: ISD::ADD offset = " << offset
+	    << " frame index = " << FI->getIndex() << "\n");
+
+      if (FI->getIndex() < SPUFrameInfo::maxFrameOffset()) {
+	Base = CurDAG->getTargetConstant(offset, PtrTy);
+	Index = CurDAG->getTargetFrameIndex(FI->getIndex(), PtrTy);
+	return true;
+      }
+    } else if (offset > SPUFrameInfo::minFrameOffset()
+	       && offset < SPUFrameInfo::maxFrameOffset()) {
+      Base = CurDAG->getTargetConstant(offset, PtrTy);
+      if (Opc0 == ISD::GlobalAddress) {
+	// Convert global address to target global address
+	GlobalAddressSDNode *GV = dyn_cast<GlobalAddressSDNode>(Op0);
+	Index = CurDAG->getTargetGlobalAddress(GV->getGlobal(), PtrTy);
+	return true;
+      } else {
+	// Otherwise, just take operand 0
+	Index = Op0;
+	return true;
+      }
+    }
+  } else if (Opc == SPUISD::DFormAddr) {
+    // D-Form address: This is pretty straightforward, naturally...
+    ConstantSDNode *CN = cast<ConstantSDNode>(N.getOperand(1));
+    assert(CN != 0 && "SelectDFormAddr/SPUISD::DFormAddr expecting constant"); 
+    Base = CurDAG->getTargetConstant(CN->getValue(), PtrTy);
+    Index = N.getOperand(0);
+    return true;
+  }
+
+  return false;
+}
+
+/*!
+  \arg Op The ISD instruction operand
+  \arg N The address operand
+  \arg Base The base pointer operand
+  \arg Index The offset/index operand
+
+  If the address \a N can be expressed as a [r + s10imm] address, returns false.
+  Otherwise, creates two operands, Base and Index that will become the [r+r]
+  address.
+*/
+bool
+SPUDAGToDAGISel::SelectXFormAddr(SDOperand Op, SDOperand N, SDOperand &Base,
+				 SDOperand &Index) {
+  if (SelectAFormAddr(Op, N, Base, Index)
+      || SelectDFormAddr(Op, N, Base, Index))
+    return false;
+
+  unsigned Opc = N.getOpcode();
+
+  if (Opc == ISD::ADD) {
+    SDOperand N1 = N.getOperand(0);
+    SDOperand N2 = N.getOperand(1);
+    unsigned N1Opc = N1.getOpcode();
+    unsigned N2Opc = N2.getOpcode();
+
+    if ((N1Opc == SPUISD::Hi && N2Opc == SPUISD::Lo)
+	 || (N1Opc == SPUISD::Lo && N2Opc == SPUISD::Hi)) {
+      Base = N.getOperand(0);
+      Index = N.getOperand(1);
+      return true;
+    } else {
+      cerr << "SelectXFormAddr: Unhandled ADD operands:\n";
+      N1.Val->dump();
+      cerr << "\n";
+      N2.Val->dump();
+      cerr << "\n";
+      abort();
+      /*UNREACHED*/
+    }
+  } else if (N.getNumOperands() == 2) {
+    SDOperand N1 = N.getOperand(0);
+    SDOperand N2 = N.getOperand(1);
+    unsigned N1Opc = N1.getOpcode();
+    unsigned N2Opc = N2.getOpcode();
+
+    if ((N1Opc == ISD::CopyToReg || N1Opc == ISD::Register)
+	&& (N2Opc == ISD::CopyToReg || N2Opc == ISD::Register)) {
+      Base = N.getOperand(0);
+      Index = N.getOperand(1);
+      return true;
+      /*UNREACHED*/
+    } else {
+      cerr << "SelectXFormAddr: 2-operand unhandled operand:\n";
+      N.Val->dump();
+      cerr << "\n";
+      abort();
+    /*UNREACHED*/
+    }
+  } else {
+    cerr << "SelectXFormAddr: Unhandled operand type:\n";
+    N.Val->dump();
+    cerr << "\n";
+    abort();
+    /*UNREACHED*/
+  }
+
+  return false;
+}
+
+//! Convert the operand from a target-independent to a target-specific node
+/*!
+ */
+SDNode *
+SPUDAGToDAGISel::Select(SDOperand Op) {
+  SDNode *N = Op.Val;
+  unsigned Opc = N->getOpcode();
+
+  if (Opc >= ISD::BUILTIN_OP_END && Opc < SPUISD::FIRST_NUMBER) {
+    return NULL;   // Already selected.
+  } else if (Opc == ISD::FrameIndex) {
+    // Selects to AIr32 FI, 0 which in turn will become AIr32 SP, imm.
+    int FI = cast<FrameIndexSDNode>(N)->getIndex();
+    SDOperand TFI = CurDAG->getTargetFrameIndex(FI, SPUtli.getPointerTy());
+
+    DEBUG(cerr << "SPUDAGToDAGISel: Replacing FrameIndex with AI32 TFI, 0\n");
+    return CurDAG->SelectNodeTo(N, SPU::AIr32, Op.getValueType(), TFI,
+				CurDAG->getTargetConstant(0, MVT::i32));
+  } else if (Opc == SPUISD::LDRESULT) {
+    // Custom select instructions for LDRESULT
+    unsigned VT = N->getValueType(0);
+    SDOperand Arg = N->getOperand(0);
+    SDOperand Chain = N->getOperand(1);
+    SDOperand Zero = CurDAG->getTargetConstant(0, VT);
+    SDNode *Result;
+    const valtype_map_s *vtm = getValueTypeMapEntry(VT);
+
+    if (vtm->ldresult_ins == 0) {
+      cerr << "LDRESULT for unsupported type: "
+           << MVT::getValueTypeString(VT)
+           << "\n";
+      abort();
+    } else
+      Opc = vtm->ldresult_ins;
+
+    AddToISelQueue(Arg);
+    AddToISelQueue(Zero);
+    AddToISelQueue(Chain);
+    Result = CurDAG->SelectNodeTo(N, Opc, VT, MVT::Other, Arg, Zero, Chain);
+    Chain = SDOperand(Result, 1);
+    return Result;
+  }
+  
+  return SelectCode(Op);
+}
+
+/// createPPCISelDag - This pass converts a legalized DAG into a 
+/// SPU-specific DAG, ready for instruction scheduling.
+///
+FunctionPass *llvm::createSPUISelDag(SPUTargetMachine &TM) {
+  return new SPUDAGToDAGISel(TM);
+}
diff --git a/lib/Target/CellSPU/SPUISelLowering.cpp b/lib/Target/CellSPU/SPUISelLowering.cpp
new file mode 100644
index 00000000000..91c0024d744
--- /dev/null
+++ b/lib/Target/CellSPU/SPUISelLowering.cpp
@@ -0,0 +1,2673 @@
+//===-- SPUISelLowering.cpp - Cell SPU DAG Lowering Implementation --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by a team from the Computer Systems Research
+// Department at The Aerospace Corporation.
+//
+// See README.txt for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the SPUTargetLowering class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SPURegisterNames.h"
+#include "SPUISelLowering.h"
+#include "SPUTargetMachine.h"
+#include "llvm/ADT/VectorExtras.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/SSARegMap.h"
+#include "llvm/Constants.h"
+#include "llvm/Function.h"
+#include "llvm/Intrinsics.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Target/TargetOptions.h"
+
+#include <map>
+
+using namespace llvm;
+
+// Used in getTargetNodeName() below
+namespace {
+  std::map<unsigned, const char *> node_names;
+
+  //! MVT::ValueType mapping to useful data for Cell SPU
+  struct valtype_map_s {
+    const MVT::ValueType	valtype;
+    const int			prefslot_byte;
+  };
+  
+  const valtype_map_s valtype_map[] = {
+    { MVT::i1,   3 },
+    { MVT::i8,   3 },
+    { MVT::i16,  2 },
+    { MVT::i32,  0 },
+    { MVT::f32,  0 },
+    { MVT::i64,  0 },
+    { MVT::f64,  0 },
+    { MVT::i128, 0 }
+  };
+
+  const size_t n_valtype_map = sizeof(valtype_map) / sizeof(valtype_map[0]);
+
+  const valtype_map_s *getValueTypeMapEntry(MVT::ValueType VT) {
+    const valtype_map_s *retval = 0;
+
+    for (size_t i = 0; i < n_valtype_map; ++i) {
+      if (valtype_map[i].valtype == VT) {
+	retval = valtype_map + i;
+	break;
+      }
+    }
+
+#ifndef NDEBUG
+    if (retval == 0) {
+      cerr << "getValueTypeMapEntry returns NULL for "
+	   << MVT::getValueTypeString(VT)
+	   << "\n";
+      abort();
+    }
+#endif
+
+    return retval;
+  }
+
+  //! Predicate that returns true if operand is a memory target
+  /*!
+    \arg Op Operand to test
+    \return true if the operand is a memory target (i.e., global
+    address, external symbol, constant pool) or an existing D-Form
+    address.
+   */
+  bool isMemoryOperand(const SDOperand &Op)
+  {
+    const unsigned Opc = Op.getOpcode();
+    return (Opc == ISD::GlobalAddress
+            || Opc == ISD::GlobalTLSAddress
+            || Opc ==  ISD::FrameIndex
+            || Opc == ISD::JumpTable
+            || Opc == ISD::ConstantPool
+            || Opc == ISD::ExternalSymbol
+            || Opc == ISD::TargetGlobalAddress
+            || Opc == ISD::TargetGlobalTLSAddress
+            || Opc == ISD::TargetFrameIndex
+            || Opc == ISD::TargetJumpTable
+            || Opc == ISD::TargetConstantPool
+            || Opc == ISD::TargetExternalSymbol
+	    || Opc == SPUISD::DFormAddr);
+  }
+}
+
+SPUTargetLowering::SPUTargetLowering(SPUTargetMachine &TM)
+  : TargetLowering(TM),
+    SPUTM(TM)
+{
+  // Fold away setcc operations if possible.
+  setPow2DivIsCheap();
+
+  // Use _setjmp/_longjmp instead of setjmp/longjmp.
+  setUseUnderscoreSetJmp(true);
+  setUseUnderscoreLongJmp(true);
+    
+  // Set up the SPU's register classes:
+  // NOTE: i8 register class is not registered because we cannot determine when
+  // we need to zero or sign extend for custom-lowered loads and stores.
+  addRegisterClass(MVT::i16, SPU::R16CRegisterClass);
+  addRegisterClass(MVT::i32, SPU::R32CRegisterClass);
+  addRegisterClass(MVT::i64, SPU::R64CRegisterClass);
+  addRegisterClass(MVT::f32, SPU::R32FPRegisterClass);
+  addRegisterClass(MVT::f64, SPU::R64FPRegisterClass);
+  addRegisterClass(MVT::i128, SPU::GPRCRegisterClass);
+  
+  // SPU has no sign or zero extended loads for i1, i8, i16:
+  setLoadXAction(ISD::EXTLOAD,  MVT::i1, Custom);
+  setLoadXAction(ISD::SEXTLOAD, MVT::i1, Promote);
+  setLoadXAction(ISD::ZEXTLOAD, MVT::i1, Promote);
+  setStoreXAction(MVT::i1, Custom);
+
+  setLoadXAction(ISD::EXTLOAD,  MVT::i8, Custom);
+  setLoadXAction(ISD::SEXTLOAD, MVT::i8, Custom);
+  setLoadXAction(ISD::ZEXTLOAD, MVT::i8, Custom);
+  setStoreXAction(MVT::i8, Custom);
+
+  setLoadXAction(ISD::EXTLOAD,  MVT::i16, Custom);
+  setLoadXAction(ISD::SEXTLOAD, MVT::i16, Custom);
+  setLoadXAction(ISD::ZEXTLOAD, MVT::i16, Custom);
+
+  // SPU constant load actions are custom lowered:
+  setOperationAction(ISD::Constant,   MVT::i64, Custom);
+  setOperationAction(ISD::ConstantFP, MVT::f32, Custom);
+  setOperationAction(ISD::ConstantFP, MVT::f64, Custom);
+
+  // SPU's loads and stores have to be custom lowered:
+  for (unsigned sctype = (unsigned) MVT::i1; sctype < (unsigned) MVT::f128;
+       ++sctype) {
+    setOperationAction(ISD::LOAD, sctype, Custom);
+    setOperationAction(ISD::STORE, sctype, Custom);
+  }
+
+  // SPU supports BRCOND, although DAGCombine will convert BRCONDs
+  // into BR_CCs. BR_CC instructions are custom selected in
+  // SPUDAGToDAGISel.
+  setOperationAction(ISD::BRCOND, MVT::Other, Legal);
+
+  // Expand the jumptable branches
+  setOperationAction(ISD::BR_JT,        MVT::Other, Expand);
+  setOperationAction(ISD::BR_CC,        MVT::Other, Expand);
+  setOperationAction(ISD::SELECT_CC,    MVT::Other, Expand);  
+
+  // SPU has no intrinsics for these particular operations:
+  setOperationAction(ISD::MEMMOVE, MVT::Other, Expand);
+  setOperationAction(ISD::MEMSET, MVT::Other, Expand);
+  setOperationAction(ISD::MEMCPY, MVT::Other, Expand);
+  
+  // PowerPC has no SREM/UREM instructions
+  setOperationAction(ISD::SREM, MVT::i32, Expand);
+  setOperationAction(ISD::UREM, MVT::i32, Expand);
+  setOperationAction(ISD::SREM, MVT::i64, Expand);
+  setOperationAction(ISD::UREM, MVT::i64, Expand);
+  
+  // We don't support sin/cos/sqrt/fmod
+  setOperationAction(ISD::FSIN , MVT::f64, Expand);
+  setOperationAction(ISD::FCOS , MVT::f64, Expand);
+  setOperationAction(ISD::FREM , MVT::f64, Expand);
+  setOperationAction(ISD::FSIN , MVT::f32, Expand);
+  setOperationAction(ISD::FCOS , MVT::f32, Expand);
+  setOperationAction(ISD::FREM , MVT::f32, Expand);
+  
+  // If we're enabling GP optimizations, use hardware square root
+  setOperationAction(ISD::FSQRT, MVT::f64, Expand);
+  setOperationAction(ISD::FSQRT, MVT::f32, Expand);
+  
+  setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
+  setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
+
+  // SPU can do rotate right and left, so legalize it... but customize for i8
+  // because instructions don't exist.
+  setOperationAction(ISD::ROTR, MVT::i32,    Legal);
+  setOperationAction(ISD::ROTR, MVT::i16,    Legal);
+  setOperationAction(ISD::ROTR, MVT::i8,     Custom);
+  setOperationAction(ISD::ROTL, MVT::i32,    Legal);
+  setOperationAction(ISD::ROTL, MVT::i16,    Legal);
+  setOperationAction(ISD::ROTL, MVT::i8,     Custom);
+  // SPU has no native version of shift left/right for i8
+  setOperationAction(ISD::SHL,  MVT::i8,     Custom);
+  setOperationAction(ISD::SRL,  MVT::i8,     Custom);
+  setOperationAction(ISD::SRA,  MVT::i8,     Custom);
+
+  // Custom lower i32 multiplications
+  setOperationAction(ISD::MUL,  MVT::i32,    Custom);
+
+  // Need to custom handle (some) common i8 math ops
+  setOperationAction(ISD::SUB,  MVT::i8,     Custom);
+  setOperationAction(ISD::MUL,  MVT::i8,     Custom);
+  
+  // SPU does not have BSWAP. It does have i32 support CTLZ.
+  // CTPOP has to be custom lowered.
+  setOperationAction(ISD::BSWAP, MVT::i32,   Expand);
+  setOperationAction(ISD::BSWAP, MVT::i64,   Expand);
+
+  setOperationAction(ISD::CTPOP, MVT::i8,    Custom);
+  setOperationAction(ISD::CTPOP, MVT::i16,   Custom);
+  setOperationAction(ISD::CTPOP, MVT::i32,   Custom);
+  setOperationAction(ISD::CTPOP, MVT::i64,   Custom);
+
+  setOperationAction(ISD::CTTZ , MVT::i32,   Expand);
+  setOperationAction(ISD::CTTZ , MVT::i64,   Expand);
+
+  setOperationAction(ISD::CTLZ , MVT::i32,   Legal);
+  
+  // SPU does not have select or setcc
+  setOperationAction(ISD::SELECT, MVT::i1,   Expand);
+  setOperationAction(ISD::SELECT, MVT::i8,   Expand);
+  setOperationAction(ISD::SELECT, MVT::i16,  Expand);
+  setOperationAction(ISD::SELECT, MVT::i32,  Expand);
+  setOperationAction(ISD::SELECT, MVT::i64,  Expand);
+  setOperationAction(ISD::SELECT, MVT::f32,  Expand);
+  setOperationAction(ISD::SELECT, MVT::f64,  Expand);
+
+  setOperationAction(ISD::SETCC, MVT::i1,   Expand);
+  setOperationAction(ISD::SETCC, MVT::i8,   Expand);
+  setOperationAction(ISD::SETCC, MVT::i16,  Expand);
+  setOperationAction(ISD::SETCC, MVT::i32,  Expand);
+  setOperationAction(ISD::SETCC, MVT::i64,  Expand);
+  setOperationAction(ISD::SETCC, MVT::f32,  Expand);
+  setOperationAction(ISD::SETCC, MVT::f64,  Expand);
+  
+  // SPU has a legal FP -> signed INT instruction
+  setOperationAction(ISD::FP_TO_SINT, MVT::i32, Legal);
+  setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
+  setOperationAction(ISD::FP_TO_UINT, MVT::i32, Legal);
+  setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
+
+  // FDIV on SPU requires custom lowering
+  setOperationAction(ISD::FDIV, MVT::f32, Custom);
+  //setOperationAction(ISD::FDIV, MVT::f64, Custom);
+
+  // SPU has [U|S]INT_TO_FP
+  setOperationAction(ISD::SINT_TO_FP, MVT::i32, Legal);
+  setOperationAction(ISD::SINT_TO_FP, MVT::i16, Promote);
+  setOperationAction(ISD::SINT_TO_FP, MVT::i8, Promote);
+  setOperationAction(ISD::UINT_TO_FP, MVT::i32, Legal);
+  setOperationAction(ISD::UINT_TO_FP, MVT::i16, Promote);
+  setOperationAction(ISD::UINT_TO_FP, MVT::i8, Promote);
+  setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
+  setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);
+
+  setOperationAction(ISD::BIT_CONVERT, MVT::f32, Expand);
+  setOperationAction(ISD::BIT_CONVERT, MVT::i32, Expand);
+  setOperationAction(ISD::BIT_CONVERT, MVT::i64, Expand);
+  setOperationAction(ISD::BIT_CONVERT, MVT::f64, Expand);
+
+  // We cannot sextinreg(i1).  Expand to shifts.
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
+  
+  // Support label based line numbers.
+  setOperationAction(ISD::LOCATION, MVT::Other, Expand);
+  setOperationAction(ISD::DEBUG_LOC, MVT::Other, Expand);
+  
+  // We want to legalize GlobalAddress and ConstantPool nodes into the 
+  // appropriate instructions to materialize the address.
+  setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
+  setOperationAction(ISD::ConstantPool,  MVT::i32, Custom);
+  setOperationAction(ISD::ConstantPool,  MVT::f32, Custom);
+  setOperationAction(ISD::JumpTable,     MVT::i32, Custom);
+  setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
+  setOperationAction(ISD::ConstantPool,  MVT::i64, Custom);
+  setOperationAction(ISD::ConstantPool,  MVT::f64, Custom);
+  setOperationAction(ISD::JumpTable,     MVT::i64, Custom);
+
+  // RET must be custom lowered, to meet ABI requirements
+  setOperationAction(ISD::RET,           MVT::Other, Custom);
+  
+  // VASTART needs to be custom lowered to use the VarArgsFrameIndex
+  setOperationAction(ISD::VASTART           , MVT::Other, Custom);
+  
+  // Use the default implementation.
+  setOperationAction(ISD::VAARG             , MVT::Other, Expand);
+  setOperationAction(ISD::VACOPY            , MVT::Other, Expand);
+  setOperationAction(ISD::VAEND             , MVT::Other, Expand);
+  setOperationAction(ISD::STACKSAVE         , MVT::Other, Expand); 
+  setOperationAction(ISD::STACKRESTORE      , MVT::Other, Expand);
+  setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32  , Expand);
+  setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64  , Expand);
+
+  // Cell SPU has instructions for converting between i64 and fp.
+  setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
+  setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
+    
+  // To take advantage of the above i64 FP_TO_SINT, promote i32 FP_TO_UINT
+  setOperationAction(ISD::FP_TO_UINT, MVT::i32, Promote);
+
+  // BUILD_PAIR can't be handled natively, and should be expanded to shl/or
+  setOperationAction(ISD::BUILD_PAIR, MVT::i64, Expand);
+
+  // First set operation action for all vector types to expand. Then we
+  // will selectively turn on ones that can be effectively codegen'd.
+  addRegisterClass(MVT::v16i8, SPU::VECREGRegisterClass);
+  addRegisterClass(MVT::v8i16, SPU::VECREGRegisterClass);
+  addRegisterClass(MVT::v4i32, SPU::VECREGRegisterClass);
+  addRegisterClass(MVT::v2i64, SPU::VECREGRegisterClass);
+  addRegisterClass(MVT::v4f32, SPU::VECREGRegisterClass);
+  addRegisterClass(MVT::v2f64, SPU::VECREGRegisterClass);
+
+  for (unsigned VT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
+       VT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++VT) {
+    // add/sub are legal for all supported vector VT's.
+    setOperationAction(ISD::ADD , (MVT::ValueType)VT, Legal);
+    setOperationAction(ISD::SUB , (MVT::ValueType)VT, Legal);
+    // mul has to be custom lowered.
+    setOperationAction(ISD::MUL , (MVT::ValueType)VT, Custom);
+
+    setOperationAction(ISD::AND   , (MVT::ValueType)VT, Legal);
+    setOperationAction(ISD::OR    , (MVT::ValueType)VT, Legal);
+    setOperationAction(ISD::XOR   , (MVT::ValueType)VT, Legal);
+    setOperationAction(ISD::LOAD  , (MVT::ValueType)VT, Legal);
+    setOperationAction(ISD::SELECT, (MVT::ValueType)VT, Legal);
+    setOperationAction(ISD::STORE,  (MVT::ValueType)VT, Legal);
+    
+    // These operations need to be expanded:
+    setOperationAction(ISD::SDIV, (MVT::ValueType)VT, Expand);
+    setOperationAction(ISD::SREM, (MVT::ValueType)VT, Expand);
+    setOperationAction(ISD::UDIV, (MVT::ValueType)VT, Expand);
+    setOperationAction(ISD::UREM, (MVT::ValueType)VT, Expand);
+    setOperationAction(ISD::FDIV, (MVT::ValueType)VT, Custom);
+
+    // Custom lower build_vector, constant pool spills, insert and
+    // extract vector elements:
+    setOperationAction(ISD::BUILD_VECTOR, (MVT::ValueType)VT, Custom);
+    setOperationAction(ISD::ConstantPool, (MVT::ValueType)VT, Custom);
+    setOperationAction(ISD::SCALAR_TO_VECTOR, (MVT::ValueType)VT, Custom);
+    setOperationAction(ISD::EXTRACT_VECTOR_ELT, (MVT::ValueType)VT, Custom);
+    setOperationAction(ISD::INSERT_VECTOR_ELT, (MVT::ValueType)VT, Custom);
+    setOperationAction(ISD::VECTOR_SHUFFLE, (MVT::ValueType)VT, Custom);
+  }
+
+  setOperationAction(ISD::MUL, MVT::v16i8, Custom);
+  setOperationAction(ISD::AND, MVT::v16i8, Custom);
+  setOperationAction(ISD::OR,  MVT::v16i8, Custom);
+  setOperationAction(ISD::XOR, MVT::v16i8, Custom);
+  setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4f32, Custom);
+  
+  setSetCCResultType(MVT::i32);
+  setShiftAmountType(MVT::i32);
+  setSetCCResultContents(ZeroOrOneSetCCResult);
+  
+  setStackPointerRegisterToSaveRestore(SPU::R1);
+  
+  // We have target-specific dag combine patterns for the following nodes:
+  // e.g., setTargetDAGCombine(ISD::SUB);
+  
+  computeRegisterProperties();
+}
+
+const char *
+SPUTargetLowering::getTargetNodeName(unsigned Opcode) const
+{
+  if (node_names.empty()) {
+    node_names[(unsigned) SPUISD::RET_FLAG] = "SPUISD::RET_FLAG";
+    node_names[(unsigned) SPUISD::Hi] = "SPUISD::Hi";
+    node_names[(unsigned) SPUISD::Lo] = "SPUISD::Lo";
+    node_names[(unsigned) SPUISD::PCRelAddr] = "SPUISD::PCRelAddr";
+    node_names[(unsigned) SPUISD::DFormAddr] = "SPUISD::DFormAddr";
+    node_names[(unsigned) SPUISD::XFormAddr] = "SPUISD::XFormAddr";
+    node_names[(unsigned) SPUISD::LDRESULT] = "SPUISD::LDRESULT";
+    node_names[(unsigned) SPUISD::CALL] = "SPUISD::CALL";
+    node_names[(unsigned) SPUISD::SHUFB] = "SPUISD::SHUFB";
+    node_names[(unsigned) SPUISD::INSERT_MASK] = "SPUISD::INSERT_MASK";
+    node_names[(unsigned) SPUISD::CNTB] = "SPUISD::CNTB";
+    node_names[(unsigned) SPUISD::PROMOTE_SCALAR] = "SPUISD::PROMOTE_SCALAR";
+    node_names[(unsigned) SPUISD::EXTRACT_ELT0] = "SPUISD::EXTRACT_ELT0";
+    node_names[(unsigned) SPUISD::EXTRACT_ELT0_CHAINED] = "SPUISD::EXTRACT_ELT0_CHAINED";
+    node_names[(unsigned) SPUISD::EXTRACT_I1_ZEXT] = "SPUISD::EXTRACT_I1_ZEXT";
+    node_names[(unsigned) SPUISD::EXTRACT_I1_SEXT] = "SPUISD::EXTRACT_I1_SEXT";
+    node_names[(unsigned) SPUISD::EXTRACT_I8_ZEXT] = "SPUISD::EXTRACT_I8_ZEXT";
+    node_names[(unsigned) SPUISD::EXTRACT_I8_SEXT] = "SPUISD::EXTRACT_I8_SEXT";
+    node_names[(unsigned) SPUISD::MPY] = "SPUISD::MPY";
+    node_names[(unsigned) SPUISD::MPYU] = "SPUISD::MPYU";
+    node_names[(unsigned) SPUISD::MPYH] = "SPUISD::MPYH";
+    node_names[(unsigned) SPUISD::MPYHH] = "SPUISD::MPYHH";
+    node_names[(unsigned) SPUISD::VEC_SHL] = "SPUISD::VEC_SHL";
+    node_names[(unsigned) SPUISD::VEC_SRL] = "SPUISD::VEC_SRL";
+    node_names[(unsigned) SPUISD::VEC_SRA] = "SPUISD::VEC_SRA";
+    node_names[(unsigned) SPUISD::VEC_ROTL] = "SPUISD::VEC_ROTL";
+    node_names[(unsigned) SPUISD::VEC_ROTR] = "SPUISD::VEC_ROTR";
+    node_names[(unsigned) SPUISD::ROTBYTES_RIGHT_Z] =
+      "SPUISD::ROTBYTES_RIGHT_Z";
+    node_names[(unsigned) SPUISD::ROTBYTES_RIGHT_S] =
+      "SPUISD::ROTBYTES_RIGHT_S";
+    node_names[(unsigned) SPUISD::ROTBYTES_LEFT] = "SPUISD::ROTBYTES_LEFT";
+    node_names[(unsigned) SPUISD::ROTBYTES_LEFT_CHAINED] =
+      "SPUISD::ROTBYTES_LEFT_CHAINED";
+    node_names[(unsigned) SPUISD::FSMBI] = "SPUISD::FSMBI";
+    node_names[(unsigned) SPUISD::SELB] = "SPUISD::SELB";
+    node_names[(unsigned) SPUISD::SFPConstant] = "SPUISD::SFPConstant";
+    node_names[(unsigned) SPUISD::FPInterp] = "SPUISD::FPInterp";
+    node_names[(unsigned) SPUISD::FPRecipEst] = "SPUISD::FPRecipEst";
+    node_names[(unsigned) SPUISD::SEXT32TO64] = "SPUISD::SEXT32TO64";
+  }
+
+  std::map<unsigned, const char *>::iterator i = node_names.find(Opcode);
+
+  return ((i != node_names.end()) ? i->second : 0);
+}
+
+//===----------------------------------------------------------------------===//
+// Calling convention code:
+//===----------------------------------------------------------------------===//
+
+#include "SPUGenCallingConv.inc"
+
+//===----------------------------------------------------------------------===//
+//  LowerOperation implementation
+//===----------------------------------------------------------------------===//
+
+/// Custom lower loads for CellSPU
+/*!
+ All CellSPU loads and stores are aligned to 16-byte boundaries, so for elements
+ within a 16-byte block, we have to rotate to extract the requested element.
+ */
+static SDOperand
+LowerLOAD(SDOperand Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
+  LoadSDNode *LN = cast<LoadSDNode>(Op);
+  SDOperand basep = LN->getBasePtr();
+  SDOperand the_chain = LN->getChain();
+  MVT::ValueType VT = LN->getLoadedVT();
+  MVT::ValueType OpVT = Op.Val->getValueType(0);
+  MVT::ValueType PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+  ISD::LoadExtType ExtType = LN->getExtensionType();
+  unsigned alignment = LN->getAlignment();
+  const valtype_map_s *vtm = getValueTypeMapEntry(VT);
+  SDOperand Ops[8];
+
+  // For an extending load of an i1 variable, just call it i8 (or whatever we
+  // were passed) and make it zero-extended:
+  if (VT == MVT::i1) {
+    VT = OpVT;
+    ExtType = ISD::ZEXTLOAD;
+  }
+
+  switch (LN->getAddressingMode()) {
+  case ISD::UNINDEXED: {
+    SDOperand result;
+    SDOperand rot_op, rotamt;
+    SDOperand ptrp;
+    int c_offset;
+    int c_rotamt;
+
+    // The vector type we really want to be when we load the 16-byte chunk
+    MVT::ValueType vecVT, opVecVT;
+    
+    if (VT != MVT::i1)
+      vecVT = MVT::getVectorType(VT, (128 / MVT::getSizeInBits(VT)));
+    else
+      vecVT = MVT::v16i8;
+
+    opVecVT = MVT::getVectorType(OpVT, (128 / MVT::getSizeInBits(OpVT)));
+
+    if (basep.getOpcode() == ISD::ADD) {
+      const ConstantSDNode *CN = cast<ConstantSDNode>(basep.Val->getOperand(1));
+
+      assert(CN != NULL
+             && "LowerLOAD: ISD::ADD operand 1 is not constant");
+
+      c_offset = (int) CN->getValue();
+      c_rotamt = (int) (c_offset & 0xf);
+
+      // Adjust the rotation amount to ensure that the final result ends up in
+      // the preferred slot:
+      c_rotamt -= vtm->prefslot_byte;
+      ptrp = basep.getOperand(0);
+    } else {
+      c_offset = 0;
+      c_rotamt = -vtm->prefslot_byte;
+      ptrp = basep;
+    }
+
+    if (alignment == 16) {
+      // 16-byte aligned load into preferred slot, no rotation
+      if (c_rotamt == 0) {
+	if (isMemoryOperand(ptrp))
+	  // Return unchanged
+	  return SDOperand();
+	else {
+	  // Return modified D-Form address for pointer:
+	  ptrp = DAG.getNode(SPUISD::DFormAddr, PtrVT,
+			     ptrp, DAG.getConstant((c_offset & ~0xf), PtrVT));
+	  if (VT == OpVT)
+	    return DAG.getLoad(VT, LN->getChain(), ptrp,
+			       LN->getSrcValue(), LN->getSrcValueOffset(),
+			       LN->isVolatile(), 16);
+	  else
+	    return DAG.getExtLoad(ExtType, VT, LN->getChain(), ptrp, LN->getSrcValue(),
+				  LN->getSrcValueOffset(), OpVT,
+				  LN->isVolatile(), 16);
+	}
+      } else {
+	// Need to rotate...
+	if (c_rotamt < 0)
+	  c_rotamt += 16;
+	// Realign the base pointer, with a D-Form address
+	if ((c_offset & ~0xf) != 0 || !isMemoryOperand(ptrp))
+	  basep = DAG.getNode(SPUISD::DFormAddr, PtrVT,
+			      ptrp, DAG.getConstant((c_offset & ~0xf), MVT::i32));
+	else
+	  basep = ptrp;
+
+	// Rotate the load:
+	rot_op = DAG.getLoad(MVT::v16i8, the_chain, basep,
+			     LN->getSrcValue(), LN->getSrcValueOffset(),
+			     LN->isVolatile(), 16);
+	the_chain = rot_op.getValue(1);
+	rotamt = DAG.getConstant(c_rotamt, MVT::i16);
+
+	SDVTList vecvts = DAG.getVTList(MVT::v16i8, MVT::Other);
+	Ops[0] = the_chain;
+	Ops[1] = rot_op;
+	Ops[2] = rotamt;
+
+	result = DAG.getNode(SPUISD::ROTBYTES_LEFT_CHAINED, vecvts, Ops, 3);
+	the_chain = result.getValue(1);
+
+	if (VT == OpVT || ExtType == ISD::EXTLOAD) {
+	  SDVTList scalarvts;
+	  Ops[0] = the_chain;
+	  Ops[1] = result;
+	  if (OpVT == VT) {
+	    scalarvts = DAG.getVTList(VT, MVT::Other);
+	  } else {
+	    scalarvts = DAG.getVTList(OpVT, MVT::Other);
+	  }
+
+	  result = DAG.getNode(ISD::BIT_CONVERT, (OpVT == VT ? vecVT : opVecVT),
+	                       result);
+	  Ops[0] = the_chain;
+	  Ops[1] = result;
+	  result = DAG.getNode(SPUISD::EXTRACT_ELT0_CHAINED, scalarvts, Ops, 2);
+	  the_chain = result.getValue(1);
+	} else {
+	  // Handle the sign and zero-extending loads for i1 and i8:
+	  unsigned NewOpC;
+
+	  if (ExtType == ISD::SEXTLOAD) {
+	    NewOpC = (OpVT == MVT::i1
+		      ? SPUISD::EXTRACT_I1_SEXT
+		      : SPUISD::EXTRACT_I8_SEXT);
+	  } else if (ExtType == ISD::ZEXTLOAD) {
+	    NewOpC = (OpVT == MVT::i1
+		      ? SPUISD::EXTRACT_I1_ZEXT
+		      : SPUISD::EXTRACT_I8_ZEXT);
+	  }
+
+	  result = DAG.getNode(NewOpC, OpVT, result);
+	}
+
+	SDVTList retvts = DAG.getVTList(OpVT, MVT::Other);
+	SDOperand retops[2] = { result, the_chain };
+
+	result = DAG.getNode(SPUISD::LDRESULT, retvts, retops, 2);
+	return result;
+	/*UNREACHED*/
+      }
+    } else {
+      // Misaligned 16-byte load:
+      if (basep.getOpcode() == ISD::LOAD) {
+	LN = cast<LoadSDNode>(basep);
+	if (LN->getAlignment() == 16) {
+	  // We can verify that we're really loading from a 16-byte aligned
+	  // chunk. Encapsulate basep as a D-Form address and return a new
+	  // load:
+	  basep = DAG.getNode(SPUISD::DFormAddr, PtrVT, basep,
+			      DAG.getConstant(0, PtrVT));
+	  if (OpVT == VT)
+	    return DAG.getLoad(VT, LN->getChain(), basep,
+			       LN->getSrcValue(), LN->getSrcValueOffset(),
+			       LN->isVolatile(), 16);
+	  else
+	    return DAG.getExtLoad(ExtType, VT, LN->getChain(), basep,
+				  LN->getSrcValue(), LN->getSrcValueOffset(),
+				  OpVT, LN->isVolatile(), 16);
+	}
+      }
+
+      // Catch all other cases where we can't guarantee that we have a
+      // 16-byte aligned entity, which means resorting to an X-form
+      // address scheme:
+
+      SDOperand ZeroOffs = DAG.getConstant(0, PtrVT);
+      SDOperand loOp = DAG.getNode(SPUISD::Lo, VT, basep, ZeroOffs);
+      SDOperand hiOp = DAG.getNode(SPUISD::Hi, VT, basep, ZeroOffs);
+
+      ptrp = DAG.getNode(ISD::ADD, PtrVT, loOp, hiOp);
+
+      SDOperand alignLoad =
+	DAG.getLoad(opVecVT, LN->getChain(), ptrp,
+		    LN->getSrcValue(), LN->getSrcValueOffset(),
+		    LN->isVolatile(), 16);
+
+      SDOperand insertEltOp =
+	DAG.getNode(SPUISD::INSERT_MASK, vecVT, ptrp);
+
+      result = DAG.getNode(SPUISD::SHUFB, opVecVT,
+			   alignLoad,
+			   alignLoad,
+			   DAG.getNode(ISD::BIT_CONVERT, opVecVT, insertEltOp));
+
+      result = DAG.getNode(SPUISD::EXTRACT_ELT0, OpVT, result);
+
+      SDVTList retvts = DAG.getVTList(OpVT, MVT::Other);
+      SDOperand retops[2] = { result, the_chain };
+
+      result = DAG.getNode(SPUISD::LDRESULT, retvts, retops, 2);
+      return result;
+    }
+    break;
+  }
+  case ISD::PRE_INC:
+  case ISD::PRE_DEC:
+  case ISD::POST_INC:
+  case ISD::POST_DEC:
+  case ISD::LAST_INDEXED_MODE:
+    cerr << "LowerLOAD: Got a LoadSDNode with an addr mode other than "
+            "UNINDEXED\n";
+    cerr << (unsigned) LN->getAddressingMode() << "\n";
+    abort();
+    /*NOTREACHED*/
+  }
+
+  return SDOperand();
+}
+
+/// Custom lower stores for CellSPU
+/*!
+ All CellSPU stores are aligned to 16-byte boundaries, so for elements
+ within a 16-byte block, we have to generate a shuffle to insert the
+ requested element into its place, then store the resulting block.
+ */
+static SDOperand
+LowerSTORE(SDOperand Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
+  StoreSDNode *SN = cast<StoreSDNode>(Op);
+  SDOperand Value = SN->getValue();
+  MVT::ValueType VT = Value.getValueType();
+  MVT::ValueType StVT = (!SN->isTruncatingStore() ? VT : SN->getStoredVT());
+  MVT::ValueType PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+  SDOperand the_chain = SN->getChain();
+  unsigned alignment = SN->getAlignment();
+  const valtype_map_s *vtm = getValueTypeMapEntry(VT);
+
+  switch (SN->getAddressingMode()) {
+  case ISD::UNINDEXED: {
+    SDOperand basep = SN->getBasePtr();
+    SDOperand ptrOp;
+    int offset;
+
+    if (basep.getOpcode() == ISD::ADD) {
+      const ConstantSDNode *CN = cast<ConstantSDNode>(basep.Val->getOperand(1));
+      assert(CN != NULL
+             && "LowerSTORE: ISD::ADD operand 1 is not constant");
+      offset = unsigned(CN->getValue());
+      ptrOp = basep.getOperand(0);
+      DEBUG(cerr << "LowerSTORE: StoreSDNode ISD:ADD offset = "
+	         << offset
+		 << "\n");
+    } else {
+      ptrOp = basep;
+      offset = 0;
+    }
+
+    // The vector type we really want to load from the 16-byte chunk, except
+    // in the case of MVT::i1, which has to be v16i8.
+    unsigned vecVT, stVecVT;
+
+    if (StVT != MVT::i1)
+      stVecVT = MVT::getVectorType(StVT, (128 / MVT::getSizeInBits(StVT)));
+    else
+      stVecVT = MVT::v16i8;
+    vecVT = MVT::getVectorType(VT, (128 / MVT::getSizeInBits(VT)));
+
+    // Realign the pointer as a D-Form address (ptrOp is the pointer,
+    // to force a register load with the address; basep is the actual
+    // dform addr offs($reg).
+    ptrOp = DAG.getNode(SPUISD::DFormAddr, PtrVT, ptrOp,
+		       DAG.getConstant(0, PtrVT));
+    basep = DAG.getNode(SPUISD::DFormAddr, PtrVT, 
+			ptrOp, DAG.getConstant((offset & ~0xf), PtrVT));
+
+    // Create the 16-byte aligned vector load
+    SDOperand alignLoad =
+      DAG.getLoad(vecVT, the_chain, basep,
+                  SN->getSrcValue(), SN->getSrcValueOffset(),
+                  SN->isVolatile(), 16);
+    the_chain = alignLoad.getValue(1);
+
+    LoadSDNode *LN = cast<LoadSDNode>(alignLoad);
+    SDOperand theValue = SN->getValue();
+    SDOperand result;
+
+    if (StVT != VT
+	&& (theValue.getOpcode() == ISD::AssertZext
+	    || theValue.getOpcode() == ISD::AssertSext)) {
+      // Drill down and get the value for zero- and sign-extended
+      // quantities
+      theValue = theValue.getOperand(0); 
+    }
+
+    SDOperand insertEltOp =
+      DAG.getNode(SPUISD::INSERT_MASK, stVecVT,
+		  DAG.getNode(SPUISD::DFormAddr, PtrVT,
+			      ptrOp,
+			      DAG.getConstant((offset & 0xf), PtrVT)));
+
+    result = DAG.getNode(SPUISD::SHUFB, vecVT,
+			 DAG.getNode(ISD::SCALAR_TO_VECTOR, vecVT, theValue),
+			 alignLoad,
+			 DAG.getNode(ISD::BIT_CONVERT, vecVT, insertEltOp));
+
+    result = DAG.getStore(the_chain, result, basep,
+                          LN->getSrcValue(), LN->getSrcValueOffset(),
+                          LN->isVolatile(), LN->getAlignment());
+
+    return result;
+    /*UNREACHED*/
+  }
+  case ISD::PRE_INC:
+  case ISD::PRE_DEC:
+  case ISD::POST_INC:
+  case ISD::POST_DEC:
+  case ISD::LAST_INDEXED_MODE:
+    cerr << "LowerLOAD: Got a LoadSDNode with an addr mode other than "
+            "UNINDEXED\n";
+    cerr << (unsigned) SN->getAddressingMode() << "\n";
+    abort();
+    /*NOTREACHED*/
+  }
+
+  return SDOperand();
+}
+
+/// Generate the address of a constant pool entry.
+static SDOperand
+LowerConstantPool(SDOperand Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
+  MVT::ValueType PtrVT = Op.getValueType();
+  ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
+  Constant *C = CP->getConstVal();
+  SDOperand CPI = DAG.getTargetConstantPool(C, PtrVT, CP->getAlignment());
+  const TargetMachine &TM = DAG.getTarget();
+  SDOperand Zero = DAG.getConstant(0, PtrVT);
+
+  if (TM.getRelocationModel() == Reloc::Static) {
+    if (!ST->usingLargeMem()) {
+      // Just return the SDOperand with the constant pool address in it.
+      return CPI;
+    } else {
+      // Generate hi/lo address pair
+      SDOperand Hi = DAG.getNode(SPUISD::Hi, PtrVT, CPI, Zero);
+      SDOperand Lo = DAG.getNode(SPUISD::Lo, PtrVT, CPI, Zero);
+
+      return DAG.getNode(ISD::ADD, PtrVT, Lo, Hi);
+    }
+  }
+
+  assert(0 &&
+         "LowerConstantPool: Relocation model other than static not supported.");
+  return SDOperand();
+}
+
+static SDOperand
+LowerJumpTable(SDOperand Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
+  MVT::ValueType PtrVT = Op.getValueType();
+  JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
+  SDOperand JTI = DAG.getTargetJumpTable(JT->getIndex(), PtrVT);
+  SDOperand Zero = DAG.getConstant(0, PtrVT);
+  const TargetMachine &TM = DAG.getTarget();
+
+  if (TM.getRelocationModel() == Reloc::Static) {
+    if (!ST->usingLargeMem()) {
+      // Just return the SDOperand with the jump table address in it.
+      return JTI;
+    } else {
+      // Generate hi/lo address pair
+      SDOperand Hi = DAG.getNode(SPUISD::Hi, PtrVT, JTI, Zero);
+      SDOperand Lo = DAG.getNode(SPUISD::Lo, PtrVT, JTI, Zero);
+
+      return DAG.getNode(ISD::ADD, PtrVT, Lo, Hi);
+    }
+  }
+
+  assert(0 &&
+         "LowerJumpTable: Relocation model other than static not supported.");
+  return SDOperand();
+}
+
+static SDOperand
+LowerGlobalAddress(SDOperand Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
+  MVT::ValueType PtrVT = Op.getValueType();
+  GlobalAddressSDNode *GSDN = cast<GlobalAddressSDNode>(Op);
+  GlobalValue *GV = GSDN->getGlobal();
+  SDOperand GA = DAG.getTargetGlobalAddress(GV, PtrVT, GSDN->getOffset());
+  SDOperand Zero = DAG.getConstant(0, PtrVT);
+  const TargetMachine &TM = DAG.getTarget();
+  
+  if (TM.getRelocationModel() == Reloc::Static) {
+    if (!ST->usingLargeMem()) {
+      // Generate a local store address
+      return GA;
+    } else {
+      // Generate hi/lo address pair
+      SDOperand Hi = DAG.getNode(SPUISD::Hi, PtrVT, GA, Zero);
+      SDOperand Lo = DAG.getNode(SPUISD::Lo, PtrVT, GA, Zero);
+
+      return DAG.getNode(ISD::ADD, PtrVT, Lo, Hi);
+    }
+  } else {
+    cerr << "LowerGlobalAddress: Relocation model other than static not "
+	 << "supported.\n";
+    abort();
+    /*NOTREACHED*/
+  }
+
+  return SDOperand();
+}
+
+//! Custom lower i64 integer constants
+/*!
+ This code inserts all of the necessary juggling that needs to occur to load
+ a 64-bit constant into a register.
+ */
+static SDOperand
+LowerConstant(SDOperand Op, SelectionDAG &DAG) {
+  unsigned VT = Op.getValueType();
+  ConstantSDNode *CN = cast<ConstantSDNode>(Op.Val);
+
+  if (VT == MVT::i64) {
+    SDOperand T = DAG.getConstant(CN->getValue(), MVT::i64);
+    return DAG.getNode(SPUISD::EXTRACT_ELT0, VT,
+		       DAG.getNode(ISD::BUILD_VECTOR, MVT::v2i64, T, T));
+
+  } else {
+    cerr << "LowerConstant: unhandled constant type "
+	 << MVT::getValueTypeString(VT)
+	 << "\n";
+    abort();
+    /*NOTREACHED*/
+  }
+
+  return SDOperand();
+}
+
+//! Custom lower single precision floating point constants
+/*!
+  "float" immediates can be lowered as if they were unsigned 32-bit integers.
+  The SPUISD::SFPConstant pseudo-instruction handles this in the instruction
+  target description.
+ */
+static SDOperand
+LowerConstantFP(SDOperand Op, SelectionDAG &DAG) {
+  unsigned VT = Op.getValueType();
+  ConstantFPSDNode *FP = cast<ConstantFPSDNode>(Op.Val);
+
+  assert((FP != 0) &&
+	 "LowerConstantFP: Node is not ConstantFPSDNode");
+
+  const APFloat &apf = FP->getValueAPF();
+
+  if (VT == MVT::f32) {
+    return DAG.getNode(SPUISD::SFPConstant, VT,
+		       DAG.getTargetConstantFP(apf.convertToFloat(), VT));
+  } else if (VT == MVT::f64) {
+    uint64_t dbits = DoubleToBits(apf.convertToDouble());
+    return DAG.getNode(ISD::BIT_CONVERT, VT,
+		       LowerConstant(DAG.getConstant(dbits, MVT::i64), DAG));
+  }
+
+  return SDOperand();
+}
+
+static SDOperand
+LowerFORMAL_ARGUMENTS(SDOperand Op, SelectionDAG &DAG, int &VarArgsFrameIndex)
+{
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  SSARegMap *RegMap = MF.getSSARegMap();
+  SmallVector<SDOperand, 8> ArgValues;
+  SDOperand Root = Op.getOperand(0);
+  bool isVarArg = cast<ConstantSDNode>(Op.getOperand(2))->getValue() != 0;
+
+  const unsigned *ArgRegs = SPURegisterInfo::getArgRegs();
+  const unsigned NumArgRegs = SPURegisterInfo::getNumArgRegs();
+  
+  unsigned ArgOffset = SPUFrameInfo::minStackSize();
+  unsigned ArgRegIdx = 0;
+  unsigned StackSlotSize = SPUFrameInfo::stackSlotSize();
+  
+  MVT::ValueType PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+  
+  // Add DAG nodes to load the arguments or copy them out of registers.
+  for (unsigned ArgNo = 0, e = Op.Val->getNumValues()-1; ArgNo != e; ++ArgNo) {
+    SDOperand ArgVal;
+    bool needsLoad = false;
+    MVT::ValueType ObjectVT = Op.getValue(ArgNo).getValueType();
+    unsigned ObjSize = MVT::getSizeInBits(ObjectVT)/8;
+
+    switch (ObjectVT) {
+    default: {
+      cerr << "LowerFORMAL_ARGUMENTS Unhandled argument type: "
+	   << MVT::getValueTypeString(ObjectVT)
+           << "\n";
+      abort();
+    }
+    case MVT::i8:
+      if (!isVarArg && ArgRegIdx < NumArgRegs) {
+        unsigned VReg = RegMap->createVirtualRegister(&SPU::R16CRegClass);
+        MF.addLiveIn(ArgRegs[ArgRegIdx], VReg);
+        ArgVal = DAG.getCopyFromReg(Root, VReg, MVT::i8);
+        ++ArgRegIdx;
+      } else {
+        needsLoad = true;
+      }
+      break;
+    case MVT::i16:
+      if (!isVarArg && ArgRegIdx < NumArgRegs) {
+        unsigned VReg = RegMap->createVirtualRegister(&SPU::R16CRegClass);
+        MF.addLiveIn(ArgRegs[ArgRegIdx], VReg);
+        ArgVal = DAG.getCopyFromReg(Root, VReg, MVT::i16);
+        ++ArgRegIdx;
+      } else {
+        needsLoad = true;
+      }
+      break;
+    case MVT::i32:
+      if (!isVarArg && ArgRegIdx < NumArgRegs) {
+        unsigned VReg = RegMap->createVirtualRegister(&SPU::R32CRegClass);
+        MF.addLiveIn(ArgRegs[ArgRegIdx], VReg);
+        ArgVal = DAG.getCopyFromReg(Root, VReg, MVT::i32);
+        ++ArgRegIdx;
+      } else {
+        needsLoad = true;
+      }
+      break;
+    case MVT::i64:
+      if (!isVarArg && ArgRegIdx < NumArgRegs) {
+        unsigned VReg = RegMap->createVirtualRegister(&SPU::R64CRegClass);
+        MF.addLiveIn(ArgRegs[ArgRegIdx], VReg);
+        ArgVal = DAG.getCopyFromReg(Root, VReg, MVT::i64);
+        ++ArgRegIdx;
+      } else {
+        needsLoad = true;
+      }
+      break;
+    case MVT::f32:
+      if (!isVarArg && ArgRegIdx < NumArgRegs) {
+        unsigned VReg = RegMap->createVirtualRegister(&SPU::R32FPRegClass);
+        MF.addLiveIn(ArgRegs[ArgRegIdx], VReg);
+        ArgVal = DAG.getCopyFromReg(Root, VReg, MVT::f32);
+        ++ArgRegIdx;
+      } else {
+        needsLoad = true;
+      }
+      break;
+    case MVT::f64:
+      if (!isVarArg && ArgRegIdx < NumArgRegs) {
+        unsigned VReg = RegMap->createVirtualRegister(&SPU::R64FPRegClass);
+        MF.addLiveIn(ArgRegs[ArgRegIdx], VReg);
+        ArgVal = DAG.getCopyFromReg(Root, VReg, MVT::f64);
+        ++ArgRegIdx;
+      } else {
+        needsLoad = true;
+      }
+      break;
+    case MVT::v2f64:
+    case MVT::v4f32:
+    case MVT::v4i32:
+    case MVT::v8i16:
+    case MVT::v16i8:
+      if (!isVarArg && ArgRegIdx < NumArgRegs) {
+        unsigned VReg = RegMap->createVirtualRegister(&SPU::VECREGRegClass);
+        MF.addLiveIn(ArgRegs[ArgRegIdx], VReg);
+        ArgVal = DAG.getCopyFromReg(Root, VReg, ObjectVT);
+        ++ArgRegIdx;
+      } else {
+        needsLoad = true;
+      }
+      break;
+    }
+    
+    // We need to load the argument to a virtual register if we determined above
+    // that we ran out of physical registers of the appropriate type
+    if (needsLoad) {
+      // If the argument is actually used, emit a load from the right stack
+      // slot.
+      if (!Op.Val->hasNUsesOfValue(0, ArgNo)) {
+        int FI = MFI->CreateFixedObject(ObjSize, ArgOffset);
+        SDOperand FIN = DAG.getFrameIndex(FI, PtrVT);
+        ArgVal = DAG.getLoad(ObjectVT, Root, FIN, NULL, 0);
+      } else {
+        // Don't emit a dead load.
+        ArgVal = DAG.getNode(ISD::UNDEF, ObjectVT);
+      }
+
+      ArgOffset += StackSlotSize;
+    }
+    
+    ArgValues.push_back(ArgVal);
+  }
+  
+  // If the function takes variable number of arguments, make a frame index for
+  // the start of the first vararg value... for expansion of llvm.va_start.
+  if (isVarArg) {
+    VarArgsFrameIndex = MFI->CreateFixedObject(MVT::getSizeInBits(PtrVT)/8,
+                                               ArgOffset);
+    SDOperand FIN = DAG.getFrameIndex(VarArgsFrameIndex, PtrVT);
+    // If this function is vararg, store any remaining integer argument regs to
+    // their spots on the stack so that they may be loaded by deferencing the
+    // result of va_next.
+    SmallVector<SDOperand, 8> MemOps;
+    for (; ArgRegIdx != NumArgRegs; ++ArgRegIdx) {
+      unsigned VReg = RegMap->createVirtualRegister(&SPU::GPRCRegClass);
+      MF.addLiveIn(ArgRegs[ArgRegIdx], VReg);
+      SDOperand Val = DAG.getCopyFromReg(Root, VReg, PtrVT);
+      SDOperand Store = DAG.getStore(Val.getValue(1), Val, FIN, NULL, 0);
+      MemOps.push_back(Store);
+      // Increment the address by four for the next argument to store
+      SDOperand PtrOff = DAG.getConstant(MVT::getSizeInBits(PtrVT)/8, PtrVT);
+      FIN = DAG.getNode(ISD::ADD, PtrOff.getValueType(), FIN, PtrOff);
+    }
+    if (!MemOps.empty())
+      Root = DAG.getNode(ISD::TokenFactor, MVT::Other,&MemOps[0],MemOps.size());
+  }
+  
+  ArgValues.push_back(Root);
+ 
+  // Return the new list of results.
+  std::vector<MVT::ValueType> RetVT(Op.Val->value_begin(),
+                                    Op.Val->value_end());
+  return DAG.getNode(ISD::MERGE_VALUES, RetVT, &ArgValues[0], ArgValues.size());
+}
+
+/// isLSAAddress - Return the immediate to use if the specified
+/// value is representable as a LSA address.
+static SDNode *isLSAAddress(SDOperand Op, SelectionDAG &DAG) {
+  ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
+  if (!C) return 0;
+  
+  int Addr = C->getValue();
+  if ((Addr & 3) != 0 ||  // Low 2 bits are implicitly zero.
+      (Addr << 14 >> 14) != Addr)
+    return 0;  // Top 14 bits have to be sext of immediate.
+  
+  return DAG.getConstant((int)C->getValue() >> 2, MVT::i32).Val;
+}
+
+static
+SDOperand
+LowerCALL(SDOperand Op, SelectionDAG &DAG) {
+  SDOperand Chain = Op.getOperand(0);
+#if 0
+  bool isVarArg       = cast<ConstantSDNode>(Op.getOperand(2))->getValue() != 0;
+  bool isTailCall     = cast<ConstantSDNode>(Op.getOperand(3))->getValue() != 0;
+#endif
+  SDOperand Callee    = Op.getOperand(4);
+  unsigned NumOps     = (Op.getNumOperands() - 5) / 2;
+  unsigned StackSlotSize = SPUFrameInfo::stackSlotSize();
+  const unsigned *ArgRegs = SPURegisterInfo::getArgRegs();
+  const unsigned NumArgRegs = SPURegisterInfo::getNumArgRegs();
+
+  // Handy pointer type
+  MVT::ValueType PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+  
+  // Accumulate how many bytes are to be pushed on the stack, including the
+  // linkage area, and parameter passing area.  According to the SPU ABI,
+  // we minimally need space for [LR] and [SP]
+  unsigned NumStackBytes = SPUFrameInfo::minStackSize();
+  
+  // Set up a copy of the stack pointer for use loading and storing any
+  // arguments that may not fit in the registers available for argument
+  // passing.
+  SDOperand StackPtr = DAG.getRegister(SPU::R1, MVT::i32);
+  
+  // Figure out which arguments are going to go in registers, and which in
+  // memory.
+  unsigned ArgOffset = SPUFrameInfo::minStackSize(); // Just below [LR]
+  unsigned ArgRegIdx = 0;
+
+  // Keep track of registers passing arguments
+  std::vector<std::pair<unsigned, SDOperand> > RegsToPass;
+  // And the arguments passed on the stack
+  SmallVector<SDOperand, 8> MemOpChains;
+
+  for (unsigned i = 0; i != NumOps; ++i) {
+    SDOperand Arg = Op.getOperand(5+2*i);
+    
+    // PtrOff will be used to store the current argument to the stack if a
+    // register cannot be found for it.
+    SDOperand PtrOff = DAG.getConstant(ArgOffset, StackPtr.getValueType());
+    PtrOff = DAG.getNode(ISD::ADD, PtrVT, StackPtr, PtrOff);
+
+    switch (Arg.getValueType()) {
+    default: assert(0 && "Unexpected ValueType for argument!");
+    case MVT::i32:
+    case MVT::i64:
+    case MVT::i128:
+      if (ArgRegIdx != NumArgRegs) {
+        RegsToPass.push_back(std::make_pair(ArgRegs[ArgRegIdx++], Arg));
+      } else {
+        MemOpChains.push_back(DAG.getStore(Chain, Arg, PtrOff, NULL, 0));
+	ArgOffset += StackSlotSize;
+      }
+      break;
+    case MVT::f32:
+    case MVT::f64:
+      if (ArgRegIdx != NumArgRegs) {
+        RegsToPass.push_back(std::make_pair(ArgRegs[ArgRegIdx++], Arg));
+      } else {
+        MemOpChains.push_back(DAG.getStore(Chain, Arg, PtrOff, NULL, 0));
+	ArgOffset += StackSlotSize;
+      }
+      break;
+    case MVT::v4f32:
+    case MVT::v4i32:
+    case MVT::v8i16:
+    case MVT::v16i8:
+      if (ArgRegIdx != NumArgRegs) {
+        RegsToPass.push_back(std::make_pair(ArgRegs[ArgRegIdx++], Arg));
+      } else {
+        MemOpChains.push_back(DAG.getStore(Chain, Arg, PtrOff, NULL, 0));
+	ArgOffset += StackSlotSize;
+      }
+      break;
+    }
+  }
+
+  // Update number of stack bytes actually used, insert a call sequence start
+  NumStackBytes = (ArgOffset - SPUFrameInfo::minStackSize());
+  Chain = DAG.getCALLSEQ_START(Chain, DAG.getConstant(NumStackBytes, PtrVT));
+
+  if (!MemOpChains.empty()) {
+    // Adjust the stack pointer for the stack arguments.
+    Chain = DAG.getNode(ISD::TokenFactor, MVT::Other,
+                        &MemOpChains[0], MemOpChains.size());
+  }
+  
+  // Build a sequence of copy-to-reg nodes chained together with token chain
+  // and flag operands which copy the outgoing args into the appropriate regs.
+  SDOperand InFlag;
+  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
+    Chain = DAG.getCopyToReg(Chain, RegsToPass[i].first, RegsToPass[i].second,
+                             InFlag);
+    InFlag = Chain.getValue(1);
+  }
+  
+  std::vector<MVT::ValueType> NodeTys;
+  NodeTys.push_back(MVT::Other);   // Returns a chain
+  NodeTys.push_back(MVT::Flag);    // Returns a flag for retval copy to use.
+
+  SmallVector<SDOperand, 8> Ops;
+  unsigned CallOpc = SPUISD::CALL;
+  
+  // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
+  // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
+  // node so that legalize doesn't hack it.
+  if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
+    GlobalValue *GV = G->getGlobal();
+    unsigned CalleeVT = Callee.getValueType();
+
+    // Turn calls to targets that are defined (i.e., have bodies) into BRSL
+    // style calls, otherwise, external symbols are BRASL calls.
+    // NOTE:
+    // This may be an unsafe assumption for JIT and really large compilation
+    // units.
+    if (GV->isDeclaration()) {
+      Callee = DAG.getGlobalAddress(GV, CalleeVT);
+    } else {
+      Callee = DAG.getNode(SPUISD::PCRelAddr, CalleeVT,
+                           DAG.getTargetGlobalAddress(GV, CalleeVT),
+                           DAG.getConstant(0, PtrVT));
+    }
+  } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee))
+    Callee = DAG.getExternalSymbol(S->getSymbol(), Callee.getValueType());
+  else if (SDNode *Dest = isLSAAddress(Callee, DAG))
+    // If this is an absolute destination address that appears to be a legal
+    // local store address, use the munged value.
+    Callee = SDOperand(Dest, 0);
+
+  Ops.push_back(Chain);
+  Ops.push_back(Callee);
+  
+  // Add argument registers to the end of the list so that they are known live
+  // into the call.
+  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
+    Ops.push_back(DAG.getRegister(RegsToPass[i].first, 
+                                  RegsToPass[i].second.getValueType()));
+  
+  if (InFlag.Val)
+    Ops.push_back(InFlag);
+  Chain = DAG.getNode(CallOpc, NodeTys, &Ops[0], Ops.size());
+  InFlag = Chain.getValue(1);
+
+  SDOperand ResultVals[3];
+  unsigned NumResults = 0;
+  NodeTys.clear();
+  
+  // If the call has results, copy the values out of the ret val registers.
+  switch (Op.Val->getValueType(0)) {
+  default: assert(0 && "Unexpected ret value!");
+  case MVT::Other: break;
+  case MVT::i32:
+    if (Op.Val->getValueType(1) == MVT::i32) {
+      Chain = DAG.getCopyFromReg(Chain, SPU::R4, MVT::i32, InFlag).getValue(1);
+      ResultVals[0] = Chain.getValue(0);
+      Chain = DAG.getCopyFromReg(Chain, SPU::R3, MVT::i32,
+                                 Chain.getValue(2)).getValue(1);
+      ResultVals[1] = Chain.getValue(0);
+      NumResults = 2;
+      NodeTys.push_back(MVT::i32);
+    } else {
+      Chain = DAG.getCopyFromReg(Chain, SPU::R3, MVT::i32, InFlag).getValue(1);
+      ResultVals[0] = Chain.getValue(0);
+      NumResults = 1;
+    }
+    NodeTys.push_back(MVT::i32);
+    break;
+  case MVT::i64:
+    Chain = DAG.getCopyFromReg(Chain, SPU::R3, MVT::i64, InFlag).getValue(1);
+    ResultVals[0] = Chain.getValue(0);
+    NumResults = 1;
+    NodeTys.push_back(MVT::i64);
+    break;
+  case MVT::f32:
+  case MVT::f64:
+    Chain = DAG.getCopyFromReg(Chain, SPU::R3, Op.Val->getValueType(0),
+                               InFlag).getValue(1);
+    ResultVals[0] = Chain.getValue(0);
+    NumResults = 1;
+    NodeTys.push_back(Op.Val->getValueType(0));
+    break;
+  case MVT::v2f64:
+  case MVT::v4f32:
+  case MVT::v4i32:
+  case MVT::v8i16:
+  case MVT::v16i8:
+    Chain = DAG.getCopyFromReg(Chain, SPU::R3, Op.Val->getValueType(0),
+                                   InFlag).getValue(1);
+    ResultVals[0] = Chain.getValue(0);
+    NumResults = 1;
+    NodeTys.push_back(Op.Val->getValueType(0));
+    break;
+  }
+  
+  Chain = DAG.getNode(ISD::CALLSEQ_END, MVT::Other, Chain,
+                      DAG.getConstant(NumStackBytes, PtrVT));
+  NodeTys.push_back(MVT::Other);
+  
+  // If the function returns void, just return the chain.
+  if (NumResults == 0)
+    return Chain;
+  
+  // Otherwise, merge everything together with a MERGE_VALUES node.
+  ResultVals[NumResults++] = Chain;
+  SDOperand Res = DAG.getNode(ISD::MERGE_VALUES, NodeTys,
+                              ResultVals, NumResults);
+  return Res.getValue(Op.ResNo);
+}
+
+static SDOperand
+LowerRET(SDOperand Op, SelectionDAG &DAG, TargetMachine &TM) {
+  SmallVector<CCValAssign, 16> RVLocs;
+  unsigned CC = DAG.getMachineFunction().getFunction()->getCallingConv();
+  bool isVarArg = DAG.getMachineFunction().getFunction()->isVarArg();
+  CCState CCInfo(CC, isVarArg, TM, RVLocs);
+  CCInfo.AnalyzeReturn(Op.Val, RetCC_SPU);
+  
+  // If this is the first return lowered for this function, add the regs to the
+  // liveout set for the function.
+  if (DAG.getMachineFunction().liveout_empty()) {
+    for (unsigned i = 0; i != RVLocs.size(); ++i)
+      DAG.getMachineFunction().addLiveOut(RVLocs[i].getLocReg());
+  }
+
+  SDOperand Chain = Op.getOperand(0);
+  SDOperand Flag;
+  
+  // Copy the result values into the output registers.
+  for (unsigned i = 0; i != RVLocs.size(); ++i) {
+    CCValAssign &VA = RVLocs[i];
+    assert(VA.isRegLoc() && "Can only return in registers!");
+    Chain = DAG.getCopyToReg(Chain, VA.getLocReg(), Op.getOperand(i*2+1), Flag);
+    Flag = Chain.getValue(1);
+  }
+
+  if (Flag.Val)
+    return DAG.getNode(SPUISD::RET_FLAG, MVT::Other, Chain, Flag);
+  else
+    return DAG.getNode(SPUISD::RET_FLAG, MVT::Other, Chain);
+}
+
+
+//===----------------------------------------------------------------------===//
+// Vector related lowering:
+//===----------------------------------------------------------------------===//
+
+static ConstantSDNode *
+getVecImm(SDNode *N) {
+  SDOperand OpVal(0, 0);
+  
+  // Check to see if this buildvec has a single non-undef value in its elements.
+  for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
+    if (N->getOperand(i).getOpcode() == ISD::UNDEF) continue;
+    if (OpVal.Val == 0)
+      OpVal = N->getOperand(i);
+    else if (OpVal != N->getOperand(i))
+      return 0;
+  }
+  
+  if (OpVal.Val != 0) {
+    if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(OpVal)) {
+      return CN;
+    }
+  }
+
+  return 0; // All UNDEF: use implicit def.; not Constant node
+}
+
+/// get_vec_i18imm - Test if this vector is a vector filled with the same value
+/// and the value fits into an unsigned 18-bit constant, and if so, return the
+/// constant
+SDOperand SPU::get_vec_u18imm(SDNode *N, SelectionDAG &DAG,
+                              MVT::ValueType ValueType) {
+  if (ConstantSDNode *CN = getVecImm(N)) {
+    uint64_t Value = CN->getValue();
+    if (Value <= 0x3ffff)
+      return DAG.getConstant(Value, ValueType);
+  }
+
+  return SDOperand();
+}
+
+/// get_vec_i16imm - Test if this vector is a vector filled with the same value
+/// and the value fits into a signed 16-bit constant, and if so, return the
+/// constant
+SDOperand SPU::get_vec_i16imm(SDNode *N, SelectionDAG &DAG,
+                              MVT::ValueType ValueType) {
+  if (ConstantSDNode *CN = getVecImm(N)) {
+    if (ValueType == MVT::i32) {
+      int Value = (int) CN->getValue();
+      int SExtValue = ((Value & 0xffff) << 16) >> 16;
+
+      if (Value == SExtValue)
+	return DAG.getConstant(Value, ValueType);
+    } else if (ValueType == MVT::i16) {
+      short Value = (short) CN->getValue();
+      int SExtValue = ((int) Value << 16) >> 16;
+
+      if (Value == (short) SExtValue)
+	return DAG.getConstant(Value, ValueType);
+    } else if (ValueType == MVT::i64) {
+      int64_t Value = CN->getValue();
+      int64_t SExtValue = ((Value & 0xffff) << (64 - 16)) >> (64 - 16);
+
+      if (Value == SExtValue)
+	return DAG.getConstant(Value, ValueType);
+    }
+  }
+
+  return SDOperand();
+}
+
+/// get_vec_i10imm - Test if this vector is a vector filled with the same value
+/// and the value fits into a signed 10-bit constant, and if so, return the
+/// constant
+SDOperand SPU::get_vec_i10imm(SDNode *N, SelectionDAG &DAG,
+                              MVT::ValueType ValueType) {
+  if (ConstantSDNode *CN = getVecImm(N)) {
+    int Value = (int) CN->getValue();
+    if ((ValueType == MVT::i32 && isS10Constant(Value))
+	|| (ValueType == MVT::i16 && isS10Constant((short) Value)))
+      return DAG.getConstant(Value, ValueType);
+  }
+
+  return SDOperand();
+}
+
+/// get_vec_i8imm - Test if this vector is a vector filled with the same value
+/// and the value fits into a signed 8-bit constant, and if so, return the
+/// constant.
+///
+/// @note: The incoming vector is v16i8 because that's the only way we can load
+/// constant vectors. Thus, we test to see if the upper and lower bytes are the
+/// same value.
+SDOperand SPU::get_vec_i8imm(SDNode *N, SelectionDAG &DAG,
+                             MVT::ValueType ValueType) {
+  if (ConstantSDNode *CN = getVecImm(N)) {
+    int Value = (int) CN->getValue();
+    if (ValueType == MVT::i16
+	&& Value <= 0xffff                 /* truncated from uint64_t */
+	&& ((short) Value >> 8) == ((short) Value & 0xff))
+      return DAG.getConstant(Value & 0xff, ValueType);
+    else if (ValueType == MVT::i8
+	     && (Value & 0xff) == Value)
+      return DAG.getConstant(Value, ValueType);
+  }
+
+  return SDOperand();
+}
+
+/// get_ILHUvec_imm - Test if this vector is a vector filled with the same value
+/// and the value fits into a signed 16-bit constant, and if so, return the
+/// constant
+SDOperand SPU::get_ILHUvec_imm(SDNode *N, SelectionDAG &DAG,
+                               MVT::ValueType ValueType) {
+  if (ConstantSDNode *CN = getVecImm(N)) {
+    uint64_t Value = CN->getValue();
+    if ((ValueType == MVT::i32
+	  && ((unsigned) Value & 0xffff0000) == (unsigned) Value)
+	|| (ValueType == MVT::i64 && (Value & 0xffff0000) == Value))
+      return DAG.getConstant(Value >> 16, ValueType);
+  }
+
+  return SDOperand();
+}
+
+/// get_v4i32_imm - Catch-all for general 32-bit constant vectors
+SDOperand SPU::get_v4i32_imm(SDNode *N, SelectionDAG &DAG) {
+  if (ConstantSDNode *CN = getVecImm(N)) {
+    return DAG.getConstant((unsigned) CN->getValue(), MVT::i32);
+  }
+
+  return SDOperand();
+}
+
+/// get_v4i32_imm - Catch-all for general 64-bit constant vectors
+SDOperand SPU::get_v2i64_imm(SDNode *N, SelectionDAG &DAG) {
+  if (ConstantSDNode *CN = getVecImm(N)) {
+    return DAG.getConstant((unsigned) CN->getValue(), MVT::i64);
+  }
+
+  return SDOperand();
+}
+
+// If this is a vector of constants or undefs, get the bits.  A bit in
+// UndefBits is set if the corresponding element of the vector is an 
+// ISD::UNDEF value.  For undefs, the corresponding VectorBits values are
+// zero.   Return true if this is not an array of constants, false if it is.
+//
+static bool GetConstantBuildVectorBits(SDNode *BV, uint64_t VectorBits[2],
+                                       uint64_t UndefBits[2]) {
+  // Start with zero'd results.
+  VectorBits[0] = VectorBits[1] = UndefBits[0] = UndefBits[1] = 0;
+  
+  unsigned EltBitSize = MVT::getSizeInBits(BV->getOperand(0).getValueType());
+  for (unsigned i = 0, e = BV->getNumOperands(); i != e; ++i) {
+    SDOperand OpVal = BV->getOperand(i);
+    
+    unsigned PartNo = i >= e/2;     // In the upper 128 bits?
+    unsigned SlotNo = e/2 - (i & (e/2-1))-1;  // Which subpiece of the uint64_t.
+
+    uint64_t EltBits = 0;
+    if (OpVal.getOpcode() == ISD::UNDEF) {
+      uint64_t EltUndefBits = ~0ULL >> (64-EltBitSize);
+      UndefBits[PartNo] |= EltUndefBits << (SlotNo*EltBitSize);
+      continue;
+    } else if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(OpVal)) {
+      EltBits = CN->getValue() & (~0ULL >> (64-EltBitSize));
+    } else if (ConstantFPSDNode *CN = dyn_cast<ConstantFPSDNode>(OpVal)) {
+      const APFloat &apf = CN->getValueAPF();
+      EltBits = (CN->getValueType(0) == MVT::f32
+		 ? FloatToBits(apf.convertToFloat())
+		 : DoubleToBits(apf.convertToDouble()));
+    } else {
+      // Nonconstant element.
+      return true;
+    }
+    
+    VectorBits[PartNo] |= EltBits << (SlotNo*EltBitSize);
+  }
+  
+  //printf("%llx %llx  %llx %llx\n", 
+  //       VectorBits[0], VectorBits[1], UndefBits[0], UndefBits[1]);
+  return false;
+}
+
+/// If this is a splat (repetition) of a value across the whole vector, return
+/// the smallest size that splats it.  For example, "0x01010101010101..." is a
+/// splat of 0x01, 0x0101, and 0x01010101.  We return SplatBits = 0x01 and 
+/// SplatSize = 1 byte.
+static bool isConstantSplat(const uint64_t Bits128[2], 
+                            const uint64_t Undef128[2],
+			    int MinSplatBits,
+                            uint64_t &SplatBits, uint64_t &SplatUndef,
+                            int &SplatSize) {
+  // Don't let undefs prevent splats from matching.  See if the top 64-bits are
+  // the same as the lower 64-bits, ignoring undefs.
+  uint64_t Bits64  = Bits128[0] | Bits128[1];
+  uint64_t Undef64 = Undef128[0] & Undef128[1];
+  uint32_t Bits32  = uint32_t(Bits64) | uint32_t(Bits64 >> 32);
+  uint32_t Undef32 = uint32_t(Undef64) & uint32_t(Undef64 >> 32);
+  uint16_t Bits16  = uint16_t(Bits32)  | uint16_t(Bits32 >> 16);
+  uint16_t Undef16 = uint16_t(Undef32) & uint16_t(Undef32 >> 16);
+
+  if ((Bits128[0] & ~Undef128[1]) == (Bits128[1] & ~Undef128[0])) {
+    if (MinSplatBits < 64) {
+  
+      // Check that the top 32-bits are the same as the lower 32-bits, ignoring
+      // undefs.
+      if ((Bits64 & (~Undef64 >> 32)) == ((Bits64 >> 32) & ~Undef64)) {
+	if (MinSplatBits < 32) {
+
+	  // If the top 16-bits are different than the lower 16-bits, ignoring
+	  // undefs, we have an i32 splat.
+	  if ((Bits32 & (~Undef32 >> 16)) == ((Bits32 >> 16) & ~Undef32)) {
+	    if (MinSplatBits < 16) {
+	      // If the top 8-bits are different than the lower 8-bits, ignoring
+	      // undefs, we have an i16 splat.
+	      if ((Bits16 & (uint16_t(~Undef16) >> 8)) == ((Bits16 >> 8) & ~Undef16)) {
+		// Otherwise, we have an 8-bit splat.
+		SplatBits  = uint8_t(Bits16)  | uint8_t(Bits16 >> 8);
+		SplatUndef = uint8_t(Undef16) & uint8_t(Undef16 >> 8);
+		SplatSize = 1;
+		return true;
+	      }
+	    } else {
+	      SplatBits = Bits16;
+	      SplatUndef = Undef16;
+	      SplatSize = 2;
+	      return true;
+	    }
+	  }
+	} else {
+	  SplatBits = Bits32;
+	  SplatUndef = Undef32;
+	  SplatSize = 4;
+	  return true;
+	}
+      }
+    } else {
+      SplatBits = Bits128[0];
+      SplatUndef = Undef128[0];
+      SplatSize = 8;
+      return true;
+    }
+  }
+
+  return false;  // Can't be a splat if two pieces don't match.
+}
+
+// If this is a case we can't handle, return null and let the default
+// expansion code take care of it.  If we CAN select this case, and if it
+// selects to a single instruction, return Op.  Otherwise, if we can codegen
+// this case more efficiently than a constant pool load, lower it to the
+// sequence of ops that should be used.
+static SDOperand LowerBUILD_VECTOR(SDOperand Op, SelectionDAG &DAG) {
+  MVT::ValueType VT = Op.getValueType();
+  // If this is a vector of constants or undefs, get the bits.  A bit in
+  // UndefBits is set if the corresponding element of the vector is an 
+  // ISD::UNDEF value.  For undefs, the corresponding VectorBits values are
+  // zero. 
+  uint64_t VectorBits[2];
+  uint64_t UndefBits[2];
+  uint64_t SplatBits, SplatUndef;
+  int SplatSize;
+  if (GetConstantBuildVectorBits(Op.Val, VectorBits, UndefBits)
+      || !isConstantSplat(VectorBits, UndefBits,
+			  MVT::getSizeInBits(MVT::getVectorElementType(VT)),
+                          SplatBits, SplatUndef, SplatSize))
+    return SDOperand();   // Not a constant vector, not a splat.
+  
+  switch (VT) {
+  default:
+  case MVT::v4f32: {
+    uint32_t Value32 = SplatBits;
+    assert(SplatSize == 4
+	   && "LowerBUILD_VECTOR: Unexpected floating point vector element.");
+    // NOTE: pretend the constant is an integer. LLVM won't load FP constants
+    SDOperand T = DAG.getConstant(Value32, MVT::i32);
+    return DAG.getNode(ISD::BIT_CONVERT, MVT::v4f32,
+		       DAG.getNode(ISD::BUILD_VECTOR, MVT::v4i32, T, T, T, T));
+    break;
+  }
+  case MVT::v2f64: {
+    uint64_t f64val = SplatBits;
+    assert(SplatSize == 8
+	   && "LowerBUILD_VECTOR: 64-bit float vector element: unexpected size.");
+    // NOTE: pretend the constant is an integer. LLVM won't load FP constants
+    SDOperand T = DAG.getConstant(f64val, MVT::i64);
+    return DAG.getNode(ISD::BIT_CONVERT, MVT::v2f64,
+		       DAG.getNode(ISD::BUILD_VECTOR, MVT::v2i64, T, T));
+    break;
+  }
+  case MVT::v16i8: {
+   // 8-bit constants have to be expanded to 16-bits
+   unsigned short Value16 = SplatBits | (SplatBits << 8);
+   SDOperand Ops[8];
+   for (int i = 0; i < 8; ++i)
+     Ops[i] = DAG.getConstant(Value16, MVT::i16);
+   return DAG.getNode(ISD::BIT_CONVERT, VT,
+                      DAG.getNode(ISD::BUILD_VECTOR, MVT::v8i16, Ops, 8));
+  }
+  case MVT::v8i16: {
+    unsigned short Value16;
+    if (SplatSize == 2) 
+      Value16 = (unsigned short) (SplatBits & 0xffff);
+    else
+      Value16 = (unsigned short) (SplatBits | (SplatBits << 8));
+    SDOperand T = DAG.getConstant(Value16, MVT::getVectorElementType(VT));
+    SDOperand Ops[8];
+    for (int i = 0; i < 8; ++i) Ops[i] = T;
+    return DAG.getNode(ISD::BUILD_VECTOR, VT, Ops, 8);
+  }
+  case MVT::v4i32: {
+    unsigned int Value = SplatBits;
+    SDOperand T = DAG.getConstant(Value, MVT::getVectorElementType(VT));
+    return DAG.getNode(ISD::BUILD_VECTOR, VT, T, T, T, T);
+  }
+  case MVT::v2i64: {
+    uint64_t val = SplatBits;
+    uint32_t upper = uint32_t(val >> 32);
+    uint32_t lower = uint32_t(val);
+
+    if (val != 0) {
+      SDOperand LO32;
+      SDOperand HI32;
+      SmallVector<SDOperand, 16> ShufBytes;
+      SDOperand Result;
+      bool upper_special, lower_special;
+
+      // NOTE: This code creates common-case shuffle masks that can be easily
+      // detected as common expressions. It is not attempting to create highly
+      // specialized masks to replace any and all 0's, 0xff's and 0x80's.
+
+      // Detect if the upper or lower half is a special shuffle mask pattern:
+      upper_special = (upper == 0 || upper == 0xffffffff || upper == 0x80000000);
+      lower_special = (lower == 0 || lower == 0xffffffff || lower == 0x80000000);
+
+      // Create lower vector if not a special pattern
+      if (!lower_special) {
+	SDOperand LO32C = DAG.getConstant(lower, MVT::i32);
+	LO32 = DAG.getNode(ISD::BIT_CONVERT, VT,
+			   DAG.getNode(ISD::BUILD_VECTOR, MVT::v4i32,
+				       LO32C, LO32C, LO32C, LO32C));
+      }
+
+      // Create upper vector if not a special pattern
+      if (!upper_special) {
+	SDOperand HI32C = DAG.getConstant(upper, MVT::i32);
+	HI32 = DAG.getNode(ISD::BIT_CONVERT, VT,
+			   DAG.getNode(ISD::BUILD_VECTOR, MVT::v4i32,
+				       HI32C, HI32C, HI32C, HI32C));
+      }
+
+      // If either upper or lower are special, then the two input operands are
+      // the same (basically, one of them is a "don't care")
+      if (lower_special)
+	LO32 = HI32;
+      if (upper_special)
+	HI32 = LO32;
+      if (lower_special && upper_special) {
+	// Unhappy situation... both upper and lower are special, so punt with
+	// a target constant:
+        SDOperand Zero = DAG.getConstant(0, MVT::i32);
+	HI32 = LO32 = DAG.getNode(ISD::BUILD_VECTOR, MVT::v4i32, Zero, Zero,
+                                  Zero, Zero);
+      }
+
+      for (int i = 0; i < 4; ++i) {
+	for (int j = 0; j < 4; ++j) {
+	  SDOperand V;
+	  bool process_upper, process_lower;
+	  uint64_t val;
+
+	  process_upper = (upper_special && (i & 1) == 0);
+	  process_lower = (lower_special && (i & 1) == 1);
+
+	  if (process_upper || process_lower) {
+	    if ((process_upper && upper == 0)
+		|| (process_lower && lower == 0))
+	      val = 0x80;
+	    else if ((process_upper && upper == 0xffffffff)
+		     || (process_lower && lower == 0xffffffff))
+	      val = 0xc0;
+	    else if ((process_upper && upper == 0x80000000)
+		     || (process_lower && lower == 0x80000000))
+	      val = (j == 0 ? 0xe0 : 0x80);
+	  } else
+	    val = i * 4 + j + ((i & 1) * 16);
+
+	  ShufBytes.push_back(DAG.getConstant(val, MVT::i8));
+	}
+      }
+
+      return DAG.getNode(SPUISD::SHUFB, VT, HI32, LO32,
+			 DAG.getNode(ISD::BUILD_VECTOR, MVT::v16i8,
+				     &ShufBytes[0], ShufBytes.size()));
+    } else {
+      // For zero, this can be lowered efficiently via v4i32 BUILD_VECTOR
+      SDOperand Zero = DAG.getConstant(0, MVT::i32);
+      return DAG.getNode(ISD::BIT_CONVERT, VT,
+			 DAG.getNode(ISD::BUILD_VECTOR, MVT::v4i32,
+				     Zero, Zero, Zero, Zero));
+    }
+  }
+  }
+ 
+  return SDOperand();
+}
+
+/// LowerVECTOR_SHUFFLE - Lower a vector shuffle (V1, V2, V3) to something on
+/// which the Cell can operate. The code inspects V3 to ascertain whether the
+/// permutation vector, V3, is monotonically increasing with one "exception"
+/// element, e.g., (0, 1, _, 3). If this is the case, then generate a
+/// INSERT_MASK synthetic instruction. Otherwise, spill V3 to the constant pool.
+/// In either case, the net result is going to eventually invoke SHUFB to
+/// permute/shuffle the bytes from V1 and V2.
+/// \note
+/// INSERT_MASK is eventually selected as one of the C*D instructions, generate
+/// control word for byte/halfword/word insertion. This takes care of a single
+/// element move from V2 into V1.
+/// \note
+/// SPUISD::SHUFB is eventually selected as Cell's <i>shufb</i> instructions.
+static SDOperand LowerVECTOR_SHUFFLE(SDOperand Op, SelectionDAG &DAG) {
+  SDOperand V1 = Op.getOperand(0);
+  SDOperand V2 = Op.getOperand(1);
+  SDOperand PermMask = Op.getOperand(2);
+  
+  if (V2.getOpcode() == ISD::UNDEF) V2 = V1;
+  
+  // If we have a single element being moved from V1 to V2, this can be handled
+  // using the C*[DX] compute mask instructions, but the vector elements have
+  // to be monotonically increasing with one exception element.
+  MVT::ValueType EltVT = MVT::getVectorElementType(V1.getValueType());
+  unsigned EltsFromV2 = 0;
+  unsigned V2Elt = 0;
+  unsigned V2EltIdx0 = 0;
+  unsigned CurrElt = 0;
+  bool monotonic = true;
+  if (EltVT == MVT::i8)
+    V2EltIdx0 = 16;
+  else if (EltVT == MVT::i16)
+    V2EltIdx0 = 8;
+  else if (EltVT == MVT::i32)
+    V2EltIdx0 = 4;
+  else
+    assert(0 && "Unhandled vector type in LowerVECTOR_SHUFFLE");
+
+  for (unsigned i = 0, e = PermMask.getNumOperands();
+       EltsFromV2 <= 1 && monotonic && i != e;
+       ++i) {
+    unsigned SrcElt;
+    if (PermMask.getOperand(i).getOpcode() == ISD::UNDEF)
+      SrcElt = 0;
+    else 
+      SrcElt = cast<ConstantSDNode>(PermMask.getOperand(i))->getValue();
+
+    if (SrcElt >= V2EltIdx0) {
+      ++EltsFromV2;
+      V2Elt = (V2EltIdx0 - SrcElt) << 2;
+    } else if (CurrElt != SrcElt) {
+      monotonic = false;
+    }
+
+    ++CurrElt;
+  }
+
+  if (EltsFromV2 == 1 && monotonic) {
+    // Compute mask and shuffle
+    MachineFunction &MF = DAG.getMachineFunction();
+    SSARegMap *RegMap = MF.getSSARegMap();
+    unsigned VReg = RegMap->createVirtualRegister(&SPU::R32CRegClass);
+    MVT::ValueType PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+    // Initialize temporary register to 0
+    SDOperand InitTempReg =
+      DAG.getCopyToReg(DAG.getEntryNode(), VReg, DAG.getConstant(0, PtrVT));
+    // Copy register's contents as index in INSERT_MASK:
+    SDOperand ShufMaskOp =
+      DAG.getNode(SPUISD::INSERT_MASK, V1.getValueType(),
+		  DAG.getTargetConstant(V2Elt, MVT::i32),
+		  DAG.getCopyFromReg(InitTempReg, VReg, PtrVT));
+    // Use shuffle mask in SHUFB synthetic instruction:
+    return DAG.getNode(SPUISD::SHUFB, V1.getValueType(), V2, V1, ShufMaskOp);
+  } else {
+    // Convert the SHUFFLE_VECTOR mask's input element units to the actual bytes.
+    unsigned BytesPerElement = MVT::getSizeInBits(EltVT)/8;
+    
+    SmallVector<SDOperand, 16> ResultMask;
+    for (unsigned i = 0, e = PermMask.getNumOperands(); i != e; ++i) {
+      unsigned SrcElt;
+      if (PermMask.getOperand(i).getOpcode() == ISD::UNDEF)
+	SrcElt = 0;
+      else 
+	SrcElt = cast<ConstantSDNode>(PermMask.getOperand(i))->getValue();
+      
+      for (unsigned j = 0; j != BytesPerElement; ++j) {
+	ResultMask.push_back(DAG.getConstant(SrcElt*BytesPerElement+j,
+					     MVT::i8));
+      }
+    }
+    
+    SDOperand VPermMask = DAG.getNode(ISD::BUILD_VECTOR, MVT::v16i8,
+				      &ResultMask[0], ResultMask.size());
+    return DAG.getNode(SPUISD::SHUFB, V1.getValueType(), V1, V2, VPermMask);
+  }
+}
+
+static SDOperand LowerSCALAR_TO_VECTOR(SDOperand Op, SelectionDAG &DAG) {
+  SDOperand Op0 = Op.getOperand(0);			// Op0 = the scalar
+
+  if (Op0.Val->getOpcode() == ISD::Constant) {
+    // For a constant, build the appropriate constant vector, which will
+    // eventually simplify to a vector register load.
+
+    ConstantSDNode *CN = cast<ConstantSDNode>(Op0.Val);
+    SmallVector<SDOperand, 16> ConstVecValues;
+    MVT::ValueType VT;
+    size_t n_copies;
+
+    // Create a constant vector:
+    switch (Op.getValueType()) {
+    default: assert(0 && "Unexpected constant value type in "
+		         "LowerSCALAR_TO_VECTOR");
+    case MVT::v16i8: n_copies = 16; VT = MVT::i8; break;
+    case MVT::v8i16: n_copies = 8; VT = MVT::i16; break;
+    case MVT::v4i32: n_copies = 4; VT = MVT::i32; break;
+    case MVT::v4f32: n_copies = 4; VT = MVT::f32; break;
+    case MVT::v2i64: n_copies = 2; VT = MVT::i64; break;
+    case MVT::v2f64: n_copies = 2; VT = MVT::f64; break;
+    }
+
+    SDOperand CValue = DAG.getConstant(CN->getValue(), VT);
+    for (size_t j = 0; j < n_copies; ++j)
+      ConstVecValues.push_back(CValue);
+
+    return DAG.getNode(ISD::BUILD_VECTOR, Op.getValueType(),
+	               &ConstVecValues[0], ConstVecValues.size());
+  } else {
+    // Otherwise, copy the value from one register to another:
+    switch (Op0.getValueType()) {
+    default: assert(0 && "Unexpected value type in LowerSCALAR_TO_VECTOR");
+    case MVT::i8:
+    case MVT::i16:
+    case MVT::i32:
+    case MVT::i64:
+    case MVT::f32:
+    case MVT::f64:
+      return DAG.getNode(SPUISD::PROMOTE_SCALAR, Op.getValueType(), Op0, Op0);
+    }
+  }
+
+  return SDOperand();
+}
+
+static SDOperand LowerVectorMUL(SDOperand Op, SelectionDAG &DAG) {
+  switch (Op.getValueType()) {
+  case MVT::v4i32: {
+    SDOperand rA = Op.getOperand(0);
+    SDOperand rB = Op.getOperand(1);
+    SDOperand HiProd1 = DAG.getNode(SPUISD::MPYH, MVT::v4i32, rA, rB);
+    SDOperand HiProd2 = DAG.getNode(SPUISD::MPYH, MVT::v4i32, rB, rA);
+    SDOperand LoProd = DAG.getNode(SPUISD::MPYU, MVT::v4i32, rA, rB);
+    SDOperand Residual1 = DAG.getNode(ISD::ADD, MVT::v4i32, LoProd, HiProd1);
+
+    return DAG.getNode(ISD::ADD, MVT::v4i32, Residual1, HiProd2);
+    break;
+  }
+
+  // Multiply two v8i16 vectors (pipeline friendly version):
+  // a) multiply lower halves, mask off upper 16-bit of 32-bit product
+  // b) multiply upper halves, rotate left by 16 bits (inserts 16 lower zeroes)
+  // c) Use SELB to select upper and lower halves from the intermediate results
+  //
+  // NOTE: We really want to move the FSMBI to earlier to actually get the
+  // dual-issue. This code does manage to do this, even if it's a little on
+  // the wacky side
+  case MVT::v8i16: {
+    MachineFunction &MF = DAG.getMachineFunction();
+    SSARegMap *RegMap = MF.getSSARegMap();
+    SDOperand Chain = Op.getOperand(0);
+    SDOperand rA = Op.getOperand(0);
+    SDOperand rB = Op.getOperand(1);
+    unsigned FSMBIreg = RegMap->createVirtualRegister(&SPU::VECREGRegClass);
+    unsigned HiProdReg = RegMap->createVirtualRegister(&SPU::VECREGRegClass);
+
+    SDOperand FSMBOp =
+      DAG.getCopyToReg(Chain, FSMBIreg,
+		       DAG.getNode(SPUISD::FSMBI, MVT::v8i16,
+				   DAG.getConstant(0xcccc, MVT::i32)));
+
+    SDOperand HHProd =
+      DAG.getCopyToReg(FSMBOp, HiProdReg,
+		       DAG.getNode(SPUISD::MPYHH, MVT::v8i16, rA, rB));
+
+    SDOperand HHProd_v4i32 =
+      DAG.getNode(ISD::BIT_CONVERT, MVT::v4i32,
+		  DAG.getCopyFromReg(HHProd, HiProdReg, MVT::v4i32));
+
+    return DAG.getNode(SPUISD::SELB, MVT::v8i16,
+		       DAG.getNode(SPUISD::MPY, MVT::v8i16, rA, rB),
+		       DAG.getNode(ISD::BIT_CONVERT, Op.getValueType(),
+				   DAG.getNode(SPUISD::VEC_SHL, MVT::v4i32,
+					       HHProd_v4i32,
+					       DAG.getConstant(16, MVT::i16))),
+		       DAG.getCopyFromReg(FSMBOp, FSMBIreg, MVT::v4i32));
+  }
+
+  // This M00sE is N@stI! (apologies to Monty Python)
+  //
+  // SPU doesn't know how to do any 8-bit multiplication, so the solution
+  // is to break it all apart, sign extend, and reassemble the various
+  // intermediate products.
+  case MVT::v16i8: {
+    MachineFunction &MF = DAG.getMachineFunction();
+    SSARegMap *RegMap = MF.getSSARegMap();
+    SDOperand Chain = Op.getOperand(0);
+    SDOperand rA = Op.getOperand(0);
+    SDOperand rB = Op.getOperand(1);
+    SDOperand c8 = DAG.getConstant(8, MVT::i8);
+    SDOperand c16 = DAG.getConstant(16, MVT::i8);
+
+    unsigned FSMBreg_2222 = RegMap->createVirtualRegister(&SPU::VECREGRegClass);
+    unsigned LoProd_reg = RegMap->createVirtualRegister(&SPU::VECREGRegClass);
+    unsigned HiProd_reg = RegMap->createVirtualRegister(&SPU::VECREGRegClass);
+
+    SDOperand LLProd =
+      DAG.getNode(SPUISD::MPY, MVT::v8i16,
+		  DAG.getNode(ISD::BIT_CONVERT, MVT::v8i16, rA),
+		  DAG.getNode(ISD::BIT_CONVERT, MVT::v8i16, rB));
+
+    SDOperand rALH = DAG.getNode(SPUISD::VEC_SRA, MVT::v8i16, rA, c8);
+
+    SDOperand rBLH = DAG.getNode(SPUISD::VEC_SRA, MVT::v8i16, rB, c8);
+
+    SDOperand LHProd =
+      DAG.getNode(SPUISD::VEC_SHL, MVT::v8i16,
+		  DAG.getNode(SPUISD::MPY, MVT::v8i16, rALH, rBLH), c8);
+
+    SDOperand FSMBdef_2222 =
+      DAG.getCopyToReg(Chain, FSMBreg_2222,
+		       DAG.getNode(SPUISD::FSMBI, MVT::v8i16,
+				   DAG.getConstant(0x2222, MVT::i32)));
+
+    SDOperand FSMBuse_2222 =
+      DAG.getCopyFromReg(FSMBdef_2222, FSMBreg_2222, MVT::v4i32);
+
+    SDOperand LoProd_1 =
+      DAG.getCopyToReg(Chain, LoProd_reg,
+		       DAG.getNode(SPUISD::SELB, MVT::v8i16, LLProd, LHProd,
+				   FSMBuse_2222));
+
+    SDOperand LoProdMask = DAG.getConstant(0xffff, MVT::i32);
+
+    SDOperand LoProd = 
+      DAG.getNode(ISD::AND, MVT::v4i32,
+		  DAG.getCopyFromReg(LoProd_1, LoProd_reg, MVT::v4i32),
+		  DAG.getNode(ISD::BUILD_VECTOR, MVT::v4i32,
+			      LoProdMask, LoProdMask,
+			      LoProdMask, LoProdMask));
+
+    SDOperand rAH =
+      DAG.getNode(SPUISD::VEC_SRA, MVT::v4i32,
+		  DAG.getNode(ISD::BIT_CONVERT, MVT::v4i32, rA), c16);
+
+    SDOperand rBH =
+      DAG.getNode(SPUISD::VEC_SRA, MVT::v4i32,
+		  DAG.getNode(ISD::BIT_CONVERT, MVT::v4i32, rB), c16);
+
+    SDOperand HLProd =
+      DAG.getNode(SPUISD::MPY, MVT::v8i16,
+		  DAG.getNode(ISD::BIT_CONVERT, MVT::v8i16, rAH),
+		  DAG.getNode(ISD::BIT_CONVERT, MVT::v8i16, rBH));
+
+    SDOperand HHProd_1 =
+      DAG.getNode(SPUISD::MPY, MVT::v8i16,
+		  DAG.getNode(ISD::BIT_CONVERT, MVT::v8i16,
+			      DAG.getNode(SPUISD::VEC_SRA, MVT::v4i32, rAH, c8)),
+		  DAG.getNode(ISD::BIT_CONVERT, MVT::v8i16,
+			      DAG.getNode(SPUISD::VEC_SRA, MVT::v4i32, rBH, c8)));
+
+    SDOperand HHProd =
+      DAG.getCopyToReg(Chain, HiProd_reg,
+		       DAG.getNode(SPUISD::SELB, MVT::v8i16,
+				   HLProd,
+				   DAG.getNode(SPUISD::VEC_SHL, MVT::v8i16, HHProd_1, c8),
+				   FSMBuse_2222));
+
+    SDOperand HiProd =
+      DAG.getNode(SPUISD::VEC_SHL, MVT::v4i32,
+		  DAG.getCopyFromReg(HHProd, HiProd_reg, MVT::v4i32), c16);
+
+    return DAG.getNode(ISD::BIT_CONVERT, MVT::v16i8,
+		       DAG.getNode(ISD::OR, MVT::v4i32,
+				   LoProd, HiProd));
+  }
+
+  default:
+    cerr << "CellSPU: Unknown vector multiplication, got "
+         << MVT::getValueTypeString(Op.getValueType())
+	 << "\n";
+    abort();
+    /*NOTREACHED*/
+  }
+
+  return SDOperand();
+}
+
+static SDOperand LowerFDIVf32(SDOperand Op, SelectionDAG &DAG) {
+  MachineFunction &MF = DAG.getMachineFunction();
+  SSARegMap *RegMap = MF.getSSARegMap();
+
+  SDOperand A = Op.getOperand(0);
+  SDOperand B = Op.getOperand(1);
+  unsigned VT = Op.getValueType();
+
+  unsigned VRegBR, VRegC;
+
+  if (VT == MVT::f32) {
+    VRegBR = RegMap->createVirtualRegister(&SPU::R32FPRegClass);
+    VRegC = RegMap->createVirtualRegister(&SPU::R32FPRegClass);
+  } else {
+    VRegBR = RegMap->createVirtualRegister(&SPU::VECREGRegClass);
+    VRegC = RegMap->createVirtualRegister(&SPU::VECREGRegClass);
+  }
+  // TODO: make sure we're feeding FPInterp the right arguments
+  // Right now: fi B, frest(B)
+
+  // Computes BRcpl =
+  // (Floating Interpolate (FP Reciprocal Estimate B))
+  SDOperand BRcpl =
+      DAG.getCopyToReg(DAG.getEntryNode(), VRegBR, 
+		       DAG.getNode(SPUISD::FPInterp, VT, B, 
+				DAG.getNode(SPUISD::FPRecipEst, VT, B)));
+  
+  // Computes A * BRcpl and stores in a temporary register
+  SDOperand AxBRcpl =
+      DAG.getCopyToReg(BRcpl, VRegC,
+		 DAG.getNode(ISD::FMUL, VT, A, 
+			DAG.getCopyFromReg(BRcpl, VRegBR, VT)));
+  // What's the Chain variable do? It's magic!
+  // TODO: set Chain = Op(0).getEntryNode()
+  
+  return DAG.getNode(ISD::FADD, VT, 
+		DAG.getCopyFromReg(AxBRcpl, VRegC, VT),
+		DAG.getNode(ISD::FMUL, VT, 
+			DAG.getCopyFromReg(AxBRcpl, VRegBR, VT), 
+			DAG.getNode(ISD::FSUB, VT, A,
+			    DAG.getNode(ISD::FMUL, VT, B, 
+			    DAG.getCopyFromReg(AxBRcpl, VRegC, VT)))));
+}
+
+// Expands double-precision FDIV
+// Expects two doubles as inputs X and Y, does a floating point
+// reciprocal estimate, and three iterations of Newton-Raphson
+// to increase accuracy.
+//static SDOperand LowerFDIVf64(SDOperand Op, SelectionDAG &DAG) {
+//  MachineFunction &MF = DAG.getMachineFunction();
+//  SSARegMap *RegMap = MF.getSSARegMap();
+//
+//  SDOperand X = Op.getOperand(0);
+//  SDOperand Y = Op.getOperand(1);
+//}
+
+static SDOperand LowerEXTRACT_VECTOR_ELT(SDOperand Op, SelectionDAG &DAG) {
+  unsigned VT = Op.getValueType();
+  SDOperand N = Op.getOperand(0);
+  SDOperand Elt = Op.getOperand(1);
+  SDOperand ShufMask[16];
+  ConstantSDNode *C = dyn_cast<ConstantSDNode>(Elt);
+
+  assert(C != 0 && "LowerEXTRACT_VECTOR_ELT expecting constant SDNode");
+
+  int EltNo = (int) C->getValue();
+
+  // sanity checks:
+  if (VT == MVT::i8 && EltNo >= 16)
+    assert(0 && "SPU LowerEXTRACT_VECTOR_ELT: i8 extraction slot > 15");
+  else if (VT == MVT::i16 && EltNo >= 8)
+    assert(0 && "SPU LowerEXTRACT_VECTOR_ELT: i16 extraction slot > 7");
+  else if (VT == MVT::i32 && EltNo >= 4)
+    assert(0 && "SPU LowerEXTRACT_VECTOR_ELT: i32 extraction slot > 4");
+  else if (VT == MVT::i64 && EltNo >= 2)
+    assert(0 && "SPU LowerEXTRACT_VECTOR_ELT: i64 extraction slot > 2");
+
+  if (EltNo == 0 && (VT == MVT::i32 || VT == MVT::i64)) {
+    // i32 and i64: Element 0 is the preferred slot
+    return DAG.getNode(SPUISD::EXTRACT_ELT0, VT, N);
+  }
+
+  // Need to generate shuffle mask and extract:
+  int prefslot_begin, prefslot_end;
+  int elt_byte = EltNo * MVT::getSizeInBits(VT) / 8;
+
+  switch (VT) {
+  case MVT::i8: {
+    prefslot_begin = prefslot_end = 3;
+    break;
+  }
+  case MVT::i16: {
+    prefslot_begin = 2; prefslot_end = 3;
+    break;
+  }
+  case MVT::i32: {
+    prefslot_begin = 0; prefslot_end = 3;
+    break;
+  }
+  case MVT::i64: {
+    prefslot_begin = 0; prefslot_end = 7;
+    break;
+  }
+  }
+
+  for (int i = 0; i < 16; ++i) {
+    // zero fill uppper part of preferred slot, don't care about the
+    // other slots:
+    unsigned int mask_val;
+
+    if (i <= prefslot_end) {
+      mask_val =
+	((i < prefslot_begin)
+	 ? 0x80
+	 : elt_byte + (i - prefslot_begin));
+
+      ShufMask[i] = DAG.getConstant(mask_val, MVT::i16);
+    } else 
+      ShufMask[i] = ShufMask[i % (prefslot_end + 1)];
+  }
+
+  SDOperand ShufMaskVec =
+    DAG.getNode(ISD::BUILD_VECTOR, MVT::v16i8,
+		&ShufMask[0],
+		sizeof(ShufMask) / sizeof(ShufMask[0]));
+
+  return DAG.getNode(SPUISD::EXTRACT_ELT0, VT,
+		     DAG.getNode(SPUISD::SHUFB, N.getValueType(),
+				 N, N, ShufMaskVec));
+				 
+}
+
+static SDOperand LowerINSERT_VECTOR_ELT(SDOperand Op, SelectionDAG &DAG) {
+  SDOperand VecOp = Op.getOperand(0);
+  SDOperand ValOp = Op.getOperand(1);
+  SDOperand IdxOp = Op.getOperand(2);
+  MVT::ValueType VT = Op.getValueType();
+
+  ConstantSDNode *CN = cast<ConstantSDNode>(IdxOp);
+  assert(CN != 0 && "LowerINSERT_VECTOR_ELT: Index is not constant!");
+
+  MVT::ValueType PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+  // Use $2 because it's always 16-byte aligned and it's available:
+  SDOperand PtrBase = DAG.getRegister(SPU::R2, PtrVT);
+
+  SDOperand result =
+    DAG.getNode(SPUISD::SHUFB, VT,
+                DAG.getNode(ISD::SCALAR_TO_VECTOR, VT, ValOp),
+                VecOp,
+                DAG.getNode(SPUISD::INSERT_MASK, VT,
+                            DAG.getNode(ISD::ADD, PtrVT,
+                                        PtrBase,
+                                        DAG.getConstant(CN->getValue(),
+					                PtrVT))));
+
+  return result;
+}
+
+static SDOperand LowerI8Math(SDOperand Op, SelectionDAG &DAG, unsigned Opc) {
+  SDOperand N0 = Op.getOperand(0);      // Everything has at least one operand
+
+  assert(Op.getValueType() == MVT::i8);
+  switch (Opc) {
+  default:
+    assert(0 && "Unhandled i8 math operator");
+    /*NOTREACHED*/
+    break;
+  case ISD::SUB: {
+    // 8-bit subtraction: Promote the arguments up to 16-bits and truncate
+    // the result:
+    SDOperand N1 = Op.getOperand(1);
+    N0 = (N0.getOpcode() != ISD::Constant
+          ? DAG.getNode(ISD::SIGN_EXTEND, MVT::i16, N0)
+          : DAG.getConstant(cast<ConstantSDNode>(N0)->getValue(), MVT::i16));
+    N1 = (N1.getOpcode() != ISD::Constant
+          ? DAG.getNode(ISD::SIGN_EXTEND, MVT::i16, N1)
+          : DAG.getConstant(cast<ConstantSDNode>(N1)->getValue(), MVT::i16));
+    return DAG.getNode(ISD::TRUNCATE, MVT::i8, 
+                       DAG.getNode(Opc, MVT::i16, N0, N1));
+  } 
+  case ISD::ROTR:
+  case ISD::ROTL: {
+    SDOperand N1 = Op.getOperand(1);
+    unsigned N1Opc;
+    N0 = (N0.getOpcode() != ISD::Constant
+          ? DAG.getNode(ISD::ZERO_EXTEND, MVT::i16, N0)
+          : DAG.getConstant(cast<ConstantSDNode>(N0)->getValue(), MVT::i16));
+    N1Opc = (N1.getValueType() < MVT::i16 ? ISD::ZERO_EXTEND : ISD::TRUNCATE);
+    N1 = (N1.getOpcode() != ISD::Constant
+          ? DAG.getNode(N1Opc, MVT::i16, N1)
+          : DAG.getConstant(cast<ConstantSDNode>(N1)->getValue(), MVT::i16));
+    SDOperand ExpandArg =
+      DAG.getNode(ISD::OR, MVT::i16, N0,
+                  DAG.getNode(ISD::SHL, MVT::i16,
+                              N0, DAG.getConstant(8, MVT::i16)));
+    return DAG.getNode(ISD::TRUNCATE, MVT::i8, 
+                       DAG.getNode(Opc, MVT::i16, ExpandArg, N1));
+  }
+  case ISD::SRL:
+  case ISD::SHL: {
+    SDOperand N1 = Op.getOperand(1);
+    unsigned N1Opc;
+    N0 = (N0.getOpcode() != ISD::Constant
+          ? DAG.getNode(ISD::ZERO_EXTEND, MVT::i16, N0)
+          : DAG.getConstant(cast<ConstantSDNode>(N0)->getValue(), MVT::i16));
+    N1Opc = (N1.getValueType() < MVT::i16 ? ISD::ZERO_EXTEND : ISD::TRUNCATE);
+    N1 = (N1.getOpcode() != ISD::Constant
+          ? DAG.getNode(N1Opc, MVT::i16, N1)
+          : DAG.getConstant(cast<ConstantSDNode>(N1)->getValue(), MVT::i16));
+    return DAG.getNode(ISD::TRUNCATE, MVT::i8, 
+                       DAG.getNode(Opc, MVT::i16, N0, N1));
+  }
+  case ISD::SRA: {
+    SDOperand N1 = Op.getOperand(1);
+    unsigned N1Opc;
+    N0 = (N0.getOpcode() != ISD::Constant
+          ? DAG.getNode(ISD::SIGN_EXTEND, MVT::i16, N0)
+          : DAG.getConstant(cast<ConstantSDNode>(N0)->getValue(), MVT::i16));
+    N1Opc = (N1.getValueType() < MVT::i16 ? ISD::SIGN_EXTEND : ISD::TRUNCATE);
+    N1 = (N1.getOpcode() != ISD::Constant
+          ? DAG.getNode(N1Opc, MVT::i16, N1)
+          : DAG.getConstant(cast<ConstantSDNode>(N1)->getValue(), MVT::i16));
+    return DAG.getNode(ISD::TRUNCATE, MVT::i8, 
+                       DAG.getNode(Opc, MVT::i16, N0, N1));
+  }
+  case ISD::MUL: {
+    SDOperand N1 = Op.getOperand(1);
+    unsigned N1Opc;
+    N0 = (N0.getOpcode() != ISD::Constant
+          ? DAG.getNode(ISD::SIGN_EXTEND, MVT::i16, N0)
+          : DAG.getConstant(cast<ConstantSDNode>(N0)->getValue(), MVT::i16));
+    N1Opc = (N1.getValueType() < MVT::i16 ? ISD::SIGN_EXTEND : ISD::TRUNCATE);
+    N1 = (N1.getOpcode() != ISD::Constant
+          ? DAG.getNode(N1Opc, MVT::i16, N1)
+          : DAG.getConstant(cast<ConstantSDNode>(N1)->getValue(), MVT::i16));
+    return DAG.getNode(ISD::TRUNCATE, MVT::i8, 
+                       DAG.getNode(Opc, MVT::i16, N0, N1));
+    break;
+  }
+  }
+
+  return SDOperand();
+}
+
+//! Lower byte immediate operations for v16i8 vectors:
+static SDOperand
+LowerByteImmed(SDOperand Op, SelectionDAG &DAG) {
+  SDOperand ConstVec;
+  SDOperand Arg;
+  MVT::ValueType VT = Op.getValueType();
+
+  ConstVec = Op.getOperand(0);
+  Arg = Op.getOperand(1);
+  if (ConstVec.Val->getOpcode() != ISD::BUILD_VECTOR) {
+    if (ConstVec.Val->getOpcode() == ISD::BIT_CONVERT) {
+      ConstVec = ConstVec.getOperand(0);
+    } else {
+      ConstVec = Op.getOperand(1);
+      Arg = Op.getOperand(0);
+      if (ConstVec.Val->getOpcode() == ISD::BIT_CONVERT) {
+	ConstVec = ConstVec.getOperand(0);
+      }
+    }
+  }
+
+  if (ConstVec.Val->getOpcode() == ISD::BUILD_VECTOR) {
+    uint64_t VectorBits[2];
+    uint64_t UndefBits[2];
+    uint64_t SplatBits, SplatUndef;
+    int SplatSize;
+
+    if (!GetConstantBuildVectorBits(ConstVec.Val, VectorBits, UndefBits)
+	&& isConstantSplat(VectorBits, UndefBits,
+			   MVT::getSizeInBits(MVT::getVectorElementType(VT)),
+			   SplatBits, SplatUndef, SplatSize)) {
+      SDOperand tcVec[16];
+      SDOperand tc = DAG.getTargetConstant(SplatBits & 0xff, MVT::i8);
+      const size_t tcVecSize = sizeof(tcVec) / sizeof(tcVec[0]);
+
+      // Turn the BUILD_VECTOR into a set of target constants:
+      for (size_t i = 0; i < tcVecSize; ++i)
+	tcVec[i] = tc;
+
+      return DAG.getNode(Op.Val->getOpcode(), VT, Arg,
+			 DAG.getNode(ISD::BUILD_VECTOR, VT, tcVec, tcVecSize));
+    }
+  }
+
+  return SDOperand();
+}
+
+//! Lower i32 multiplication
+static SDOperand LowerMUL(SDOperand Op, SelectionDAG &DAG, unsigned VT,
+                          unsigned Opc) {
+  switch (VT) {
+  default:
+    cerr << "CellSPU: Unknown LowerMUL value type, got "
+         << MVT::getValueTypeString(Op.getValueType())
+	 << "\n";
+    abort();
+    /*NOTREACHED*/
+
+  case MVT::i32: {
+    SDOperand rA = Op.getOperand(0);
+    SDOperand rB = Op.getOperand(1);
+
+    return DAG.getNode(ISD::ADD, MVT::i32,
+		       DAG.getNode(ISD::ADD, MVT::i32,
+				   DAG.getNode(SPUISD::MPYH, MVT::i32, rA, rB),
+				   DAG.getNode(SPUISD::MPYH, MVT::i32, rB, rA)),
+		       DAG.getNode(SPUISD::MPYU, MVT::i32, rA, rB));
+  }
+  }
+
+  return SDOperand();
+}
+
+//! Custom lowering for CTPOP (count population)
+/*!
+  Custom lowering code that counts the number ones in the input
+  operand. SPU has such an instruction, but it counts the number of
+  ones per byte, which then have to be accumulated.
+*/
+static SDOperand LowerCTPOP(SDOperand Op, SelectionDAG &DAG) {
+  unsigned VT = Op.getValueType();
+  unsigned vecVT = MVT::getVectorType(VT, (128 / MVT::getSizeInBits(VT)));
+
+  switch (VT) {
+  case MVT::i8: {
+    SDOperand N = Op.getOperand(0);
+    SDOperand Elt0 = DAG.getConstant(0, MVT::i32);
+
+    SDOperand Promote = DAG.getNode(SPUISD::PROMOTE_SCALAR, vecVT, N, N);
+    SDOperand CNTB = DAG.getNode(SPUISD::CNTB, vecVT, Promote);
+
+    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, MVT::i8, CNTB, Elt0);
+  }
+
+  case MVT::i16: {
+    MachineFunction &MF = DAG.getMachineFunction();
+    SSARegMap *RegMap = MF.getSSARegMap();
+
+    unsigned CNTB_reg = RegMap->createVirtualRegister(&SPU::R16CRegClass);
+
+    SDOperand N = Op.getOperand(0);
+    SDOperand Elt0 = DAG.getConstant(0, MVT::i16);
+    SDOperand Mask0 = DAG.getConstant(0x0f, MVT::i16);
+    SDOperand Shift1 = DAG.getConstant(8, MVT::i16);
+
+    SDOperand Promote = DAG.getNode(SPUISD::PROMOTE_SCALAR, vecVT, N, N);
+    SDOperand CNTB = DAG.getNode(SPUISD::CNTB, vecVT, Promote);
+
+    // CNTB_result becomes the chain to which all of the virtual registers
+    // CNTB_reg, SUM1_reg become associated:
+    SDOperand CNTB_result =
+      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, MVT::i16, CNTB, Elt0);
+		  
+    SDOperand CNTB_rescopy =
+      DAG.getCopyToReg(CNTB_result, CNTB_reg, CNTB_result);
+
+    SDOperand Tmp1 = DAG.getCopyFromReg(CNTB_rescopy, CNTB_reg, MVT::i16);
+
+    return DAG.getNode(ISD::AND, MVT::i16,
+		       DAG.getNode(ISD::ADD, MVT::i16,
+				   DAG.getNode(ISD::SRL, MVT::i16,
+					       Tmp1, Shift1),
+				   Tmp1),
+		       Mask0);
+  }
+
+  case MVT::i32: {
+    MachineFunction &MF = DAG.getMachineFunction();
+    SSARegMap *RegMap = MF.getSSARegMap();
+
+    unsigned CNTB_reg = RegMap->createVirtualRegister(&SPU::R32CRegClass);
+    unsigned SUM1_reg = RegMap->createVirtualRegister(&SPU::R32CRegClass);
+
+    SDOperand N = Op.getOperand(0);
+    SDOperand Elt0 = DAG.getConstant(0, MVT::i32);
+    SDOperand Mask0 = DAG.getConstant(0xff, MVT::i32);
+    SDOperand Shift1 = DAG.getConstant(16, MVT::i32);
+    SDOperand Shift2 = DAG.getConstant(8, MVT::i32);
+
+    SDOperand Promote = DAG.getNode(SPUISD::PROMOTE_SCALAR, vecVT, N, N);
+    SDOperand CNTB = DAG.getNode(SPUISD::CNTB, vecVT, Promote);
+
+    // CNTB_result becomes the chain to which all of the virtual registers
+    // CNTB_reg, SUM1_reg become associated:
+    SDOperand CNTB_result =
+      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, MVT::i32, CNTB, Elt0);
+		  
+    SDOperand CNTB_rescopy =
+      DAG.getCopyToReg(CNTB_result, CNTB_reg, CNTB_result);
+
+    SDOperand Comp1 =
+      DAG.getNode(ISD::SRL, MVT::i32,
+		  DAG.getCopyFromReg(CNTB_rescopy, CNTB_reg, MVT::i32), Shift1);
+
+    SDOperand Sum1 =
+      DAG.getNode(ISD::ADD, MVT::i32,
+		  Comp1, DAG.getCopyFromReg(CNTB_rescopy, CNTB_reg, MVT::i32));
+
+    SDOperand Sum1_rescopy =
+      DAG.getCopyToReg(CNTB_result, SUM1_reg, Sum1);
+
+    SDOperand Comp2 =
+      DAG.getNode(ISD::SRL, MVT::i32,
+		  DAG.getCopyFromReg(Sum1_rescopy, SUM1_reg, MVT::i32),
+		  Shift2);
+    SDOperand Sum2 =
+      DAG.getNode(ISD::ADD, MVT::i32, Comp2,
+		  DAG.getCopyFromReg(Sum1_rescopy, SUM1_reg, MVT::i32));
+
+    return DAG.getNode(ISD::AND, MVT::i32, Sum2, Mask0);
+  }
+
+  case MVT::i64:
+    break;
+  }
+
+  return SDOperand();
+}
+
+/// LowerOperation - Provide custom lowering hooks for some operations.
+///
+SDOperand
+SPUTargetLowering::LowerOperation(SDOperand Op, SelectionDAG &DAG)
+{
+  switch (Op.getOpcode()) {
+  default: {
+    cerr << "SPUTargetLowering::LowerOperation(): need to lower this!\n";
+    cerr << "Op.getOpcode() = " << Op.getOpcode() << "\n";
+    cerr << "*Op.Val:\n";
+    Op.Val->dump();
+    abort();
+  }
+  case ISD::LOAD:
+  case ISD::SEXTLOAD:
+  case ISD::ZEXTLOAD:
+    return LowerLOAD(Op, DAG, SPUTM.getSubtargetImpl());
+  case ISD::STORE:
+    return LowerSTORE(Op, DAG, SPUTM.getSubtargetImpl());
+  case ISD::ConstantPool:
+    return LowerConstantPool(Op, DAG, SPUTM.getSubtargetImpl());
+  case ISD::GlobalAddress:
+    return LowerGlobalAddress(Op, DAG, SPUTM.getSubtargetImpl());
+  case ISD::JumpTable:
+    return LowerJumpTable(Op, DAG, SPUTM.getSubtargetImpl());
+  case ISD::Constant:
+    return LowerConstant(Op, DAG);
+  case ISD::ConstantFP:
+    return LowerConstantFP(Op, DAG);
+  case ISD::FORMAL_ARGUMENTS:
+      return LowerFORMAL_ARGUMENTS(Op, DAG, VarArgsFrameIndex);
+  case ISD::CALL:
+    return LowerCALL(Op, DAG);
+  case ISD::RET:
+    return LowerRET(Op, DAG, getTargetMachine());
+
+  // i8 math ops:
+  case ISD::SUB:
+  case ISD::ROTR:
+  case ISD::ROTL:
+  case ISD::SRL:
+  case ISD::SHL:
+  case ISD::SRA:
+    return LowerI8Math(Op, DAG, Op.getOpcode());
+
+  // Vector-related lowering.
+  case ISD::BUILD_VECTOR:
+    return LowerBUILD_VECTOR(Op, DAG);
+  case ISD::SCALAR_TO_VECTOR:
+    return LowerSCALAR_TO_VECTOR(Op, DAG);
+  case ISD::VECTOR_SHUFFLE:
+    return LowerVECTOR_SHUFFLE(Op, DAG);
+  case ISD::EXTRACT_VECTOR_ELT:
+    return LowerEXTRACT_VECTOR_ELT(Op, DAG);
+  case ISD::INSERT_VECTOR_ELT:
+    return LowerINSERT_VECTOR_ELT(Op, DAG);
+
+  // Look for ANDBI, ORBI and XORBI opportunities and lower appropriately:
+  case ISD::AND:
+  case ISD::OR:
+  case ISD::XOR:
+    return LowerByteImmed(Op, DAG);
+
+  // Vector and i8 multiply:
+  case ISD::MUL:
+    if (MVT::isVector(Op.getValueType()))
+      return LowerVectorMUL(Op, DAG);
+    else if (Op.getValueType() == MVT::i8)
+      return LowerI8Math(Op, DAG, Op.getOpcode());
+    else
+      return LowerMUL(Op, DAG, Op.getValueType(), Op.getOpcode());
+
+  case ISD::FDIV:
+    if (Op.getValueType() == MVT::f32 || Op.getValueType() == MVT::v4f32)
+      return LowerFDIVf32(Op, DAG);
+//    else if (Op.getValueType() == MVT::f64)
+//      return LowerFDIVf64(Op, DAG);
+    else
+      assert(0 && "Calling FDIV on unsupported MVT");
+
+  case ISD::CTPOP:
+    return LowerCTPOP(Op, DAG);
+  }
+
+  return SDOperand();
+}
+
+//===----------------------------------------------------------------------===//
+//  Other Lowering Code
+//===----------------------------------------------------------------------===//
+
+MachineBasicBlock *
+SPUTargetLowering::InsertAtEndOfBasicBlock(MachineInstr *MI,
+                                           MachineBasicBlock *BB)
+{
+  return BB;
+}
+
+//===----------------------------------------------------------------------===//
+// Target Optimization Hooks
+//===----------------------------------------------------------------------===//
+
+SDOperand
+SPUTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const
+{
+#if 0
+  TargetMachine &TM = getTargetMachine();
+  SelectionDAG &DAG = DCI.DAG;
+#endif
+  SDOperand N0 = N->getOperand(0);	// everything has at least one operand
+
+  switch (N->getOpcode()) {
+  default: break;
+
+  // Look for obvious optimizations for shift left:
+  // a) Replace 0 << V with 0
+  // b) Replace V << 0 with V
+  //
+  // N.B: llvm will generate an undef node if the shift amount is greater than
+  // 15 (e.g.: V << 16), which will naturally trigger an assert.
+  case SPU::SHLIr32:
+  case SPU::SHLHIr16:
+  case SPU::SHLQBIIvec:
+  case SPU::ROTHIr16:
+  case SPU::ROTHIr16_i32:
+  case SPU::ROTIr32:
+  case SPU::ROTIr32_i16:
+  case SPU::ROTQBYIvec:
+  case SPU::ROTQBYBIvec:
+  case SPU::ROTQBIIvec:
+  case SPU::ROTHMIr16:
+  case SPU::ROTMIr32:
+  case SPU::ROTQMBYIvec: {
+    if (N0.getOpcode() == ISD::Constant) {
+      if (ConstantSDNode *C = cast<ConstantSDNode>(N0)) {
+	if (C->getValue() == 0)   	// 0 << V -> 0.
+	  return N0;
+      }
+    }
+    SDOperand N1 = N->getOperand(1);
+    if (N1.getOpcode() == ISD::Constant) {
+      if (ConstantSDNode *C = cast<ConstantSDNode>(N1)) {
+	if (C->getValue() == 0)		// V << 0 -> V
+	  return N1;
+      }
+    }
+    break;
+  }
+  }
+  
+  return SDOperand();
+}
+
+//===----------------------------------------------------------------------===//
+// Inline Assembly Support
+//===----------------------------------------------------------------------===//
+
+/// getConstraintType - Given a constraint letter, return the type of
+/// constraint it is for this target.
+SPUTargetLowering::ConstraintType 
+SPUTargetLowering::getConstraintType(const std::string &ConstraintLetter) const {
+  if (ConstraintLetter.size() == 1) {
+    switch (ConstraintLetter[0]) {
+    default: break;
+    case 'b':
+    case 'r':
+    case 'f':
+    case 'v':
+    case 'y':
+      return C_RegisterClass;
+    }  
+  }
+  return TargetLowering::getConstraintType(ConstraintLetter);
+}
+
+std::pair<unsigned, const TargetRegisterClass*> 
+SPUTargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
+                                                MVT::ValueType VT) const
+{
+  if (Constraint.size() == 1) {
+    // GCC RS6000 Constraint Letters
+    switch (Constraint[0]) {
+    case 'b':   // R1-R31
+    case 'r':   // R0-R31
+      if (VT == MVT::i64)
+        return std::make_pair(0U, SPU::R64CRegisterClass);
+      return std::make_pair(0U, SPU::R32CRegisterClass);
+    case 'f':
+      if (VT == MVT::f32)
+        return std::make_pair(0U, SPU::R32FPRegisterClass);
+      else if (VT == MVT::f64)
+        return std::make_pair(0U, SPU::R64FPRegisterClass);
+      break;
+    case 'v': 
+      return std::make_pair(0U, SPU::GPRCRegisterClass);
+    }
+  }
+  
+  return TargetLowering::getRegForInlineAsmConstraint(Constraint, VT);
+}
+
+void
+SPUTargetLowering::computeMaskedBitsForTargetNode(const SDOperand Op,
+						  uint64_t Mask,
+						  uint64_t &KnownZero, 
+						  uint64_t &KnownOne,
+						  const SelectionDAG &DAG,
+						  unsigned Depth ) const {
+  KnownZero = 0;
+  KnownOne = 0;
+}
+
+// LowerAsmOperandForConstraint
+void
+SPUTargetLowering::LowerAsmOperandForConstraint(SDOperand Op,
+                                                char ConstraintLetter,
+                                                std::vector<SDOperand> &Ops,
+                                                SelectionDAG &DAG) {
+  // Default, for the time being, to the base class handler
+  TargetLowering::LowerAsmOperandForConstraint(Op, ConstraintLetter, Ops, DAG);
+}
+
+/// isLegalAddressImmediate - Return true if the integer value can be used
+/// as the offset of the target addressing mode.
+bool SPUTargetLowering::isLegalAddressImmediate(int64_t V, const Type *Ty) const {
+  // SPU's addresses are 256K:
+  return (V > -(1 << 18) && V < (1 << 18) - 1);
+}
+
+bool SPUTargetLowering::isLegalAddressImmediate(llvm::GlobalValue* GV) const {
+  return false; 
+}
diff --git a/lib/Target/CellSPU/SPUISelLowering.h b/lib/Target/CellSPU/SPUISelLowering.h
new file mode 100644
index 00000000000..4e3ec3a2457
--- /dev/null
+++ b/lib/Target/CellSPU/SPUISelLowering.h
@@ -0,0 +1,139 @@
+//===-- SPUISelLowering.h - Cell SPU DAG Lowering Interface -----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by a team from the Computer Systems Research
+// Department at The Aerospace Corporation.
+//
+// See README.txt for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interfaces that Cell SPU uses to lower LLVM code into
+// a selection DAG.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SPU_ISELLOWERING_H
+#define SPU_ISELLOWERING_H
+
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "SPU.h"
+
+namespace llvm {
+  namespace SPUISD {
+    enum NodeType {
+      // Start the numbering where the builting ops and target ops leave off.
+      FIRST_NUMBER = ISD::BUILTIN_OP_END+SPU::INSTRUCTION_LIST_END,
+      
+      // Pseudo instructions:
+      RET_FLAG,                 ///< Return with flag, matched by bi instruction
+      
+      Hi,                       ///< High address component (upper 16)
+      Lo,                       ///< Low address component (lower 16)
+      PCRelAddr,                ///< Program counter relative address
+      DFormAddr,                ///< D-Form address "imm($r)"
+      XFormAddr,		///< X-Form address "$r1($r2)"
+
+      LDRESULT,                 ///< Load result (value, chain)
+      CALL,                     ///< CALL instruction
+      SHUFB,                    ///< Vector shuffle (permute)
+      INSERT_MASK,              ///< Insert element shuffle mask
+      CNTB,			///< Count leading ones in bytes
+      PROMOTE_SCALAR,           ///< Promote scalar->vector
+      EXTRACT_ELT0,             ///< Extract element 0
+      EXTRACT_ELT0_CHAINED,	///< Extract element 0, with chain
+      EXTRACT_I1_ZEXT,          ///< Extract element 0 as i1, zero extend
+      EXTRACT_I1_SEXT,          ///< Extract element 0 as i1, sign extend
+      EXTRACT_I8_ZEXT,          ///< Extract element 0 as i8, zero extend
+      EXTRACT_I8_SEXT,          ///< Extract element 0 as i8, sign extend
+      MPY,			///< 16-bit Multiply (low parts of a 32-bit)
+      MPYU,			///< Multiply Unsigned
+      MPYH,			///< Multiply High
+      MPYHH,			///< Multiply High-High
+      VEC_SHL,			///< Vector shift left
+      VEC_SRL,			///< Vector shift right (logical)
+      VEC_SRA,			///< Vector shift right (arithmetic)
+      VEC_ROTL,			///< Vector rotate left
+      VEC_ROTR,			///< Vector rotate right
+      ROTBYTES_RIGHT_Z,		///< Vector rotate right, by bytes, zero fill
+      ROTBYTES_RIGHT_S,         ///< Vector rotate right, by bytes, sign fill
+      ROTBYTES_LEFT,		///< Rotate bytes (loads -> ROTQBYI)
+      ROTBYTES_LEFT_CHAINED,	///< Rotate bytes (loads -> ROTQBYI), with chain
+      FSMBI,			///< Form Select Mask for Bytes, Immediate
+      SELB,			///< Select bits -> (b & mask) | (a & ~mask)
+      SFPConstant,		///< Single precision floating point constant
+      FPInterp,                 ///< Floating point interpolate
+      FPRecipEst,		///< Floating point reciprocal estimate
+      SEXT32TO64,		///< Sign-extended 32-bit const -> 64-bits
+      LAST_SPUISD		///< Last user-defined instruction
+    };
+  }
+
+  /// Predicates that are used for node matching:
+  namespace SPU {
+    SDOperand get_vec_u18imm(SDNode *N, SelectionDAG &DAG,
+                             MVT::ValueType ValueType);
+    SDOperand get_vec_i16imm(SDNode *N, SelectionDAG &DAG,
+                             MVT::ValueType ValueType);
+    SDOperand get_vec_i10imm(SDNode *N, SelectionDAG &DAG,
+                             MVT::ValueType ValueType);
+    SDOperand get_vec_i8imm(SDNode *N, SelectionDAG &DAG,
+                            MVT::ValueType ValueType);
+    SDOperand get_ILHUvec_imm(SDNode *N, SelectionDAG &DAG,
+                              MVT::ValueType ValueType);
+    SDOperand get_v4i32_imm(SDNode *N, SelectionDAG &DAG);
+    SDOperand get_v2i64_imm(SDNode *N, SelectionDAG &DAG);
+  }
+
+  class SPUTargetMachine;            // forward dec'l.
+  
+  class SPUTargetLowering :
+    public TargetLowering
+  {
+    int VarArgsFrameIndex;            // FrameIndex for start of varargs area.
+    int ReturnAddrIndex;              // FrameIndex for return slot.
+    SPUTargetMachine &SPUTM;
+
+  public:
+    SPUTargetLowering(SPUTargetMachine &TM);
+    
+    /// getTargetNodeName() - This method returns the name of a target specific
+    /// DAG node.
+    virtual const char *getTargetNodeName(unsigned Opcode) const;
+    
+    /// LowerOperation - Provide custom lowering hooks for some operations.
+    ///
+    virtual SDOperand LowerOperation(SDOperand Op, SelectionDAG &DAG);
+    
+    virtual SDOperand PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+
+    virtual void computeMaskedBitsForTargetNode(const SDOperand Op,
+                                                uint64_t Mask,
+                                                uint64_t &KnownZero, 
+                                                uint64_t &KnownOne,
+                                                const SelectionDAG &DAG,
+                                                unsigned Depth = 0) const;
+
+    virtual MachineBasicBlock *InsertAtEndOfBasicBlock(MachineInstr *MI,
+                                                       MachineBasicBlock *MBB);
+    
+    ConstraintType getConstraintType(const std::string &ConstraintLetter) const;
+
+    std::pair<unsigned, const TargetRegisterClass*> 
+      getRegForInlineAsmConstraint(const std::string &Constraint,
+                                   MVT::ValueType VT) const;
+
+    void LowerAsmOperandForConstraint(SDOperand Op, char ConstraintLetter,
+                                      std::vector<SDOperand> &Ops,
+                                      SelectionDAG &DAG);
+
+    /// isLegalAddressImmediate - Return true if the integer value can be used
+    /// as the offset of the target addressing mode.
+    virtual bool isLegalAddressImmediate(int64_t V, const Type *Ty) const;
+    virtual bool isLegalAddressImmediate(GlobalValue *) const;
+  };
+}
+
+#endif
diff --git a/lib/Target/CellSPU/SPUInstrBuilder.h b/lib/Target/CellSPU/SPUInstrBuilder.h
new file mode 100644
index 00000000000..58e455f9f19
--- /dev/null
+++ b/lib/Target/CellSPU/SPUInstrBuilder.h
@@ -0,0 +1,52 @@
+//==-- SPUInstrBuilder.h - Aides for building Cell SPU insts -----*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+//===----------------------------------------------------------------------===//
+//
+// This file exposes functions that may be used with BuildMI from the
+// MachineInstrBuilder.h file to simplify generating frame and constant pool
+// references.
+//
+// For reference, the order of operands for memory references is:
+// (Operand), Dest Reg, Base Reg, and either Reg Index or Immediate
+// Displacement.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SPU_INSTRBUILDER_H
+#define SPU_INSTRBUILDER_H
+
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+
+namespace llvm {
+
+/// addFrameReference - This function is used to add a reference to the base of
+/// an abstract object on the stack frame of the current function.  This
+/// reference has base register as the FrameIndex offset until it is resolved.
+/// This allows a constant offset to be specified as well...
+///
+inline const MachineInstrBuilder&
+addFrameReference(const MachineInstrBuilder &MIB, int FI, int Offset = 0,
+                  bool mem = true) {
+  if (mem)
+    return MIB.addImm(Offset).addFrameIndex(FI);
+  else
+    return MIB.addFrameIndex(FI).addImm(Offset);
+}
+
+/// addConstantPoolReference - This function is used to add a reference to the
+/// base of a constant value spilled to the per-function constant pool.  The
+/// reference has base register ConstantPoolIndex offset which is retained until
+/// either machine code emission or assembly output.  This allows an optional
+/// offset to be added as well.
+///
+inline const MachineInstrBuilder&
+addConstantPoolReference(const MachineInstrBuilder &MIB, unsigned CPI,
+                         int Offset = 0) {
+  return MIB.addImm(Offset).addConstantPoolIndex(CPI);
+}
+
+} // End llvm namespace
+
+#endif
diff --git a/lib/Target/CellSPU/SPUInstrFormats.td b/lib/Target/CellSPU/SPUInstrFormats.td
new file mode 100644
index 00000000000..8d3a500a513
--- /dev/null
+++ b/lib/Target/CellSPU/SPUInstrFormats.td
@@ -0,0 +1,308 @@
+//==== SPUInstrFormats.td - Cell SPU Instruction Formats ---*- tablegen -*-===//
+// 
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by The Aerospace Corporation....
+// 
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+//
+// Cell SPU instruction formats. Note that these are notationally similar to
+// PowerPC, like "A-Form". But the sizes of operands and fields differ.
+
+// This was kiped from the PPC instruction formats (seemed like a good idea...)
+
+class I<dag OOL, dag IOL, string asmstr, InstrItinClass itin>
+        : Instruction {
+  field bits<32> Inst;
+
+  let Name = "";
+  let Namespace = "SPU";
+  let OutOperandList = OOL;
+  let InOperandList = IOL;
+  let AsmString = asmstr;
+  let Itinerary = itin;
+}
+
+// RR Format
+class RRForm<bits<11> opcode, dag OOL, dag IOL, string asmstr, 
+              InstrItinClass itin, list<dag> pattern>
+         : I<OOL, IOL, asmstr, itin> {
+  bits<7> RA;
+  bits<7> RB;
+  bits<7> RT;
+
+  let Pattern = pattern;
+
+  let Inst{0-10} = opcode;
+  let Inst{11-17} = RB;
+  let Inst{18-24} = RA;
+  let Inst{25-31} = RT;
+}
+
+let RB = 0 in {
+  // RR Format, where RB is zeroed (dont care):
+  class RRForm_1<bits<11> opcode, dag OOL, dag IOL, string asmstr, 
+                 InstrItinClass itin, list<dag> pattern>
+           : RRForm<opcode, OOL, IOL, asmstr, itin, pattern>
+  { }
+
+  let RA = 0 in {
+    // RR Format, where RA and RB are zeroed (dont care):
+    // Used for reads from status control registers (see FPSCRRr32)
+    class RRForm_2<bits<11> opcode, dag OOL, dag IOL, string asmstr,
+                   InstrItinClass itin, list<dag> pattern>
+             : RRForm<opcode, OOL, IOL, asmstr, itin, pattern>
+    { }
+  }
+}
+
+let RT = 0 in {
+  // RR Format, where RT is zeroed (don't care), or as the instruction handbook
+  // says, "RT is a false target." Used in "Halt if" instructions
+  class RRForm_3<bits<11> opcode, dag OOL, dag IOL, string asmstr,
+                 InstrItinClass itin, list<dag> pattern>
+      : RRForm<opcode, OOL, IOL, asmstr, itin, pattern>
+  { }
+}
+
+// RRR Format
+class RRRForm<bits<4> opcode, dag OOL, dag IOL, string asmstr,
+              InstrItinClass itin, list<dag> pattern>
+        : I<OOL, IOL, asmstr, itin>
+{
+  bits<7> RA;
+  bits<7> RB;
+  bits<7> RC;
+  bits<7> RT;
+
+  let Pattern = pattern;
+
+  let Inst{0-3} = opcode;
+  let Inst{4-10} = RT;
+  let Inst{11-17} = RB;
+  let Inst{18-24} = RA;
+  let Inst{25-31} = RC;
+}
+
+// RI7 Format
+class RI7Form<bits<11> opcode, dag OOL, dag IOL, string asmstr,
+              InstrItinClass itin, list<dag> pattern>
+        : I<OOL, IOL, asmstr, itin>
+{
+  bits<7> i7;
+  bits<7> RA;
+  bits<7> RT;
+
+  let Pattern = pattern;
+
+  let Inst{0-10} = opcode;
+  let Inst{11-17} = i7;
+  let Inst{18-24} = RA;
+  let Inst{25-31} = RT;
+}
+
+// CVTIntFp Format
+class CVTIntFPForm<bits<10> opcode, dag OOL, dag IOL, string asmstr,
+                   InstrItinClass itin, list<dag> pattern>
+        : I<OOL, IOL, asmstr, itin>
+{
+  bits<7> RA;
+  bits<7> RT;
+
+  let Pattern = pattern;
+
+  let Inst{0-9} = opcode;
+  let Inst{10-17} = 0;
+  let Inst{18-24} = RA;
+  let Inst{25-31} = RT;
+}
+
+let RA = 0 in {
+  class BICondForm<bits<11> opcode, string asmstr, list<dag> pattern>
+           : RRForm<opcode, (outs), (ins R32C:$rA, R32C:$func), asmstr,
+                    BranchResolv, pattern>
+  { }
+
+  let RT = 0 in {
+    // Branch instruction format (without D/E flag settings)
+    class BRForm<bits<11> opcode, dag OOL, dag IOL, string asmstr,
+               InstrItinClass itin, list<dag> pattern>
+          : RRForm<opcode, OOL, IOL, asmstr, itin, pattern>
+    { }
+
+    class BIForm<bits<11> opcode, string asmstr, list<dag> pattern>
+             : RRForm<opcode, (outs), (ins R32C:$func), asmstr, BranchResolv,
+                      pattern>
+    { }
+
+    let RB = 0 in {
+      // Return instruction (bi, branch indirect), RA is zero (LR):
+      class RETForm<string asmstr, list<dag> pattern>
+             : BRForm<0b00010101100, (outs), (ins), asmstr, BranchResolv,
+                      pattern>
+      { }
+    }
+  }
+}
+
+// Branch indirect external data forms:
+class BISLEDForm<bits<2> DE_flag, string asmstr, list<dag> pattern>
+         : I<(outs), (ins indcalltarget:$func), asmstr, BranchResolv>
+{
+  bits<7> Rcalldest;
+
+  let Pattern = pattern;
+
+  let Inst{0-10} = 0b11010101100;
+  let Inst{11} = 0;
+  let Inst{12-13} = DE_flag;
+  let Inst{14-17} = 0b0000;
+  let Inst{18-24} = Rcalldest;
+  let Inst{25-31} = 0b0000000;
+}
+
+// RI10 Format
+class RI10Form<bits<8> opcode, dag OOL, dag IOL, string asmstr,
+              InstrItinClass itin, list<dag> pattern>
+        : I<OOL, IOL, asmstr, itin>
+{
+  bits<10> i10;
+  bits<7> RA;
+  bits<7> RT;
+
+  let Pattern = pattern;
+
+  let Inst{0-7} = opcode;
+  let Inst{8-17} = i10;
+  let Inst{18-24} = RA;
+  let Inst{25-31} = RT;
+}
+
+// RI10 Format, where the constant is zero (or effectively ignored by the
+// SPU)
+class RI10Form_1<bits<8> opcode, dag OOL, dag IOL, string asmstr,
+                 InstrItinClass itin, list<dag> pattern>
+        : I<OOL, IOL, asmstr, itin>
+{
+  bits<7> RA;
+  bits<7> RT;
+
+  let Pattern = pattern;
+
+  let Inst{0-7} = opcode;
+  let Inst{8-17} = 0;
+  let Inst{18-24} = RA;
+  let Inst{25-31} = RT;
+}
+
+// RI10 Format, where RT is ignored.
+// This format is used primarily by the Halt If ... Immediate set of
+// instructions
+class RI10Form_2<bits<8> opcode, dag OOL, dag IOL, string asmstr,
+                 InstrItinClass itin, list<dag> pattern>
+        : I<OOL, IOL, asmstr, itin>
+{
+  bits<10> i10;
+  bits<7> RA;
+
+  let Pattern = pattern;
+
+  let Inst{0-7} = opcode;
+  let Inst{8-17} = i10;
+  let Inst{18-24} = RA;
+  let Inst{25-31} = 0;
+}
+
+// RI16 Format
+class RI16Form<bits<9> opcode, dag OOL, dag IOL, string asmstr,
+              InstrItinClass itin, list<dag> pattern>
+        : I<OOL, IOL, asmstr, itin>
+{
+  bits<16> i16;
+  bits<7> RT;
+
+  let Pattern = pattern;
+
+  let Inst{0-8} = opcode;
+  let Inst{9-24} = i16;
+  let Inst{25-31} = RT;
+}
+
+// Specialized version of the RI16 Format for unconditional branch relative and
+// branch absolute, branch and set link. Note that for branch and set link, the
+// link register doesn't have to be $lr, but this is actually hard coded into
+// the instruction pattern.
+
+let RT = 0 in {
+  class UncondBranch<bits<9> opcode, dag OOL, dag IOL, string asmstr,
+                     list<dag> pattern>
+    : RI16Form<opcode, OOL, IOL, asmstr, BranchResolv, pattern>
+  { }
+
+  class BranchSetLink<bits<9> opcode, dag OOL, dag IOL, string asmstr,
+                      list<dag> pattern>
+        : RI16Form<opcode, OOL, IOL, asmstr, BranchResolv, pattern>
+  { }
+}
+
+// RI18 Format
+class RI18Form<bits<7> opcode, dag OOL, dag IOL, string asmstr,
+              InstrItinClass itin, list<dag> pattern>
+        : I<OOL, IOL, asmstr, itin>
+{
+  bits<18> i18;
+  bits<7> RT;
+
+  let Pattern = pattern;
+
+  let Inst{0-6} = opcode;
+  let Inst{7-24} = i18;
+  let Inst{25-31} = RT;
+}
+
+//===----------------------------------------------------------------------===//
+// Instruction formats for intrinsics:
+//===----------------------------------------------------------------------===//
+
+// RI10 Format for v8i16 intrinsics
+class RI10_Int_v8i16<bits<8> opcode, string opc, InstrItinClass itin,
+                     Intrinsic IntID> :
+  RI10Form<opcode, (outs VECREG:$rT), (ins s10imm:$val, VECREG:$rA),
+           !strconcat(opc, " $rT, $rA, $val"), itin,
+           [(set (v8i16 VECREG:$rT), (IntID (v8i16 VECREG:$rA),
+                                            i16ImmSExt10:$val))] >;
+
+class RI10_Int_v4i32<bits<8> opcode, string opc, InstrItinClass itin,
+                     Intrinsic IntID> :
+  RI10Form<opcode, (outs VECREG:$rT), (ins s10imm:$val, VECREG:$rA),
+           !strconcat(opc, " $rT, $rA, $val"), itin,
+           [(set (v4i32 VECREG:$rT), (IntID (v4i32 VECREG:$rA),
+                                            i32ImmSExt10:$val))] >;
+
+// RR Format for v8i16 intrinsics
+class RR_Int_v8i16<bits<11> opcode, string opc, InstrItinClass itin,
+                   Intrinsic IntID> :
+  RRForm<opcode, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+         !strconcat(opc, " $rT, $rA, $rB"), itin,
+         [(set (v8i16 VECREG:$rT), (IntID (v8i16 VECREG:$rA),
+                                          (v8i16 VECREG:$rB)))] >;
+
+// RR Format for v4i32 intrinsics
+class RR_Int_v4i32<bits<11> opcode, string opc, InstrItinClass itin,
+                   Intrinsic IntID> :
+  RRForm<opcode, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+         !strconcat(opc, " $rT, $rA, $rB"), itin,
+         [(set (v4i32 VECREG:$rT), (IntID (v4i32 VECREG:$rA),
+                                          (v4i32 VECREG:$rB)))] >;
+
+//===----------------------------------------------------------------------===//
+// Pseudo instructions, like call frames:
+//===----------------------------------------------------------------------===//
+
+class Pseudo<dag OOL, dag IOL, string asmstr, list<dag> pattern>
+    : I<OOL, IOL, asmstr, NoItinerary> {
+  let Pattern = pattern;
+  let Inst{31-0} = 0;
+}