llvm-6502/lib/Target/CellSPU/SPUISelLowering.cpp

//===-- SPUISelLowering.cpp - Cell SPU DAG Lowering Implementation --------===//
//
//                     The LLVM Compiler Infrastructure
//
// This file was developed by a team from the Computer Systems Research
// Department at The Aerospace Corporation and is distributed under the
// University of Illinois Open Source License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
//
// This file implements the SPUTargetLowering class.
//
//===----------------------------------------------------------------------===//

#include "SPURegisterNames.h"
#include "SPUISelLowering.h"
#include "SPUTargetMachine.h"
#include "llvm/ADT/VectorExtras.h"
#include "llvm/Analysis/ScalarEvolutionExpressions.h"
#include "llvm/CodeGen/CallingConvLower.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/SelectionDAG.h"
#include "llvm/CodeGen/SSARegMap.h"
#include "llvm/Constants.h"
#include "llvm/Function.h"
#include "llvm/Intrinsics.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/MathExtras.h"
#include "llvm/Target/TargetOptions.h"

#include <map>

using namespace llvm;

// Used in getTargetNodeName() below
namespace {
  std::map<unsigned, const char *> node_names;

  //! MVT::ValueType mapping to useful data for Cell SPU
  struct valtype_map_s {
    const MVT::ValueType	valtype;
    const int			prefslot_byte;
  };

  const valtype_map_s valtype_map[] = {
    { MVT::i1,   3 },
    { MVT::i8,   3 },
    { MVT::i16,  2 },
    { MVT::i32,  0 },
    { MVT::f32,  0 },
    { MVT::i64,  0 },
    { MVT::f64,  0 },
    { MVT::i128, 0 }
  };

  const size_t n_valtype_map = sizeof(valtype_map) / sizeof(valtype_map[0]);

  const valtype_map_s *getValueTypeMapEntry(MVT::ValueType VT) {
    const valtype_map_s *retval = 0;

    for (size_t i = 0; i < n_valtype_map; ++i) {
      if (valtype_map[i].valtype == VT) {
	retval = valtype_map + i;
	break;
      }
    }

#ifndef NDEBUG
    if (retval == 0) {
      cerr << "getValueTypeMapEntry returns NULL for "
	   << MVT::getValueTypeString(VT)
	   << "\n";
      abort();
    }
#endif

    return retval;
  }

  //! Predicate that returns true if operand is a memory target
  /*!
    \arg Op Operand to test
    \return true if the operand is a memory target (i.e., global
    address, external symbol, constant pool) or an existing D-Form
    address.
   */
  bool isMemoryOperand(const SDOperand &Op)
  {
    const unsigned Opc = Op.getOpcode();
    return (Opc == ISD::GlobalAddress
            || Opc == ISD::GlobalTLSAddress
            || Opc ==  ISD::FrameIndex
            || Opc == ISD::JumpTable
            || Opc == ISD::ConstantPool
            || Opc == ISD::ExternalSymbol
            || Opc == ISD::TargetGlobalAddress
            || Opc == ISD::TargetGlobalTLSAddress
            || Opc == ISD::TargetFrameIndex
            || Opc == ISD::TargetJumpTable
            || Opc == ISD::TargetConstantPool
            || Opc == ISD::TargetExternalSymbol
	    || Opc == SPUISD::DFormAddr);
  }
}

SPUTargetLowering::SPUTargetLowering(SPUTargetMachine &TM)
  : TargetLowering(TM),
    SPUTM(TM)
{
  // Fold away setcc operations if possible.
  setPow2DivIsCheap();

  // Use _setjmp/_longjmp instead of setjmp/longjmp.
  setUseUnderscoreSetJmp(true);
  setUseUnderscoreLongJmp(true);

  // Set up the SPU's register classes:
  // NOTE: i8 register class is not registered because we cannot determine when
  // we need to zero or sign extend for custom-lowered loads and stores.
  // NOTE: Ignore the previous note. For now. :-)
  addRegisterClass(MVT::i8,   SPU::R8CRegisterClass);
  addRegisterClass(MVT::i16,  SPU::R16CRegisterClass);
  addRegisterClass(MVT::i32,  SPU::R32CRegisterClass);
  addRegisterClass(MVT::i64,  SPU::R64CRegisterClass);
  addRegisterClass(MVT::f32,  SPU::R32FPRegisterClass);
  addRegisterClass(MVT::f64,  SPU::R64FPRegisterClass);
  addRegisterClass(MVT::i128, SPU::GPRCRegisterClass);

  // SPU has no sign or zero extended loads for i1, i8, i16:
  setLoadXAction(ISD::EXTLOAD,  MVT::i1, Custom);
  setLoadXAction(ISD::SEXTLOAD, MVT::i1, Promote);
  setLoadXAction(ISD::ZEXTLOAD, MVT::i1, Promote);
  setStoreXAction(MVT::i1, Custom);

  setLoadXAction(ISD::EXTLOAD,  MVT::i8, Custom);
  setLoadXAction(ISD::SEXTLOAD, MVT::i8, Custom);
  setLoadXAction(ISD::ZEXTLOAD, MVT::i8, Custom);
  setStoreXAction(MVT::i8, Custom);

  setLoadXAction(ISD::EXTLOAD,  MVT::i16, Custom);
  setLoadXAction(ISD::SEXTLOAD, MVT::i16, Custom);
  setLoadXAction(ISD::ZEXTLOAD, MVT::i16, Custom);

  // SPU constant load actions are custom lowered:
  setOperationAction(ISD::Constant,   MVT::i64, Custom);
  setOperationAction(ISD::ConstantFP, MVT::f32, Custom);
  setOperationAction(ISD::ConstantFP, MVT::f64, Custom);

  // SPU's loads and stores have to be custom lowered:
  for (unsigned sctype = (unsigned) MVT::i1; sctype < (unsigned) MVT::f128;
       ++sctype) {
    setOperationAction(ISD::LOAD, sctype, Custom);
    setOperationAction(ISD::STORE, sctype, Custom);
  }

  // SPU supports BRCOND, although DAGCombine will convert BRCONDs
  // into BR_CCs. BR_CC instructions are custom selected in
  // SPUDAGToDAGISel.
  setOperationAction(ISD::BRCOND, MVT::Other, Legal);

  // Expand the jumptable branches
  setOperationAction(ISD::BR_JT,        MVT::Other, Expand);
  setOperationAction(ISD::BR_CC,        MVT::Other, Expand);
  setOperationAction(ISD::SELECT_CC,    MVT::Other, Expand);

  // SPU has no intrinsics for these particular operations:
  setOperationAction(ISD::MEMMOVE, MVT::Other, Expand);
  setOperationAction(ISD::MEMSET, MVT::Other, Expand);
  setOperationAction(ISD::MEMCPY, MVT::Other, Expand);

  // PowerPC has no SREM/UREM instructions
  setOperationAction(ISD::SREM, MVT::i32, Expand);
  setOperationAction(ISD::UREM, MVT::i32, Expand);
  setOperationAction(ISD::SREM, MVT::i64, Expand);
  setOperationAction(ISD::UREM, MVT::i64, Expand);

  // We don't support sin/cos/sqrt/fmod
  setOperationAction(ISD::FSIN , MVT::f64, Expand);
  setOperationAction(ISD::FCOS , MVT::f64, Expand);
  setOperationAction(ISD::FREM , MVT::f64, Expand);
  setOperationAction(ISD::FSIN , MVT::f32, Expand);
  setOperationAction(ISD::FCOS , MVT::f32, Expand);
  setOperationAction(ISD::FREM , MVT::f32, Expand);

  // If we're enabling GP optimizations, use hardware square root
  setOperationAction(ISD::FSQRT, MVT::f64, Expand);
  setOperationAction(ISD::FSQRT, MVT::f32, Expand);

  setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
  setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);

  // SPU can do rotate right and left, so legalize it... but customize for i8
  // because instructions don't exist.
  setOperationAction(ISD::ROTR, MVT::i32,    Legal);
  setOperationAction(ISD::ROTR, MVT::i16,    Legal);
  setOperationAction(ISD::ROTR, MVT::i8,     Custom);
  setOperationAction(ISD::ROTL, MVT::i32,    Legal);
  setOperationAction(ISD::ROTL, MVT::i16,    Legal);
  setOperationAction(ISD::ROTL, MVT::i8,     Custom);
  // SPU has no native version of shift left/right for i8
  setOperationAction(ISD::SHL,  MVT::i8,     Custom);
  setOperationAction(ISD::SRL,  MVT::i8,     Custom);
  setOperationAction(ISD::SRA,  MVT::i8,     Custom);

  // Custom lower i32 multiplications
  setOperationAction(ISD::MUL,  MVT::i32,    Custom);

  // Need to custom handle (some) common i8 math ops
  setOperationAction(ISD::SUB,  MVT::i8,     Custom);
  setOperationAction(ISD::MUL,  MVT::i8,     Custom);

  // SPU does not have BSWAP. It does have i32 support CTLZ.
  // CTPOP has to be custom lowered.
  setOperationAction(ISD::BSWAP, MVT::i32,   Expand);
  setOperationAction(ISD::BSWAP, MVT::i64,   Expand);

  setOperationAction(ISD::CTPOP, MVT::i8,    Custom);
  setOperationAction(ISD::CTPOP, MVT::i16,   Custom);
  setOperationAction(ISD::CTPOP, MVT::i32,   Custom);
  setOperationAction(ISD::CTPOP, MVT::i64,   Custom);

  setOperationAction(ISD::CTTZ , MVT::i32,   Expand);
  setOperationAction(ISD::CTTZ , MVT::i64,   Expand);

  setOperationAction(ISD::CTLZ , MVT::i32,   Legal);

  // SPU does not have select or setcc
  setOperationAction(ISD::SELECT, MVT::i1,   Expand);
  setOperationAction(ISD::SELECT, MVT::i8,   Expand);
  setOperationAction(ISD::SELECT, MVT::i16,  Expand);
  setOperationAction(ISD::SELECT, MVT::i32,  Expand);
  setOperationAction(ISD::SELECT, MVT::i64,  Expand);
  setOperationAction(ISD::SELECT, MVT::f32,  Expand);
  setOperationAction(ISD::SELECT, MVT::f64,  Expand);

  setOperationAction(ISD::SETCC, MVT::i1,   Expand);
  setOperationAction(ISD::SETCC, MVT::i8,   Expand);
  setOperationAction(ISD::SETCC, MVT::i16,  Expand);
  setOperationAction(ISD::SETCC, MVT::i32,  Expand);
  setOperationAction(ISD::SETCC, MVT::i64,  Expand);
  setOperationAction(ISD::SETCC, MVT::f32,  Expand);
  setOperationAction(ISD::SETCC, MVT::f64,  Expand);

  // SPU has a legal FP -> signed INT instruction
  setOperationAction(ISD::FP_TO_SINT, MVT::i32, Legal);
  setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
  setOperationAction(ISD::FP_TO_UINT, MVT::i32, Legal);
  setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);

  // FDIV on SPU requires custom lowering
  setOperationAction(ISD::FDIV, MVT::f32, Custom);
  //setOperationAction(ISD::FDIV, MVT::f64, Custom);

  // SPU has [U|S]INT_TO_FP
  setOperationAction(ISD::SINT_TO_FP, MVT::i32, Legal);
  setOperationAction(ISD::SINT_TO_FP, MVT::i16, Promote);
  setOperationAction(ISD::SINT_TO_FP, MVT::i8, Promote);
  setOperationAction(ISD::UINT_TO_FP, MVT::i32, Legal);
  setOperationAction(ISD::UINT_TO_FP, MVT::i16, Promote);
  setOperationAction(ISD::UINT_TO_FP, MVT::i8, Promote);
  setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
  setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);

  setOperationAction(ISD::BIT_CONVERT, MVT::f32, Expand);
  setOperationAction(ISD::BIT_CONVERT, MVT::i32, Expand);
  setOperationAction(ISD::BIT_CONVERT, MVT::i64, Expand);
  setOperationAction(ISD::BIT_CONVERT, MVT::f64, Expand);

  // We cannot sextinreg(i1).  Expand to shifts.
  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);

  // Support label based line numbers.
  setOperationAction(ISD::LOCATION, MVT::Other, Expand);
  setOperationAction(ISD::DEBUG_LOC, MVT::Other, Expand);

  // We want to legalize GlobalAddress and ConstantPool nodes into the
  // appropriate instructions to materialize the address.
  setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
  setOperationAction(ISD::ConstantPool,  MVT::i32, Custom);
  setOperationAction(ISD::ConstantPool,  MVT::f32, Custom);
  setOperationAction(ISD::JumpTable,     MVT::i32, Custom);
  setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
  setOperationAction(ISD::ConstantPool,  MVT::i64, Custom);
  setOperationAction(ISD::ConstantPool,  MVT::f64, Custom);
  setOperationAction(ISD::JumpTable,     MVT::i64, Custom);

  // RET must be custom lowered, to meet ABI requirements
  setOperationAction(ISD::RET,           MVT::Other, Custom);

  // VASTART needs to be custom lowered to use the VarArgsFrameIndex
  setOperationAction(ISD::VASTART           , MVT::Other, Custom);

  // Use the default implementation.
  setOperationAction(ISD::VAARG             , MVT::Other, Expand);
  setOperationAction(ISD::VACOPY            , MVT::Other, Expand);
  setOperationAction(ISD::VAEND             , MVT::Other, Expand);
  setOperationAction(ISD::STACKSAVE         , MVT::Other, Expand);
  setOperationAction(ISD::STACKRESTORE      , MVT::Other, Expand);
  setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32  , Expand);
  setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64  , Expand);

  // Cell SPU has instructions for converting between i64 and fp.
  setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
  setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);

  // To take advantage of the above i64 FP_TO_SINT, promote i32 FP_TO_UINT
  setOperationAction(ISD::FP_TO_UINT, MVT::i32, Promote);

  // BUILD_PAIR can't be handled natively, and should be expanded to shl/or
  setOperationAction(ISD::BUILD_PAIR, MVT::i64, Expand);

  // First set operation action for all vector types to expand. Then we
  // will selectively turn on ones that can be effectively codegen'd.
  addRegisterClass(MVT::v16i8, SPU::VECREGRegisterClass);
  addRegisterClass(MVT::v8i16, SPU::VECREGRegisterClass);
  addRegisterClass(MVT::v4i32, SPU::VECREGRegisterClass);
  addRegisterClass(MVT::v2i64, SPU::VECREGRegisterClass);
  addRegisterClass(MVT::v4f32, SPU::VECREGRegisterClass);
  addRegisterClass(MVT::v2f64, SPU::VECREGRegisterClass);

  for (unsigned VT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
       VT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++VT) {
    // add/sub are legal for all supported vector VT's.
    setOperationAction(ISD::ADD , (MVT::ValueType)VT, Legal);
    setOperationAction(ISD::SUB , (MVT::ValueType)VT, Legal);
    // mul has to be custom lowered.
    setOperationAction(ISD::MUL , (MVT::ValueType)VT, Custom);

    setOperationAction(ISD::AND   , (MVT::ValueType)VT, Legal);
    setOperationAction(ISD::OR    , (MVT::ValueType)VT, Legal);
    setOperationAction(ISD::XOR   , (MVT::ValueType)VT, Legal);
    setOperationAction(ISD::LOAD  , (MVT::ValueType)VT, Legal);
    setOperationAction(ISD::SELECT, (MVT::ValueType)VT, Legal);
    setOperationAction(ISD::STORE,  (MVT::ValueType)VT, Legal);

    // These operations need to be expanded:
    setOperationAction(ISD::SDIV, (MVT::ValueType)VT, Expand);
    setOperationAction(ISD::SREM, (MVT::ValueType)VT, Expand);
    setOperationAction(ISD::UDIV, (MVT::ValueType)VT, Expand);
    setOperationAction(ISD::UREM, (MVT::ValueType)VT, Expand);
    setOperationAction(ISD::FDIV, (MVT::ValueType)VT, Custom);

    // Custom lower build_vector, constant pool spills, insert and
    // extract vector elements:
    setOperationAction(ISD::BUILD_VECTOR, (MVT::ValueType)VT, Custom);
    setOperationAction(ISD::ConstantPool, (MVT::ValueType)VT, Custom);
    setOperationAction(ISD::SCALAR_TO_VECTOR, (MVT::ValueType)VT, Custom);
    setOperationAction(ISD::EXTRACT_VECTOR_ELT, (MVT::ValueType)VT, Custom);
    setOperationAction(ISD::INSERT_VECTOR_ELT, (MVT::ValueType)VT, Custom);
    setOperationAction(ISD::VECTOR_SHUFFLE, (MVT::ValueType)VT, Custom);
  }

  setOperationAction(ISD::MUL, MVT::v16i8, Custom);
  setOperationAction(ISD::AND, MVT::v16i8, Custom);
  setOperationAction(ISD::OR,  MVT::v16i8, Custom);
  setOperationAction(ISD::XOR, MVT::v16i8, Custom);
  setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4f32, Custom);

  setSetCCResultType(MVT::i32);
  setShiftAmountType(MVT::i32);
  setSetCCResultContents(ZeroOrOneSetCCResult);

  setStackPointerRegisterToSaveRestore(SPU::R1);

  // We have target-specific dag combine patterns for the following nodes:
  // e.g., setTargetDAGCombine(ISD::SUB);

  computeRegisterProperties();
}

const char *
SPUTargetLowering::getTargetNodeName(unsigned Opcode) const
{
  if (node_names.empty()) {
    node_names[(unsigned) SPUISD::RET_FLAG] = "SPUISD::RET_FLAG";
    node_names[(unsigned) SPUISD::Hi] = "SPUISD::Hi";
    node_names[(unsigned) SPUISD::Lo] = "SPUISD::Lo";
    node_names[(unsigned) SPUISD::PCRelAddr] = "SPUISD::PCRelAddr";
    node_names[(unsigned) SPUISD::DFormAddr] = "SPUISD::DFormAddr";
    node_names[(unsigned) SPUISD::XFormAddr] = "SPUISD::XFormAddr";
    node_names[(unsigned) SPUISD::LDRESULT] = "SPUISD::LDRESULT";
    node_names[(unsigned) SPUISD::CALL] = "SPUISD::CALL";
    node_names[(unsigned) SPUISD::SHUFB] = "SPUISD::SHUFB";
    node_names[(unsigned) SPUISD::INSERT_MASK] = "SPUISD::INSERT_MASK";
    node_names[(unsigned) SPUISD::CNTB] = "SPUISD::CNTB";
    node_names[(unsigned) SPUISD::PROMOTE_SCALAR] = "SPUISD::PROMOTE_SCALAR";
    node_names[(unsigned) SPUISD::EXTRACT_ELT0] = "SPUISD::EXTRACT_ELT0";
    node_names[(unsigned) SPUISD::EXTRACT_ELT0_CHAINED] = "SPUISD::EXTRACT_ELT0_CHAINED";
    node_names[(unsigned) SPUISD::EXTRACT_I1_ZEXT] = "SPUISD::EXTRACT_I1_ZEXT";
    node_names[(unsigned) SPUISD::EXTRACT_I1_SEXT] = "SPUISD::EXTRACT_I1_SEXT";
    node_names[(unsigned) SPUISD::EXTRACT_I8_ZEXT] = "SPUISD::EXTRACT_I8_ZEXT";
    node_names[(unsigned) SPUISD::EXTRACT_I8_SEXT] = "SPUISD::EXTRACT_I8_SEXT";
    node_names[(unsigned) SPUISD::MPY] = "SPUISD::MPY";
    node_names[(unsigned) SPUISD::MPYU] = "SPUISD::MPYU";
    node_names[(unsigned) SPUISD::MPYH] = "SPUISD::MPYH";
    node_names[(unsigned) SPUISD::MPYHH] = "SPUISD::MPYHH";
    node_names[(unsigned) SPUISD::VEC_SHL] = "SPUISD::VEC_SHL";
    node_names[(unsigned) SPUISD::VEC_SRL] = "SPUISD::VEC_SRL";
    node_names[(unsigned) SPUISD::VEC_SRA] = "SPUISD::VEC_SRA";
    node_names[(unsigned) SPUISD::VEC_ROTL] = "SPUISD::VEC_ROTL";
    node_names[(unsigned) SPUISD::VEC_ROTR] = "SPUISD::VEC_ROTR";
    node_names[(unsigned) SPUISD::ROTBYTES_RIGHT_Z] =
      "SPUISD::ROTBYTES_RIGHT_Z";
    node_names[(unsigned) SPUISD::ROTBYTES_RIGHT_S] =
      "SPUISD::ROTBYTES_RIGHT_S";
    node_names[(unsigned) SPUISD::ROTBYTES_LEFT] = "SPUISD::ROTBYTES_LEFT";
    node_names[(unsigned) SPUISD::ROTBYTES_LEFT_CHAINED] =
      "SPUISD::ROTBYTES_LEFT_CHAINED";
    node_names[(unsigned) SPUISD::FSMBI] = "SPUISD::FSMBI";
    node_names[(unsigned) SPUISD::SELB] = "SPUISD::SELB";
    node_names[(unsigned) SPUISD::SFPConstant] = "SPUISD::SFPConstant";
    node_names[(unsigned) SPUISD::FPInterp] = "SPUISD::FPInterp";
    node_names[(unsigned) SPUISD::FPRecipEst] = "SPUISD::FPRecipEst";
    node_names[(unsigned) SPUISD::SEXT32TO64] = "SPUISD::SEXT32TO64";
  }

  std::map<unsigned, const char *>::iterator i = node_names.find(Opcode);

  return ((i != node_names.end()) ? i->second : 0);
}

//===----------------------------------------------------------------------===//
// Calling convention code:
//===----------------------------------------------------------------------===//

#include "SPUGenCallingConv.inc"

//===----------------------------------------------------------------------===//
//  LowerOperation implementation
//===----------------------------------------------------------------------===//

/// Custom lower loads for CellSPU
/*!
 All CellSPU loads and stores are aligned to 16-byte boundaries, so for elements
 within a 16-byte block, we have to rotate to extract the requested element.
 */
static SDOperand
LowerLOAD(SDOperand Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
  LoadSDNode *LN = cast<LoadSDNode>(Op);
  SDOperand basep = LN->getBasePtr();
  SDOperand the_chain = LN->getChain();
  MVT::ValueType VT = LN->getLoadedVT();
  MVT::ValueType OpVT = Op.Val->getValueType(0);
  MVT::ValueType PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
  ISD::LoadExtType ExtType = LN->getExtensionType();
  unsigned alignment = LN->getAlignment();
  const valtype_map_s *vtm = getValueTypeMapEntry(VT);
  SDOperand Ops[8];

  // For an extending load of an i1 variable, just call it i8 (or whatever we
  // were passed) and make it zero-extended:
  if (VT == MVT::i1) {
    VT = OpVT;
    ExtType = ISD::ZEXTLOAD;
  }

  switch (LN->getAddressingMode()) {
  case ISD::UNINDEXED: {
    SDOperand result;
    SDOperand rot_op, rotamt;
    SDOperand ptrp;
    int c_offset;
    int c_rotamt;

    // The vector type we really want to be when we load the 16-byte chunk
    MVT::ValueType vecVT, opVecVT;

    if (VT != MVT::i1)
      vecVT = MVT::getVectorType(VT, (128 / MVT::getSizeInBits(VT)));
    else
      vecVT = MVT::v16i8;

    opVecVT = MVT::getVectorType(OpVT, (128 / MVT::getSizeInBits(OpVT)));

    if (basep.getOpcode() == ISD::ADD) {
      const ConstantSDNode *CN = cast<ConstantSDNode>(basep.Val->getOperand(1));

      assert(CN != NULL
             && "LowerLOAD: ISD::ADD operand 1 is not constant");

      c_offset = (int) CN->getValue();
      c_rotamt = (int) (c_offset & 0xf);

      // Adjust the rotation amount to ensure that the final result ends up in
      // the preferred slot:
      c_rotamt -= vtm->prefslot_byte;
      ptrp = basep.getOperand(0);
    } else {
      c_offset = 0;
      c_rotamt = -vtm->prefslot_byte;
      ptrp = basep;
    }

    if (alignment == 16) {
      // 16-byte aligned load into preferred slot, no rotation
      if (c_rotamt == 0) {
	if (isMemoryOperand(ptrp))
	  // Return unchanged
	  return SDOperand();
	else {
	  // Return modified D-Form address for pointer:
	  ptrp = DAG.getNode(SPUISD::DFormAddr, PtrVT,
			     ptrp, DAG.getConstant((c_offset & ~0xf), PtrVT));
	  if (VT == OpVT)
	    return DAG.getLoad(VT, LN->getChain(), ptrp,
			       LN->getSrcValue(), LN->getSrcValueOffset(),
			       LN->isVolatile(), 16);
	  else
	    return DAG.getExtLoad(ExtType, VT, LN->getChain(), ptrp, LN->getSrcValue(),
				  LN->getSrcValueOffset(), OpVT,
				  LN->isVolatile(), 16);
	}
      } else {
	// Need to rotate...
	if (c_rotamt < 0)
	  c_rotamt += 16;
	// Realign the base pointer, with a D-Form address
	if ((c_offset & ~0xf) != 0 || !isMemoryOperand(ptrp))
	  basep = DAG.getNode(SPUISD::DFormAddr, PtrVT,
			      ptrp, DAG.getConstant((c_offset & ~0xf), MVT::i32));
	else
	  basep = ptrp;

	// Rotate the load:
	rot_op = DAG.getLoad(MVT::v16i8, the_chain, basep,
			     LN->getSrcValue(), LN->getSrcValueOffset(),
			     LN->isVolatile(), 16);
	the_chain = rot_op.getValue(1);
	rotamt = DAG.getConstant(c_rotamt, MVT::i16);

	SDVTList vecvts = DAG.getVTList(MVT::v16i8, MVT::Other);
	Ops[0] = the_chain;
	Ops[1] = rot_op;
	Ops[2] = rotamt;

	result = DAG.getNode(SPUISD::ROTBYTES_LEFT_CHAINED, vecvts, Ops, 3);
	the_chain = result.getValue(1);

	if (VT == OpVT || ExtType == ISD::EXTLOAD) {
	  SDVTList scalarvts;
	  Ops[0] = the_chain;
	  Ops[1] = result;
	  if (OpVT == VT) {
	    scalarvts = DAG.getVTList(VT, MVT::Other);
	  } else {
	    scalarvts = DAG.getVTList(OpVT, MVT::Other);
	  }

	  result = DAG.getNode(ISD::BIT_CONVERT, (OpVT == VT ? vecVT : opVecVT),
	                       result);
	  Ops[0] = the_chain;
	  Ops[1] = result;
	  result = DAG.getNode(SPUISD::EXTRACT_ELT0_CHAINED, scalarvts, Ops, 2);
	  the_chain = result.getValue(1);
	} else {
	  // Handle the sign and zero-extending loads for i1 and i8:
	  unsigned NewOpC;

	  if (ExtType == ISD::SEXTLOAD) {
	    NewOpC = (OpVT == MVT::i1
		      ? SPUISD::EXTRACT_I1_SEXT
		      : SPUISD::EXTRACT_I8_SEXT);
	  } else if (ExtType == ISD::ZEXTLOAD) {
	    NewOpC = (OpVT == MVT::i1
		      ? SPUISD::EXTRACT_I1_ZEXT
		      : SPUISD::EXTRACT_I8_ZEXT);
	  }

	  result = DAG.getNode(NewOpC, OpVT, result);
	}

	SDVTList retvts = DAG.getVTList(OpVT, MVT::Other);
	SDOperand retops[2] = { result, the_chain };

	result = DAG.getNode(SPUISD::LDRESULT, retvts, retops, 2);
	return result;
	/*UNREACHED*/
      }
    } else {
      // Misaligned 16-byte load:
      if (basep.getOpcode() == ISD::LOAD) {
	LN = cast<LoadSDNode>(basep);
	if (LN->getAlignment() == 16) {
	  // We can verify that we're really loading from a 16-byte aligned
	  // chunk. Encapsulate basep as a D-Form address and return a new
	  // load:
	  basep = DAG.getNode(SPUISD::DFormAddr, PtrVT, basep,
			      DAG.getConstant(0, PtrVT));
	  if (OpVT == VT)
	    return DAG.getLoad(VT, LN->getChain(), basep,
			       LN->getSrcValue(), LN->getSrcValueOffset(),
			       LN->isVolatile(), 16);
	  else
	    return DAG.getExtLoad(ExtType, VT, LN->getChain(), basep,
				  LN->getSrcValue(), LN->getSrcValueOffset(),
				  OpVT, LN->isVolatile(), 16);
	}
      }

      // Catch all other cases where we can't guarantee that we have a
      // 16-byte aligned entity, which means resorting to an X-form
      // address scheme:

      SDOperand ZeroOffs = DAG.getConstant(0, PtrVT);
      SDOperand loOp = DAG.getNode(SPUISD::Lo, VT, basep, ZeroOffs);
      SDOperand hiOp = DAG.getNode(SPUISD::Hi, VT, basep, ZeroOffs);

      ptrp = DAG.getNode(ISD::ADD, PtrVT, loOp, hiOp);

      SDOperand alignLoad =
	DAG.getLoad(opVecVT, LN->getChain(), ptrp,
		    LN->getSrcValue(), LN->getSrcValueOffset(),
		    LN->isVolatile(), 16);

      SDOperand insertEltOp =
	DAG.getNode(SPUISD::INSERT_MASK, vecVT, ptrp);

      result = DAG.getNode(SPUISD::SHUFB, opVecVT,
			   alignLoad,
			   alignLoad,
			   DAG.getNode(ISD::BIT_CONVERT, opVecVT, insertEltOp));

      result = DAG.getNode(SPUISD::EXTRACT_ELT0, OpVT, result);

      SDVTList retvts = DAG.getVTList(OpVT, MVT::Other);
      SDOperand retops[2] = { result, the_chain };

      result = DAG.getNode(SPUISD::LDRESULT, retvts, retops, 2);
      return result;
    }
    break;
  }
  case ISD::PRE_INC:
  case ISD::PRE_DEC:
  case ISD::POST_INC:
  case ISD::POST_DEC:
  case ISD::LAST_INDEXED_MODE:
    cerr << "LowerLOAD: Got a LoadSDNode with an addr mode other than "
            "UNINDEXED\n";
    cerr << (unsigned) LN->getAddressingMode() << "\n";
    abort();
    /*NOTREACHED*/
  }

  return SDOperand();
}

/// Custom lower stores for CellSPU
/*!
 All CellSPU stores are aligned to 16-byte boundaries, so for elements
 within a 16-byte block, we have to generate a shuffle to insert the
 requested element into its place, then store the resulting block.
 */
static SDOperand
LowerSTORE(SDOperand Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
  StoreSDNode *SN = cast<StoreSDNode>(Op);
  SDOperand Value = SN->getValue();
  MVT::ValueType VT = Value.getValueType();
  MVT::ValueType StVT = (!SN->isTruncatingStore() ? VT : SN->getStoredVT());
  MVT::ValueType PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
  SDOperand the_chain = SN->getChain();
  //unsigned alignment = SN->getAlignment();
  //const valtype_map_s *vtm = getValueTypeMapEntry(VT);

  switch (SN->getAddressingMode()) {
  case ISD::UNINDEXED: {
    SDOperand basep = SN->getBasePtr();
    SDOperand ptrOp;
    int offset;

    if (basep.getOpcode() == ISD::FrameIndex) {
      // FrameIndex nodes are always properly aligned. Really.
      return SDOperand();
    }

    if (basep.getOpcode() == ISD::ADD) {
      const ConstantSDNode *CN = cast<ConstantSDNode>(basep.Val->getOperand(1));
      assert(CN != NULL
             && "LowerSTORE: ISD::ADD operand 1 is not constant");
      offset = unsigned(CN->getValue());
      ptrOp = basep.getOperand(0);
      DEBUG(cerr << "LowerSTORE: StoreSDNode ISD:ADD offset = "
	         << offset
		 << "\n");
    } else {
      ptrOp = basep;
      offset = 0;
    }

    // The vector type we really want to load from the 16-byte chunk, except
    // in the case of MVT::i1, which has to be v16i8.
    unsigned vecVT, stVecVT;

    if (StVT != MVT::i1)
      stVecVT = MVT::getVectorType(StVT, (128 / MVT::getSizeInBits(StVT)));
    else
      stVecVT = MVT::v16i8;
    vecVT = MVT::getVectorType(VT, (128 / MVT::getSizeInBits(VT)));

    // Realign the pointer as a D-Form address (ptrOp is the pointer, basep is
    // the actual dform addr offs($reg).
    basep = DAG.getNode(SPUISD::DFormAddr, PtrVT, ptrOp,
                        DAG.getConstant((offset & ~0xf), PtrVT));

    // Create the 16-byte aligned vector load
    SDOperand alignLoad =
      DAG.getLoad(vecVT, the_chain, basep,
                  SN->getSrcValue(), SN->getSrcValueOffset(),
                  SN->isVolatile(), 16);
    the_chain = alignLoad.getValue(1);

    LoadSDNode *LN = cast<LoadSDNode>(alignLoad);
    SDOperand theValue = SN->getValue();
    SDOperand result;

    if (StVT != VT
	&& (theValue.getOpcode() == ISD::AssertZext
	    || theValue.getOpcode() == ISD::AssertSext)) {
      // Drill down and get the value for zero- and sign-extended
      // quantities
      theValue = theValue.getOperand(0);
    }

    SDOperand insertEltOp =
      DAG.getNode(SPUISD::INSERT_MASK, stVecVT,
		  DAG.getNode(SPUISD::DFormAddr, PtrVT,
			      ptrOp,
			      DAG.getConstant((offset & 0xf), PtrVT)));

    result = DAG.getNode(SPUISD::SHUFB, vecVT,
			 DAG.getNode(ISD::SCALAR_TO_VECTOR, vecVT, theValue),
			 alignLoad,
			 DAG.getNode(ISD::BIT_CONVERT, vecVT, insertEltOp));

    result = DAG.getStore(the_chain, result, basep,
                          LN->getSrcValue(), LN->getSrcValueOffset(),
                          LN->isVolatile(), LN->getAlignment());

    return result;
    /*UNREACHED*/
  }
  case ISD::PRE_INC:
  case ISD::PRE_DEC:
  case ISD::POST_INC:
  case ISD::POST_DEC:
  case ISD::LAST_INDEXED_MODE:
    cerr << "LowerLOAD: Got a LoadSDNode with an addr mode other than "
            "UNINDEXED\n";
    cerr << (unsigned) SN->getAddressingMode() << "\n";
    abort();
    /*NOTREACHED*/
  }

  return SDOperand();
}

/// Generate the address of a constant pool entry.
static SDOperand
LowerConstantPool(SDOperand Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
  MVT::ValueType PtrVT = Op.getValueType();
  ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
  Constant *C = CP->getConstVal();
  SDOperand CPI = DAG.getTargetConstantPool(C, PtrVT, CP->getAlignment());
  const TargetMachine &TM = DAG.getTarget();
  SDOperand Zero = DAG.getConstant(0, PtrVT);

  if (TM.getRelocationModel() == Reloc::Static) {
    if (!ST->usingLargeMem()) {
      // Just return the SDOperand with the constant pool address in it.
      return CPI;
    } else {
      // Generate hi/lo address pair
      SDOperand Hi = DAG.getNode(SPUISD::Hi, PtrVT, CPI, Zero);
      SDOperand Lo = DAG.getNode(SPUISD::Lo, PtrVT, CPI, Zero);

      return DAG.getNode(ISD::ADD, PtrVT, Lo, Hi);
    }
  }

  assert(0 &&
         "LowerConstantPool: Relocation model other than static not supported.");
  return SDOperand();
}

static SDOperand
LowerJumpTable(SDOperand Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
  MVT::ValueType PtrVT = Op.getValueType();
  JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
  SDOperand JTI = DAG.getTargetJumpTable(JT->getIndex(), PtrVT);
  SDOperand Zero = DAG.getConstant(0, PtrVT);
  const TargetMachine &TM = DAG.getTarget();

  if (TM.getRelocationModel() == Reloc::Static) {
    if (!ST->usingLargeMem()) {
      // Just return the SDOperand with the jump table address in it.
      return JTI;
    } else {
      // Generate hi/lo address pair
      SDOperand Hi = DAG.getNode(SPUISD::Hi, PtrVT, JTI, Zero);
      SDOperand Lo = DAG.getNode(SPUISD::Lo, PtrVT, JTI, Zero);

      return DAG.getNode(ISD::ADD, PtrVT, Lo, Hi);
    }
  }

  assert(0 &&
         "LowerJumpTable: Relocation model other than static not supported.");
  return SDOperand();
}

static SDOperand
LowerGlobalAddress(SDOperand Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
  MVT::ValueType PtrVT = Op.getValueType();
  GlobalAddressSDNode *GSDN = cast<GlobalAddressSDNode>(Op);
  GlobalValue *GV = GSDN->getGlobal();
  SDOperand GA = DAG.getTargetGlobalAddress(GV, PtrVT, GSDN->getOffset());
  SDOperand Zero = DAG.getConstant(0, PtrVT);
  const TargetMachine &TM = DAG.getTarget();

  if (TM.getRelocationModel() == Reloc::Static) {
    if (!ST->usingLargeMem()) {
      // Generate a local store address
      return GA;
    } else {
      // Generate hi/lo address pair
      SDOperand Hi = DAG.getNode(SPUISD::Hi, PtrVT, GA, Zero);
      SDOperand Lo = DAG.getNode(SPUISD::Lo, PtrVT, GA, Zero);

      return DAG.getNode(ISD::ADD, PtrVT, Lo, Hi);
    }
  } else {
    cerr << "LowerGlobalAddress: Relocation model other than static not "
	 << "supported.\n";
    abort();
    /*NOTREACHED*/
  }

  return SDOperand();
}

//! Custom lower i64 integer constants
/*!
 This code inserts all of the necessary juggling that needs to occur to load
 a 64-bit constant into a register.
 */
static SDOperand
LowerConstant(SDOperand Op, SelectionDAG &DAG) {
  unsigned VT = Op.getValueType();
  ConstantSDNode *CN = cast<ConstantSDNode>(Op.Val);

  if (VT == MVT::i64) {
    SDOperand T = DAG.getConstant(CN->getValue(), MVT::i64);
    return DAG.getNode(SPUISD::EXTRACT_ELT0, VT,
		       DAG.getNode(ISD::BUILD_VECTOR, MVT::v2i64, T, T));

  } else {
    cerr << "LowerConstant: unhandled constant type "
	 << MVT::getValueTypeString(VT)
	 << "\n";
    abort();
    /*NOTREACHED*/
  }

  return SDOperand();
}

//! Custom lower single precision floating point constants
/*!
  "float" immediates can be lowered as if they were unsigned 32-bit integers.
  The SPUISD::SFPConstant pseudo-instruction handles this in the instruction
  target description.
 */
static SDOperand
LowerConstantFP(SDOperand Op, SelectionDAG &DAG) {
  unsigned VT = Op.getValueType();
  ConstantFPSDNode *FP = cast<ConstantFPSDNode>(Op.Val);

  assert((FP != 0) &&
	 "LowerConstantFP: Node is not ConstantFPSDNode");

  const APFloat &apf = FP->getValueAPF();

  if (VT == MVT::f32) {
    return DAG.getNode(SPUISD::SFPConstant, VT,
		       DAG.getTargetConstantFP(apf.convertToFloat(), VT));
  } else if (VT == MVT::f64) {
    uint64_t dbits = DoubleToBits(apf.convertToDouble());
    return DAG.getNode(ISD::BIT_CONVERT, VT,
		       LowerConstant(DAG.getConstant(dbits, MVT::i64), DAG));
  }

  return SDOperand();
}

static SDOperand
LowerFORMAL_ARGUMENTS(SDOperand Op, SelectionDAG &DAG, int &VarArgsFrameIndex)
{
  MachineFunction &MF = DAG.getMachineFunction();
  MachineFrameInfo *MFI = MF.getFrameInfo();
  SSARegMap *RegMap = MF.getSSARegMap();
  SmallVector<SDOperand, 8> ArgValues;
  SDOperand Root = Op.getOperand(0);
  bool isVarArg = cast<ConstantSDNode>(Op.getOperand(2))->getValue() != 0;

  const unsigned *ArgRegs = SPURegisterInfo::getArgRegs();
  const unsigned NumArgRegs = SPURegisterInfo::getNumArgRegs();

  unsigned ArgOffset = SPUFrameInfo::minStackSize();
  unsigned ArgRegIdx = 0;
  unsigned StackSlotSize = SPUFrameInfo::stackSlotSize();

  MVT::ValueType PtrVT = DAG.getTargetLoweringInfo().getPointerTy();

  // Add DAG nodes to load the arguments or copy them out of registers.
  for (unsigned ArgNo = 0, e = Op.Val->getNumValues()-1; ArgNo != e; ++ArgNo) {
    SDOperand ArgVal;
    bool needsLoad = false;
    MVT::ValueType ObjectVT = Op.getValue(ArgNo).getValueType();
    unsigned ObjSize = MVT::getSizeInBits(ObjectVT)/8;

    switch (ObjectVT) {
    default: {
      cerr << "LowerFORMAL_ARGUMENTS Unhandled argument type: "
	   << MVT::getValueTypeString(ObjectVT)
           << "\n";
      abort();
    }
    case MVT::i8:
      if (!isVarArg && ArgRegIdx < NumArgRegs) {
        unsigned VReg = RegMap->createVirtualRegister(&SPU::R8CRegClass);
        MF.addLiveIn(ArgRegs[ArgRegIdx], VReg);
        ArgVal = DAG.getCopyFromReg(Root, VReg, MVT::i8);
        ++ArgRegIdx;
      } else {
        needsLoad = true;
      }
      break;
    case MVT::i16:
      if (!isVarArg && ArgRegIdx < NumArgRegs) {
        unsigned VReg = RegMap->createVirtualRegister(&SPU::R16CRegClass);
        MF.addLiveIn(ArgRegs[ArgRegIdx], VReg);
        ArgVal = DAG.getCopyFromReg(Root, VReg, MVT::i16);
        ++ArgRegIdx;
      } else {
        needsLoad = true;
      }
      break;
    case MVT::i32:
      if (!isVarArg && ArgRegIdx < NumArgRegs) {
        unsigned VReg = RegMap->createVirtualRegister(&SPU::R32CRegClass);
        MF.addLiveIn(ArgRegs[ArgRegIdx], VReg);
        ArgVal = DAG.getCopyFromReg(Root, VReg, MVT::i32);
        ++ArgRegIdx;
      } else {
        needsLoad = true;
      }
      break;
    case MVT::i64:
      if (!isVarArg && ArgRegIdx < NumArgRegs) {
        unsigned VReg = RegMap->createVirtualRegister(&SPU::R64CRegClass);
        MF.addLiveIn(ArgRegs[ArgRegIdx], VReg);
        ArgVal = DAG.getCopyFromReg(Root, VReg, MVT::i64);
        ++ArgRegIdx;
      } else {
        needsLoad = true;
      }
      break;
    case MVT::f32:
      if (!isVarArg && ArgRegIdx < NumArgRegs) {
        unsigned VReg = RegMap->createVirtualRegister(&SPU::R32FPRegClass);
        MF.addLiveIn(ArgRegs[ArgRegIdx], VReg);
        ArgVal = DAG.getCopyFromReg(Root, VReg, MVT::f32);
        ++ArgRegIdx;
      } else {
        needsLoad = true;
      }
      break;
    case MVT::f64:
      if (!isVarArg && ArgRegIdx < NumArgRegs) {
        unsigned VReg = RegMap->createVirtualRegister(&SPU::R64FPRegClass);
        MF.addLiveIn(ArgRegs[ArgRegIdx], VReg);
        ArgVal = DAG.getCopyFromReg(Root, VReg, MVT::f64);
        ++ArgRegIdx;
      } else {
        needsLoad = true;
      }
      break;
    case MVT::v2f64:
    case MVT::v4f32:
    case MVT::v4i32:
    case MVT::v8i16:
    case MVT::v16i8:
      if (!isVarArg && ArgRegIdx < NumArgRegs) {
        unsigned VReg = RegMap->createVirtualRegister(&SPU::VECREGRegClass);
        MF.addLiveIn(ArgRegs[ArgRegIdx], VReg);
        ArgVal = DAG.getCopyFromReg(Root, VReg, ObjectVT);
        ++ArgRegIdx;
      } else {
        needsLoad = true;
      }
      break;
    }

    // We need to load the argument to a virtual register if we determined above
    // that we ran out of physical registers of the appropriate type
    if (needsLoad) {
      // If the argument is actually used, emit a load from the right stack
      // slot.
      if (!Op.Val->hasNUsesOfValue(0, ArgNo)) {
        int FI = MFI->CreateFixedObject(ObjSize, ArgOffset);
        SDOperand FIN = DAG.getFrameIndex(FI, PtrVT);
        ArgVal = DAG.getLoad(ObjectVT, Root, FIN, NULL, 0);
      } else {
        // Don't emit a dead load.
        ArgVal = DAG.getNode(ISD::UNDEF, ObjectVT);
      }

      ArgOffset += StackSlotSize;
    }

    ArgValues.push_back(ArgVal);
  }

  // If the function takes variable number of arguments, make a frame index for
  // the start of the first vararg value... for expansion of llvm.va_start.
  if (isVarArg) {
    VarArgsFrameIndex = MFI->CreateFixedObject(MVT::getSizeInBits(PtrVT)/8,
                                               ArgOffset);
    SDOperand FIN = DAG.getFrameIndex(VarArgsFrameIndex, PtrVT);
    // If this function is vararg, store any remaining integer argument regs to
    // their spots on the stack so that they may be loaded by deferencing the
    // result of va_next.
    SmallVector<SDOperand, 8> MemOps;
    for (; ArgRegIdx != NumArgRegs; ++ArgRegIdx) {
      unsigned VReg = RegMap->createVirtualRegister(&SPU::GPRCRegClass);
      MF.addLiveIn(ArgRegs[ArgRegIdx], VReg);
      SDOperand Val = DAG.getCopyFromReg(Root, VReg, PtrVT);
      SDOperand Store = DAG.getStore(Val.getValue(1), Val, FIN, NULL, 0);
      MemOps.push_back(Store);
      // Increment the address by four for the next argument to store
      SDOperand PtrOff = DAG.getConstant(MVT::getSizeInBits(PtrVT)/8, PtrVT);
      FIN = DAG.getNode(ISD::ADD, PtrOff.getValueType(), FIN, PtrOff);
    }
    if (!MemOps.empty())
      Root = DAG.getNode(ISD::TokenFactor, MVT::Other,&MemOps[0],MemOps.size());
  }

  ArgValues.push_back(Root);

  // Return the new list of results.
  std::vector<MVT::ValueType> RetVT(Op.Val->value_begin(),
                                    Op.Val->value_end());
  return DAG.getNode(ISD::MERGE_VALUES, RetVT, &ArgValues[0], ArgValues.size());
}

/// isLSAAddress - Return the immediate to use if the specified
/// value is representable as a LSA address.
static SDNode *isLSAAddress(SDOperand Op, SelectionDAG &DAG) {
  ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
  if (!C) return 0;

  int Addr = C->getValue();
  if ((Addr & 3) != 0 ||  // Low 2 bits are implicitly zero.
      (Addr << 14 >> 14) != Addr)
    return 0;  // Top 14 bits have to be sext of immediate.

  return DAG.getConstant((int)C->getValue() >> 2, MVT::i32).Val;
}

static
SDOperand
LowerCALL(SDOperand Op, SelectionDAG &DAG) {
  SDOperand Chain = Op.getOperand(0);
#if 0
  bool isVarArg       = cast<ConstantSDNode>(Op.getOperand(2))->getValue() != 0;
  bool isTailCall     = cast<ConstantSDNode>(Op.getOperand(3))->getValue() != 0;
#endif
  SDOperand Callee    = Op.getOperand(4);
  unsigned NumOps     = (Op.getNumOperands() - 5) / 2;
  unsigned StackSlotSize = SPUFrameInfo::stackSlotSize();
  const unsigned *ArgRegs = SPURegisterInfo::getArgRegs();
  const unsigned NumArgRegs = SPURegisterInfo::getNumArgRegs();

  // Handy pointer type
  MVT::ValueType PtrVT = DAG.getTargetLoweringInfo().getPointerTy();

  // Accumulate how many bytes are to be pushed on the stack, including the
  // linkage area, and parameter passing area.  According to the SPU ABI,
  // we minimally need space for [LR] and [SP]
  unsigned NumStackBytes = SPUFrameInfo::minStackSize();

  // Set up a copy of the stack pointer for use loading and storing any
  // arguments that may not fit in the registers available for argument
  // passing.
  SDOperand StackPtr = DAG.getRegister(SPU::R1, MVT::i32);

  // Figure out which arguments are going to go in registers, and which in
  // memory.
  unsigned ArgOffset = SPUFrameInfo::minStackSize(); // Just below [LR]
  unsigned ArgRegIdx = 0;

  // Keep track of registers passing arguments
  std::vector<std::pair<unsigned, SDOperand> > RegsToPass;
  // And the arguments passed on the stack
  SmallVector<SDOperand, 8> MemOpChains;

  for (unsigned i = 0; i != NumOps; ++i) {
    SDOperand Arg = Op.getOperand(5+2*i);

    // PtrOff will be used to store the current argument to the stack if a
    // register cannot be found for it.
    SDOperand PtrOff = DAG.getConstant(ArgOffset, StackPtr.getValueType());
    PtrOff = DAG.getNode(ISD::ADD, PtrVT, StackPtr, PtrOff);

    switch (Arg.getValueType()) {
    default: assert(0 && "Unexpected ValueType for argument!");
    case MVT::i32:
    case MVT::i64:
    case MVT::i128:
      if (ArgRegIdx != NumArgRegs) {
        RegsToPass.push_back(std::make_pair(ArgRegs[ArgRegIdx++], Arg));
      } else {
        MemOpChains.push_back(DAG.getStore(Chain, Arg, PtrOff, NULL, 0));
	ArgOffset += StackSlotSize;
      }
      break;
    case MVT::f32:
    case MVT::f64:
      if (ArgRegIdx != NumArgRegs) {
        RegsToPass.push_back(std::make_pair(ArgRegs[ArgRegIdx++], Arg));
      } else {
        MemOpChains.push_back(DAG.getStore(Chain, Arg, PtrOff, NULL, 0));
	ArgOffset += StackSlotSize;
      }
      break;
    case MVT::v4f32:
    case MVT::v4i32:
    case MVT::v8i16:
    case MVT::v16i8:
      if (ArgRegIdx != NumArgRegs) {
        RegsToPass.push_back(std::make_pair(ArgRegs[ArgRegIdx++], Arg));
      } else {
        MemOpChains.push_back(DAG.getStore(Chain, Arg, PtrOff, NULL, 0));
	ArgOffset += StackSlotSize;
      }
      break;
    }
  }

  // Update number of stack bytes actually used, insert a call sequence start
  NumStackBytes = (ArgOffset - SPUFrameInfo::minStackSize());
  Chain = DAG.getCALLSEQ_START(Chain, DAG.getConstant(NumStackBytes, PtrVT));

  if (!MemOpChains.empty()) {
    // Adjust the stack pointer for the stack arguments.
    Chain = DAG.getNode(ISD::TokenFactor, MVT::Other,
                        &MemOpChains[0], MemOpChains.size());
  }

  // Build a sequence of copy-to-reg nodes chained together with token chain
  // and flag operands which copy the outgoing args into the appropriate regs.
  SDOperand InFlag;
  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
    Chain = DAG.getCopyToReg(Chain, RegsToPass[i].first, RegsToPass[i].second,
                             InFlag);
    InFlag = Chain.getValue(1);
  }

  std::vector<MVT::ValueType> NodeTys;
  NodeTys.push_back(MVT::Other);   // Returns a chain
  NodeTys.push_back(MVT::Flag);    // Returns a flag for retval copy to use.

  SmallVector<SDOperand, 8> Ops;
  unsigned CallOpc = SPUISD::CALL;

  // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
  // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
  // node so that legalize doesn't hack it.
  if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
    GlobalValue *GV = G->getGlobal();
    unsigned CalleeVT = Callee.getValueType();

    // Turn calls to targets that are defined (i.e., have bodies) into BRSL
    // style calls, otherwise, external symbols are BRASL calls.
    // NOTE:
    // This may be an unsafe assumption for JIT and really large compilation
    // units.
    if (GV->isDeclaration()) {
      Callee = DAG.getGlobalAddress(GV, CalleeVT);
    } else {
      Callee = DAG.getNode(SPUISD::PCRelAddr, CalleeVT,
                           DAG.getTargetGlobalAddress(GV, CalleeVT),
                           DAG.getConstant(0, PtrVT));
    }
  } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee))
    Callee = DAG.getExternalSymbol(S->getSymbol(), Callee.getValueType());
  else if (SDNode *Dest = isLSAAddress(Callee, DAG))
    // If this is an absolute destination address that appears to be a legal
    // local store address, use the munged value.
    Callee = SDOperand(Dest, 0);

  Ops.push_back(Chain);
  Ops.push_back(Callee);

  // Add argument registers to the end of the list so that they are known live
  // into the call.
  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
    Ops.push_back(DAG.getRegister(RegsToPass[i].first,
                                  RegsToPass[i].second.getValueType()));

  if (InFlag.Val)
    Ops.push_back(InFlag);
  Chain = DAG.getNode(CallOpc, NodeTys, &Ops[0], Ops.size());
  InFlag = Chain.getValue(1);

  SDOperand ResultVals[3];
  unsigned NumResults = 0;
  NodeTys.clear();

  // If the call has results, copy the values out of the ret val registers.
  switch (Op.Val->getValueType(0)) {
  default: assert(0 && "Unexpected ret value!");
  case MVT::Other: break;
  case MVT::i32:
    if (Op.Val->getValueType(1) == MVT::i32) {
      Chain = DAG.getCopyFromReg(Chain, SPU::R4, MVT::i32, InFlag).getValue(1);
      ResultVals[0] = Chain.getValue(0);
      Chain = DAG.getCopyFromReg(Chain, SPU::R3, MVT::i32,
                                 Chain.getValue(2)).getValue(1);
      ResultVals[1] = Chain.getValue(0);
      NumResults = 2;
      NodeTys.push_back(MVT::i32);
    } else {
      Chain = DAG.getCopyFromReg(Chain, SPU::R3, MVT::i32, InFlag).getValue(1);
      ResultVals[0] = Chain.getValue(0);
      NumResults = 1;
    }
    NodeTys.push_back(MVT::i32);
    break;
  case MVT::i64:
    Chain = DAG.getCopyFromReg(Chain, SPU::R3, MVT::i64, InFlag).getValue(1);
    ResultVals[0] = Chain.getValue(0);
    NumResults = 1;
    NodeTys.push_back(MVT::i64);
    break;
  case MVT::f32:
  case MVT::f64:
    Chain = DAG.getCopyFromReg(Chain, SPU::R3, Op.Val->getValueType(0),
                               InFlag).getValue(1);
    ResultVals[0] = Chain.getValue(0);
    NumResults = 1;
    NodeTys.push_back(Op.Val->getValueType(0));
    break;
  case MVT::v2f64:
  case MVT::v4f32:
  case MVT::v4i32:
  case MVT::v8i16:
  case MVT::v16i8:
    Chain = DAG.getCopyFromReg(Chain, SPU::R3, Op.Val->getValueType(0),
                                   InFlag).getValue(1);
    ResultVals[0] = Chain.getValue(0);
    NumResults = 1;
    NodeTys.push_back(Op.Val->getValueType(0));
    break;
  }

  Chain = DAG.getNode(ISD::CALLSEQ_END, MVT::Other, Chain,
                      DAG.getConstant(NumStackBytes, PtrVT));
  NodeTys.push_back(MVT::Other);

  // If the function returns void, just return the chain.
  if (NumResults == 0)
    return Chain;

  // Otherwise, merge everything together with a MERGE_VALUES node.
  ResultVals[NumResults++] = Chain;
  SDOperand Res = DAG.getNode(ISD::MERGE_VALUES, NodeTys,
                              ResultVals, NumResults);
  return Res.getValue(Op.ResNo);
}

static SDOperand
LowerRET(SDOperand Op, SelectionDAG &DAG, TargetMachine &TM) {
  SmallVector<CCValAssign, 16> RVLocs;
  unsigned CC = DAG.getMachineFunction().getFunction()->getCallingConv();
  bool isVarArg = DAG.getMachineFunction().getFunction()->isVarArg();
  CCState CCInfo(CC, isVarArg, TM, RVLocs);
  CCInfo.AnalyzeReturn(Op.Val, RetCC_SPU);

  // If this is the first return lowered for this function, add the regs to the
  // liveout set for the function.
  if (DAG.getMachineFunction().liveout_empty()) {
    for (unsigned i = 0; i != RVLocs.size(); ++i)
      DAG.getMachineFunction().addLiveOut(RVLocs[i].getLocReg());
  }

  SDOperand Chain = Op.getOperand(0);
  SDOperand Flag;

  // Copy the result values into the output registers.
  for (unsigned i = 0; i != RVLocs.size(); ++i) {
    CCValAssign &VA = RVLocs[i];
    assert(VA.isRegLoc() && "Can only return in registers!");
    Chain = DAG.getCopyToReg(Chain, VA.getLocReg(), Op.getOperand(i*2+1), Flag);
    Flag = Chain.getValue(1);
  }

  if (Flag.Val)
    return DAG.getNode(SPUISD::RET_FLAG, MVT::Other, Chain, Flag);
  else
    return DAG.getNode(SPUISD::RET_FLAG, MVT::Other, Chain);
}


//===----------------------------------------------------------------------===//
// Vector related lowering:
//===----------------------------------------------------------------------===//

static ConstantSDNode *
getVecImm(SDNode *N) {
  SDOperand OpVal(0, 0);

  // Check to see if this buildvec has a single non-undef value in its elements.
  for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
    if (N->getOperand(i).getOpcode() == ISD::UNDEF) continue;
    if (OpVal.Val == 0)
      OpVal = N->getOperand(i);
    else if (OpVal != N->getOperand(i))
      return 0;
  }

  if (OpVal.Val != 0) {
    if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(OpVal)) {
      return CN;
    }
  }

  return 0; // All UNDEF: use implicit def.; not Constant node
}

/// get_vec_i18imm - Test if this vector is a vector filled with the same value
/// and the value fits into an unsigned 18-bit constant, and if so, return the
/// constant
SDOperand SPU::get_vec_u18imm(SDNode *N, SelectionDAG &DAG,
                              MVT::ValueType ValueType) {
  if (ConstantSDNode *CN = getVecImm(N)) {
    uint64_t Value = CN->getValue();
    if (Value <= 0x3ffff)
      return DAG.getConstant(Value, ValueType);
  }

  return SDOperand();
}

/// get_vec_i16imm - Test if this vector is a vector filled with the same value
/// and the value fits into a signed 16-bit constant, and if so, return the
/// constant
SDOperand SPU::get_vec_i16imm(SDNode *N, SelectionDAG &DAG,
                              MVT::ValueType ValueType) {
  if (ConstantSDNode *CN = getVecImm(N)) {
    if (ValueType == MVT::i32) {
      int Value = (int) CN->getValue();
      int SExtValue = ((Value & 0xffff) << 16) >> 16;

      if (Value == SExtValue)
	return DAG.getConstant(Value, ValueType);
    } else if (ValueType == MVT::i16) {
      short Value = (short) CN->getValue();
      int SExtValue = ((int) Value << 16) >> 16;

      if (Value == (short) SExtValue)
	return DAG.getConstant(Value, ValueType);
    } else if (ValueType == MVT::i64) {
      int64_t Value = CN->getValue();
      int64_t SExtValue = ((Value & 0xffff) << (64 - 16)) >> (64 - 16);

      if (Value == SExtValue)
	return DAG.getConstant(Value, ValueType);
    }
  }

  return SDOperand();
}

/// get_vec_i10imm - Test if this vector is a vector filled with the same value
/// and the value fits into a signed 10-bit constant, and if so, return the
/// constant
SDOperand SPU::get_vec_i10imm(SDNode *N, SelectionDAG &DAG,
                              MVT::ValueType ValueType) {
  if (ConstantSDNode *CN = getVecImm(N)) {
    int Value = (int) CN->getValue();
    if ((ValueType == MVT::i32 && isS10Constant(Value))
	|| (ValueType == MVT::i16 && isS10Constant((short) Value)))
      return DAG.getConstant(Value, ValueType);
  }

  return SDOperand();
}

/// get_vec_i8imm - Test if this vector is a vector filled with the same value
/// and the value fits into a signed 8-bit constant, and if so, return the
/// constant.
///
/// @note: The incoming vector is v16i8 because that's the only way we can load
/// constant vectors. Thus, we test to see if the upper and lower bytes are the
/// same value.
SDOperand SPU::get_vec_i8imm(SDNode *N, SelectionDAG &DAG,
                             MVT::ValueType ValueType) {
  if (ConstantSDNode *CN = getVecImm(N)) {
    int Value = (int) CN->getValue();
    if (ValueType == MVT::i16
	&& Value <= 0xffff                 /* truncated from uint64_t */
	&& ((short) Value >> 8) == ((short) Value & 0xff))
      return DAG.getConstant(Value & 0xff, ValueType);
    else if (ValueType == MVT::i8
	     && (Value & 0xff) == Value)
      return DAG.getConstant(Value, ValueType);
  }

  return SDOperand();
}

/// get_ILHUvec_imm - Test if this vector is a vector filled with the same value
/// and the value fits into a signed 16-bit constant, and if so, return the
/// constant
SDOperand SPU::get_ILHUvec_imm(SDNode *N, SelectionDAG &DAG,
                               MVT::ValueType ValueType) {
  if (ConstantSDNode *CN = getVecImm(N)) {
    uint64_t Value = CN->getValue();
    if ((ValueType == MVT::i32
	  && ((unsigned) Value & 0xffff0000) == (unsigned) Value)
	|| (ValueType == MVT::i64 && (Value & 0xffff0000) == Value))
      return DAG.getConstant(Value >> 16, ValueType);
  }

  return SDOperand();
}

/// get_v4i32_imm - Catch-all for general 32-bit constant vectors
SDOperand SPU::get_v4i32_imm(SDNode *N, SelectionDAG &DAG) {
  if (ConstantSDNode *CN = getVecImm(N)) {
    return DAG.getConstant((unsigned) CN->getValue(), MVT::i32);
  }

  return SDOperand();
}

/// get_v4i32_imm - Catch-all for general 64-bit constant vectors
SDOperand SPU::get_v2i64_imm(SDNode *N, SelectionDAG &DAG) {
  if (ConstantSDNode *CN = getVecImm(N)) {
    return DAG.getConstant((unsigned) CN->getValue(), MVT::i64);
  }

  return SDOperand();
}

// If this is a vector of constants or undefs, get the bits.  A bit in
// UndefBits is set if the corresponding element of the vector is an
// ISD::UNDEF value.  For undefs, the corresponding VectorBits values are
// zero.   Return true if this is not an array of constants, false if it is.
//
static bool GetConstantBuildVectorBits(SDNode *BV, uint64_t VectorBits[2],
                                       uint64_t UndefBits[2]) {
  // Start with zero'd results.
  VectorBits[0] = VectorBits[1] = UndefBits[0] = UndefBits[1] = 0;

  unsigned EltBitSize = MVT::getSizeInBits(BV->getOperand(0).getValueType());
  for (unsigned i = 0, e = BV->getNumOperands(); i != e; ++i) {
    SDOperand OpVal = BV->getOperand(i);

    unsigned PartNo = i >= e/2;     // In the upper 128 bits?
    unsigned SlotNo = e/2 - (i & (e/2-1))-1;  // Which subpiece of the uint64_t.

    uint64_t EltBits = 0;
    if (OpVal.getOpcode() == ISD::UNDEF) {
      uint64_t EltUndefBits = ~0ULL >> (64-EltBitSize);
      UndefBits[PartNo] |= EltUndefBits << (SlotNo*EltBitSize);
      continue;
    } else if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(OpVal)) {
      EltBits = CN->getValue() & (~0ULL >> (64-EltBitSize));
    } else if (ConstantFPSDNode *CN = dyn_cast<ConstantFPSDNode>(OpVal)) {
      const APFloat &apf = CN->getValueAPF();
      EltBits = (CN->getValueType(0) == MVT::f32
		 ? FloatToBits(apf.convertToFloat())
		 : DoubleToBits(apf.convertToDouble()));
    } else {
      // Nonconstant element.
      return true;
    }

    VectorBits[PartNo] |= EltBits << (SlotNo*EltBitSize);
  }

  //printf("%llx %llx  %llx %llx\n",
  //       VectorBits[0], VectorBits[1], UndefBits[0], UndefBits[1]);
  return false;
}

/// If this is a splat (repetition) of a value across the whole vector, return
/// the smallest size that splats it.  For example, "0x01010101010101..." is a
/// splat of 0x01, 0x0101, and 0x01010101.  We return SplatBits = 0x01 and
/// SplatSize = 1 byte.
static bool isConstantSplat(const uint64_t Bits128[2],
                            const uint64_t Undef128[2],
			    int MinSplatBits,
                            uint64_t &SplatBits, uint64_t &SplatUndef,
                            int &SplatSize) {
  // Don't let undefs prevent splats from matching.  See if the top 64-bits are
  // the same as the lower 64-bits, ignoring undefs.
  uint64_t Bits64  = Bits128[0] | Bits128[1];
  uint64_t Undef64 = Undef128[0] & Undef128[1];
  uint32_t Bits32  = uint32_t(Bits64) | uint32_t(Bits64 >> 32);
  uint32_t Undef32 = uint32_t(Undef64) & uint32_t(Undef64 >> 32);
  uint16_t Bits16  = uint16_t(Bits32)  | uint16_t(Bits32 >> 16);
  uint16_t Undef16 = uint16_t(Undef32) & uint16_t(Undef32 >> 16);

  if ((Bits128[0] & ~Undef128[1]) == (Bits128[1] & ~Undef128[0])) {
    if (MinSplatBits < 64) {

      // Check that the top 32-bits are the same as the lower 32-bits, ignoring
      // undefs.
      if ((Bits64 & (~Undef64 >> 32)) == ((Bits64 >> 32) & ~Undef64)) {
	if (MinSplatBits < 32) {

	  // If the top 16-bits are different than the lower 16-bits, ignoring
	  // undefs, we have an i32 splat.
	  if ((Bits32 & (~Undef32 >> 16)) == ((Bits32 >> 16) & ~Undef32)) {
	    if (MinSplatBits < 16) {
	      // If the top 8-bits are different than the lower 8-bits, ignoring
	      // undefs, we have an i16 splat.
	      if ((Bits16 & (uint16_t(~Undef16) >> 8)) == ((Bits16 >> 8) & ~Undef16)) {
		// Otherwise, we have an 8-bit splat.
		SplatBits  = uint8_t(Bits16)  | uint8_t(Bits16 >> 8);
		SplatUndef = uint8_t(Undef16) & uint8_t(Undef16 >> 8);
		SplatSize = 1;
		return true;
	      }
	    } else {
	      SplatBits = Bits16;
	      SplatUndef = Undef16;
	      SplatSize = 2;
	      return true;
	    }
	  }
	} else {
	  SplatBits = Bits32;
	  SplatUndef = Undef32;
	  SplatSize = 4;
	  return true;
	}
      }
    } else {
      SplatBits = Bits128[0];
      SplatUndef = Undef128[0];
      SplatSize = 8;
      return true;
    }
  }

  return false;  // Can't be a splat if two pieces don't match.
}

// If this is a case we can't handle, return null and let the default
// expansion code take care of it.  If we CAN select this case, and if it
// selects to a single instruction, return Op.  Otherwise, if we can codegen
// this case more efficiently than a constant pool load, lower it to the
// sequence of ops that should be used.
static SDOperand LowerBUILD_VECTOR(SDOperand Op, SelectionDAG &DAG) {
  MVT::ValueType VT = Op.getValueType();
  // If this is a vector of constants or undefs, get the bits.  A bit in
  // UndefBits is set if the corresponding element of the vector is an
  // ISD::UNDEF value.  For undefs, the corresponding VectorBits values are
  // zero.
  uint64_t VectorBits[2];
  uint64_t UndefBits[2];
  uint64_t SplatBits, SplatUndef;
  int SplatSize;
  if (GetConstantBuildVectorBits(Op.Val, VectorBits, UndefBits)
      || !isConstantSplat(VectorBits, UndefBits,
			  MVT::getSizeInBits(MVT::getVectorElementType(VT)),
                          SplatBits, SplatUndef, SplatSize))
    return SDOperand();   // Not a constant vector, not a splat.

  switch (VT) {
  default:
  case MVT::v4f32: {
    uint32_t Value32 = SplatBits;
    assert(SplatSize == 4
	   && "LowerBUILD_VECTOR: Unexpected floating point vector element.");
    // NOTE: pretend the constant is an integer. LLVM won't load FP constants
    SDOperand T = DAG.getConstant(Value32, MVT::i32);
    return DAG.getNode(ISD::BIT_CONVERT, MVT::v4f32,
		       DAG.getNode(ISD::BUILD_VECTOR, MVT::v4i32, T, T, T, T));
    break;
  }
  case MVT::v2f64: {
    uint64_t f64val = SplatBits;
    assert(SplatSize == 8
	   && "LowerBUILD_VECTOR: 64-bit float vector element: unexpected size.");
    // NOTE: pretend the constant is an integer. LLVM won't load FP constants
    SDOperand T = DAG.getConstant(f64val, MVT::i64);
    return DAG.getNode(ISD::BIT_CONVERT, MVT::v2f64,
		       DAG.getNode(ISD::BUILD_VECTOR, MVT::v2i64, T, T));
    break;
  }
  case MVT::v16i8: {
   // 8-bit constants have to be expanded to 16-bits
   unsigned short Value16 = SplatBits | (SplatBits << 8);
   SDOperand Ops[8];
   for (int i = 0; i < 8; ++i)
     Ops[i] = DAG.getConstant(Value16, MVT::i16);
   return DAG.getNode(ISD::BIT_CONVERT, VT,
                      DAG.getNode(ISD::BUILD_VECTOR, MVT::v8i16, Ops, 8));
  }
  case MVT::v8i16: {
    unsigned short Value16;
    if (SplatSize == 2)
      Value16 = (unsigned short) (SplatBits & 0xffff);
    else
      Value16 = (unsigned short) (SplatBits | (SplatBits << 8));
    SDOperand T = DAG.getConstant(Value16, MVT::getVectorElementType(VT));
    SDOperand Ops[8];
    for (int i = 0; i < 8; ++i) Ops[i] = T;
    return DAG.getNode(ISD::BUILD_VECTOR, VT, Ops, 8);
  }
  case MVT::v4i32: {
    unsigned int Value = SplatBits;
    SDOperand T = DAG.getConstant(Value, MVT::getVectorElementType(VT));
    return DAG.getNode(ISD::BUILD_VECTOR, VT, T, T, T, T);
  }
  case MVT::v2i64: {
    uint64_t val = SplatBits;
    uint32_t upper = uint32_t(val >> 32);
    uint32_t lower = uint32_t(val);

    if (val != 0) {
      SDOperand LO32;
      SDOperand HI32;
      SmallVector<SDOperand, 16> ShufBytes;
      SDOperand Result;
      bool upper_special, lower_special;

      // NOTE: This code creates common-case shuffle masks that can be easily
      // detected as common expressions. It is not attempting to create highly
      // specialized masks to replace any and all 0's, 0xff's and 0x80's.

      // Detect if the upper or lower half is a special shuffle mask pattern:
      upper_special = (upper == 0 || upper == 0xffffffff || upper == 0x80000000);
      lower_special = (lower == 0 || lower == 0xffffffff || lower == 0x80000000);

      // Create lower vector if not a special pattern
      if (!lower_special) {
	SDOperand LO32C = DAG.getConstant(lower, MVT::i32);
	LO32 = DAG.getNode(ISD::BIT_CONVERT, VT,
			   DAG.getNode(ISD::BUILD_VECTOR, MVT::v4i32,
				       LO32C, LO32C, LO32C, LO32C));
      }

      // Create upper vector if not a special pattern
      if (!upper_special) {
	SDOperand HI32C = DAG.getConstant(upper, MVT::i32);
	HI32 = DAG.getNode(ISD::BIT_CONVERT, VT,
			   DAG.getNode(ISD::BUILD_VECTOR, MVT::v4i32,
				       HI32C, HI32C, HI32C, HI32C));
      }

      // If either upper or lower are special, then the two input operands are
      // the same (basically, one of them is a "don't care")
      if (lower_special)
	LO32 = HI32;
      if (upper_special)
	HI32 = LO32;
      if (lower_special && upper_special) {
	// Unhappy situation... both upper and lower are special, so punt with
	// a target constant:
        SDOperand Zero = DAG.getConstant(0, MVT::i32);
	HI32 = LO32 = DAG.getNode(ISD::BUILD_VECTOR, MVT::v4i32, Zero, Zero,
                                  Zero, Zero);
      }

      for (int i = 0; i < 4; ++i) {
	for (int j = 0; j < 4; ++j) {
	  SDOperand V;
	  bool process_upper, process_lower;
	  uint64_t val;

	  process_upper = (upper_special && (i & 1) == 0);
	  process_lower = (lower_special && (i & 1) == 1);

	  if (process_upper || process_lower) {
	    if ((process_upper && upper == 0)
		|| (process_lower && lower == 0))
	      val = 0x80;
	    else if ((process_upper && upper == 0xffffffff)
		     || (process_lower && lower == 0xffffffff))
	      val = 0xc0;
	    else if ((process_upper && upper == 0x80000000)
		     || (process_lower && lower == 0x80000000))
	      val = (j == 0 ? 0xe0 : 0x80);
	  } else
	    val = i * 4 + j + ((i & 1) * 16);

	  ShufBytes.push_back(DAG.getConstant(val, MVT::i8));
	}
      }

      return DAG.getNode(SPUISD::SHUFB, VT, HI32, LO32,
			 DAG.getNode(ISD::BUILD_VECTOR, MVT::v16i8,
				     &ShufBytes[0], ShufBytes.size()));
    } else {
      // For zero, this can be lowered efficiently via v4i32 BUILD_VECTOR
      SDOperand Zero = DAG.getConstant(0, MVT::i32);
      return DAG.getNode(ISD::BIT_CONVERT, VT,
			 DAG.getNode(ISD::BUILD_VECTOR, MVT::v4i32,
				     Zero, Zero, Zero, Zero));
    }
  }
  }

  return SDOperand();
}

/// LowerVECTOR_SHUFFLE - Lower a vector shuffle (V1, V2, V3) to something on
/// which the Cell can operate. The code inspects V3 to ascertain whether the
/// permutation vector, V3, is monotonically increasing with one "exception"
/// element, e.g., (0, 1, _, 3). If this is the case, then generate a
/// INSERT_MASK synthetic instruction. Otherwise, spill V3 to the constant pool.
/// In either case, the net result is going to eventually invoke SHUFB to
/// permute/shuffle the bytes from V1 and V2.
/// \note
/// INSERT_MASK is eventually selected as one of the C*D instructions, generate
/// control word for byte/halfword/word insertion. This takes care of a single
/// element move from V2 into V1.
/// \note
/// SPUISD::SHUFB is eventually selected as Cell's <i>shufb</i> instructions.
static SDOperand LowerVECTOR_SHUFFLE(SDOperand Op, SelectionDAG &DAG) {
  SDOperand V1 = Op.getOperand(0);
  SDOperand V2 = Op.getOperand(1);
  SDOperand PermMask = Op.getOperand(2);

  if (V2.getOpcode() == ISD::UNDEF) V2 = V1;

  // If we have a single element being moved from V1 to V2, this can be handled
  // using the C*[DX] compute mask instructions, but the vector elements have
  // to be monotonically increasing with one exception element.
  MVT::ValueType EltVT = MVT::getVectorElementType(V1.getValueType());
  unsigned EltsFromV2 = 0;
  unsigned V2Elt = 0;
  unsigned V2EltIdx0 = 0;
  unsigned CurrElt = 0;
  bool monotonic = true;
  if (EltVT == MVT::i8)
    V2EltIdx0 = 16;
  else if (EltVT == MVT::i16)
    V2EltIdx0 = 8;
  else if (EltVT == MVT::i32)
    V2EltIdx0 = 4;
  else
    assert(0 && "Unhandled vector type in LowerVECTOR_SHUFFLE");

  for (unsigned i = 0, e = PermMask.getNumOperands();
       EltsFromV2 <= 1 && monotonic && i != e;
       ++i) {
    unsigned SrcElt;
    if (PermMask.getOperand(i).getOpcode() == ISD::UNDEF)
      SrcElt = 0;
    else
      SrcElt = cast<ConstantSDNode>(PermMask.getOperand(i))->getValue();

    if (SrcElt >= V2EltIdx0) {
      ++EltsFromV2;
      V2Elt = (V2EltIdx0 - SrcElt) << 2;
    } else if (CurrElt != SrcElt) {
      monotonic = false;
    }

    ++CurrElt;
  }

  if (EltsFromV2 == 1 && monotonic) {
    // Compute mask and shuffle
    MachineFunction &MF = DAG.getMachineFunction();
    SSARegMap *RegMap = MF.getSSARegMap();
    unsigned VReg = RegMap->createVirtualRegister(&SPU::R32CRegClass);
    MVT::ValueType PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
    // Initialize temporary register to 0
    SDOperand InitTempReg =
      DAG.getCopyToReg(DAG.getEntryNode(), VReg, DAG.getConstant(0, PtrVT));
    // Copy register's contents as index in INSERT_MASK:
    SDOperand ShufMaskOp =
      DAG.getNode(SPUISD::INSERT_MASK, V1.getValueType(),
		  DAG.getTargetConstant(V2Elt, MVT::i32),
		  DAG.getCopyFromReg(InitTempReg, VReg, PtrVT));
    // Use shuffle mask in SHUFB synthetic instruction:
    return DAG.getNode(SPUISD::SHUFB, V1.getValueType(), V2, V1, ShufMaskOp);
  } else {
    // Convert the SHUFFLE_VECTOR mask's input element units to the actual bytes.
    unsigned BytesPerElement = MVT::getSizeInBits(EltVT)/8;

    SmallVector<SDOperand, 16> ResultMask;
    for (unsigned i = 0, e = PermMask.getNumOperands(); i != e; ++i) {
      unsigned SrcElt;
      if (PermMask.getOperand(i).getOpcode() == ISD::UNDEF)
	SrcElt = 0;
      else
	SrcElt = cast<ConstantSDNode>(PermMask.getOperand(i))->getValue();

      for (unsigned j = 0; j != BytesPerElement; ++j) {
	ResultMask.push_back(DAG.getConstant(SrcElt*BytesPerElement+j,
					     MVT::i8));
      }
    }

    SDOperand VPermMask = DAG.getNode(ISD::BUILD_VECTOR, MVT::v16i8,
				      &ResultMask[0], ResultMask.size());
    return DAG.getNode(SPUISD::SHUFB, V1.getValueType(), V1, V2, VPermMask);
  }
}

static SDOperand LowerSCALAR_TO_VECTOR(SDOperand Op, SelectionDAG &DAG) {
  SDOperand Op0 = Op.getOperand(0);			// Op0 = the scalar

  if (Op0.Val->getOpcode() == ISD::Constant) {
    // For a constant, build the appropriate constant vector, which will
    // eventually simplify to a vector register load.

    ConstantSDNode *CN = cast<ConstantSDNode>(Op0.Val);
    SmallVector<SDOperand, 16> ConstVecValues;
    MVT::ValueType VT;
    size_t n_copies;

    // Create a constant vector:
    switch (Op.getValueType()) {
    default: assert(0 && "Unexpected constant value type in "
		         "LowerSCALAR_TO_VECTOR");
    case MVT::v16i8: n_copies = 16; VT = MVT::i8; break;
    case MVT::v8i16: n_copies = 8; VT = MVT::i16; break;
    case MVT::v4i32: n_copies = 4; VT = MVT::i32; break;
    case MVT::v4f32: n_copies = 4; VT = MVT::f32; break;
    case MVT::v2i64: n_copies = 2; VT = MVT::i64; break;
    case MVT::v2f64: n_copies = 2; VT = MVT::f64; break;
    }

    SDOperand CValue = DAG.getConstant(CN->getValue(), VT);
    for (size_t j = 0; j < n_copies; ++j)
      ConstVecValues.push_back(CValue);

    return DAG.getNode(ISD::BUILD_VECTOR, Op.getValueType(),
	               &ConstVecValues[0], ConstVecValues.size());
  } else {
    // Otherwise, copy the value from one register to another:
    switch (Op0.getValueType()) {
    default: assert(0 && "Unexpected value type in LowerSCALAR_TO_VECTOR");
    case MVT::i8:
    case MVT::i16:
    case MVT::i32:
    case MVT::i64:
    case MVT::f32:
    case MVT::f64:
      return DAG.getNode(SPUISD::PROMOTE_SCALAR, Op.getValueType(), Op0, Op0);
    }
  }

  return SDOperand();
}

static SDOperand LowerVectorMUL(SDOperand Op, SelectionDAG &DAG) {
  switch (Op.getValueType()) {
  case MVT::v4i32: {
    SDOperand rA = Op.getOperand(0);
    SDOperand rB = Op.getOperand(1);
    SDOperand HiProd1 = DAG.getNode(SPUISD::MPYH, MVT::v4i32, rA, rB);
    SDOperand HiProd2 = DAG.getNode(SPUISD::MPYH, MVT::v4i32, rB, rA);
    SDOperand LoProd = DAG.getNode(SPUISD::MPYU, MVT::v4i32, rA, rB);
    SDOperand Residual1 = DAG.getNode(ISD::ADD, MVT::v4i32, LoProd, HiProd1);

    return DAG.getNode(ISD::ADD, MVT::v4i32, Residual1, HiProd2);
    break;
  }

  // Multiply two v8i16 vectors (pipeline friendly version):
  // a) multiply lower halves, mask off upper 16-bit of 32-bit product
  // b) multiply upper halves, rotate left by 16 bits (inserts 16 lower zeroes)
  // c) Use SELB to select upper and lower halves from the intermediate results
  //
  // NOTE: We really want to move the FSMBI to earlier to actually get the
  // dual-issue. This code does manage to do this, even if it's a little on
  // the wacky side
  case MVT::v8i16: {
    MachineFunction &MF = DAG.getMachineFunction();
    SSARegMap *RegMap = MF.getSSARegMap();
    SDOperand Chain = Op.getOperand(0);
    SDOperand rA = Op.getOperand(0);
    SDOperand rB = Op.getOperand(1);
    unsigned FSMBIreg = RegMap->createVirtualRegister(&SPU::VECREGRegClass);
    unsigned HiProdReg = RegMap->createVirtualRegister(&SPU::VECREGRegClass);

    SDOperand FSMBOp =
      DAG.getCopyToReg(Chain, FSMBIreg,
		       DAG.getNode(SPUISD::FSMBI, MVT::v8i16,
				   DAG.getConstant(0xcccc, MVT::i32)));

    SDOperand HHProd =
      DAG.getCopyToReg(FSMBOp, HiProdReg,
		       DAG.getNode(SPUISD::MPYHH, MVT::v8i16, rA, rB));

    SDOperand HHProd_v4i32 =
      DAG.getNode(ISD::BIT_CONVERT, MVT::v4i32,
		  DAG.getCopyFromReg(HHProd, HiProdReg, MVT::v4i32));

    return DAG.getNode(SPUISD::SELB, MVT::v8i16,
		       DAG.getNode(SPUISD::MPY, MVT::v8i16, rA, rB),
		       DAG.getNode(ISD::BIT_CONVERT, Op.getValueType(),
				   DAG.getNode(SPUISD::VEC_SHL, MVT::v4i32,
					       HHProd_v4i32,
					       DAG.getConstant(16, MVT::i16))),
		       DAG.getCopyFromReg(FSMBOp, FSMBIreg, MVT::v4i32));
  }

  // This M00sE is N@stI! (apologies to Monty Python)
  //
  // SPU doesn't know how to do any 8-bit multiplication, so the solution
  // is to break it all apart, sign extend, and reassemble the various
  // intermediate products.
  case MVT::v16i8: {
    MachineFunction &MF = DAG.getMachineFunction();
    SSARegMap *RegMap = MF.getSSARegMap();
    SDOperand Chain = Op.getOperand(0);
    SDOperand rA = Op.getOperand(0);
    SDOperand rB = Op.getOperand(1);
    SDOperand c8 = DAG.getConstant(8, MVT::i8);
    SDOperand c16 = DAG.getConstant(16, MVT::i8);

    unsigned FSMBreg_2222 = RegMap->createVirtualRegister(&SPU::VECREGRegClass);
    unsigned LoProd_reg = RegMap->createVirtualRegister(&SPU::VECREGRegClass);
    unsigned HiProd_reg = RegMap->createVirtualRegister(&SPU::VECREGRegClass);

    SDOperand LLProd =
      DAG.getNode(SPUISD::MPY, MVT::v8i16,
		  DAG.getNode(ISD::BIT_CONVERT, MVT::v8i16, rA),
		  DAG.getNode(ISD::BIT_CONVERT, MVT::v8i16, rB));

    SDOperand rALH = DAG.getNode(SPUISD::VEC_SRA, MVT::v8i16, rA, c8);

    SDOperand rBLH = DAG.getNode(SPUISD::VEC_SRA, MVT::v8i16, rB, c8);

    SDOperand LHProd =
      DAG.getNode(SPUISD::VEC_SHL, MVT::v8i16,
		  DAG.getNode(SPUISD::MPY, MVT::v8i16, rALH, rBLH), c8);

    SDOperand FSMBdef_2222 =
      DAG.getCopyToReg(Chain, FSMBreg_2222,
		       DAG.getNode(SPUISD::FSMBI, MVT::v8i16,
				   DAG.getConstant(0x2222, MVT::i32)));

    SDOperand FSMBuse_2222 =
      DAG.getCopyFromReg(FSMBdef_2222, FSMBreg_2222, MVT::v4i32);

    SDOperand LoProd_1 =
      DAG.getCopyToReg(Chain, LoProd_reg,
		       DAG.getNode(SPUISD::SELB, MVT::v8i16, LLProd, LHProd,
				   FSMBuse_2222));

    SDOperand LoProdMask = DAG.getConstant(0xffff, MVT::i32);

    SDOperand LoProd =
      DAG.getNode(ISD::AND, MVT::v4i32,
		  DAG.getCopyFromReg(LoProd_1, LoProd_reg, MVT::v4i32),
		  DAG.getNode(ISD::BUILD_VECTOR, MVT::v4i32,
			      LoProdMask, LoProdMask,
			      LoProdMask, LoProdMask));

    SDOperand rAH =
      DAG.getNode(SPUISD::VEC_SRA, MVT::v4i32,
		  DAG.getNode(ISD::BIT_CONVERT, MVT::v4i32, rA), c16);

    SDOperand rBH =
      DAG.getNode(SPUISD::VEC_SRA, MVT::v4i32,
		  DAG.getNode(ISD::BIT_CONVERT, MVT::v4i32, rB), c16);

    SDOperand HLProd =
      DAG.getNode(SPUISD::MPY, MVT::v8i16,
		  DAG.getNode(ISD::BIT_CONVERT, MVT::v8i16, rAH),
		  DAG.getNode(ISD::BIT_CONVERT, MVT::v8i16, rBH));

    SDOperand HHProd_1 =
      DAG.getNode(SPUISD::MPY, MVT::v8i16,
		  DAG.getNode(ISD::BIT_CONVERT, MVT::v8i16,
			      DAG.getNode(SPUISD::VEC_SRA, MVT::v4i32, rAH, c8)),
		  DAG.getNode(ISD::BIT_CONVERT, MVT::v8i16,
			      DAG.getNode(SPUISD::VEC_SRA, MVT::v4i32, rBH, c8)));

    SDOperand HHProd =
      DAG.getCopyToReg(Chain, HiProd_reg,
		       DAG.getNode(SPUISD::SELB, MVT::v8i16,
				   HLProd,
				   DAG.getNode(SPUISD::VEC_SHL, MVT::v8i16, HHProd_1, c8),
				   FSMBuse_2222));

    SDOperand HiProd =
      DAG.getNode(SPUISD::VEC_SHL, MVT::v4i32,
		  DAG.getCopyFromReg(HHProd, HiProd_reg, MVT::v4i32), c16);

    return DAG.getNode(ISD::BIT_CONVERT, MVT::v16i8,
		       DAG.getNode(ISD::OR, MVT::v4i32,
				   LoProd, HiProd));
  }

  default:
    cerr << "CellSPU: Unknown vector multiplication, got "
         << MVT::getValueTypeString(Op.getValueType())
	 << "\n";
    abort();
    /*NOTREACHED*/
  }

  return SDOperand();
}

static SDOperand LowerFDIVf32(SDOperand Op, SelectionDAG &DAG) {
  MachineFunction &MF = DAG.getMachineFunction();
  SSARegMap *RegMap = MF.getSSARegMap();

  SDOperand A = Op.getOperand(0);
  SDOperand B = Op.getOperand(1);
  unsigned VT = Op.getValueType();

  unsigned VRegBR, VRegC;

  if (VT == MVT::f32) {
    VRegBR = RegMap->createVirtualRegister(&SPU::R32FPRegClass);
    VRegC = RegMap->createVirtualRegister(&SPU::R32FPRegClass);
  } else {
    VRegBR = RegMap->createVirtualRegister(&SPU::VECREGRegClass);
    VRegC = RegMap->createVirtualRegister(&SPU::VECREGRegClass);
  }
  // TODO: make sure we're feeding FPInterp the right arguments
  // Right now: fi B, frest(B)

  // Computes BRcpl =
  // (Floating Interpolate (FP Reciprocal Estimate B))
  SDOperand BRcpl =
      DAG.getCopyToReg(DAG.getEntryNode(), VRegBR,
		       DAG.getNode(SPUISD::FPInterp, VT, B,
				DAG.getNode(SPUISD::FPRecipEst, VT, B)));

  // Computes A * BRcpl and stores in a temporary register
  SDOperand AxBRcpl =
      DAG.getCopyToReg(BRcpl, VRegC,
		 DAG.getNode(ISD::FMUL, VT, A,
			DAG.getCopyFromReg(BRcpl, VRegBR, VT)));
  // What's the Chain variable do? It's magic!
  // TODO: set Chain = Op(0).getEntryNode()

  return DAG.getNode(ISD::FADD, VT,
		DAG.getCopyFromReg(AxBRcpl, VRegC, VT),
		DAG.getNode(ISD::FMUL, VT,
			DAG.getCopyFromReg(AxBRcpl, VRegBR, VT),
			DAG.getNode(ISD::FSUB, VT, A,
			    DAG.getNode(ISD::FMUL, VT, B,
			    DAG.getCopyFromReg(AxBRcpl, VRegC, VT)))));
}

// Expands double-precision FDIV
// Expects two doubles as inputs X and Y, does a floating point
// reciprocal estimate, and three iterations of Newton-Raphson
// to increase accuracy.
//static SDOperand LowerFDIVf64(SDOperand Op, SelectionDAG &DAG) {
//  MachineFunction &MF = DAG.getMachineFunction();
//  SSARegMap *RegMap = MF.getSSARegMap();
//
//  SDOperand X = Op.getOperand(0);
//  SDOperand Y = Op.getOperand(1);
//}

static SDOperand LowerEXTRACT_VECTOR_ELT(SDOperand Op, SelectionDAG &DAG) {
  unsigned VT = Op.getValueType();
  SDOperand N = Op.getOperand(0);
  SDOperand Elt = Op.getOperand(1);
  SDOperand ShufMask[16];
  ConstantSDNode *C = dyn_cast<ConstantSDNode>(Elt);

  assert(C != 0 && "LowerEXTRACT_VECTOR_ELT expecting constant SDNode");

  int EltNo = (int) C->getValue();

  // sanity checks:
  if (VT == MVT::i8 && EltNo >= 16)
    assert(0 && "SPU LowerEXTRACT_VECTOR_ELT: i8 extraction slot > 15");
  else if (VT == MVT::i16 && EltNo >= 8)
    assert(0 && "SPU LowerEXTRACT_VECTOR_ELT: i16 extraction slot > 7");
  else if (VT == MVT::i32 && EltNo >= 4)
    assert(0 && "SPU LowerEXTRACT_VECTOR_ELT: i32 extraction slot > 4");
  else if (VT == MVT::i64 && EltNo >= 2)
    assert(0 && "SPU LowerEXTRACT_VECTOR_ELT: i64 extraction slot > 2");

  if (EltNo == 0 && (VT == MVT::i32 || VT == MVT::i64)) {
    // i32 and i64: Element 0 is the preferred slot
    return DAG.getNode(SPUISD::EXTRACT_ELT0, VT, N);
  }

  // Need to generate shuffle mask and extract:
  int prefslot_begin, prefslot_end;
  int elt_byte = EltNo * MVT::getSizeInBits(VT) / 8;

  switch (VT) {
  case MVT::i8: {
    prefslot_begin = prefslot_end = 3;
    break;
  }
  case MVT::i16: {
    prefslot_begin = 2; prefslot_end = 3;
    break;
  }
  case MVT::i32: {
    prefslot_begin = 0; prefslot_end = 3;
    break;
  }
  case MVT::i64: {
    prefslot_begin = 0; prefslot_end = 7;
    break;
  }
  }

  for (int i = 0; i < 16; ++i) {
    // zero fill uppper part of preferred slot, don't care about the
    // other slots:
    unsigned int mask_val;

    if (i <= prefslot_end) {
      mask_val =
	((i < prefslot_begin)
	 ? 0x80
	 : elt_byte + (i - prefslot_begin));

      ShufMask[i] = DAG.getConstant(mask_val, MVT::i16);
    } else
      ShufMask[i] = ShufMask[i % (prefslot_end + 1)];
  }

  SDOperand ShufMaskVec =
    DAG.getNode(ISD::BUILD_VECTOR, MVT::v16i8,
		&ShufMask[0],
		sizeof(ShufMask) / sizeof(ShufMask[0]));

  return DAG.getNode(SPUISD::EXTRACT_ELT0, VT,
		     DAG.getNode(SPUISD::SHUFB, N.getValueType(),
				 N, N, ShufMaskVec));

}

static SDOperand LowerINSERT_VECTOR_ELT(SDOperand Op, SelectionDAG &DAG) {
  SDOperand VecOp = Op.getOperand(0);
  SDOperand ValOp = Op.getOperand(1);
  SDOperand IdxOp = Op.getOperand(2);
  MVT::ValueType VT = Op.getValueType();

  ConstantSDNode *CN = cast<ConstantSDNode>(IdxOp);
  assert(CN != 0 && "LowerINSERT_VECTOR_ELT: Index is not constant!");

  MVT::ValueType PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
  // Use $2 because it's always 16-byte aligned and it's available:
  SDOperand PtrBase = DAG.getRegister(SPU::R2, PtrVT);

  SDOperand result =
    DAG.getNode(SPUISD::SHUFB, VT,
                DAG.getNode(ISD::SCALAR_TO_VECTOR, VT, ValOp),
                VecOp,
                DAG.getNode(SPUISD::INSERT_MASK, VT,
                            DAG.getNode(ISD::ADD, PtrVT,
                                        PtrBase,
                                        DAG.getConstant(CN->getValue(),
					                PtrVT))));

  return result;
}

static SDOperand LowerI8Math(SDOperand Op, SelectionDAG &DAG, unsigned Opc) {
  SDOperand N0 = Op.getOperand(0);      // Everything has at least one operand

  assert(Op.getValueType() == MVT::i8);
  switch (Opc) {
  default:
    assert(0 && "Unhandled i8 math operator");
    /*NOTREACHED*/
    break;
  case ISD::SUB: {
    // 8-bit subtraction: Promote the arguments up to 16-bits and truncate
    // the result:
    SDOperand N1 = Op.getOperand(1);
    N0 = (N0.getOpcode() != ISD::Constant
          ? DAG.getNode(ISD::SIGN_EXTEND, MVT::i16, N0)
          : DAG.getConstant(cast<ConstantSDNode>(N0)->getValue(), MVT::i16));
    N1 = (N1.getOpcode() != ISD::Constant
          ? DAG.getNode(ISD::SIGN_EXTEND, MVT::i16, N1)
          : DAG.getConstant(cast<ConstantSDNode>(N1)->getValue(), MVT::i16));
    return DAG.getNode(ISD::TRUNCATE, MVT::i8,
                       DAG.getNode(Opc, MVT::i16, N0, N1));
  }
  case ISD::ROTR:
  case ISD::ROTL: {
    SDOperand N1 = Op.getOperand(1);
    unsigned N1Opc;
    N0 = (N0.getOpcode() != ISD::Constant
          ? DAG.getNode(ISD::ZERO_EXTEND, MVT::i16, N0)
          : DAG.getConstant(cast<ConstantSDNode>(N0)->getValue(), MVT::i16));
    N1Opc = (N1.getValueType() < MVT::i16 ? ISD::ZERO_EXTEND : ISD::TRUNCATE);
    N1 = (N1.getOpcode() != ISD::Constant
          ? DAG.getNode(N1Opc, MVT::i16, N1)
          : DAG.getConstant(cast<ConstantSDNode>(N1)->getValue(), MVT::i16));
    SDOperand ExpandArg =
      DAG.getNode(ISD::OR, MVT::i16, N0,
                  DAG.getNode(ISD::SHL, MVT::i16,
                              N0, DAG.getConstant(8, MVT::i16)));
    return DAG.getNode(ISD::TRUNCATE, MVT::i8,
                       DAG.getNode(Opc, MVT::i16, ExpandArg, N1));
  }
  case ISD::SRL:
  case ISD::SHL: {
    SDOperand N1 = Op.getOperand(1);
    unsigned N1Opc;
    N0 = (N0.getOpcode() != ISD::Constant
          ? DAG.getNode(ISD::ZERO_EXTEND, MVT::i16, N0)
          : DAG.getConstant(cast<ConstantSDNode>(N0)->getValue(), MVT::i16));
    N1Opc = (N1.getValueType() < MVT::i16 ? ISD::ZERO_EXTEND : ISD::TRUNCATE);
    N1 = (N1.getOpcode() != ISD::Constant
          ? DAG.getNode(N1Opc, MVT::i16, N1)
          : DAG.getConstant(cast<ConstantSDNode>(N1)->getValue(), MVT::i16));
    return DAG.getNode(ISD::TRUNCATE, MVT::i8,
                       DAG.getNode(Opc, MVT::i16, N0, N1));
  }
  case ISD::SRA: {
    SDOperand N1 = Op.getOperand(1);
    unsigned N1Opc;
    N0 = (N0.getOpcode() != ISD::Constant
          ? DAG.getNode(ISD::SIGN_EXTEND, MVT::i16, N0)
          : DAG.getConstant(cast<ConstantSDNode>(N0)->getValue(), MVT::i16));
    N1Opc = (N1.getValueType() < MVT::i16 ? ISD::SIGN_EXTEND : ISD::TRUNCATE);
    N1 = (N1.getOpcode() != ISD::Constant
          ? DAG.getNode(N1Opc, MVT::i16, N1)
          : DAG.getConstant(cast<ConstantSDNode>(N1)->getValue(), MVT::i16));
    return DAG.getNode(ISD::TRUNCATE, MVT::i8,
                       DAG.getNode(Opc, MVT::i16, N0, N1));
  }
  case ISD::MUL: {
    SDOperand N1 = Op.getOperand(1);
    unsigned N1Opc;
    N0 = (N0.getOpcode() != ISD::Constant
          ? DAG.getNode(ISD::SIGN_EXTEND, MVT::i16, N0)
          : DAG.getConstant(cast<ConstantSDNode>(N0)->getValue(), MVT::i16));
    N1Opc = (N1.getValueType() < MVT::i16 ? ISD::SIGN_EXTEND : ISD::TRUNCATE);
    N1 = (N1.getOpcode() != ISD::Constant
          ? DAG.getNode(N1Opc, MVT::i16, N1)
          : DAG.getConstant(cast<ConstantSDNode>(N1)->getValue(), MVT::i16));
    return DAG.getNode(ISD::TRUNCATE, MVT::i8,
                       DAG.getNode(Opc, MVT::i16, N0, N1));
    break;
  }
  }

  return SDOperand();
}

//! Lower byte immediate operations for v16i8 vectors:
static SDOperand
LowerByteImmed(SDOperand Op, SelectionDAG &DAG) {
  SDOperand ConstVec;
  SDOperand Arg;
  MVT::ValueType VT = Op.getValueType();

  ConstVec = Op.getOperand(0);
  Arg = Op.getOperand(1);
  if (ConstVec.Val->getOpcode() != ISD::BUILD_VECTOR) {
    if (ConstVec.Val->getOpcode() == ISD::BIT_CONVERT) {
      ConstVec = ConstVec.getOperand(0);
    } else {
      ConstVec = Op.getOperand(1);
      Arg = Op.getOperand(0);
      if (ConstVec.Val->getOpcode() == ISD::BIT_CONVERT) {
	ConstVec = ConstVec.getOperand(0);
      }
    }
  }

  if (ConstVec.Val->getOpcode() == ISD::BUILD_VECTOR) {
    uint64_t VectorBits[2];
    uint64_t UndefBits[2];
    uint64_t SplatBits, SplatUndef;
    int SplatSize;

    if (!GetConstantBuildVectorBits(ConstVec.Val, VectorBits, UndefBits)
	&& isConstantSplat(VectorBits, UndefBits,
			   MVT::getSizeInBits(MVT::getVectorElementType(VT)),
			   SplatBits, SplatUndef, SplatSize)) {
      SDOperand tcVec[16];
      SDOperand tc = DAG.getTargetConstant(SplatBits & 0xff, MVT::i8);
      const size_t tcVecSize = sizeof(tcVec) / sizeof(tcVec[0]);

      // Turn the BUILD_VECTOR into a set of target constants:
      for (size_t i = 0; i < tcVecSize; ++i)
	tcVec[i] = tc;

      return DAG.getNode(Op.Val->getOpcode(), VT, Arg,
			 DAG.getNode(ISD::BUILD_VECTOR, VT, tcVec, tcVecSize));
    }
  }

  return SDOperand();
}

//! Lower i32 multiplication
static SDOperand LowerMUL(SDOperand Op, SelectionDAG &DAG, unsigned VT,
                          unsigned Opc) {
  switch (VT) {
  default:
    cerr << "CellSPU: Unknown LowerMUL value type, got "
         << MVT::getValueTypeString(Op.getValueType())
	 << "\n";
    abort();
    /*NOTREACHED*/

  case MVT::i32: {
    SDOperand rA = Op.getOperand(0);
    SDOperand rB = Op.getOperand(1);

    return DAG.getNode(ISD::ADD, MVT::i32,
		       DAG.getNode(ISD::ADD, MVT::i32,
				   DAG.getNode(SPUISD::MPYH, MVT::i32, rA, rB),
				   DAG.getNode(SPUISD::MPYH, MVT::i32, rB, rA)),
		       DAG.getNode(SPUISD::MPYU, MVT::i32, rA, rB));
  }
  }

  return SDOperand();
}

//! Custom lowering for CTPOP (count population)
/*!
  Custom lowering code that counts the number ones in the input
  operand. SPU has such an instruction, but it counts the number of
  ones per byte, which then have to be accumulated.
*/
static SDOperand LowerCTPOP(SDOperand Op, SelectionDAG &DAG) {
  unsigned VT = Op.getValueType();
  unsigned vecVT = MVT::getVectorType(VT, (128 / MVT::getSizeInBits(VT)));

  switch (VT) {
  case MVT::i8: {
    SDOperand N = Op.getOperand(0);
    SDOperand Elt0 = DAG.getConstant(0, MVT::i32);

    SDOperand Promote = DAG.getNode(SPUISD::PROMOTE_SCALAR, vecVT, N, N);
    SDOperand CNTB = DAG.getNode(SPUISD::CNTB, vecVT, Promote);

    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, MVT::i8, CNTB, Elt0);
  }

  case MVT::i16: {
    MachineFunction &MF = DAG.getMachineFunction();
    SSARegMap *RegMap = MF.getSSARegMap();

    unsigned CNTB_reg = RegMap->createVirtualRegister(&SPU::R16CRegClass);

    SDOperand N = Op.getOperand(0);
    SDOperand Elt0 = DAG.getConstant(0, MVT::i16);
    SDOperand Mask0 = DAG.getConstant(0x0f, MVT::i16);
    SDOperand Shift1 = DAG.getConstant(8, MVT::i16);

    SDOperand Promote = DAG.getNode(SPUISD::PROMOTE_SCALAR, vecVT, N, N);
    SDOperand CNTB = DAG.getNode(SPUISD::CNTB, vecVT, Promote);

    // CNTB_result becomes the chain to which all of the virtual registers
    // CNTB_reg, SUM1_reg become associated:
    SDOperand CNTB_result =
      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, MVT::i16, CNTB, Elt0);

    SDOperand CNTB_rescopy =
      DAG.getCopyToReg(CNTB_result, CNTB_reg, CNTB_result);

    SDOperand Tmp1 = DAG.getCopyFromReg(CNTB_rescopy, CNTB_reg, MVT::i16);

    return DAG.getNode(ISD::AND, MVT::i16,
		       DAG.getNode(ISD::ADD, MVT::i16,
				   DAG.getNode(ISD::SRL, MVT::i16,
					       Tmp1, Shift1),
				   Tmp1),
		       Mask0);
  }

  case MVT::i32: {
    MachineFunction &MF = DAG.getMachineFunction();
    SSARegMap *RegMap = MF.getSSARegMap();

    unsigned CNTB_reg = RegMap->createVirtualRegister(&SPU::R32CRegClass);
    unsigned SUM1_reg = RegMap->createVirtualRegister(&SPU::R32CRegClass);

    SDOperand N = Op.getOperand(0);
    SDOperand Elt0 = DAG.getConstant(0, MVT::i32);
    SDOperand Mask0 = DAG.getConstant(0xff, MVT::i32);
    SDOperand Shift1 = DAG.getConstant(16, MVT::i32);
    SDOperand Shift2 = DAG.getConstant(8, MVT::i32);

    SDOperand Promote = DAG.getNode(SPUISD::PROMOTE_SCALAR, vecVT, N, N);
    SDOperand CNTB = DAG.getNode(SPUISD::CNTB, vecVT, Promote);

    // CNTB_result becomes the chain to which all of the virtual registers
    // CNTB_reg, SUM1_reg become associated:
    SDOperand CNTB_result =
      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, MVT::i32, CNTB, Elt0);

    SDOperand CNTB_rescopy =
      DAG.getCopyToReg(CNTB_result, CNTB_reg, CNTB_result);

    SDOperand Comp1 =
      DAG.getNode(ISD::SRL, MVT::i32,
		  DAG.getCopyFromReg(CNTB_rescopy, CNTB_reg, MVT::i32), Shift1);

    SDOperand Sum1 =
      DAG.getNode(ISD::ADD, MVT::i32,
		  Comp1, DAG.getCopyFromReg(CNTB_rescopy, CNTB_reg, MVT::i32));

    SDOperand Sum1_rescopy =
      DAG.getCopyToReg(CNTB_result, SUM1_reg, Sum1);

    SDOperand Comp2 =
      DAG.getNode(ISD::SRL, MVT::i32,
		  DAG.getCopyFromReg(Sum1_rescopy, SUM1_reg, MVT::i32),
		  Shift2);
    SDOperand Sum2 =
      DAG.getNode(ISD::ADD, MVT::i32, Comp2,
		  DAG.getCopyFromReg(Sum1_rescopy, SUM1_reg, MVT::i32));

    return DAG.getNode(ISD::AND, MVT::i32, Sum2, Mask0);
  }

  case MVT::i64:
    break;
  }

  return SDOperand();
}

/// LowerOperation - Provide custom lowering hooks for some operations.
///
SDOperand
SPUTargetLowering::LowerOperation(SDOperand Op, SelectionDAG &DAG)
{
  switch (Op.getOpcode()) {
  default: {
    cerr << "SPUTargetLowering::LowerOperation(): need to lower this!\n";
    cerr << "Op.getOpcode() = " << Op.getOpcode() << "\n";
    cerr << "*Op.Val:\n";
    Op.Val->dump();
    abort();
  }
  case ISD::LOAD:
  case ISD::SEXTLOAD:
  case ISD::ZEXTLOAD:
    return LowerLOAD(Op, DAG, SPUTM.getSubtargetImpl());
  case ISD::STORE:
    return LowerSTORE(Op, DAG, SPUTM.getSubtargetImpl());
  case ISD::ConstantPool:
    return LowerConstantPool(Op, DAG, SPUTM.getSubtargetImpl());
  case ISD::GlobalAddress:
    return LowerGlobalAddress(Op, DAG, SPUTM.getSubtargetImpl());
  case ISD::JumpTable:
    return LowerJumpTable(Op, DAG, SPUTM.getSubtargetImpl());
  case ISD::Constant:
    return LowerConstant(Op, DAG);
  case ISD::ConstantFP:
    return LowerConstantFP(Op, DAG);
  case ISD::FORMAL_ARGUMENTS:
      return LowerFORMAL_ARGUMENTS(Op, DAG, VarArgsFrameIndex);
  case ISD::CALL:
    return LowerCALL(Op, DAG);
  case ISD::RET:
    return LowerRET(Op, DAG, getTargetMachine());

  // i8 math ops:
  case ISD::SUB:
  case ISD::ROTR:
  case ISD::ROTL:
  case ISD::SRL:
  case ISD::SHL:
  case ISD::SRA:
    return LowerI8Math(Op, DAG, Op.getOpcode());

  // Vector-related lowering.
  case ISD::BUILD_VECTOR:
    return LowerBUILD_VECTOR(Op, DAG);
  case ISD::SCALAR_TO_VECTOR:
    return LowerSCALAR_TO_VECTOR(Op, DAG);
  case ISD::VECTOR_SHUFFLE:
    return LowerVECTOR_SHUFFLE(Op, DAG);
  case ISD::EXTRACT_VECTOR_ELT:
    return LowerEXTRACT_VECTOR_ELT(Op, DAG);
  case ISD::INSERT_VECTOR_ELT:
    return LowerINSERT_VECTOR_ELT(Op, DAG);

  // Look for ANDBI, ORBI and XORBI opportunities and lower appropriately:
  case ISD::AND:
  case ISD::OR:
  case ISD::XOR:
    return LowerByteImmed(Op, DAG);

  // Vector and i8 multiply:
  case ISD::MUL:
    if (MVT::isVector(Op.getValueType()))
      return LowerVectorMUL(Op, DAG);
    else if (Op.getValueType() == MVT::i8)
      return LowerI8Math(Op, DAG, Op.getOpcode());
    else
      return LowerMUL(Op, DAG, Op.getValueType(), Op.getOpcode());

  case ISD::FDIV:
    if (Op.getValueType() == MVT::f32 || Op.getValueType() == MVT::v4f32)
      return LowerFDIVf32(Op, DAG);
//    else if (Op.getValueType() == MVT::f64)
//      return LowerFDIVf64(Op, DAG);
    else
      assert(0 && "Calling FDIV on unsupported MVT");

  case ISD::CTPOP:
    return LowerCTPOP(Op, DAG);
  }

  return SDOperand();
}

//===----------------------------------------------------------------------===//
//  Other Lowering Code
//===----------------------------------------------------------------------===//

MachineBasicBlock *
SPUTargetLowering::InsertAtEndOfBasicBlock(MachineInstr *MI,
                                           MachineBasicBlock *BB)
{
  return BB;
}

//===----------------------------------------------------------------------===//
// Target Optimization Hooks
//===----------------------------------------------------------------------===//

SDOperand
SPUTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const
{
#if 0
  TargetMachine &TM = getTargetMachine();
  SelectionDAG &DAG = DCI.DAG;
#endif
  SDOperand N0 = N->getOperand(0);	// everything has at least one operand

  switch (N->getOpcode()) {
  default: break;

  // Look for obvious optimizations for shift left:
  // a) Replace 0 << V with 0
  // b) Replace V << 0 with V
  //
  // N.B: llvm will generate an undef node if the shift amount is greater than
  // 15 (e.g.: V << 16), which will naturally trigger an assert.
  case SPU::SHLIr32:
  case SPU::SHLHIr16:
  case SPU::SHLQBIIvec:
  case SPU::ROTHIr16:
  case SPU::ROTHIr16_i32:
  case SPU::ROTIr32:
  case SPU::ROTIr32_i16:
  case SPU::ROTQBYIvec:
  case SPU::ROTQBYBIvec:
  case SPU::ROTQBIIvec:
  case SPU::ROTHMIr16:
  case SPU::ROTMIr32:
  case SPU::ROTQMBYIvec: {
    if (N0.getOpcode() == ISD::Constant) {
      if (ConstantSDNode *C = cast<ConstantSDNode>(N0)) {
	if (C->getValue() == 0)   	// 0 << V -> 0.
	  return N0;
      }
    }
    SDOperand N1 = N->getOperand(1);
    if (N1.getOpcode() == ISD::Constant) {
      if (ConstantSDNode *C = cast<ConstantSDNode>(N1)) {
	if (C->getValue() == 0)		// V << 0 -> V
	  return N1;
      }
    }
    break;
  }
  }

  return SDOperand();
}

//===----------------------------------------------------------------------===//
// Inline Assembly Support
//===----------------------------------------------------------------------===//

/// getConstraintType - Given a constraint letter, return the type of
/// constraint it is for this target.
SPUTargetLowering::ConstraintType
SPUTargetLowering::getConstraintType(const std::string &ConstraintLetter) const {
  if (ConstraintLetter.size() == 1) {
    switch (ConstraintLetter[0]) {
    default: break;
    case 'b':
    case 'r':
    case 'f':
    case 'v':
    case 'y':
      return C_RegisterClass;
    }
  }
  return TargetLowering::getConstraintType(ConstraintLetter);
}

std::pair<unsigned, const TargetRegisterClass*>
SPUTargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
                                                MVT::ValueType VT) const
{
  if (Constraint.size() == 1) {
    // GCC RS6000 Constraint Letters
    switch (Constraint[0]) {
    case 'b':   // R1-R31
    case 'r':   // R0-R31
      if (VT == MVT::i64)
        return std::make_pair(0U, SPU::R64CRegisterClass);
      return std::make_pair(0U, SPU::R32CRegisterClass);
    case 'f':
      if (VT == MVT::f32)
        return std::make_pair(0U, SPU::R32FPRegisterClass);
      else if (VT == MVT::f64)
        return std::make_pair(0U, SPU::R64FPRegisterClass);
      break;
    case 'v':
      return std::make_pair(0U, SPU::GPRCRegisterClass);
    }
  }

  return TargetLowering::getRegForInlineAsmConstraint(Constraint, VT);
}

void
SPUTargetLowering::computeMaskedBitsForTargetNode(const SDOperand Op,
						  uint64_t Mask,
						  uint64_t &KnownZero,
						  uint64_t &KnownOne,
						  const SelectionDAG &DAG,
						  unsigned Depth ) const {
  KnownZero = 0;
  KnownOne = 0;
}

// LowerAsmOperandForConstraint
void
SPUTargetLowering::LowerAsmOperandForConstraint(SDOperand Op,
                                                char ConstraintLetter,
                                                std::vector<SDOperand> &Ops,
                                                SelectionDAG &DAG) {
  // Default, for the time being, to the base class handler
  TargetLowering::LowerAsmOperandForConstraint(Op, ConstraintLetter, Ops, DAG);
}

/// isLegalAddressImmediate - Return true if the integer value can be used
/// as the offset of the target addressing mode.
bool SPUTargetLowering::isLegalAddressImmediate(int64_t V, const Type *Ty) const {
  // SPU's addresses are 256K:
  return (V > -(1 << 18) && V < (1 << 18) - 1);
}

bool SPUTargetLowering::isLegalAddressImmediate(llvm::GlobalValue* GV) const {
  return false;
}