llvm-6502/lib/Target/R600/AMDGPUISelLowering.cpp

//===-- AMDGPUISelLowering.cpp - AMDGPU Common DAG lowering functions -----===//
//
//                     The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
//
/// \file
/// \brief This is the parent TargetLowering class for hardware code gen
/// targets.
//
//===----------------------------------------------------------------------===//

#include "AMDGPUISelLowering.h"
#include "AMDGPU.h"
#include "AMDGPUFrameLowering.h"
#include "AMDGPURegisterInfo.h"
#include "AMDGPUSubtarget.h"
#include "AMDILIntrinsicInfo.h"
#include "R600MachineFunctionInfo.h"
#include "SIMachineFunctionInfo.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/CodeGen/CallingConvLower.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/SelectionDAG.h"
#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
#include "llvm/IR/DataLayout.h"
#include "llvm/IR/DiagnosticInfo.h"
#include "llvm/IR/DiagnosticPrinter.h"

using namespace llvm;

namespace {

/// Diagnostic information for unimplemented or unsupported feature reporting.
class DiagnosticInfoUnsupported : public DiagnosticInfo {
private:
  const Twine &Description;
  const Function &Fn;

  static int KindID;

  static int getKindID() {
    if (KindID == 0)
      KindID = llvm::getNextAvailablePluginDiagnosticKind();
    return KindID;
  }

public:
  DiagnosticInfoUnsupported(const Function &Fn, const Twine &Desc,
                          DiagnosticSeverity Severity = DS_Error)
    : DiagnosticInfo(getKindID(), Severity),
      Description(Desc),
      Fn(Fn) { }

  const Function &getFunction() const { return Fn; }
  const Twine &getDescription() const { return Description; }

  void print(DiagnosticPrinter &DP) const override {
    DP << "unsupported " << getDescription() << " in " << Fn.getName();
  }

  static bool classof(const DiagnosticInfo *DI) {
    return DI->getKind() == getKindID();
  }
};

int DiagnosticInfoUnsupported::KindID = 0;
}


static bool allocateStack(unsigned ValNo, MVT ValVT, MVT LocVT,
                      CCValAssign::LocInfo LocInfo,
                      ISD::ArgFlagsTy ArgFlags, CCState &State) {
  unsigned Offset = State.AllocateStack(ValVT.getStoreSize(),
                                        ArgFlags.getOrigAlign());
  State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));

  return true;
}

#include "AMDGPUGenCallingConv.inc"

AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) :
  TargetLowering(TM, new TargetLoweringObjectFileELF()) {

  Subtarget = &TM.getSubtarget<AMDGPUSubtarget>();

  // Initialize target lowering borrowed from AMDIL
  InitAMDILLowering();

  // We need to custom lower some of the intrinsics
  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);

  // Library functions.  These default to Expand, but we have instructions
  // for them.
  setOperationAction(ISD::FCEIL,  MVT::f32, Legal);
  setOperationAction(ISD::FEXP2,  MVT::f32, Legal);
  setOperationAction(ISD::FPOW,   MVT::f32, Legal);
  setOperationAction(ISD::FLOG2,  MVT::f32, Legal);
  setOperationAction(ISD::FABS,   MVT::f32, Legal);
  setOperationAction(ISD::FFLOOR, MVT::f32, Legal);
  setOperationAction(ISD::FRINT,  MVT::f32, Legal);
  setOperationAction(ISD::FROUND, MVT::f32, Legal);
  setOperationAction(ISD::FTRUNC, MVT::f32, Legal);

  // The hardware supports ROTR, but not ROTL
  setOperationAction(ISD::ROTL, MVT::i32, Expand);

  // Lower floating point store/load to integer store/load to reduce the number
  // of patterns in tablegen.
  setOperationAction(ISD::STORE, MVT::f32, Promote);
  AddPromotedToType(ISD::STORE, MVT::f32, MVT::i32);

  setOperationAction(ISD::STORE, MVT::v2f32, Promote);
  AddPromotedToType(ISD::STORE, MVT::v2f32, MVT::v2i32);

  setOperationAction(ISD::STORE, MVT::v4f32, Promote);
  AddPromotedToType(ISD::STORE, MVT::v4f32, MVT::v4i32);

  setOperationAction(ISD::STORE, MVT::v8f32, Promote);
  AddPromotedToType(ISD::STORE, MVT::v8f32, MVT::v8i32);

  setOperationAction(ISD::STORE, MVT::v16f32, Promote);
  AddPromotedToType(ISD::STORE, MVT::v16f32, MVT::v16i32);

  setOperationAction(ISD::STORE, MVT::f64, Promote);
  AddPromotedToType(ISD::STORE, MVT::f64, MVT::i64);

  setOperationAction(ISD::STORE, MVT::v2f64, Promote);
  AddPromotedToType(ISD::STORE, MVT::v2f64, MVT::v2i64);

  // Custom lowering of vector stores is required for local address space
  // stores.
  setOperationAction(ISD::STORE, MVT::v4i32, Custom);
  // XXX: Native v2i32 local address space stores are possible, but not
  // currently implemented.
  setOperationAction(ISD::STORE, MVT::v2i32, Custom);

  setTruncStoreAction(MVT::v2i32, MVT::v2i16, Custom);
  setTruncStoreAction(MVT::v2i32, MVT::v2i8, Custom);
  setTruncStoreAction(MVT::v4i32, MVT::v4i8, Custom);

  // XXX: This can be change to Custom, once ExpandVectorStores can
  // handle 64-bit stores.
  setTruncStoreAction(MVT::v4i32, MVT::v4i16, Expand);

  setTruncStoreAction(MVT::i64, MVT::i16, Expand);
  setTruncStoreAction(MVT::i64, MVT::i8, Expand);
  setTruncStoreAction(MVT::i64, MVT::i1, Expand);
  setTruncStoreAction(MVT::v2i64, MVT::v2i1, Expand);
  setTruncStoreAction(MVT::v4i64, MVT::v4i1, Expand);


  setOperationAction(ISD::LOAD, MVT::f32, Promote);
  AddPromotedToType(ISD::LOAD, MVT::f32, MVT::i32);

  setOperationAction(ISD::LOAD, MVT::v2f32, Promote);
  AddPromotedToType(ISD::LOAD, MVT::v2f32, MVT::v2i32);

  setOperationAction(ISD::LOAD, MVT::v4f32, Promote);
  AddPromotedToType(ISD::LOAD, MVT::v4f32, MVT::v4i32);

  setOperationAction(ISD::LOAD, MVT::v8f32, Promote);
  AddPromotedToType(ISD::LOAD, MVT::v8f32, MVT::v8i32);

  setOperationAction(ISD::LOAD, MVT::v16f32, Promote);
  AddPromotedToType(ISD::LOAD, MVT::v16f32, MVT::v16i32);

  setOperationAction(ISD::LOAD, MVT::f64, Promote);
  AddPromotedToType(ISD::LOAD, MVT::f64, MVT::i64);

  setOperationAction(ISD::LOAD, MVT::v2f64, Promote);
  AddPromotedToType(ISD::LOAD, MVT::v2f64, MVT::v2i64);

  setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i32, Custom);
  setOperationAction(ISD::CONCAT_VECTORS, MVT::v4f32, Custom);
  setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i32, Custom);
  setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f32, Custom);
  setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2f32, Custom);
  setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2i32, Custom);
  setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4f32, Custom);
  setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4i32, Custom);
  setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8f32, Custom);
  setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8i32, Custom);

  setLoadExtAction(ISD::EXTLOAD, MVT::v2i8, Expand);
  setLoadExtAction(ISD::SEXTLOAD, MVT::v2i8, Expand);
  setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i8, Expand);
  setLoadExtAction(ISD::EXTLOAD, MVT::v4i8, Expand);
  setLoadExtAction(ISD::SEXTLOAD, MVT::v4i8, Expand);
  setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i8, Expand);
  setLoadExtAction(ISD::EXTLOAD, MVT::v2i16, Expand);
  setLoadExtAction(ISD::SEXTLOAD, MVT::v2i16, Expand);
  setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i16, Expand);
  setLoadExtAction(ISD::EXTLOAD, MVT::v4i16, Expand);
  setLoadExtAction(ISD::SEXTLOAD, MVT::v4i16, Expand);
  setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i16, Expand);

  setOperationAction(ISD::BR_CC, MVT::i1, Expand);

  setOperationAction(ISD::SELECT_CC, MVT::i64, Expand);

  setOperationAction(ISD::FNEG, MVT::v2f32, Expand);
  setOperationAction(ISD::FNEG, MVT::v4f32, Expand);

  setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);

  setOperationAction(ISD::MUL, MVT::i64, Expand);
  setOperationAction(ISD::SUB, MVT::i64, Expand);

  setOperationAction(ISD::UDIV, MVT::i32, Expand);
  setOperationAction(ISD::UDIVREM, MVT::i32, Custom);
  setOperationAction(ISD::UDIVREM, MVT::i64, Custom);
  setOperationAction(ISD::UREM, MVT::i32, Expand);
  setOperationAction(ISD::VSELECT, MVT::v2f32, Expand);
  setOperationAction(ISD::VSELECT, MVT::v4f32, Expand);

  static const MVT::SimpleValueType IntTypes[] = {
    MVT::v2i32, MVT::v4i32
  };

  for (MVT VT : IntTypes) {
    //Expand the following operations for the current type by default
    setOperationAction(ISD::ADD,  VT, Expand);
    setOperationAction(ISD::AND,  VT, Expand);
    setOperationAction(ISD::FP_TO_SINT, VT, Expand);
    setOperationAction(ISD::FP_TO_UINT, VT, Expand);
    setOperationAction(ISD::MUL,  VT, Expand);
    setOperationAction(ISD::OR,   VT, Expand);
    setOperationAction(ISD::SHL,  VT, Expand);
    setOperationAction(ISD::SINT_TO_FP, VT, Expand);
    setOperationAction(ISD::SRL,  VT, Expand);
    setOperationAction(ISD::SRA,  VT, Expand);
    setOperationAction(ISD::SUB,  VT, Expand);
    setOperationAction(ISD::UDIV, VT, Expand);
    setOperationAction(ISD::UINT_TO_FP, VT, Expand);
    setOperationAction(ISD::UREM, VT, Expand);
    setOperationAction(ISD::SELECT, VT, Expand);
    setOperationAction(ISD::VSELECT, VT, Expand);
    setOperationAction(ISD::XOR,  VT, Expand);
  }

  static const MVT::SimpleValueType FloatTypes[] = {
    MVT::v2f32, MVT::v4f32
  };

  for (MVT VT : FloatTypes) {
    setOperationAction(ISD::FABS, VT, Expand);
    setOperationAction(ISD::FADD, VT, Expand);
    setOperationAction(ISD::FCOS, VT, Expand);
    setOperationAction(ISD::FDIV, VT, Expand);
    setOperationAction(ISD::FPOW, VT, Expand);
    setOperationAction(ISD::FFLOOR, VT, Expand);
    setOperationAction(ISD::FTRUNC, VT, Expand);
    setOperationAction(ISD::FMUL, VT, Expand);
    setOperationAction(ISD::FRINT, VT, Expand);
    setOperationAction(ISD::FSQRT, VT, Expand);
    setOperationAction(ISD::FSIN, VT, Expand);
    setOperationAction(ISD::FSUB, VT, Expand);
    setOperationAction(ISD::SELECT, VT, Expand);
  }

  setTargetDAGCombine(ISD::MUL);
  setTargetDAGCombine(ISD::SELECT_CC);
}

//===----------------------------------------------------------------------===//
// Target Information
//===----------------------------------------------------------------------===//

MVT AMDGPUTargetLowering::getVectorIdxTy() const {
  return MVT::i32;
}

bool AMDGPUTargetLowering::isLoadBitCastBeneficial(EVT LoadTy,
                                                   EVT CastTy) const {
  if (LoadTy.getSizeInBits() != CastTy.getSizeInBits())
    return true;

  unsigned LScalarSize = LoadTy.getScalarType().getSizeInBits();
  unsigned CastScalarSize = CastTy.getScalarType().getSizeInBits();

  return ((LScalarSize <= CastScalarSize) ||
          (CastScalarSize >= 32) ||
          (LScalarSize < 32));
}

//===---------------------------------------------------------------------===//
// Target Properties
//===---------------------------------------------------------------------===//

bool AMDGPUTargetLowering::isFAbsFree(EVT VT) const {
  assert(VT.isFloatingPoint());
  return VT == MVT::f32;
}

bool AMDGPUTargetLowering::isFNegFree(EVT VT) const {
  assert(VT.isFloatingPoint());
  return VT == MVT::f32;
}

bool AMDGPUTargetLowering::isTruncateFree(EVT Source, EVT Dest) const {
  // Truncate is just accessing a subregister.
  return Dest.bitsLT(Source) && (Dest.getSizeInBits() % 32 == 0);
}

bool AMDGPUTargetLowering::isTruncateFree(Type *Source, Type *Dest) const {
  // Truncate is just accessing a subregister.
  return Dest->getPrimitiveSizeInBits() < Source->getPrimitiveSizeInBits() &&
         (Dest->getPrimitiveSizeInBits() % 32 == 0);
}

bool AMDGPUTargetLowering::isZExtFree(Type *Src, Type *Dest) const {
  const DataLayout *DL = getDataLayout();
  unsigned SrcSize = DL->getTypeSizeInBits(Src->getScalarType());
  unsigned DestSize = DL->getTypeSizeInBits(Dest->getScalarType());

  return SrcSize == 32 && DestSize == 64;
}

bool AMDGPUTargetLowering::isZExtFree(EVT Src, EVT Dest) const {
  // Any register load of a 64-bit value really requires 2 32-bit moves. For all
  // practical purposes, the extra mov 0 to load a 64-bit is free.  As used,
  // this will enable reducing 64-bit operations the 32-bit, which is always
  // good.
  return Src == MVT::i32 && Dest == MVT::i64;
}

bool AMDGPUTargetLowering::isNarrowingProfitable(EVT SrcVT, EVT DestVT) const {
  // There aren't really 64-bit registers, but pairs of 32-bit ones and only a
  // limited number of native 64-bit operations. Shrinking an operation to fit
  // in a single 32-bit register should always be helpful. As currently used,
  // this is much less general than the name suggests, and is only used in
  // places trying to reduce the sizes of loads. Shrinking loads to < 32-bits is
  // not profitable, and may actually be harmful.
  return SrcVT.getSizeInBits() > 32 && DestVT.getSizeInBits() == 32;
}

//===---------------------------------------------------------------------===//
// TargetLowering Callbacks
//===---------------------------------------------------------------------===//

void AMDGPUTargetLowering::AnalyzeFormalArguments(CCState &State,
                             const SmallVectorImpl<ISD::InputArg> &Ins) const {

  State.AnalyzeFormalArguments(Ins, CC_AMDGPU);
}

SDValue AMDGPUTargetLowering::LowerReturn(
                                     SDValue Chain,
                                     CallingConv::ID CallConv,
                                     bool isVarArg,
                                     const SmallVectorImpl<ISD::OutputArg> &Outs,
                                     const SmallVectorImpl<SDValue> &OutVals,
                                     SDLoc DL, SelectionDAG &DAG) const {
  return DAG.getNode(AMDGPUISD::RET_FLAG, DL, MVT::Other, Chain);
}

//===---------------------------------------------------------------------===//
// Target specific lowering
//===---------------------------------------------------------------------===//

SDValue AMDGPUTargetLowering::LowerCall(CallLoweringInfo &CLI,
                                        SmallVectorImpl<SDValue> &InVals) const {
  SDValue Callee = CLI.Callee;
  SelectionDAG &DAG = CLI.DAG;

  const Function &Fn = *DAG.getMachineFunction().getFunction();

  StringRef FuncName("<unknown>");

  if (const ExternalSymbolSDNode *G = dyn_cast<ExternalSymbolSDNode>(Callee))
    FuncName = G->getSymbol();
  else if (const GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
    FuncName = G->getGlobal()->getName();

  DiagnosticInfoUnsupported NoCalls(Fn, "call to function " + FuncName);
  DAG.getContext()->diagnose(NoCalls);
  return SDValue();
}

SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG)
    const {
  switch (Op.getOpcode()) {
  default:
    Op.getNode()->dump();
    llvm_unreachable("Custom lowering code for this"
                     "instruction is not implemented yet!");
    break;
  // AMDIL DAG lowering
  case ISD::SDIV: return LowerSDIV(Op, DAG);
  case ISD::SREM: return LowerSREM(Op, DAG);
  case ISD::SIGN_EXTEND_INREG: return LowerSIGN_EXTEND_INREG(Op, DAG);
  case ISD::BRCOND: return LowerBRCOND(Op, DAG);
  // AMDGPU DAG lowering
  case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG);
  case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG);
  case ISD::FrameIndex: return LowerFrameIndex(Op, DAG);
  case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
  case ISD::UDIVREM: return LowerUDIVREM(Op, DAG);
  case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
  }
  return Op;
}

void AMDGPUTargetLowering::ReplaceNodeResults(SDNode *N,
                                              SmallVectorImpl<SDValue> &Results,
                                              SelectionDAG &DAG) const {
  switch (N->getOpcode()) {
  case ISD::SIGN_EXTEND_INREG:
    // Different parts of legalization seem to interpret which type of
    // sign_extend_inreg is the one to check for custom lowering. The extended
    // from type is what really matters, but some places check for custom
    // lowering of the result type. This results in trying to use
    // ReplaceNodeResults to sext_in_reg to an illegal type, so we'll just do
    // nothing here and let the illegal result integer be handled normally.
    return;
  case ISD::UDIV: {
    SDValue Op = SDValue(N, 0);
    SDLoc DL(Op);
    EVT VT = Op.getValueType();
    SDValue UDIVREM = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT),
      N->getOperand(0), N->getOperand(1));
    Results.push_back(UDIVREM);
    break;
  }
  case ISD::UREM: {
    SDValue Op = SDValue(N, 0);
    SDLoc DL(Op);
    EVT VT = Op.getValueType();
    SDValue UDIVREM = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT),
      N->getOperand(0), N->getOperand(1));
    Results.push_back(UDIVREM.getValue(1));
    break;
  }
  case ISD::UDIVREM: {
    SDValue Op = SDValue(N, 0);
    SDLoc DL(Op);
    EVT VT = Op.getValueType();
    EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());

    SDValue one = DAG.getConstant(1, HalfVT);
    SDValue zero = DAG.getConstant(0, HalfVT);

    //HiLo split
    SDValue LHS = N->getOperand(0);
    SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, zero);
    SDValue LHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, one);

    SDValue RHS = N->getOperand(1);
    SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, zero);
    SDValue RHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, one);

    // Get Speculative values
    SDValue DIV_Part = DAG.getNode(ISD::UDIV, DL, HalfVT, LHS_Hi, RHS_Lo);
    SDValue REM_Part = DAG.getNode(ISD::UREM, DL, HalfVT, LHS_Hi, RHS_Lo);

    SDValue REM_Hi = zero;
    SDValue REM_Lo = DAG.getSelectCC(DL, RHS_Hi, zero, REM_Part, LHS_Hi, ISD::SETEQ);

    SDValue DIV_Hi = DAG.getSelectCC(DL, RHS_Hi, zero, DIV_Part, zero, ISD::SETEQ);
    SDValue DIV_Lo = zero;

    const unsigned halfBitWidth = HalfVT.getSizeInBits();

    for (unsigned i = 0; i < halfBitWidth; ++i) {
      SDValue POS = DAG.getConstant(halfBitWidth - i - 1, HalfVT);
      // Get Value of high bit
      SDValue HBit;
      if (halfBitWidth == 32 && Subtarget->hasBFE()) {
        HBit = DAG.getNode(AMDGPUISD::BFE_U32, DL, HalfVT, LHS_Lo, POS, one);
      } else {
        HBit = DAG.getNode(ISD::SRL, DL, HalfVT, LHS_Lo, POS);
        HBit = DAG.getNode(ISD::AND, DL, HalfVT, HBit, one);
      }

      SDValue Carry = DAG.getNode(ISD::SRL, DL, HalfVT, REM_Lo,
        DAG.getConstant(halfBitWidth - 1, HalfVT));
      REM_Hi = DAG.getNode(ISD::SHL, DL, HalfVT, REM_Hi, one);
      REM_Hi = DAG.getNode(ISD::OR, DL, HalfVT, REM_Hi, Carry);

      REM_Lo = DAG.getNode(ISD::SHL, DL, HalfVT, REM_Lo, one);
      REM_Lo = DAG.getNode(ISD::OR, DL, HalfVT, REM_Lo, HBit);


      SDValue REM = DAG.getNode(ISD::BUILD_PAIR, DL, VT, REM_Lo, REM_Hi);

      SDValue BIT = DAG.getConstant(1 << (halfBitWidth - i - 1), HalfVT);
      SDValue realBIT = DAG.getSelectCC(DL, REM, RHS, BIT, zero, ISD::SETGE);

      DIV_Lo = DAG.getNode(ISD::OR, DL, HalfVT, DIV_Lo, realBIT);

      // Update REM

      SDValue REM_sub = DAG.getNode(ISD::SUB, DL, VT, REM, RHS);

      REM = DAG.getSelectCC(DL, REM, RHS, REM_sub, REM, ISD::SETGE);
      REM_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, REM, zero);
      REM_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, REM, one);
    }

    SDValue REM = DAG.getNode(ISD::BUILD_PAIR, DL, VT, REM_Lo, REM_Hi);
    SDValue DIV = DAG.getNode(ISD::BUILD_PAIR, DL, VT, DIV_Lo, DIV_Hi);
    Results.push_back(DIV);
    Results.push_back(REM);
    break;
  }
  default:
    return;
  }
}

SDValue AMDGPUTargetLowering::LowerConstantInitializer(const Constant* Init,
                                                       const GlobalValue *GV,
                                                       const SDValue &InitPtr,
                                                       SDValue Chain,
                                                       SelectionDAG &DAG) const {
  const DataLayout *TD = getTargetMachine().getDataLayout();
  SDLoc DL(InitPtr);
  if (const ConstantInt *CI = dyn_cast<ConstantInt>(Init)) {
    EVT VT = EVT::getEVT(CI->getType());
    PointerType *PtrTy = PointerType::get(CI->getType(), 0);
    return DAG.getStore(Chain, DL,  DAG.getConstant(*CI, VT), InitPtr,
                 MachinePointerInfo(UndefValue::get(PtrTy)), false, false,
                 TD->getPrefTypeAlignment(CI->getType()));
  }

  if (const ConstantFP *CFP = dyn_cast<ConstantFP>(Init)) {
    EVT VT = EVT::getEVT(CFP->getType());
    PointerType *PtrTy = PointerType::get(CFP->getType(), 0);
    return DAG.getStore(Chain, DL, DAG.getConstantFP(*CFP, VT), InitPtr,
                 MachinePointerInfo(UndefValue::get(PtrTy)), false, false,
                 TD->getPrefTypeAlignment(CFP->getType()));
  }

  if (Init->getType()->isAggregateType()) {
    EVT PtrVT = InitPtr.getValueType();
    unsigned NumElements = Init->getType()->getArrayNumElements();
    SmallVector<SDValue, 8> Chains;
    for (unsigned i = 0; i < NumElements; ++i) {
      SDValue Offset = DAG.getConstant(i * TD->getTypeAllocSize(
          Init->getType()->getArrayElementType()), PtrVT);
      SDValue Ptr = DAG.getNode(ISD::ADD, DL, PtrVT, InitPtr, Offset);
      Chains.push_back(LowerConstantInitializer(Init->getAggregateElement(i),
                       GV, Ptr, Chain, DAG));
    }

    return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
  }

  Init->dump();
  llvm_unreachable("Unhandled constant initializer");
}

SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI,
                                                 SDValue Op,
                                                 SelectionDAG &DAG) const {

  const DataLayout *TD = getTargetMachine().getDataLayout();
  GlobalAddressSDNode *G = cast<GlobalAddressSDNode>(Op);
  const GlobalValue *GV = G->getGlobal();

  switch (G->getAddressSpace()) {
  default: llvm_unreachable("Global Address lowering not implemented for this "
                            "address space");
  case AMDGPUAS::LOCAL_ADDRESS: {
    // XXX: What does the value of G->getOffset() mean?
    assert(G->getOffset() == 0 &&
         "Do not know what to do with an non-zero offset");

    unsigned Offset;
    if (MFI->LocalMemoryObjects.count(GV) == 0) {
      uint64_t Size = TD->getTypeAllocSize(GV->getType()->getElementType());
      Offset = MFI->LDSSize;
      MFI->LocalMemoryObjects[GV] = Offset;
      // XXX: Account for alignment?
      MFI->LDSSize += Size;
    } else {
      Offset = MFI->LocalMemoryObjects[GV];
    }

    return DAG.getConstant(Offset, getPointerTy(G->getAddressSpace()));
  }
  case AMDGPUAS::CONSTANT_ADDRESS: {
    MachineFrameInfo *FrameInfo = DAG.getMachineFunction().getFrameInfo();
    Type *EltType = GV->getType()->getElementType();
    unsigned Size = TD->getTypeAllocSize(EltType);
    unsigned Alignment = TD->getPrefTypeAlignment(EltType);

    const GlobalVariable *Var = dyn_cast<GlobalVariable>(GV);
    const Constant *Init = Var->getInitializer();
    int FI = FrameInfo->CreateStackObject(Size, Alignment, false);
    SDValue InitPtr = DAG.getFrameIndex(FI,
        getPointerTy(AMDGPUAS::PRIVATE_ADDRESS));
    SmallVector<SDNode*, 8> WorkList;

    for (SDNode::use_iterator I = DAG.getEntryNode()->use_begin(),
                              E = DAG.getEntryNode()->use_end(); I != E; ++I) {
      if (I->getOpcode() != AMDGPUISD::REGISTER_LOAD && I->getOpcode() != ISD::LOAD)
        continue;
      WorkList.push_back(*I);
    }
    SDValue Chain = LowerConstantInitializer(Init, GV, InitPtr, DAG.getEntryNode(), DAG);
    for (SmallVector<SDNode*, 8>::iterator I = WorkList.begin(),
                                           E = WorkList.end(); I != E; ++I) {
      SmallVector<SDValue, 8> Ops;
      Ops.push_back(Chain);
      for (unsigned i = 1; i < (*I)->getNumOperands(); ++i) {
        Ops.push_back((*I)->getOperand(i));
      }
      DAG.UpdateNodeOperands(*I, Ops);
    }
    return DAG.getZExtOrTrunc(InitPtr, SDLoc(Op),
        getPointerTy(AMDGPUAS::CONSTANT_ADDRESS));
  }
  }
}

SDValue AMDGPUTargetLowering::LowerCONCAT_VECTORS(SDValue Op,
                                                  SelectionDAG &DAG) const {
  SmallVector<SDValue, 8> Args;
  SDValue A = Op.getOperand(0);
  SDValue B = Op.getOperand(1);

  DAG.ExtractVectorElements(A, Args);
  DAG.ExtractVectorElements(B, Args);

  return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(Op), Op.getValueType(), Args);
}

SDValue AMDGPUTargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
                                                     SelectionDAG &DAG) const {

  SmallVector<SDValue, 8> Args;
  unsigned Start = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
  EVT VT = Op.getValueType();
  DAG.ExtractVectorElements(Op.getOperand(0), Args, Start,
                            VT.getVectorNumElements());

  return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(Op), Op.getValueType(), Args);
}

SDValue AMDGPUTargetLowering::LowerFrameIndex(SDValue Op,
                                              SelectionDAG &DAG) const {

  MachineFunction &MF = DAG.getMachineFunction();
  const AMDGPUFrameLowering *TFL =
   static_cast<const AMDGPUFrameLowering*>(getTargetMachine().getFrameLowering());

  FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Op);
  assert(FIN);

  unsigned FrameIndex = FIN->getIndex();
  unsigned Offset = TFL->getFrameIndexOffset(MF, FrameIndex);
  return DAG.getConstant(Offset * 4 * TFL->getStackWidth(MF),
                         Op.getValueType());
}

SDValue AMDGPUTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
    SelectionDAG &DAG) const {
  unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
  SDLoc DL(Op);
  EVT VT = Op.getValueType();

  switch (IntrinsicID) {
    default: return Op;
    case AMDGPUIntrinsic::AMDIL_abs:
      return LowerIntrinsicIABS(Op, DAG);
    case AMDGPUIntrinsic::AMDIL_exp:
      return DAG.getNode(ISD::FEXP2, DL, VT, Op.getOperand(1));
    case AMDGPUIntrinsic::AMDGPU_lrp:
      return LowerIntrinsicLRP(Op, DAG);
    case AMDGPUIntrinsic::AMDIL_fraction:
      return DAG.getNode(AMDGPUISD::FRACT, DL, VT, Op.getOperand(1));
    case AMDGPUIntrinsic::AMDIL_max:
      return DAG.getNode(AMDGPUISD::FMAX, DL, VT, Op.getOperand(1),
                                                  Op.getOperand(2));
    case AMDGPUIntrinsic::AMDGPU_imax:
      return DAG.getNode(AMDGPUISD::SMAX, DL, VT, Op.getOperand(1),
                                                  Op.getOperand(2));
    case AMDGPUIntrinsic::AMDGPU_umax:
      return DAG.getNode(AMDGPUISD::UMAX, DL, VT, Op.getOperand(1),
                                                  Op.getOperand(2));
    case AMDGPUIntrinsic::AMDIL_min:
      return DAG.getNode(AMDGPUISD::FMIN, DL, VT, Op.getOperand(1),
                                                  Op.getOperand(2));
    case AMDGPUIntrinsic::AMDGPU_imin:
      return DAG.getNode(AMDGPUISD::SMIN, DL, VT, Op.getOperand(1),
                                                  Op.getOperand(2));
    case AMDGPUIntrinsic::AMDGPU_umin:
      return DAG.getNode(AMDGPUISD::UMIN, DL, VT, Op.getOperand(1),
                                                  Op.getOperand(2));

    case AMDGPUIntrinsic::AMDGPU_umul24:
      return DAG.getNode(AMDGPUISD::MUL_U24, DL, VT,
                         Op.getOperand(1), Op.getOperand(2));

    case AMDGPUIntrinsic::AMDGPU_imul24:
      return DAG.getNode(AMDGPUISD::MUL_I24, DL, VT,
                         Op.getOperand(1), Op.getOperand(2));

    case AMDGPUIntrinsic::AMDGPU_bfe_i32:
      return DAG.getNode(AMDGPUISD::BFE_I32, DL, VT,
                         Op.getOperand(1),
                         Op.getOperand(2),
                         Op.getOperand(3));

    case AMDGPUIntrinsic::AMDGPU_bfe_u32:
      return DAG.getNode(AMDGPUISD::BFE_U32, DL, VT,
                         Op.getOperand(1),
                         Op.getOperand(2),
                         Op.getOperand(3));

    case AMDGPUIntrinsic::AMDGPU_bfi:
      return DAG.getNode(AMDGPUISD::BFI, DL, VT,
                         Op.getOperand(1),
                         Op.getOperand(2),
                         Op.getOperand(3));

    case AMDGPUIntrinsic::AMDGPU_bfm:
      return DAG.getNode(AMDGPUISD::BFM, DL, VT,
                         Op.getOperand(1),
                         Op.getOperand(2));

    case AMDGPUIntrinsic::AMDIL_round_nearest:
      return DAG.getNode(ISD::FRINT, DL, VT, Op.getOperand(1));
  }
}

///IABS(a) = SMAX(sub(0, a), a)
SDValue AMDGPUTargetLowering::LowerIntrinsicIABS(SDValue Op,
                                                 SelectionDAG &DAG) const {
  SDLoc DL(Op);
  EVT VT = Op.getValueType();
  SDValue Neg = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, VT),
                                              Op.getOperand(1));

  return DAG.getNode(AMDGPUISD::SMAX, DL, VT, Neg, Op.getOperand(1));
}

/// Linear Interpolation
/// LRP(a, b, c) = muladd(a,  b, (1 - a) * c)
SDValue AMDGPUTargetLowering::LowerIntrinsicLRP(SDValue Op,
                                                SelectionDAG &DAG) const {
  SDLoc DL(Op);
  EVT VT = Op.getValueType();
  SDValue OneSubA = DAG.getNode(ISD::FSUB, DL, VT,
                                DAG.getConstantFP(1.0f, MVT::f32),
                                Op.getOperand(1));
  SDValue OneSubAC = DAG.getNode(ISD::FMUL, DL, VT, OneSubA,
                                                    Op.getOperand(3));
  return DAG.getNode(ISD::FADD, DL, VT,
      DAG.getNode(ISD::FMUL, DL, VT, Op.getOperand(1), Op.getOperand(2)),
      OneSubAC);
}

/// \brief Generate Min/Max node
SDValue AMDGPUTargetLowering::CombineMinMax(SDNode *N,
                                            SelectionDAG &DAG) const {
  SDLoc DL(N);
  EVT VT = N->getValueType(0);

  SDValue LHS = N->getOperand(0);
  SDValue RHS = N->getOperand(1);
  SDValue True = N->getOperand(2);
  SDValue False = N->getOperand(3);
  SDValue CC = N->getOperand(4);

  if (VT != MVT::f32 ||
      !((LHS == True && RHS == False) || (LHS == False && RHS == True))) {
    return SDValue();
  }

  ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
  switch (CCOpcode) {
  case ISD::SETOEQ:
  case ISD::SETONE:
  case ISD::SETUNE:
  case ISD::SETNE:
  case ISD::SETUEQ:
  case ISD::SETEQ:
  case ISD::SETFALSE:
  case ISD::SETFALSE2:
  case ISD::SETTRUE:
  case ISD::SETTRUE2:
  case ISD::SETUO:
  case ISD::SETO:
    llvm_unreachable("Operation should already be optimised!");
  case ISD::SETULE:
  case ISD::SETULT:
  case ISD::SETOLE:
  case ISD::SETOLT:
  case ISD::SETLE:
  case ISD::SETLT: {
    unsigned Opc = (LHS == True) ? AMDGPUISD::FMIN : AMDGPUISD::FMAX;
    return DAG.getNode(Opc, DL, VT, LHS, RHS);
  }
  case ISD::SETGT:
  case ISD::SETGE:
  case ISD::SETUGE:
  case ISD::SETOGE:
  case ISD::SETUGT:
  case ISD::SETOGT: {
    unsigned Opc = (LHS == True) ? AMDGPUISD::FMAX : AMDGPUISD::FMIN;
    return DAG.getNode(Opc, DL, VT, LHS, RHS);
  }
  case ISD::SETCC_INVALID:
    llvm_unreachable("Invalid setcc condcode!");
  }
  return SDValue();
}

SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue &Op,
                                              SelectionDAG &DAG) const {
  LoadSDNode *Load = dyn_cast<LoadSDNode>(Op);
  EVT MemEltVT = Load->getMemoryVT().getVectorElementType();
  EVT EltVT = Op.getValueType().getVectorElementType();
  EVT PtrVT = Load->getBasePtr().getValueType();
  unsigned NumElts = Load->getMemoryVT().getVectorNumElements();
  SmallVector<SDValue, 8> Loads;
  SDLoc SL(Op);

  for (unsigned i = 0, e = NumElts; i != e; ++i) {
    SDValue Ptr = DAG.getNode(ISD::ADD, SL, PtrVT, Load->getBasePtr(),
                    DAG.getConstant(i * (MemEltVT.getSizeInBits() / 8), PtrVT));
    Loads.push_back(DAG.getExtLoad(Load->getExtensionType(), SL, EltVT,
                        Load->getChain(), Ptr,
                        MachinePointerInfo(Load->getMemOperand()->getValue()),
                        MemEltVT, Load->isVolatile(), Load->isNonTemporal(),
                        Load->getAlignment()));
  }
  return DAG.getNode(ISD::BUILD_VECTOR, SL, Op.getValueType(), Loads);
}

SDValue AMDGPUTargetLowering::MergeVectorStore(const SDValue &Op,
                                               SelectionDAG &DAG) const {
  StoreSDNode *Store = dyn_cast<StoreSDNode>(Op);
  EVT MemVT = Store->getMemoryVT();
  unsigned MemBits = MemVT.getSizeInBits();

  // Byte stores are really expensive, so if possible, try to pack 32-bit vector
  // truncating store into an i32 store.
  // XXX: We could also handle optimize other vector bitwidths.
  if (!MemVT.isVector() || MemBits > 32) {
    return SDValue();
  }

  SDLoc DL(Op);
  SDValue Value = Store->getValue();
  EVT VT = Value.getValueType();
  EVT ElemVT = VT.getVectorElementType();
  SDValue Ptr = Store->getBasePtr();
  EVT MemEltVT = MemVT.getVectorElementType();
  unsigned MemEltBits = MemEltVT.getSizeInBits();
  unsigned MemNumElements = MemVT.getVectorNumElements();
  unsigned PackedSize = MemVT.getStoreSizeInBits();
  SDValue Mask = DAG.getConstant((1 << MemEltBits) - 1, MVT::i32);

  assert(Value.getValueType().getScalarSizeInBits() >= 32);

  SDValue PackedValue;
  for (unsigned i = 0; i < MemNumElements; ++i) {
    SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ElemVT, Value,
                              DAG.getConstant(i, MVT::i32));
    Elt = DAG.getZExtOrTrunc(Elt, DL, MVT::i32);
    Elt = DAG.getNode(ISD::AND, DL, MVT::i32, Elt, Mask); // getZeroExtendInReg

    SDValue Shift = DAG.getConstant(MemEltBits * i, MVT::i32);
    Elt = DAG.getNode(ISD::SHL, DL, MVT::i32, Elt, Shift);

    if (i == 0) {
      PackedValue = Elt;
    } else {
      PackedValue = DAG.getNode(ISD::OR, DL, MVT::i32, PackedValue, Elt);
    }
  }

  if (PackedSize < 32) {
    EVT PackedVT = EVT::getIntegerVT(*DAG.getContext(), PackedSize);
    return DAG.getTruncStore(Store->getChain(), DL, PackedValue, Ptr,
                             Store->getMemOperand()->getPointerInfo(),
                             PackedVT,
                             Store->isNonTemporal(), Store->isVolatile(),
                             Store->getAlignment());
  }

  return DAG.getStore(Store->getChain(), DL, PackedValue, Ptr,
                      Store->getMemOperand()->getPointerInfo(),
                      Store->isVolatile(),  Store->isNonTemporal(),
                      Store->getAlignment());
}

SDValue AMDGPUTargetLowering::SplitVectorStore(SDValue Op,
                                            SelectionDAG &DAG) const {
  StoreSDNode *Store = cast<StoreSDNode>(Op);
  EVT MemEltVT = Store->getMemoryVT().getVectorElementType();
  EVT EltVT = Store->getValue().getValueType().getVectorElementType();
  EVT PtrVT = Store->getBasePtr().getValueType();
  unsigned NumElts = Store->getMemoryVT().getVectorNumElements();
  SDLoc SL(Op);

  SmallVector<SDValue, 8> Chains;

  for (unsigned i = 0, e = NumElts; i != e; ++i) {
    SDValue Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT,
                              Store->getValue(), DAG.getConstant(i, MVT::i32));
    SDValue Ptr = DAG.getNode(ISD::ADD, SL, PtrVT,
                              Store->getBasePtr(),
                            DAG.getConstant(i * (MemEltVT.getSizeInBits() / 8),
                                            PtrVT));
    Chains.push_back(DAG.getTruncStore(Store->getChain(), SL, Val, Ptr,
                         MachinePointerInfo(Store->getMemOperand()->getValue()),
                         MemEltVT, Store->isVolatile(), Store->isNonTemporal(),
                         Store->getAlignment()));
  }
  return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, Chains);
}

SDValue AMDGPUTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
  SDLoc DL(Op);
  LoadSDNode *Load = cast<LoadSDNode>(Op);
  ISD::LoadExtType ExtType = Load->getExtensionType();
  EVT VT = Op.getValueType();
  EVT MemVT = Load->getMemoryVT();

  if (ExtType != ISD::NON_EXTLOAD && !VT.isVector() && VT.getSizeInBits() > 32) {
    // We can do the extload to 32-bits, and then need to separately extend to
    // 64-bits.

    SDValue ExtLoad32 = DAG.getExtLoad(ExtType, DL, MVT::i32,
                                       Load->getChain(),
                                       Load->getBasePtr(),
                                       MemVT,
                                       Load->getMemOperand());
    return DAG.getNode(ISD::getExtForLoadExtType(ExtType), DL, VT, ExtLoad32);
  }

  if (ExtType == ISD::NON_EXTLOAD && VT.getSizeInBits() < 32) {
    assert(VT == MVT::i1 && "Only i1 non-extloads expected");
    // FIXME: Copied from PPC
    // First, load into 32 bits, then truncate to 1 bit.

    SDValue Chain = Load->getChain();
    SDValue BasePtr = Load->getBasePtr();
    MachineMemOperand *MMO = Load->getMemOperand();

    SDValue NewLD = DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Chain,
                                   BasePtr, MVT::i8, MMO);
    return DAG.getNode(ISD::TRUNCATE, DL, VT, NewLD);
  }

  // Lower loads constant address space global variable loads
  if (Load->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS &&
      isa<GlobalVariable>(
          GetUnderlyingObject(Load->getMemOperand()->getValue()))) {

    SDValue Ptr = DAG.getZExtOrTrunc(Load->getBasePtr(), DL,
        getPointerTy(AMDGPUAS::PRIVATE_ADDRESS));
    Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr,
        DAG.getConstant(2, MVT::i32));
    return DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, Op.getValueType(),
                       Load->getChain(), Ptr,
                       DAG.getTargetConstant(0, MVT::i32), Op.getOperand(2));
  }

  if (Load->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS ||
      ExtType == ISD::NON_EXTLOAD || Load->getMemoryVT().bitsGE(MVT::i32))
    return SDValue();


  SDValue Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, Load->getBasePtr(),
                            DAG.getConstant(2, MVT::i32));
  SDValue Ret = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, Op.getValueType(),
                            Load->getChain(), Ptr,
                            DAG.getTargetConstant(0, MVT::i32),
                            Op.getOperand(2));
  SDValue ByteIdx = DAG.getNode(ISD::AND, DL, MVT::i32,
                                Load->getBasePtr(),
                                DAG.getConstant(0x3, MVT::i32));
  SDValue ShiftAmt = DAG.getNode(ISD::SHL, DL, MVT::i32, ByteIdx,
                                 DAG.getConstant(3, MVT::i32));

  Ret = DAG.getNode(ISD::SRL, DL, MVT::i32, Ret, ShiftAmt);

  EVT MemEltVT = MemVT.getScalarType();
  if (ExtType == ISD::SEXTLOAD) {
    SDValue MemEltVTNode = DAG.getValueType(MemEltVT);
    return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, Ret, MemEltVTNode);
  }

  return DAG.getZeroExtendInReg(Ret, DL, MemEltVT);
}

SDValue AMDGPUTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
  SDLoc DL(Op);
  SDValue Result = AMDGPUTargetLowering::MergeVectorStore(Op, DAG);
  if (Result.getNode()) {
    return Result;
  }

  StoreSDNode *Store = cast<StoreSDNode>(Op);
  SDValue Chain = Store->getChain();
  if ((Store->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
       Store->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) &&
      Store->getValue().getValueType().isVector()) {
    return SplitVectorStore(Op, DAG);
  }

  EVT MemVT = Store->getMemoryVT();
  if (Store->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS &&
      MemVT.bitsLT(MVT::i32)) {
    unsigned Mask = 0;
    if (Store->getMemoryVT() == MVT::i8) {
      Mask = 0xff;
    } else if (Store->getMemoryVT() == MVT::i16) {
      Mask = 0xffff;
    }
    SDValue BasePtr = Store->getBasePtr();
    SDValue Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, BasePtr,
                              DAG.getConstant(2, MVT::i32));
    SDValue Dst = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, MVT::i32,
                              Chain, Ptr, DAG.getTargetConstant(0, MVT::i32));

    SDValue ByteIdx = DAG.getNode(ISD::AND, DL, MVT::i32, BasePtr,
                                  DAG.getConstant(0x3, MVT::i32));

    SDValue ShiftAmt = DAG.getNode(ISD::SHL, DL, MVT::i32, ByteIdx,
                                   DAG.getConstant(3, MVT::i32));

    SDValue SExtValue = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i32,
                                    Store->getValue());

    SDValue MaskedValue = DAG.getZeroExtendInReg(SExtValue, DL, MemVT);

    SDValue ShiftedValue = DAG.getNode(ISD::SHL, DL, MVT::i32,
                                       MaskedValue, ShiftAmt);

    SDValue DstMask = DAG.getNode(ISD::SHL, DL, MVT::i32, DAG.getConstant(Mask, MVT::i32),
                                  ShiftAmt);
    DstMask = DAG.getNode(ISD::XOR, DL, MVT::i32, DstMask,
                          DAG.getConstant(0xffffffff, MVT::i32));
    Dst = DAG.getNode(ISD::AND, DL, MVT::i32, Dst, DstMask);

    SDValue Value = DAG.getNode(ISD::OR, DL, MVT::i32, Dst, ShiftedValue);
    return DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other,
                       Chain, Value, Ptr, DAG.getTargetConstant(0, MVT::i32));
  }
  return SDValue();
}

SDValue AMDGPUTargetLowering::LowerUDIVREM(SDValue Op,
                                           SelectionDAG &DAG) const {
  SDLoc DL(Op);
  EVT VT = Op.getValueType();

  SDValue Num = Op.getOperand(0);
  SDValue Den = Op.getOperand(1);

  // RCP =  URECIP(Den) = 2^32 / Den + e
  // e is rounding error.
  SDValue RCP = DAG.getNode(AMDGPUISD::URECIP, DL, VT, Den);

  // RCP_LO = umulo(RCP, Den) */
  SDValue RCP_LO = DAG.getNode(ISD::UMULO, DL, VT, RCP, Den);

  // RCP_HI = mulhu (RCP, Den) */
  SDValue RCP_HI = DAG.getNode(ISD::MULHU, DL, VT, RCP, Den);

  // NEG_RCP_LO = -RCP_LO
  SDValue NEG_RCP_LO = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, VT),
                                                     RCP_LO);

  // ABS_RCP_LO = (RCP_HI == 0 ? NEG_RCP_LO : RCP_LO)
  SDValue ABS_RCP_LO = DAG.getSelectCC(DL, RCP_HI, DAG.getConstant(0, VT),
                                           NEG_RCP_LO, RCP_LO,
                                           ISD::SETEQ);
  // Calculate the rounding error from the URECIP instruction
  // E = mulhu(ABS_RCP_LO, RCP)
  SDValue E = DAG.getNode(ISD::MULHU, DL, VT, ABS_RCP_LO, RCP);

  // RCP_A_E = RCP + E
  SDValue RCP_A_E = DAG.getNode(ISD::ADD, DL, VT, RCP, E);

  // RCP_S_E = RCP - E
  SDValue RCP_S_E = DAG.getNode(ISD::SUB, DL, VT, RCP, E);

  // Tmp0 = (RCP_HI == 0 ? RCP_A_E : RCP_SUB_E)
  SDValue Tmp0 = DAG.getSelectCC(DL, RCP_HI, DAG.getConstant(0, VT),
                                     RCP_A_E, RCP_S_E,
                                     ISD::SETEQ);
  // Quotient = mulhu(Tmp0, Num)
  SDValue Quotient = DAG.getNode(ISD::MULHU, DL, VT, Tmp0, Num);

  // Num_S_Remainder = Quotient * Den
  SDValue Num_S_Remainder = DAG.getNode(ISD::UMULO, DL, VT, Quotient, Den);

  // Remainder = Num - Num_S_Remainder
  SDValue Remainder = DAG.getNode(ISD::SUB, DL, VT, Num, Num_S_Remainder);

  // Remainder_GE_Den = (Remainder >= Den ? -1 : 0)
  SDValue Remainder_GE_Den = DAG.getSelectCC(DL, Remainder, Den,
                                                 DAG.getConstant(-1, VT),
                                                 DAG.getConstant(0, VT),
                                                 ISD::SETUGE);
  // Remainder_GE_Zero = (Num >= Num_S_Remainder ? -1 : 0)
  SDValue Remainder_GE_Zero = DAG.getSelectCC(DL, Num,
                                                  Num_S_Remainder,
                                                  DAG.getConstant(-1, VT),
                                                  DAG.getConstant(0, VT),
                                                  ISD::SETUGE);
  // Tmp1 = Remainder_GE_Den & Remainder_GE_Zero
  SDValue Tmp1 = DAG.getNode(ISD::AND, DL, VT, Remainder_GE_Den,
                                               Remainder_GE_Zero);

  // Calculate Division result:

  // Quotient_A_One = Quotient + 1
  SDValue Quotient_A_One = DAG.getNode(ISD::ADD, DL, VT, Quotient,
                                                         DAG.getConstant(1, VT));

  // Quotient_S_One = Quotient - 1
  SDValue Quotient_S_One = DAG.getNode(ISD::SUB, DL, VT, Quotient,
                                                         DAG.getConstant(1, VT));

  // Div = (Tmp1 == 0 ? Quotient : Quotient_A_One)
  SDValue Div = DAG.getSelectCC(DL, Tmp1, DAG.getConstant(0, VT),
                                     Quotient, Quotient_A_One, ISD::SETEQ);

  // Div = (Remainder_GE_Zero == 0 ? Quotient_S_One : Div)
  Div = DAG.getSelectCC(DL, Remainder_GE_Zero, DAG.getConstant(0, VT),
                            Quotient_S_One, Div, ISD::SETEQ);

  // Calculate Rem result:

  // Remainder_S_Den = Remainder - Den
  SDValue Remainder_S_Den = DAG.getNode(ISD::SUB, DL, VT, Remainder, Den);

  // Remainder_A_Den = Remainder + Den
  SDValue Remainder_A_Den = DAG.getNode(ISD::ADD, DL, VT, Remainder, Den);

  // Rem = (Tmp1 == 0 ? Remainder : Remainder_S_Den)
  SDValue Rem = DAG.getSelectCC(DL, Tmp1, DAG.getConstant(0, VT),
                                    Remainder, Remainder_S_Den, ISD::SETEQ);

  // Rem = (Remainder_GE_Zero == 0 ? Remainder_A_Den : Rem)
  Rem = DAG.getSelectCC(DL, Remainder_GE_Zero, DAG.getConstant(0, VT),
                            Remainder_A_Den, Rem, ISD::SETEQ);
  SDValue Ops[2] = {
    Div,
    Rem
  };
  return DAG.getMergeValues(Ops, DL);
}

SDValue AMDGPUTargetLowering::LowerUINT_TO_FP(SDValue Op,
                                               SelectionDAG &DAG) const {
  SDValue S0 = Op.getOperand(0);
  SDLoc DL(Op);
  if (Op.getValueType() != MVT::f32 || S0.getValueType() != MVT::i64)
    return SDValue();

  // f32 uint_to_fp i64
  SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, S0,
                           DAG.getConstant(0, MVT::i32));
  SDValue FloatLo = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, Lo);
  SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, S0,
                           DAG.getConstant(1, MVT::i32));
  SDValue FloatHi = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, Hi);
  FloatHi = DAG.getNode(ISD::FMUL, DL, MVT::f32, FloatHi,
                        DAG.getConstantFP(4294967296.0f, MVT::f32)); // 2^32
  return DAG.getNode(ISD::FADD, DL, MVT::f32, FloatLo, FloatHi);

}

SDValue AMDGPUTargetLowering::ExpandSIGN_EXTEND_INREG(SDValue Op,
                                                      unsigned BitsDiff,
                                                      SelectionDAG &DAG) const {
  MVT VT = Op.getSimpleValueType();
  SDLoc DL(Op);
  SDValue Shift = DAG.getConstant(BitsDiff, VT);
  // Shift left by 'Shift' bits.
  SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, Op.getOperand(0), Shift);
  // Signed shift Right by 'Shift' bits.
  return DAG.getNode(ISD::SRA, DL, VT, Shl, Shift);
}

SDValue AMDGPUTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,
                                                     SelectionDAG &DAG) const {
  EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
  MVT VT = Op.getSimpleValueType();
  MVT ScalarVT = VT.getScalarType();

  if (!VT.isVector())
    return SDValue();

  SDValue Src = Op.getOperand(0);
  SDLoc DL(Op);

  // TODO: Don't scalarize on Evergreen?
  unsigned NElts = VT.getVectorNumElements();
  SmallVector<SDValue, 8> Args;
  DAG.ExtractVectorElements(Src, Args, 0, NElts);

  SDValue VTOp = DAG.getValueType(ExtraVT.getScalarType());
  for (unsigned I = 0; I < NElts; ++I)
    Args[I] = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, ScalarVT, Args[I], VTOp);

  return DAG.getNode(ISD::BUILD_VECTOR, DL, VT, Args);
}

//===----------------------------------------------------------------------===//
// Custom DAG optimizations
//===----------------------------------------------------------------------===//

static bool isU24(SDValue Op, SelectionDAG &DAG) {
  APInt KnownZero, KnownOne;
  EVT VT = Op.getValueType();
  DAG.computeKnownBits(Op, KnownZero, KnownOne);

  return (VT.getSizeInBits() - KnownZero.countLeadingOnes()) <= 24;
}

static bool isI24(SDValue Op, SelectionDAG &DAG) {
  EVT VT = Op.getValueType();

  // In order for this to be a signed 24-bit value, bit 23, must
  // be a sign bit.
  return VT.getSizeInBits() >= 24 && // Types less than 24-bit should be treated
                                     // as unsigned 24-bit values.
         (VT.getSizeInBits() - DAG.ComputeNumSignBits(Op)) < 24;
}

static void simplifyI24(SDValue Op, TargetLowering::DAGCombinerInfo &DCI) {

  SelectionDAG &DAG = DCI.DAG;
  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
  EVT VT = Op.getValueType();

  APInt Demanded = APInt::getLowBitsSet(VT.getSizeInBits(), 24);
  APInt KnownZero, KnownOne;
  TargetLowering::TargetLoweringOpt TLO(DAG, true, true);
  if (TLI.SimplifyDemandedBits(Op, Demanded, KnownZero, KnownOne, TLO))
    DCI.CommitTargetLoweringOpt(TLO);
}

SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
                                            DAGCombinerInfo &DCI) const {
  SelectionDAG &DAG = DCI.DAG;
  SDLoc DL(N);

  switch(N->getOpcode()) {
    default: break;
    case ISD::MUL: {
      EVT VT = N->getValueType(0);
      SDValue N0 = N->getOperand(0);
      SDValue N1 = N->getOperand(1);
      SDValue Mul;

      // FIXME: Add support for 24-bit multiply with 64-bit output on SI.
      if (VT.isVector() || VT.getSizeInBits() > 32)
        break;

      if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) {
        N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
        N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
        Mul = DAG.getNode(AMDGPUISD::MUL_U24, DL, MVT::i32, N0, N1);
      } else if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) {
        N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
        N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
        Mul = DAG.getNode(AMDGPUISD::MUL_I24, DL, MVT::i32, N0, N1);
      } else {
        break;
      }

      // We need to use sext even for MUL_U24, because MUL_U24 is used
      // for signed multiply of 8 and 16-bit types.
      SDValue Reg = DAG.getSExtOrTrunc(Mul, DL, VT);

      return Reg;
    }
    case AMDGPUISD::MUL_I24:
    case AMDGPUISD::MUL_U24: {
      SDValue N0 = N->getOperand(0);
      SDValue N1 = N->getOperand(1);
      simplifyI24(N0, DCI);
      simplifyI24(N1, DCI);
      return SDValue();
    }
    case ISD::SELECT_CC: {
      return CombineMinMax(N, DAG);
    }
  }
  return SDValue();
}

//===----------------------------------------------------------------------===//
// Helper functions
//===----------------------------------------------------------------------===//

void AMDGPUTargetLowering::getOriginalFunctionArgs(
                               SelectionDAG &DAG,
                               const Function *F,
                               const SmallVectorImpl<ISD::InputArg> &Ins,
                               SmallVectorImpl<ISD::InputArg> &OrigIns) const {

  for (unsigned i = 0, e = Ins.size(); i < e; ++i) {
    if (Ins[i].ArgVT == Ins[i].VT) {
      OrigIns.push_back(Ins[i]);
      continue;
    }

    EVT VT;
    if (Ins[i].ArgVT.isVector() && !Ins[i].VT.isVector()) {
      // Vector has been split into scalars.
      VT = Ins[i].ArgVT.getVectorElementType();
    } else if (Ins[i].VT.isVector() && Ins[i].ArgVT.isVector() &&
               Ins[i].ArgVT.getVectorElementType() !=
               Ins[i].VT.getVectorElementType()) {
      // Vector elements have been promoted
      VT = Ins[i].ArgVT;
    } else {
      // Vector has been spilt into smaller vectors.
      VT = Ins[i].VT;
    }

    ISD::InputArg Arg(Ins[i].Flags, VT, VT, Ins[i].Used,
                      Ins[i].OrigArgIndex, Ins[i].PartOffset);
    OrigIns.push_back(Arg);
  }
}

bool AMDGPUTargetLowering::isHWTrueValue(SDValue Op) const {
  if (ConstantFPSDNode * CFP = dyn_cast<ConstantFPSDNode>(Op)) {
    return CFP->isExactlyValue(1.0);
  }
  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
    return C->isAllOnesValue();
  }
  return false;
}

bool AMDGPUTargetLowering::isHWFalseValue(SDValue Op) const {
  if (ConstantFPSDNode * CFP = dyn_cast<ConstantFPSDNode>(Op)) {
    return CFP->getValueAPF().isZero();
  }
  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
    return C->isNullValue();
  }
  return false;
}

SDValue AMDGPUTargetLowering::CreateLiveInRegister(SelectionDAG &DAG,
                                                  const TargetRegisterClass *RC,
                                                   unsigned Reg, EVT VT) const {
  MachineFunction &MF = DAG.getMachineFunction();
  MachineRegisterInfo &MRI = MF.getRegInfo();
  unsigned VirtualRegister;
  if (!MRI.isLiveIn(Reg)) {
    VirtualRegister = MRI.createVirtualRegister(RC);
    MRI.addLiveIn(Reg, VirtualRegister);
  } else {
    VirtualRegister = MRI.getLiveInVirtReg(Reg);
  }
  return DAG.getRegister(VirtualRegister, VT);
}

#define NODE_NAME_CASE(node) case AMDGPUISD::node: return #node;

const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
  switch (Opcode) {
  default: return nullptr;
  // AMDIL DAG nodes
  NODE_NAME_CASE(CALL);
  NODE_NAME_CASE(UMUL);
  NODE_NAME_CASE(DIV_INF);
  NODE_NAME_CASE(RET_FLAG);
  NODE_NAME_CASE(BRANCH_COND);

  // AMDGPU DAG nodes
  NODE_NAME_CASE(DWORDADDR)
  NODE_NAME_CASE(FRACT)
  NODE_NAME_CASE(FMAX)
  NODE_NAME_CASE(SMAX)
  NODE_NAME_CASE(UMAX)
  NODE_NAME_CASE(FMIN)
  NODE_NAME_CASE(SMIN)
  NODE_NAME_CASE(UMIN)
  NODE_NAME_CASE(BFE_U32)
  NODE_NAME_CASE(BFE_I32)
  NODE_NAME_CASE(BFI)
  NODE_NAME_CASE(BFM)
  NODE_NAME_CASE(MUL_U24)
  NODE_NAME_CASE(MUL_I24)
  NODE_NAME_CASE(URECIP)
  NODE_NAME_CASE(DOT4)
  NODE_NAME_CASE(EXPORT)
  NODE_NAME_CASE(CONST_ADDRESS)
  NODE_NAME_CASE(REGISTER_LOAD)
  NODE_NAME_CASE(REGISTER_STORE)
  NODE_NAME_CASE(LOAD_CONSTANT)
  NODE_NAME_CASE(LOAD_INPUT)
  NODE_NAME_CASE(SAMPLE)
  NODE_NAME_CASE(SAMPLEB)
  NODE_NAME_CASE(SAMPLED)
  NODE_NAME_CASE(SAMPLEL)
  NODE_NAME_CASE(STORE_MSKOR)
  NODE_NAME_CASE(TBUFFER_STORE_FORMAT)
  }
}

static void computeKnownBitsForMinMax(const SDValue Op0,
                                      const SDValue Op1,
                                      APInt &KnownZero,
                                      APInt &KnownOne,
                                      const SelectionDAG &DAG,
                                      unsigned Depth) {
  APInt Op0Zero, Op0One;
  APInt Op1Zero, Op1One;
  DAG.computeKnownBits(Op0, Op0Zero, Op0One, Depth);
  DAG.computeKnownBits(Op1, Op1Zero, Op1One, Depth);

  KnownZero = Op0Zero & Op1Zero;
  KnownOne = Op0One & Op1One;
}

void AMDGPUTargetLowering::computeKnownBitsForTargetNode(
  const SDValue Op,
  APInt &KnownZero,
  APInt &KnownOne,
  const SelectionDAG &DAG,
  unsigned Depth) const {

  KnownZero = KnownOne = APInt(KnownOne.getBitWidth(), 0); // Don't know anything.
  unsigned Opc = Op.getOpcode();
  switch (Opc) {
  case ISD::INTRINSIC_WO_CHAIN: {
    // FIXME: The intrinsic should just use the node.
    switch (cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue()) {
    case AMDGPUIntrinsic::AMDGPU_imax:
    case AMDGPUIntrinsic::AMDGPU_umax:
    case AMDGPUIntrinsic::AMDGPU_imin:
    case AMDGPUIntrinsic::AMDGPU_umin:
      computeKnownBitsForMinMax(Op.getOperand(1), Op.getOperand(2),
                                KnownZero, KnownOne, DAG, Depth);
      break;
    default:
      break;
    }

    break;
  }
  case AMDGPUISD::SMAX:
  case AMDGPUISD::UMAX:
  case AMDGPUISD::SMIN:
  case AMDGPUISD::UMIN:
    computeKnownBitsForMinMax(Op.getOperand(0), Op.getOperand(1),
                              KnownZero, KnownOne, DAG, Depth);
    break;
  default:
    break;
  }
}