mirror of
https://github.com/c64scene-ar/llvm-6502.git
synced 2024-12-14 11:32:34 +00:00
70968365db
vector type legalization strategies in a more fine grained manner, and change the legalization of several v1iN types and v1f32 to be widening rather than scalarization on AArch64. This fixes an assertion failure caused by scalarizing nodes like "v1i32 trunc v1i64". As v1i64 is legal it will fail to scalarize v1i32. This also provides a foundation for other targets to have more granular control over how vector types are legalized. Patch by Hao Liu, reviewed by Tim Northover. I'm committing it to allow some work to start taking place on top of this patch as it adds some really important hooks to the backend that I'd like to immediately start using. =] http://reviews.llvm.org/D4322 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@212242 91177308-0d34-0410-b5e6-96231b3b80d8
3319 lines
120 KiB
C++
3319 lines
120 KiB
C++
//
|
|
// The LLVM Compiler Infrastructure
|
|
//
|
|
// This file is distributed under the University of Illinois Open Source
|
|
// License. See LICENSE.TXT for details.
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
//
|
|
// This file defines the interfaces that NVPTX uses to lower LLVM code into a
|
|
// selection DAG.
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
#include "NVPTXISelLowering.h"
|
|
#include "NVPTX.h"
|
|
#include "NVPTXTargetMachine.h"
|
|
#include "NVPTXTargetObjectFile.h"
|
|
#include "NVPTXUtilities.h"
|
|
#include "llvm/CodeGen/Analysis.h"
|
|
#include "llvm/CodeGen/MachineFrameInfo.h"
|
|
#include "llvm/CodeGen/MachineFunction.h"
|
|
#include "llvm/CodeGen/MachineInstrBuilder.h"
|
|
#include "llvm/CodeGen/MachineRegisterInfo.h"
|
|
#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
|
|
#include "llvm/IR/CallSite.h"
|
|
#include "llvm/IR/DerivedTypes.h"
|
|
#include "llvm/IR/Function.h"
|
|
#include "llvm/IR/GlobalValue.h"
|
|
#include "llvm/IR/IntrinsicInst.h"
|
|
#include "llvm/IR/Intrinsics.h"
|
|
#include "llvm/IR/Module.h"
|
|
#include "llvm/MC/MCSectionELF.h"
|
|
#include "llvm/Support/CommandLine.h"
|
|
#include "llvm/Support/Debug.h"
|
|
#include "llvm/Support/ErrorHandling.h"
|
|
#include "llvm/Support/MathExtras.h"
|
|
#include "llvm/Support/raw_ostream.h"
|
|
#include <sstream>
|
|
|
|
#undef DEBUG_TYPE
|
|
#define DEBUG_TYPE "nvptx-lower"
|
|
|
|
using namespace llvm;
|
|
|
|
static unsigned int uniqueCallSite = 0;
|
|
|
|
static cl::opt<bool> sched4reg(
|
|
"nvptx-sched4reg",
|
|
cl::desc("NVPTX Specific: schedule for register pressue"), cl::init(false));
|
|
|
|
static bool IsPTXVectorType(MVT VT) {
|
|
switch (VT.SimpleTy) {
|
|
default:
|
|
return false;
|
|
case MVT::v2i1:
|
|
case MVT::v4i1:
|
|
case MVT::v2i8:
|
|
case MVT::v4i8:
|
|
case MVT::v2i16:
|
|
case MVT::v4i16:
|
|
case MVT::v2i32:
|
|
case MVT::v4i32:
|
|
case MVT::v2i64:
|
|
case MVT::v2f32:
|
|
case MVT::v4f32:
|
|
case MVT::v2f64:
|
|
return true;
|
|
}
|
|
}
|
|
|
|
/// ComputePTXValueVTs - For the given Type \p Ty, returns the set of primitive
|
|
/// EVTs that compose it. Unlike ComputeValueVTs, this will break apart vectors
|
|
/// into their primitive components.
|
|
/// NOTE: This is a band-aid for code that expects ComputeValueVTs to return the
|
|
/// same number of types as the Ins/Outs arrays in LowerFormalArguments,
|
|
/// LowerCall, and LowerReturn.
|
|
static void ComputePTXValueVTs(const TargetLowering &TLI, Type *Ty,
|
|
SmallVectorImpl<EVT> &ValueVTs,
|
|
SmallVectorImpl<uint64_t> *Offsets = nullptr,
|
|
uint64_t StartingOffset = 0) {
|
|
SmallVector<EVT, 16> TempVTs;
|
|
SmallVector<uint64_t, 16> TempOffsets;
|
|
|
|
ComputeValueVTs(TLI, Ty, TempVTs, &TempOffsets, StartingOffset);
|
|
for (unsigned i = 0, e = TempVTs.size(); i != e; ++i) {
|
|
EVT VT = TempVTs[i];
|
|
uint64_t Off = TempOffsets[i];
|
|
if (VT.isVector())
|
|
for (unsigned j = 0, je = VT.getVectorNumElements(); j != je; ++j) {
|
|
ValueVTs.push_back(VT.getVectorElementType());
|
|
if (Offsets)
|
|
Offsets->push_back(Off+j*VT.getVectorElementType().getStoreSize());
|
|
}
|
|
else {
|
|
ValueVTs.push_back(VT);
|
|
if (Offsets)
|
|
Offsets->push_back(Off);
|
|
}
|
|
}
|
|
}
|
|
|
|
// NVPTXTargetLowering Constructor.
|
|
NVPTXTargetLowering::NVPTXTargetLowering(NVPTXTargetMachine &TM)
|
|
: TargetLowering(TM, new NVPTXTargetObjectFile()), nvTM(&TM),
|
|
nvptxSubtarget(TM.getSubtarget<NVPTXSubtarget>()) {
|
|
|
|
// always lower memset, memcpy, and memmove intrinsics to load/store
|
|
// instructions, rather
|
|
// then generating calls to memset, mempcy or memmove.
|
|
MaxStoresPerMemset = (unsigned) 0xFFFFFFFF;
|
|
MaxStoresPerMemcpy = (unsigned) 0xFFFFFFFF;
|
|
MaxStoresPerMemmove = (unsigned) 0xFFFFFFFF;
|
|
|
|
setBooleanContents(ZeroOrNegativeOneBooleanContent);
|
|
setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
|
|
|
|
// Jump is Expensive. Don't create extra control flow for 'and', 'or'
|
|
// condition branches.
|
|
setJumpIsExpensive(true);
|
|
|
|
// By default, use the Source scheduling
|
|
if (sched4reg)
|
|
setSchedulingPreference(Sched::RegPressure);
|
|
else
|
|
setSchedulingPreference(Sched::Source);
|
|
|
|
addRegisterClass(MVT::i1, &NVPTX::Int1RegsRegClass);
|
|
addRegisterClass(MVT::i16, &NVPTX::Int16RegsRegClass);
|
|
addRegisterClass(MVT::i32, &NVPTX::Int32RegsRegClass);
|
|
addRegisterClass(MVT::i64, &NVPTX::Int64RegsRegClass);
|
|
addRegisterClass(MVT::f32, &NVPTX::Float32RegsRegClass);
|
|
addRegisterClass(MVT::f64, &NVPTX::Float64RegsRegClass);
|
|
|
|
// Operations not directly supported by NVPTX.
|
|
setOperationAction(ISD::SELECT_CC, MVT::f32, Expand);
|
|
setOperationAction(ISD::SELECT_CC, MVT::f64, Expand);
|
|
setOperationAction(ISD::SELECT_CC, MVT::i1, Expand);
|
|
setOperationAction(ISD::SELECT_CC, MVT::i8, Expand);
|
|
setOperationAction(ISD::SELECT_CC, MVT::i16, Expand);
|
|
setOperationAction(ISD::SELECT_CC, MVT::i32, Expand);
|
|
setOperationAction(ISD::SELECT_CC, MVT::i64, Expand);
|
|
setOperationAction(ISD::BR_CC, MVT::f32, Expand);
|
|
setOperationAction(ISD::BR_CC, MVT::f64, Expand);
|
|
setOperationAction(ISD::BR_CC, MVT::i1, Expand);
|
|
setOperationAction(ISD::BR_CC, MVT::i8, Expand);
|
|
setOperationAction(ISD::BR_CC, MVT::i16, Expand);
|
|
setOperationAction(ISD::BR_CC, MVT::i32, Expand);
|
|
setOperationAction(ISD::BR_CC, MVT::i64, Expand);
|
|
// Some SIGN_EXTEND_INREG can be done using cvt instruction.
|
|
// For others we will expand to a SHL/SRA pair.
|
|
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i64, Legal);
|
|
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
|
|
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Legal);
|
|
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal);
|
|
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
|
|
|
|
setOperationAction(ISD::SHL_PARTS, MVT::i32 , Custom);
|
|
setOperationAction(ISD::SRA_PARTS, MVT::i32 , Custom);
|
|
setOperationAction(ISD::SRL_PARTS, MVT::i32 , Custom);
|
|
setOperationAction(ISD::SHL_PARTS, MVT::i64 , Custom);
|
|
setOperationAction(ISD::SRA_PARTS, MVT::i64 , Custom);
|
|
setOperationAction(ISD::SRL_PARTS, MVT::i64 , Custom);
|
|
|
|
if (nvptxSubtarget.hasROT64()) {
|
|
setOperationAction(ISD::ROTL, MVT::i64, Legal);
|
|
setOperationAction(ISD::ROTR, MVT::i64, Legal);
|
|
} else {
|
|
setOperationAction(ISD::ROTL, MVT::i64, Expand);
|
|
setOperationAction(ISD::ROTR, MVT::i64, Expand);
|
|
}
|
|
if (nvptxSubtarget.hasROT32()) {
|
|
setOperationAction(ISD::ROTL, MVT::i32, Legal);
|
|
setOperationAction(ISD::ROTR, MVT::i32, Legal);
|
|
} else {
|
|
setOperationAction(ISD::ROTL, MVT::i32, Expand);
|
|
setOperationAction(ISD::ROTR, MVT::i32, Expand);
|
|
}
|
|
|
|
setOperationAction(ISD::ROTL, MVT::i16, Expand);
|
|
setOperationAction(ISD::ROTR, MVT::i16, Expand);
|
|
setOperationAction(ISD::ROTL, MVT::i8, Expand);
|
|
setOperationAction(ISD::ROTR, MVT::i8, Expand);
|
|
setOperationAction(ISD::BSWAP, MVT::i16, Expand);
|
|
setOperationAction(ISD::BSWAP, MVT::i32, Expand);
|
|
setOperationAction(ISD::BSWAP, MVT::i64, Expand);
|
|
|
|
// Indirect branch is not supported.
|
|
// This also disables Jump Table creation.
|
|
setOperationAction(ISD::BR_JT, MVT::Other, Expand);
|
|
setOperationAction(ISD::BRIND, MVT::Other, Expand);
|
|
|
|
setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
|
|
setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
|
|
|
|
// We want to legalize constant related memmove and memcopy
|
|
// intrinsics.
|
|
setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
|
|
|
|
// Turn FP extload into load/fextend
|
|
setLoadExtAction(ISD::EXTLOAD, MVT::f32, Expand);
|
|
// Turn FP truncstore into trunc + store.
|
|
setTruncStoreAction(MVT::f64, MVT::f32, Expand);
|
|
|
|
// PTX does not support load / store predicate registers
|
|
setOperationAction(ISD::LOAD, MVT::i1, Custom);
|
|
setOperationAction(ISD::STORE, MVT::i1, Custom);
|
|
|
|
setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote);
|
|
setLoadExtAction(ISD::ZEXTLOAD, MVT::i1, Promote);
|
|
setTruncStoreAction(MVT::i64, MVT::i1, Expand);
|
|
setTruncStoreAction(MVT::i32, MVT::i1, Expand);
|
|
setTruncStoreAction(MVT::i16, MVT::i1, Expand);
|
|
setTruncStoreAction(MVT::i8, MVT::i1, Expand);
|
|
|
|
// This is legal in NVPTX
|
|
setOperationAction(ISD::ConstantFP, MVT::f64, Legal);
|
|
setOperationAction(ISD::ConstantFP, MVT::f32, Legal);
|
|
|
|
// TRAP can be lowered to PTX trap
|
|
setOperationAction(ISD::TRAP, MVT::Other, Legal);
|
|
|
|
setOperationAction(ISD::ADDC, MVT::i64, Expand);
|
|
setOperationAction(ISD::ADDE, MVT::i64, Expand);
|
|
|
|
// Register custom handling for vector loads/stores
|
|
for (int i = MVT::FIRST_VECTOR_VALUETYPE; i <= MVT::LAST_VECTOR_VALUETYPE;
|
|
++i) {
|
|
MVT VT = (MVT::SimpleValueType) i;
|
|
if (IsPTXVectorType(VT)) {
|
|
setOperationAction(ISD::LOAD, VT, Custom);
|
|
setOperationAction(ISD::STORE, VT, Custom);
|
|
setOperationAction(ISD::INTRINSIC_W_CHAIN, VT, Custom);
|
|
}
|
|
}
|
|
|
|
// Custom handling for i8 intrinsics
|
|
setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i8, Custom);
|
|
|
|
setOperationAction(ISD::CTLZ, MVT::i16, Legal);
|
|
setOperationAction(ISD::CTLZ, MVT::i32, Legal);
|
|
setOperationAction(ISD::CTLZ, MVT::i64, Legal);
|
|
setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16, Legal);
|
|
setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Legal);
|
|
setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Legal);
|
|
setOperationAction(ISD::CTTZ, MVT::i16, Expand);
|
|
setOperationAction(ISD::CTTZ, MVT::i32, Expand);
|
|
setOperationAction(ISD::CTTZ, MVT::i64, Expand);
|
|
setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16, Expand);
|
|
setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Expand);
|
|
setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Expand);
|
|
setOperationAction(ISD::CTPOP, MVT::i16, Legal);
|
|
setOperationAction(ISD::CTPOP, MVT::i32, Legal);
|
|
setOperationAction(ISD::CTPOP, MVT::i64, Legal);
|
|
|
|
// We have some custom DAG combine patterns for these nodes
|
|
setTargetDAGCombine(ISD::ADD);
|
|
setTargetDAGCombine(ISD::AND);
|
|
setTargetDAGCombine(ISD::FADD);
|
|
setTargetDAGCombine(ISD::MUL);
|
|
setTargetDAGCombine(ISD::SHL);
|
|
|
|
// Now deduce the information based on the above mentioned
|
|
// actions
|
|
computeRegisterProperties();
|
|
}
|
|
|
|
const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const {
|
|
switch (Opcode) {
|
|
default:
|
|
return nullptr;
|
|
case NVPTXISD::CALL:
|
|
return "NVPTXISD::CALL";
|
|
case NVPTXISD::RET_FLAG:
|
|
return "NVPTXISD::RET_FLAG";
|
|
case NVPTXISD::Wrapper:
|
|
return "NVPTXISD::Wrapper";
|
|
case NVPTXISD::DeclareParam:
|
|
return "NVPTXISD::DeclareParam";
|
|
case NVPTXISD::DeclareScalarParam:
|
|
return "NVPTXISD::DeclareScalarParam";
|
|
case NVPTXISD::DeclareRet:
|
|
return "NVPTXISD::DeclareRet";
|
|
case NVPTXISD::DeclareRetParam:
|
|
return "NVPTXISD::DeclareRetParam";
|
|
case NVPTXISD::PrintCall:
|
|
return "NVPTXISD::PrintCall";
|
|
case NVPTXISD::LoadParam:
|
|
return "NVPTXISD::LoadParam";
|
|
case NVPTXISD::LoadParamV2:
|
|
return "NVPTXISD::LoadParamV2";
|
|
case NVPTXISD::LoadParamV4:
|
|
return "NVPTXISD::LoadParamV4";
|
|
case NVPTXISD::StoreParam:
|
|
return "NVPTXISD::StoreParam";
|
|
case NVPTXISD::StoreParamV2:
|
|
return "NVPTXISD::StoreParamV2";
|
|
case NVPTXISD::StoreParamV4:
|
|
return "NVPTXISD::StoreParamV4";
|
|
case NVPTXISD::StoreParamS32:
|
|
return "NVPTXISD::StoreParamS32";
|
|
case NVPTXISD::StoreParamU32:
|
|
return "NVPTXISD::StoreParamU32";
|
|
case NVPTXISD::CallArgBegin:
|
|
return "NVPTXISD::CallArgBegin";
|
|
case NVPTXISD::CallArg:
|
|
return "NVPTXISD::CallArg";
|
|
case NVPTXISD::LastCallArg:
|
|
return "NVPTXISD::LastCallArg";
|
|
case NVPTXISD::CallArgEnd:
|
|
return "NVPTXISD::CallArgEnd";
|
|
case NVPTXISD::CallVoid:
|
|
return "NVPTXISD::CallVoid";
|
|
case NVPTXISD::CallVal:
|
|
return "NVPTXISD::CallVal";
|
|
case NVPTXISD::CallSymbol:
|
|
return "NVPTXISD::CallSymbol";
|
|
case NVPTXISD::Prototype:
|
|
return "NVPTXISD::Prototype";
|
|
case NVPTXISD::MoveParam:
|
|
return "NVPTXISD::MoveParam";
|
|
case NVPTXISD::StoreRetval:
|
|
return "NVPTXISD::StoreRetval";
|
|
case NVPTXISD::StoreRetvalV2:
|
|
return "NVPTXISD::StoreRetvalV2";
|
|
case NVPTXISD::StoreRetvalV4:
|
|
return "NVPTXISD::StoreRetvalV4";
|
|
case NVPTXISD::PseudoUseParam:
|
|
return "NVPTXISD::PseudoUseParam";
|
|
case NVPTXISD::RETURN:
|
|
return "NVPTXISD::RETURN";
|
|
case NVPTXISD::CallSeqBegin:
|
|
return "NVPTXISD::CallSeqBegin";
|
|
case NVPTXISD::CallSeqEnd:
|
|
return "NVPTXISD::CallSeqEnd";
|
|
case NVPTXISD::CallPrototype:
|
|
return "NVPTXISD::CallPrototype";
|
|
case NVPTXISD::LoadV2:
|
|
return "NVPTXISD::LoadV2";
|
|
case NVPTXISD::LoadV4:
|
|
return "NVPTXISD::LoadV4";
|
|
case NVPTXISD::LDGV2:
|
|
return "NVPTXISD::LDGV2";
|
|
case NVPTXISD::LDGV4:
|
|
return "NVPTXISD::LDGV4";
|
|
case NVPTXISD::LDUV2:
|
|
return "NVPTXISD::LDUV2";
|
|
case NVPTXISD::LDUV4:
|
|
return "NVPTXISD::LDUV4";
|
|
case NVPTXISD::StoreV2:
|
|
return "NVPTXISD::StoreV2";
|
|
case NVPTXISD::StoreV4:
|
|
return "NVPTXISD::StoreV4";
|
|
case NVPTXISD::FUN_SHFL_CLAMP:
|
|
return "NVPTXISD::FUN_SHFL_CLAMP";
|
|
case NVPTXISD::FUN_SHFR_CLAMP:
|
|
return "NVPTXISD::FUN_SHFR_CLAMP";
|
|
case NVPTXISD::IMAD:
|
|
return "NVPTXISD::IMAD";
|
|
case NVPTXISD::MUL_WIDE_SIGNED:
|
|
return "NVPTXISD::MUL_WIDE_SIGNED";
|
|
case NVPTXISD::MUL_WIDE_UNSIGNED:
|
|
return "NVPTXISD::MUL_WIDE_UNSIGNED";
|
|
case NVPTXISD::Tex1DFloatI32: return "NVPTXISD::Tex1DFloatI32";
|
|
case NVPTXISD::Tex1DFloatFloat: return "NVPTXISD::Tex1DFloatFloat";
|
|
case NVPTXISD::Tex1DFloatFloatLevel:
|
|
return "NVPTXISD::Tex1DFloatFloatLevel";
|
|
case NVPTXISD::Tex1DFloatFloatGrad:
|
|
return "NVPTXISD::Tex1DFloatFloatGrad";
|
|
case NVPTXISD::Tex1DI32I32: return "NVPTXISD::Tex1DI32I32";
|
|
case NVPTXISD::Tex1DI32Float: return "NVPTXISD::Tex1DI32Float";
|
|
case NVPTXISD::Tex1DI32FloatLevel:
|
|
return "NVPTXISD::Tex1DI32FloatLevel";
|
|
case NVPTXISD::Tex1DI32FloatGrad:
|
|
return "NVPTXISD::Tex1DI32FloatGrad";
|
|
case NVPTXISD::Tex1DArrayFloatI32: return "NVPTXISD::Tex2DArrayFloatI32";
|
|
case NVPTXISD::Tex1DArrayFloatFloat: return "NVPTXISD::Tex2DArrayFloatFloat";
|
|
case NVPTXISD::Tex1DArrayFloatFloatLevel:
|
|
return "NVPTXISD::Tex2DArrayFloatFloatLevel";
|
|
case NVPTXISD::Tex1DArrayFloatFloatGrad:
|
|
return "NVPTXISD::Tex2DArrayFloatFloatGrad";
|
|
case NVPTXISD::Tex1DArrayI32I32: return "NVPTXISD::Tex2DArrayI32I32";
|
|
case NVPTXISD::Tex1DArrayI32Float: return "NVPTXISD::Tex2DArrayI32Float";
|
|
case NVPTXISD::Tex1DArrayI32FloatLevel:
|
|
return "NVPTXISD::Tex2DArrayI32FloatLevel";
|
|
case NVPTXISD::Tex1DArrayI32FloatGrad:
|
|
return "NVPTXISD::Tex2DArrayI32FloatGrad";
|
|
case NVPTXISD::Tex2DFloatI32: return "NVPTXISD::Tex2DFloatI32";
|
|
case NVPTXISD::Tex2DFloatFloat: return "NVPTXISD::Tex2DFloatFloat";
|
|
case NVPTXISD::Tex2DFloatFloatLevel:
|
|
return "NVPTXISD::Tex2DFloatFloatLevel";
|
|
case NVPTXISD::Tex2DFloatFloatGrad:
|
|
return "NVPTXISD::Tex2DFloatFloatGrad";
|
|
case NVPTXISD::Tex2DI32I32: return "NVPTXISD::Tex2DI32I32";
|
|
case NVPTXISD::Tex2DI32Float: return "NVPTXISD::Tex2DI32Float";
|
|
case NVPTXISD::Tex2DI32FloatLevel:
|
|
return "NVPTXISD::Tex2DI32FloatLevel";
|
|
case NVPTXISD::Tex2DI32FloatGrad:
|
|
return "NVPTXISD::Tex2DI32FloatGrad";
|
|
case NVPTXISD::Tex2DArrayFloatI32: return "NVPTXISD::Tex2DArrayFloatI32";
|
|
case NVPTXISD::Tex2DArrayFloatFloat: return "NVPTXISD::Tex2DArrayFloatFloat";
|
|
case NVPTXISD::Tex2DArrayFloatFloatLevel:
|
|
return "NVPTXISD::Tex2DArrayFloatFloatLevel";
|
|
case NVPTXISD::Tex2DArrayFloatFloatGrad:
|
|
return "NVPTXISD::Tex2DArrayFloatFloatGrad";
|
|
case NVPTXISD::Tex2DArrayI32I32: return "NVPTXISD::Tex2DArrayI32I32";
|
|
case NVPTXISD::Tex2DArrayI32Float: return "NVPTXISD::Tex2DArrayI32Float";
|
|
case NVPTXISD::Tex2DArrayI32FloatLevel:
|
|
return "NVPTXISD::Tex2DArrayI32FloatLevel";
|
|
case NVPTXISD::Tex2DArrayI32FloatGrad:
|
|
return "NVPTXISD::Tex2DArrayI32FloatGrad";
|
|
case NVPTXISD::Tex3DFloatI32: return "NVPTXISD::Tex3DFloatI32";
|
|
case NVPTXISD::Tex3DFloatFloat: return "NVPTXISD::Tex3DFloatFloat";
|
|
case NVPTXISD::Tex3DFloatFloatLevel:
|
|
return "NVPTXISD::Tex3DFloatFloatLevel";
|
|
case NVPTXISD::Tex3DFloatFloatGrad:
|
|
return "NVPTXISD::Tex3DFloatFloatGrad";
|
|
case NVPTXISD::Tex3DI32I32: return "NVPTXISD::Tex3DI32I32";
|
|
case NVPTXISD::Tex3DI32Float: return "NVPTXISD::Tex3DI32Float";
|
|
case NVPTXISD::Tex3DI32FloatLevel:
|
|
return "NVPTXISD::Tex3DI32FloatLevel";
|
|
case NVPTXISD::Tex3DI32FloatGrad:
|
|
return "NVPTXISD::Tex3DI32FloatGrad";
|
|
|
|
case NVPTXISD::Suld1DI8Trap: return "NVPTXISD::Suld1DI8Trap";
|
|
case NVPTXISD::Suld1DI16Trap: return "NVPTXISD::Suld1DI16Trap";
|
|
case NVPTXISD::Suld1DI32Trap: return "NVPTXISD::Suld1DI32Trap";
|
|
case NVPTXISD::Suld1DV2I8Trap: return "NVPTXISD::Suld1DV2I8Trap";
|
|
case NVPTXISD::Suld1DV2I16Trap: return "NVPTXISD::Suld1DV2I16Trap";
|
|
case NVPTXISD::Suld1DV2I32Trap: return "NVPTXISD::Suld1DV2I32Trap";
|
|
case NVPTXISD::Suld1DV4I8Trap: return "NVPTXISD::Suld1DV4I8Trap";
|
|
case NVPTXISD::Suld1DV4I16Trap: return "NVPTXISD::Suld1DV4I16Trap";
|
|
case NVPTXISD::Suld1DV4I32Trap: return "NVPTXISD::Suld1DV4I32Trap";
|
|
|
|
case NVPTXISD::Suld1DArrayI8Trap: return "NVPTXISD::Suld1DArrayI8Trap";
|
|
case NVPTXISD::Suld1DArrayI16Trap: return "NVPTXISD::Suld1DArrayI16Trap";
|
|
case NVPTXISD::Suld1DArrayI32Trap: return "NVPTXISD::Suld1DArrayI32Trap";
|
|
case NVPTXISD::Suld1DArrayV2I8Trap: return "NVPTXISD::Suld1DArrayV2I8Trap";
|
|
case NVPTXISD::Suld1DArrayV2I16Trap: return "NVPTXISD::Suld1DArrayV2I16Trap";
|
|
case NVPTXISD::Suld1DArrayV2I32Trap: return "NVPTXISD::Suld1DArrayV2I32Trap";
|
|
case NVPTXISD::Suld1DArrayV4I8Trap: return "NVPTXISD::Suld1DArrayV4I8Trap";
|
|
case NVPTXISD::Suld1DArrayV4I16Trap: return "NVPTXISD::Suld1DArrayV4I16Trap";
|
|
case NVPTXISD::Suld1DArrayV4I32Trap: return "NVPTXISD::Suld1DArrayV4I32Trap";
|
|
|
|
case NVPTXISD::Suld2DI8Trap: return "NVPTXISD::Suld2DI8Trap";
|
|
case NVPTXISD::Suld2DI16Trap: return "NVPTXISD::Suld2DI16Trap";
|
|
case NVPTXISD::Suld2DI32Trap: return "NVPTXISD::Suld2DI32Trap";
|
|
case NVPTXISD::Suld2DV2I8Trap: return "NVPTXISD::Suld2DV2I8Trap";
|
|
case NVPTXISD::Suld2DV2I16Trap: return "NVPTXISD::Suld2DV2I16Trap";
|
|
case NVPTXISD::Suld2DV2I32Trap: return "NVPTXISD::Suld2DV2I32Trap";
|
|
case NVPTXISD::Suld2DV4I8Trap: return "NVPTXISD::Suld2DV4I8Trap";
|
|
case NVPTXISD::Suld2DV4I16Trap: return "NVPTXISD::Suld2DV4I16Trap";
|
|
case NVPTXISD::Suld2DV4I32Trap: return "NVPTXISD::Suld2DV4I32Trap";
|
|
|
|
case NVPTXISD::Suld2DArrayI8Trap: return "NVPTXISD::Suld2DArrayI8Trap";
|
|
case NVPTXISD::Suld2DArrayI16Trap: return "NVPTXISD::Suld2DArrayI16Trap";
|
|
case NVPTXISD::Suld2DArrayI32Trap: return "NVPTXISD::Suld2DArrayI32Trap";
|
|
case NVPTXISD::Suld2DArrayV2I8Trap: return "NVPTXISD::Suld2DArrayV2I8Trap";
|
|
case NVPTXISD::Suld2DArrayV2I16Trap: return "NVPTXISD::Suld2DArrayV2I16Trap";
|
|
case NVPTXISD::Suld2DArrayV2I32Trap: return "NVPTXISD::Suld2DArrayV2I32Trap";
|
|
case NVPTXISD::Suld2DArrayV4I8Trap: return "NVPTXISD::Suld2DArrayV4I8Trap";
|
|
case NVPTXISD::Suld2DArrayV4I16Trap: return "NVPTXISD::Suld2DArrayV4I16Trap";
|
|
case NVPTXISD::Suld2DArrayV4I32Trap: return "NVPTXISD::Suld2DArrayV4I32Trap";
|
|
|
|
case NVPTXISD::Suld3DI8Trap: return "NVPTXISD::Suld3DI8Trap";
|
|
case NVPTXISD::Suld3DI16Trap: return "NVPTXISD::Suld3DI16Trap";
|
|
case NVPTXISD::Suld3DI32Trap: return "NVPTXISD::Suld3DI32Trap";
|
|
case NVPTXISD::Suld3DV2I8Trap: return "NVPTXISD::Suld3DV2I8Trap";
|
|
case NVPTXISD::Suld3DV2I16Trap: return "NVPTXISD::Suld3DV2I16Trap";
|
|
case NVPTXISD::Suld3DV2I32Trap: return "NVPTXISD::Suld3DV2I32Trap";
|
|
case NVPTXISD::Suld3DV4I8Trap: return "NVPTXISD::Suld3DV4I8Trap";
|
|
case NVPTXISD::Suld3DV4I16Trap: return "NVPTXISD::Suld3DV4I16Trap";
|
|
case NVPTXISD::Suld3DV4I32Trap: return "NVPTXISD::Suld3DV4I32Trap";
|
|
}
|
|
}
|
|
|
|
TargetLoweringBase::LegalizeTypeAction
|
|
NVPTXTargetLowering::getPreferredVectorAction(EVT VT) const {
|
|
if (VT.getVectorNumElements() != 1 && VT.getScalarType() == MVT::i1)
|
|
return TypeSplitVector;
|
|
|
|
return TargetLoweringBase::getPreferredVectorAction(VT);
|
|
}
|
|
|
|
SDValue
|
|
NVPTXTargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
|
|
SDLoc dl(Op);
|
|
const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
|
|
Op = DAG.getTargetGlobalAddress(GV, dl, getPointerTy());
|
|
return DAG.getNode(NVPTXISD::Wrapper, dl, getPointerTy(), Op);
|
|
}
|
|
|
|
std::string
|
|
NVPTXTargetLowering::getPrototype(Type *retTy, const ArgListTy &Args,
|
|
const SmallVectorImpl<ISD::OutputArg> &Outs,
|
|
unsigned retAlignment,
|
|
const ImmutableCallSite *CS) const {
|
|
|
|
bool isABI = (nvptxSubtarget.getSmVersion() >= 20);
|
|
assert(isABI && "Non-ABI compilation is not supported");
|
|
if (!isABI)
|
|
return "";
|
|
|
|
std::stringstream O;
|
|
O << "prototype_" << uniqueCallSite << " : .callprototype ";
|
|
|
|
if (retTy->getTypeID() == Type::VoidTyID) {
|
|
O << "()";
|
|
} else {
|
|
O << "(";
|
|
if (retTy->isFloatingPointTy() || retTy->isIntegerTy()) {
|
|
unsigned size = 0;
|
|
if (const IntegerType *ITy = dyn_cast<IntegerType>(retTy)) {
|
|
size = ITy->getBitWidth();
|
|
if (size < 32)
|
|
size = 32;
|
|
} else {
|
|
assert(retTy->isFloatingPointTy() &&
|
|
"Floating point type expected here");
|
|
size = retTy->getPrimitiveSizeInBits();
|
|
}
|
|
|
|
O << ".param .b" << size << " _";
|
|
} else if (isa<PointerType>(retTy)) {
|
|
O << ".param .b" << getPointerTy().getSizeInBits() << " _";
|
|
} else {
|
|
if((retTy->getTypeID() == Type::StructTyID) ||
|
|
isa<VectorType>(retTy)) {
|
|
O << ".param .align "
|
|
<< retAlignment
|
|
<< " .b8 _["
|
|
<< getDataLayout()->getTypeAllocSize(retTy) << "]";
|
|
} else {
|
|
assert(false && "Unknown return type");
|
|
}
|
|
}
|
|
O << ") ";
|
|
}
|
|
O << "_ (";
|
|
|
|
bool first = true;
|
|
MVT thePointerTy = getPointerTy();
|
|
|
|
unsigned OIdx = 0;
|
|
for (unsigned i = 0, e = Args.size(); i != e; ++i, ++OIdx) {
|
|
Type *Ty = Args[i].Ty;
|
|
if (!first) {
|
|
O << ", ";
|
|
}
|
|
first = false;
|
|
|
|
if (Outs[OIdx].Flags.isByVal() == false) {
|
|
if (Ty->isAggregateType() || Ty->isVectorTy()) {
|
|
unsigned align = 0;
|
|
const CallInst *CallI = cast<CallInst>(CS->getInstruction());
|
|
const DataLayout *TD = getDataLayout();
|
|
// +1 because index 0 is reserved for return type alignment
|
|
if (!llvm::getAlign(*CallI, i + 1, align))
|
|
align = TD->getABITypeAlignment(Ty);
|
|
unsigned sz = TD->getTypeAllocSize(Ty);
|
|
O << ".param .align " << align << " .b8 ";
|
|
O << "_";
|
|
O << "[" << sz << "]";
|
|
// update the index for Outs
|
|
SmallVector<EVT, 16> vtparts;
|
|
ComputeValueVTs(*this, Ty, vtparts);
|
|
if (unsigned len = vtparts.size())
|
|
OIdx += len - 1;
|
|
continue;
|
|
}
|
|
// i8 types in IR will be i16 types in SDAG
|
|
assert((getValueType(Ty) == Outs[OIdx].VT ||
|
|
(getValueType(Ty) == MVT::i8 && Outs[OIdx].VT == MVT::i16)) &&
|
|
"type mismatch between callee prototype and arguments");
|
|
// scalar type
|
|
unsigned sz = 0;
|
|
if (isa<IntegerType>(Ty)) {
|
|
sz = cast<IntegerType>(Ty)->getBitWidth();
|
|
if (sz < 32)
|
|
sz = 32;
|
|
} else if (isa<PointerType>(Ty))
|
|
sz = thePointerTy.getSizeInBits();
|
|
else
|
|
sz = Ty->getPrimitiveSizeInBits();
|
|
O << ".param .b" << sz << " ";
|
|
O << "_";
|
|
continue;
|
|
}
|
|
const PointerType *PTy = dyn_cast<PointerType>(Ty);
|
|
assert(PTy && "Param with byval attribute should be a pointer type");
|
|
Type *ETy = PTy->getElementType();
|
|
|
|
unsigned align = Outs[OIdx].Flags.getByValAlign();
|
|
unsigned sz = getDataLayout()->getTypeAllocSize(ETy);
|
|
O << ".param .align " << align << " .b8 ";
|
|
O << "_";
|
|
O << "[" << sz << "]";
|
|
}
|
|
O << ");";
|
|
return O.str();
|
|
}
|
|
|
|
unsigned
|
|
NVPTXTargetLowering::getArgumentAlignment(SDValue Callee,
|
|
const ImmutableCallSite *CS,
|
|
Type *Ty,
|
|
unsigned Idx) const {
|
|
const DataLayout *TD = getDataLayout();
|
|
unsigned Align = 0;
|
|
const Value *DirectCallee = CS->getCalledFunction();
|
|
|
|
if (!DirectCallee) {
|
|
// We don't have a direct function symbol, but that may be because of
|
|
// constant cast instructions in the call.
|
|
const Instruction *CalleeI = CS->getInstruction();
|
|
assert(CalleeI && "Call target is not a function or derived value?");
|
|
|
|
// With bitcast'd call targets, the instruction will be the call
|
|
if (isa<CallInst>(CalleeI)) {
|
|
// Check if we have call alignment metadata
|
|
if (llvm::getAlign(*cast<CallInst>(CalleeI), Idx, Align))
|
|
return Align;
|
|
|
|
const Value *CalleeV = cast<CallInst>(CalleeI)->getCalledValue();
|
|
// Ignore any bitcast instructions
|
|
while(isa<ConstantExpr>(CalleeV)) {
|
|
const ConstantExpr *CE = cast<ConstantExpr>(CalleeV);
|
|
if (!CE->isCast())
|
|
break;
|
|
// Look through the bitcast
|
|
CalleeV = cast<ConstantExpr>(CalleeV)->getOperand(0);
|
|
}
|
|
|
|
// We have now looked past all of the bitcasts. Do we finally have a
|
|
// Function?
|
|
if (isa<Function>(CalleeV))
|
|
DirectCallee = CalleeV;
|
|
}
|
|
}
|
|
|
|
// Check for function alignment information if we found that the
|
|
// ultimate target is a Function
|
|
if (DirectCallee)
|
|
if (llvm::getAlign(*cast<Function>(DirectCallee), Idx, Align))
|
|
return Align;
|
|
|
|
// Call is indirect or alignment information is not available, fall back to
|
|
// the ABI type alignment
|
|
return TD->getABITypeAlignment(Ty);
|
|
}
|
|
|
|
SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
|
|
SmallVectorImpl<SDValue> &InVals) const {
|
|
SelectionDAG &DAG = CLI.DAG;
|
|
SDLoc dl = CLI.DL;
|
|
SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
|
|
SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
|
|
SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
|
|
SDValue Chain = CLI.Chain;
|
|
SDValue Callee = CLI.Callee;
|
|
bool &isTailCall = CLI.IsTailCall;
|
|
ArgListTy &Args = CLI.getArgs();
|
|
Type *retTy = CLI.RetTy;
|
|
ImmutableCallSite *CS = CLI.CS;
|
|
|
|
bool isABI = (nvptxSubtarget.getSmVersion() >= 20);
|
|
assert(isABI && "Non-ABI compilation is not supported");
|
|
if (!isABI)
|
|
return Chain;
|
|
const DataLayout *TD = getDataLayout();
|
|
MachineFunction &MF = DAG.getMachineFunction();
|
|
const Function *F = MF.getFunction();
|
|
|
|
SDValue tempChain = Chain;
|
|
Chain =
|
|
DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(uniqueCallSite, true),
|
|
dl);
|
|
SDValue InFlag = Chain.getValue(1);
|
|
|
|
unsigned paramCount = 0;
|
|
// Args.size() and Outs.size() need not match.
|
|
// Outs.size() will be larger
|
|
// * if there is an aggregate argument with multiple fields (each field
|
|
// showing up separately in Outs)
|
|
// * if there is a vector argument with more than typical vector-length
|
|
// elements (generally if more than 4) where each vector element is
|
|
// individually present in Outs.
|
|
// So a different index should be used for indexing into Outs/OutVals.
|
|
// See similar issue in LowerFormalArguments.
|
|
unsigned OIdx = 0;
|
|
// Declare the .params or .reg need to pass values
|
|
// to the function
|
|
for (unsigned i = 0, e = Args.size(); i != e; ++i, ++OIdx) {
|
|
EVT VT = Outs[OIdx].VT;
|
|
Type *Ty = Args[i].Ty;
|
|
|
|
if (Outs[OIdx].Flags.isByVal() == false) {
|
|
if (Ty->isAggregateType()) {
|
|
// aggregate
|
|
SmallVector<EVT, 16> vtparts;
|
|
SmallVector<uint64_t, 16> Offsets;
|
|
ComputePTXValueVTs(*this, Ty, vtparts, &Offsets, 0);
|
|
|
|
unsigned align = getArgumentAlignment(Callee, CS, Ty, paramCount + 1);
|
|
// declare .param .align <align> .b8 .param<n>[<size>];
|
|
unsigned sz = TD->getTypeAllocSize(Ty);
|
|
SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
|
|
SDValue DeclareParamOps[] = { Chain, DAG.getConstant(align, MVT::i32),
|
|
DAG.getConstant(paramCount, MVT::i32),
|
|
DAG.getConstant(sz, MVT::i32), InFlag };
|
|
Chain = DAG.getNode(NVPTXISD::DeclareParam, dl, DeclareParamVTs,
|
|
DeclareParamOps);
|
|
InFlag = Chain.getValue(1);
|
|
for (unsigned j = 0, je = vtparts.size(); j != je; ++j) {
|
|
EVT elemtype = vtparts[j];
|
|
unsigned ArgAlign = GreatestCommonDivisor64(align, Offsets[j]);
|
|
if (elemtype.isInteger() && (sz < 8))
|
|
sz = 8;
|
|
SDValue StVal = OutVals[OIdx];
|
|
if (elemtype.getSizeInBits() < 16) {
|
|
StVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, StVal);
|
|
}
|
|
SDVTList CopyParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
|
|
SDValue CopyParamOps[] = { Chain,
|
|
DAG.getConstant(paramCount, MVT::i32),
|
|
DAG.getConstant(Offsets[j], MVT::i32),
|
|
StVal, InFlag };
|
|
Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreParam, dl,
|
|
CopyParamVTs, CopyParamOps,
|
|
elemtype, MachinePointerInfo(),
|
|
ArgAlign);
|
|
InFlag = Chain.getValue(1);
|
|
++OIdx;
|
|
}
|
|
if (vtparts.size() > 0)
|
|
--OIdx;
|
|
++paramCount;
|
|
continue;
|
|
}
|
|
if (Ty->isVectorTy()) {
|
|
EVT ObjectVT = getValueType(Ty);
|
|
unsigned align = getArgumentAlignment(Callee, CS, Ty, paramCount + 1);
|
|
// declare .param .align <align> .b8 .param<n>[<size>];
|
|
unsigned sz = TD->getTypeAllocSize(Ty);
|
|
SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
|
|
SDValue DeclareParamOps[] = { Chain, DAG.getConstant(align, MVT::i32),
|
|
DAG.getConstant(paramCount, MVT::i32),
|
|
DAG.getConstant(sz, MVT::i32), InFlag };
|
|
Chain = DAG.getNode(NVPTXISD::DeclareParam, dl, DeclareParamVTs,
|
|
DeclareParamOps);
|
|
InFlag = Chain.getValue(1);
|
|
unsigned NumElts = ObjectVT.getVectorNumElements();
|
|
EVT EltVT = ObjectVT.getVectorElementType();
|
|
EVT MemVT = EltVT;
|
|
bool NeedExtend = false;
|
|
if (EltVT.getSizeInBits() < 16) {
|
|
NeedExtend = true;
|
|
EltVT = MVT::i16;
|
|
}
|
|
|
|
// V1 store
|
|
if (NumElts == 1) {
|
|
SDValue Elt = OutVals[OIdx++];
|
|
if (NeedExtend)
|
|
Elt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Elt);
|
|
|
|
SDVTList CopyParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
|
|
SDValue CopyParamOps[] = { Chain,
|
|
DAG.getConstant(paramCount, MVT::i32),
|
|
DAG.getConstant(0, MVT::i32), Elt,
|
|
InFlag };
|
|
Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreParam, dl,
|
|
CopyParamVTs, CopyParamOps,
|
|
MemVT, MachinePointerInfo());
|
|
InFlag = Chain.getValue(1);
|
|
} else if (NumElts == 2) {
|
|
SDValue Elt0 = OutVals[OIdx++];
|
|
SDValue Elt1 = OutVals[OIdx++];
|
|
if (NeedExtend) {
|
|
Elt0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Elt0);
|
|
Elt1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Elt1);
|
|
}
|
|
|
|
SDVTList CopyParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
|
|
SDValue CopyParamOps[] = { Chain,
|
|
DAG.getConstant(paramCount, MVT::i32),
|
|
DAG.getConstant(0, MVT::i32), Elt0, Elt1,
|
|
InFlag };
|
|
Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreParamV2, dl,
|
|
CopyParamVTs, CopyParamOps,
|
|
MemVT, MachinePointerInfo());
|
|
InFlag = Chain.getValue(1);
|
|
} else {
|
|
unsigned curOffset = 0;
|
|
// V4 stores
|
|
// We have at least 4 elements (<3 x Ty> expands to 4 elements) and
|
|
// the
|
|
// vector will be expanded to a power of 2 elements, so we know we can
|
|
// always round up to the next multiple of 4 when creating the vector
|
|
// stores.
|
|
// e.g. 4 elem => 1 st.v4
|
|
// 6 elem => 2 st.v4
|
|
// 8 elem => 2 st.v4
|
|
// 11 elem => 3 st.v4
|
|
unsigned VecSize = 4;
|
|
if (EltVT.getSizeInBits() == 64)
|
|
VecSize = 2;
|
|
|
|
// This is potentially only part of a vector, so assume all elements
|
|
// are packed together.
|
|
unsigned PerStoreOffset = MemVT.getStoreSizeInBits() / 8 * VecSize;
|
|
|
|
for (unsigned i = 0; i < NumElts; i += VecSize) {
|
|
// Get values
|
|
SDValue StoreVal;
|
|
SmallVector<SDValue, 8> Ops;
|
|
Ops.push_back(Chain);
|
|
Ops.push_back(DAG.getConstant(paramCount, MVT::i32));
|
|
Ops.push_back(DAG.getConstant(curOffset, MVT::i32));
|
|
|
|
unsigned Opc = NVPTXISD::StoreParamV2;
|
|
|
|
StoreVal = OutVals[OIdx++];
|
|
if (NeedExtend)
|
|
StoreVal = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, StoreVal);
|
|
Ops.push_back(StoreVal);
|
|
|
|
if (i + 1 < NumElts) {
|
|
StoreVal = OutVals[OIdx++];
|
|
if (NeedExtend)
|
|
StoreVal =
|
|
DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, StoreVal);
|
|
} else {
|
|
StoreVal = DAG.getUNDEF(EltVT);
|
|
}
|
|
Ops.push_back(StoreVal);
|
|
|
|
if (VecSize == 4) {
|
|
Opc = NVPTXISD::StoreParamV4;
|
|
if (i + 2 < NumElts) {
|
|
StoreVal = OutVals[OIdx++];
|
|
if (NeedExtend)
|
|
StoreVal =
|
|
DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, StoreVal);
|
|
} else {
|
|
StoreVal = DAG.getUNDEF(EltVT);
|
|
}
|
|
Ops.push_back(StoreVal);
|
|
|
|
if (i + 3 < NumElts) {
|
|
StoreVal = OutVals[OIdx++];
|
|
if (NeedExtend)
|
|
StoreVal =
|
|
DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, StoreVal);
|
|
} else {
|
|
StoreVal = DAG.getUNDEF(EltVT);
|
|
}
|
|
Ops.push_back(StoreVal);
|
|
}
|
|
|
|
Ops.push_back(InFlag);
|
|
|
|
SDVTList CopyParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
|
|
Chain = DAG.getMemIntrinsicNode(Opc, dl, CopyParamVTs, Ops,
|
|
MemVT, MachinePointerInfo());
|
|
InFlag = Chain.getValue(1);
|
|
curOffset += PerStoreOffset;
|
|
}
|
|
}
|
|
++paramCount;
|
|
--OIdx;
|
|
continue;
|
|
}
|
|
// Plain scalar
|
|
// for ABI, declare .param .b<size> .param<n>;
|
|
unsigned sz = VT.getSizeInBits();
|
|
bool needExtend = false;
|
|
if (VT.isInteger()) {
|
|
if (sz < 16)
|
|
needExtend = true;
|
|
if (sz < 32)
|
|
sz = 32;
|
|
}
|
|
SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
|
|
SDValue DeclareParamOps[] = { Chain,
|
|
DAG.getConstant(paramCount, MVT::i32),
|
|
DAG.getConstant(sz, MVT::i32),
|
|
DAG.getConstant(0, MVT::i32), InFlag };
|
|
Chain = DAG.getNode(NVPTXISD::DeclareScalarParam, dl, DeclareParamVTs,
|
|
DeclareParamOps);
|
|
InFlag = Chain.getValue(1);
|
|
SDValue OutV = OutVals[OIdx];
|
|
if (needExtend) {
|
|
// zext/sext i1 to i16
|
|
unsigned opc = ISD::ZERO_EXTEND;
|
|
if (Outs[OIdx].Flags.isSExt())
|
|
opc = ISD::SIGN_EXTEND;
|
|
OutV = DAG.getNode(opc, dl, MVT::i16, OutV);
|
|
}
|
|
SDVTList CopyParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
|
|
SDValue CopyParamOps[] = { Chain, DAG.getConstant(paramCount, MVT::i32),
|
|
DAG.getConstant(0, MVT::i32), OutV, InFlag };
|
|
|
|
unsigned opcode = NVPTXISD::StoreParam;
|
|
if (Outs[OIdx].Flags.isZExt())
|
|
opcode = NVPTXISD::StoreParamU32;
|
|
else if (Outs[OIdx].Flags.isSExt())
|
|
opcode = NVPTXISD::StoreParamS32;
|
|
Chain = DAG.getMemIntrinsicNode(opcode, dl, CopyParamVTs, CopyParamOps,
|
|
VT, MachinePointerInfo());
|
|
|
|
InFlag = Chain.getValue(1);
|
|
++paramCount;
|
|
continue;
|
|
}
|
|
// struct or vector
|
|
SmallVector<EVT, 16> vtparts;
|
|
SmallVector<uint64_t, 16> Offsets;
|
|
const PointerType *PTy = dyn_cast<PointerType>(Args[i].Ty);
|
|
assert(PTy && "Type of a byval parameter should be pointer");
|
|
ComputePTXValueVTs(*this, PTy->getElementType(), vtparts, &Offsets, 0);
|
|
|
|
// declare .param .align <align> .b8 .param<n>[<size>];
|
|
unsigned sz = Outs[OIdx].Flags.getByValSize();
|
|
SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
|
|
unsigned ArgAlign = Outs[OIdx].Flags.getByValAlign();
|
|
// The ByValAlign in the Outs[OIdx].Flags is alway set at this point,
|
|
// so we don't need to worry about natural alignment or not.
|
|
// See TargetLowering::LowerCallTo().
|
|
SDValue DeclareParamOps[] = {
|
|
Chain, DAG.getConstant(Outs[OIdx].Flags.getByValAlign(), MVT::i32),
|
|
DAG.getConstant(paramCount, MVT::i32), DAG.getConstant(sz, MVT::i32),
|
|
InFlag
|
|
};
|
|
Chain = DAG.getNode(NVPTXISD::DeclareParam, dl, DeclareParamVTs,
|
|
DeclareParamOps);
|
|
InFlag = Chain.getValue(1);
|
|
for (unsigned j = 0, je = vtparts.size(); j != je; ++j) {
|
|
EVT elemtype = vtparts[j];
|
|
int curOffset = Offsets[j];
|
|
unsigned PartAlign = GreatestCommonDivisor64(ArgAlign, curOffset);
|
|
SDValue srcAddr =
|
|
DAG.getNode(ISD::ADD, dl, getPointerTy(), OutVals[OIdx],
|
|
DAG.getConstant(curOffset, getPointerTy()));
|
|
SDValue theVal = DAG.getLoad(elemtype, dl, tempChain, srcAddr,
|
|
MachinePointerInfo(), false, false, false,
|
|
PartAlign);
|
|
if (elemtype.getSizeInBits() < 16) {
|
|
theVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, theVal);
|
|
}
|
|
SDVTList CopyParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
|
|
SDValue CopyParamOps[] = { Chain, DAG.getConstant(paramCount, MVT::i32),
|
|
DAG.getConstant(curOffset, MVT::i32), theVal,
|
|
InFlag };
|
|
Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreParam, dl, CopyParamVTs,
|
|
CopyParamOps, elemtype,
|
|
MachinePointerInfo());
|
|
|
|
InFlag = Chain.getValue(1);
|
|
}
|
|
++paramCount;
|
|
}
|
|
|
|
GlobalAddressSDNode *Func = dyn_cast<GlobalAddressSDNode>(Callee.getNode());
|
|
unsigned retAlignment = 0;
|
|
|
|
// Handle Result
|
|
if (Ins.size() > 0) {
|
|
SmallVector<EVT, 16> resvtparts;
|
|
ComputeValueVTs(*this, retTy, resvtparts);
|
|
|
|
// Declare
|
|
// .param .align 16 .b8 retval0[<size-in-bytes>], or
|
|
// .param .b<size-in-bits> retval0
|
|
unsigned resultsz = TD->getTypeAllocSizeInBits(retTy);
|
|
if (retTy->isSingleValueType()) {
|
|
// Scalar needs to be at least 32bit wide
|
|
if (resultsz < 32)
|
|
resultsz = 32;
|
|
SDVTList DeclareRetVTs = DAG.getVTList(MVT::Other, MVT::Glue);
|
|
SDValue DeclareRetOps[] = { Chain, DAG.getConstant(1, MVT::i32),
|
|
DAG.getConstant(resultsz, MVT::i32),
|
|
DAG.getConstant(0, MVT::i32), InFlag };
|
|
Chain = DAG.getNode(NVPTXISD::DeclareRet, dl, DeclareRetVTs,
|
|
DeclareRetOps);
|
|
InFlag = Chain.getValue(1);
|
|
} else {
|
|
retAlignment = getArgumentAlignment(Callee, CS, retTy, 0);
|
|
SDVTList DeclareRetVTs = DAG.getVTList(MVT::Other, MVT::Glue);
|
|
SDValue DeclareRetOps[] = { Chain,
|
|
DAG.getConstant(retAlignment, MVT::i32),
|
|
DAG.getConstant(resultsz / 8, MVT::i32),
|
|
DAG.getConstant(0, MVT::i32), InFlag };
|
|
Chain = DAG.getNode(NVPTXISD::DeclareRetParam, dl, DeclareRetVTs,
|
|
DeclareRetOps);
|
|
InFlag = Chain.getValue(1);
|
|
}
|
|
}
|
|
|
|
if (!Func) {
|
|
// This is indirect function call case : PTX requires a prototype of the
|
|
// form
|
|
// proto_0 : .callprototype(.param .b32 _) _ (.param .b32 _);
|
|
// to be emitted, and the label has to used as the last arg of call
|
|
// instruction.
|
|
// The prototype is embedded in a string and put as the operand for a
|
|
// CallPrototype SDNode which will print out to the value of the string.
|
|
SDVTList ProtoVTs = DAG.getVTList(MVT::Other, MVT::Glue);
|
|
std::string Proto = getPrototype(retTy, Args, Outs, retAlignment, CS);
|
|
const char *ProtoStr =
|
|
nvTM->getManagedStrPool()->getManagedString(Proto.c_str())->c_str();
|
|
SDValue ProtoOps[] = {
|
|
Chain, DAG.getTargetExternalSymbol(ProtoStr, MVT::i32), InFlag,
|
|
};
|
|
Chain = DAG.getNode(NVPTXISD::CallPrototype, dl, ProtoVTs, ProtoOps);
|
|
InFlag = Chain.getValue(1);
|
|
}
|
|
// Op to just print "call"
|
|
SDVTList PrintCallVTs = DAG.getVTList(MVT::Other, MVT::Glue);
|
|
SDValue PrintCallOps[] = {
|
|
Chain, DAG.getConstant((Ins.size() == 0) ? 0 : 1, MVT::i32), InFlag
|
|
};
|
|
Chain = DAG.getNode(Func ? (NVPTXISD::PrintCallUni) : (NVPTXISD::PrintCall),
|
|
dl, PrintCallVTs, PrintCallOps);
|
|
InFlag = Chain.getValue(1);
|
|
|
|
// Ops to print out the function name
|
|
SDVTList CallVoidVTs = DAG.getVTList(MVT::Other, MVT::Glue);
|
|
SDValue CallVoidOps[] = { Chain, Callee, InFlag };
|
|
Chain = DAG.getNode(NVPTXISD::CallVoid, dl, CallVoidVTs, CallVoidOps);
|
|
InFlag = Chain.getValue(1);
|
|
|
|
// Ops to print out the param list
|
|
SDVTList CallArgBeginVTs = DAG.getVTList(MVT::Other, MVT::Glue);
|
|
SDValue CallArgBeginOps[] = { Chain, InFlag };
|
|
Chain = DAG.getNode(NVPTXISD::CallArgBegin, dl, CallArgBeginVTs,
|
|
CallArgBeginOps);
|
|
InFlag = Chain.getValue(1);
|
|
|
|
for (unsigned i = 0, e = paramCount; i != e; ++i) {
|
|
unsigned opcode;
|
|
if (i == (e - 1))
|
|
opcode = NVPTXISD::LastCallArg;
|
|
else
|
|
opcode = NVPTXISD::CallArg;
|
|
SDVTList CallArgVTs = DAG.getVTList(MVT::Other, MVT::Glue);
|
|
SDValue CallArgOps[] = { Chain, DAG.getConstant(1, MVT::i32),
|
|
DAG.getConstant(i, MVT::i32), InFlag };
|
|
Chain = DAG.getNode(opcode, dl, CallArgVTs, CallArgOps);
|
|
InFlag = Chain.getValue(1);
|
|
}
|
|
SDVTList CallArgEndVTs = DAG.getVTList(MVT::Other, MVT::Glue);
|
|
SDValue CallArgEndOps[] = { Chain, DAG.getConstant(Func ? 1 : 0, MVT::i32),
|
|
InFlag };
|
|
Chain = DAG.getNode(NVPTXISD::CallArgEnd, dl, CallArgEndVTs, CallArgEndOps);
|
|
InFlag = Chain.getValue(1);
|
|
|
|
if (!Func) {
|
|
SDVTList PrototypeVTs = DAG.getVTList(MVT::Other, MVT::Glue);
|
|
SDValue PrototypeOps[] = { Chain, DAG.getConstant(uniqueCallSite, MVT::i32),
|
|
InFlag };
|
|
Chain = DAG.getNode(NVPTXISD::Prototype, dl, PrototypeVTs, PrototypeOps);
|
|
InFlag = Chain.getValue(1);
|
|
}
|
|
|
|
// Generate loads from param memory/moves from registers for result
|
|
if (Ins.size() > 0) {
|
|
if (retTy && retTy->isVectorTy()) {
|
|
EVT ObjectVT = getValueType(retTy);
|
|
unsigned NumElts = ObjectVT.getVectorNumElements();
|
|
EVT EltVT = ObjectVT.getVectorElementType();
|
|
assert(nvTM->getTargetLowering()->getNumRegisters(F->getContext(),
|
|
ObjectVT) == NumElts &&
|
|
"Vector was not scalarized");
|
|
unsigned sz = EltVT.getSizeInBits();
|
|
bool needTruncate = sz < 8 ? true : false;
|
|
|
|
if (NumElts == 1) {
|
|
// Just a simple load
|
|
SmallVector<EVT, 4> LoadRetVTs;
|
|
if (EltVT == MVT::i1 || EltVT == MVT::i8) {
|
|
// If loading i1/i8 result, generate
|
|
// load.b8 i16
|
|
// if i1
|
|
// trunc i16 to i1
|
|
LoadRetVTs.push_back(MVT::i16);
|
|
} else
|
|
LoadRetVTs.push_back(EltVT);
|
|
LoadRetVTs.push_back(MVT::Other);
|
|
LoadRetVTs.push_back(MVT::Glue);
|
|
SmallVector<SDValue, 4> LoadRetOps;
|
|
LoadRetOps.push_back(Chain);
|
|
LoadRetOps.push_back(DAG.getConstant(1, MVT::i32));
|
|
LoadRetOps.push_back(DAG.getConstant(0, MVT::i32));
|
|
LoadRetOps.push_back(InFlag);
|
|
SDValue retval = DAG.getMemIntrinsicNode(
|
|
NVPTXISD::LoadParam, dl,
|
|
DAG.getVTList(LoadRetVTs), LoadRetOps, EltVT, MachinePointerInfo());
|
|
Chain = retval.getValue(1);
|
|
InFlag = retval.getValue(2);
|
|
SDValue Ret0 = retval;
|
|
if (needTruncate)
|
|
Ret0 = DAG.getNode(ISD::TRUNCATE, dl, EltVT, Ret0);
|
|
InVals.push_back(Ret0);
|
|
} else if (NumElts == 2) {
|
|
// LoadV2
|
|
SmallVector<EVT, 4> LoadRetVTs;
|
|
if (EltVT == MVT::i1 || EltVT == MVT::i8) {
|
|
// If loading i1/i8 result, generate
|
|
// load.b8 i16
|
|
// if i1
|
|
// trunc i16 to i1
|
|
LoadRetVTs.push_back(MVT::i16);
|
|
LoadRetVTs.push_back(MVT::i16);
|
|
} else {
|
|
LoadRetVTs.push_back(EltVT);
|
|
LoadRetVTs.push_back(EltVT);
|
|
}
|
|
LoadRetVTs.push_back(MVT::Other);
|
|
LoadRetVTs.push_back(MVT::Glue);
|
|
SmallVector<SDValue, 4> LoadRetOps;
|
|
LoadRetOps.push_back(Chain);
|
|
LoadRetOps.push_back(DAG.getConstant(1, MVT::i32));
|
|
LoadRetOps.push_back(DAG.getConstant(0, MVT::i32));
|
|
LoadRetOps.push_back(InFlag);
|
|
SDValue retval = DAG.getMemIntrinsicNode(
|
|
NVPTXISD::LoadParamV2, dl,
|
|
DAG.getVTList(LoadRetVTs), LoadRetOps, EltVT, MachinePointerInfo());
|
|
Chain = retval.getValue(2);
|
|
InFlag = retval.getValue(3);
|
|
SDValue Ret0 = retval.getValue(0);
|
|
SDValue Ret1 = retval.getValue(1);
|
|
if (needTruncate) {
|
|
Ret0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Ret0);
|
|
InVals.push_back(Ret0);
|
|
Ret1 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Ret1);
|
|
InVals.push_back(Ret1);
|
|
} else {
|
|
InVals.push_back(Ret0);
|
|
InVals.push_back(Ret1);
|
|
}
|
|
} else {
|
|
// Split into N LoadV4
|
|
unsigned Ofst = 0;
|
|
unsigned VecSize = 4;
|
|
unsigned Opc = NVPTXISD::LoadParamV4;
|
|
if (EltVT.getSizeInBits() == 64) {
|
|
VecSize = 2;
|
|
Opc = NVPTXISD::LoadParamV2;
|
|
}
|
|
EVT VecVT = EVT::getVectorVT(F->getContext(), EltVT, VecSize);
|
|
for (unsigned i = 0; i < NumElts; i += VecSize) {
|
|
SmallVector<EVT, 8> LoadRetVTs;
|
|
if (EltVT == MVT::i1 || EltVT == MVT::i8) {
|
|
// If loading i1/i8 result, generate
|
|
// load.b8 i16
|
|
// if i1
|
|
// trunc i16 to i1
|
|
for (unsigned j = 0; j < VecSize; ++j)
|
|
LoadRetVTs.push_back(MVT::i16);
|
|
} else {
|
|
for (unsigned j = 0; j < VecSize; ++j)
|
|
LoadRetVTs.push_back(EltVT);
|
|
}
|
|
LoadRetVTs.push_back(MVT::Other);
|
|
LoadRetVTs.push_back(MVT::Glue);
|
|
SmallVector<SDValue, 4> LoadRetOps;
|
|
LoadRetOps.push_back(Chain);
|
|
LoadRetOps.push_back(DAG.getConstant(1, MVT::i32));
|
|
LoadRetOps.push_back(DAG.getConstant(Ofst, MVT::i32));
|
|
LoadRetOps.push_back(InFlag);
|
|
SDValue retval = DAG.getMemIntrinsicNode(
|
|
Opc, dl, DAG.getVTList(LoadRetVTs),
|
|
LoadRetOps, EltVT, MachinePointerInfo());
|
|
if (VecSize == 2) {
|
|
Chain = retval.getValue(2);
|
|
InFlag = retval.getValue(3);
|
|
} else {
|
|
Chain = retval.getValue(4);
|
|
InFlag = retval.getValue(5);
|
|
}
|
|
|
|
for (unsigned j = 0; j < VecSize; ++j) {
|
|
if (i + j >= NumElts)
|
|
break;
|
|
SDValue Elt = retval.getValue(j);
|
|
if (needTruncate)
|
|
Elt = DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
|
|
InVals.push_back(Elt);
|
|
}
|
|
Ofst += TD->getTypeAllocSize(VecVT.getTypeForEVT(F->getContext()));
|
|
}
|
|
}
|
|
} else {
|
|
SmallVector<EVT, 16> VTs;
|
|
SmallVector<uint64_t, 16> Offsets;
|
|
ComputePTXValueVTs(*this, retTy, VTs, &Offsets, 0);
|
|
assert(VTs.size() == Ins.size() && "Bad value decomposition");
|
|
unsigned RetAlign = getArgumentAlignment(Callee, CS, retTy, 0);
|
|
for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
|
|
unsigned sz = VTs[i].getSizeInBits();
|
|
unsigned AlignI = GreatestCommonDivisor64(RetAlign, Offsets[i]);
|
|
bool needTruncate = sz < 8 ? true : false;
|
|
if (VTs[i].isInteger() && (sz < 8))
|
|
sz = 8;
|
|
|
|
SmallVector<EVT, 4> LoadRetVTs;
|
|
EVT TheLoadType = VTs[i];
|
|
if (retTy->isIntegerTy() &&
|
|
TD->getTypeAllocSizeInBits(retTy) < 32) {
|
|
// This is for integer types only, and specifically not for
|
|
// aggregates.
|
|
LoadRetVTs.push_back(MVT::i32);
|
|
TheLoadType = MVT::i32;
|
|
} else if (sz < 16) {
|
|
// If loading i1/i8 result, generate
|
|
// load i8 (-> i16)
|
|
// trunc i16 to i1/i8
|
|
LoadRetVTs.push_back(MVT::i16);
|
|
} else
|
|
LoadRetVTs.push_back(Ins[i].VT);
|
|
LoadRetVTs.push_back(MVT::Other);
|
|
LoadRetVTs.push_back(MVT::Glue);
|
|
|
|
SmallVector<SDValue, 4> LoadRetOps;
|
|
LoadRetOps.push_back(Chain);
|
|
LoadRetOps.push_back(DAG.getConstant(1, MVT::i32));
|
|
LoadRetOps.push_back(DAG.getConstant(Offsets[i], MVT::i32));
|
|
LoadRetOps.push_back(InFlag);
|
|
SDValue retval = DAG.getMemIntrinsicNode(
|
|
NVPTXISD::LoadParam, dl,
|
|
DAG.getVTList(LoadRetVTs), LoadRetOps,
|
|
TheLoadType, MachinePointerInfo(), AlignI);
|
|
Chain = retval.getValue(1);
|
|
InFlag = retval.getValue(2);
|
|
SDValue Ret0 = retval.getValue(0);
|
|
if (needTruncate)
|
|
Ret0 = DAG.getNode(ISD::TRUNCATE, dl, Ins[i].VT, Ret0);
|
|
InVals.push_back(Ret0);
|
|
}
|
|
}
|
|
}
|
|
|
|
Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(uniqueCallSite, true),
|
|
DAG.getIntPtrConstant(uniqueCallSite + 1, true),
|
|
InFlag, dl);
|
|
uniqueCallSite++;
|
|
|
|
// set isTailCall to false for now, until we figure out how to express
|
|
// tail call optimization in PTX
|
|
isTailCall = false;
|
|
return Chain;
|
|
}
|
|
|
|
// By default CONCAT_VECTORS is lowered by ExpandVectorBuildThroughStack()
|
|
// (see LegalizeDAG.cpp). This is slow and uses local memory.
|
|
// We use extract/insert/build vector just as what LegalizeOp() does in llvm 2.5
|
|
SDValue
|
|
NVPTXTargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const {
|
|
SDNode *Node = Op.getNode();
|
|
SDLoc dl(Node);
|
|
SmallVector<SDValue, 8> Ops;
|
|
unsigned NumOperands = Node->getNumOperands();
|
|
for (unsigned i = 0; i < NumOperands; ++i) {
|
|
SDValue SubOp = Node->getOperand(i);
|
|
EVT VVT = SubOp.getNode()->getValueType(0);
|
|
EVT EltVT = VVT.getVectorElementType();
|
|
unsigned NumSubElem = VVT.getVectorNumElements();
|
|
for (unsigned j = 0; j < NumSubElem; ++j) {
|
|
Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, SubOp,
|
|
DAG.getIntPtrConstant(j)));
|
|
}
|
|
}
|
|
return DAG.getNode(ISD::BUILD_VECTOR, dl, Node->getValueType(0), Ops);
|
|
}
|
|
|
|
/// LowerShiftRightParts - Lower SRL_PARTS, SRA_PARTS, which
|
|
/// 1) returns two i32 values and take a 2 x i32 value to shift plus a shift
|
|
/// amount, or
|
|
/// 2) returns two i64 values and take a 2 x i64 value to shift plus a shift
|
|
/// amount.
|
|
SDValue NVPTXTargetLowering::LowerShiftRightParts(SDValue Op,
|
|
SelectionDAG &DAG) const {
|
|
assert(Op.getNumOperands() == 3 && "Not a double-shift!");
|
|
assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS);
|
|
|
|
EVT VT = Op.getValueType();
|
|
unsigned VTBits = VT.getSizeInBits();
|
|
SDLoc dl(Op);
|
|
SDValue ShOpLo = Op.getOperand(0);
|
|
SDValue ShOpHi = Op.getOperand(1);
|
|
SDValue ShAmt = Op.getOperand(2);
|
|
unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL;
|
|
|
|
if (VTBits == 32 && nvptxSubtarget.getSmVersion() >= 35) {
|
|
|
|
// For 32bit and sm35, we can use the funnel shift 'shf' instruction.
|
|
// {dHi, dLo} = {aHi, aLo} >> Amt
|
|
// dHi = aHi >> Amt
|
|
// dLo = shf.r.clamp aLo, aHi, Amt
|
|
|
|
SDValue Hi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
|
|
SDValue Lo = DAG.getNode(NVPTXISD::FUN_SHFR_CLAMP, dl, VT, ShOpLo, ShOpHi,
|
|
ShAmt);
|
|
|
|
SDValue Ops[2] = { Lo, Hi };
|
|
return DAG.getMergeValues(Ops, dl);
|
|
}
|
|
else {
|
|
|
|
// {dHi, dLo} = {aHi, aLo} >> Amt
|
|
// - if (Amt>=size) then
|
|
// dLo = aHi >> (Amt-size)
|
|
// dHi = aHi >> Amt (this is either all 0 or all 1)
|
|
// else
|
|
// dLo = (aLo >>logic Amt) | (aHi << (size-Amt))
|
|
// dHi = aHi >> Amt
|
|
|
|
SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
|
|
DAG.getConstant(VTBits, MVT::i32), ShAmt);
|
|
SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt);
|
|
SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
|
|
DAG.getConstant(VTBits, MVT::i32));
|
|
SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt);
|
|
SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
|
|
SDValue TrueVal = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt);
|
|
|
|
SDValue Cmp = DAG.getSetCC(dl, MVT::i1, ShAmt,
|
|
DAG.getConstant(VTBits, MVT::i32), ISD::SETGE);
|
|
SDValue Hi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
|
|
SDValue Lo = DAG.getNode(ISD::SELECT, dl, VT, Cmp, TrueVal, FalseVal);
|
|
|
|
SDValue Ops[2] = { Lo, Hi };
|
|
return DAG.getMergeValues(Ops, dl);
|
|
}
|
|
}
|
|
|
|
/// LowerShiftLeftParts - Lower SHL_PARTS, which
|
|
/// 1) returns two i32 values and take a 2 x i32 value to shift plus a shift
|
|
/// amount, or
|
|
/// 2) returns two i64 values and take a 2 x i64 value to shift plus a shift
|
|
/// amount.
|
|
SDValue NVPTXTargetLowering::LowerShiftLeftParts(SDValue Op,
|
|
SelectionDAG &DAG) const {
|
|
assert(Op.getNumOperands() == 3 && "Not a double-shift!");
|
|
assert(Op.getOpcode() == ISD::SHL_PARTS);
|
|
|
|
EVT VT = Op.getValueType();
|
|
unsigned VTBits = VT.getSizeInBits();
|
|
SDLoc dl(Op);
|
|
SDValue ShOpLo = Op.getOperand(0);
|
|
SDValue ShOpHi = Op.getOperand(1);
|
|
SDValue ShAmt = Op.getOperand(2);
|
|
|
|
if (VTBits == 32 && nvptxSubtarget.getSmVersion() >= 35) {
|
|
|
|
// For 32bit and sm35, we can use the funnel shift 'shf' instruction.
|
|
// {dHi, dLo} = {aHi, aLo} << Amt
|
|
// dHi = shf.l.clamp aLo, aHi, Amt
|
|
// dLo = aLo << Amt
|
|
|
|
SDValue Hi = DAG.getNode(NVPTXISD::FUN_SHFL_CLAMP, dl, VT, ShOpLo, ShOpHi,
|
|
ShAmt);
|
|
SDValue Lo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
|
|
|
|
SDValue Ops[2] = { Lo, Hi };
|
|
return DAG.getMergeValues(Ops, dl);
|
|
}
|
|
else {
|
|
|
|
// {dHi, dLo} = {aHi, aLo} << Amt
|
|
// - if (Amt>=size) then
|
|
// dLo = aLo << Amt (all 0)
|
|
// dLo = aLo << (Amt-size)
|
|
// else
|
|
// dLo = aLo << Amt
|
|
// dHi = (aHi << Amt) | (aLo >> (size-Amt))
|
|
|
|
SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
|
|
DAG.getConstant(VTBits, MVT::i32), ShAmt);
|
|
SDValue Tmp1 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt);
|
|
SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
|
|
DAG.getConstant(VTBits, MVT::i32));
|
|
SDValue Tmp2 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt);
|
|
SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
|
|
SDValue TrueVal = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt);
|
|
|
|
SDValue Cmp = DAG.getSetCC(dl, MVT::i1, ShAmt,
|
|
DAG.getConstant(VTBits, MVT::i32), ISD::SETGE);
|
|
SDValue Lo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
|
|
SDValue Hi = DAG.getNode(ISD::SELECT, dl, VT, Cmp, TrueVal, FalseVal);
|
|
|
|
SDValue Ops[2] = { Lo, Hi };
|
|
return DAG.getMergeValues(Ops, dl);
|
|
}
|
|
}
|
|
|
|
SDValue
|
|
NVPTXTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
|
|
switch (Op.getOpcode()) {
|
|
case ISD::RETURNADDR:
|
|
return SDValue();
|
|
case ISD::FRAMEADDR:
|
|
return SDValue();
|
|
case ISD::GlobalAddress:
|
|
return LowerGlobalAddress(Op, DAG);
|
|
case ISD::INTRINSIC_W_CHAIN:
|
|
return Op;
|
|
case ISD::BUILD_VECTOR:
|
|
case ISD::EXTRACT_SUBVECTOR:
|
|
return Op;
|
|
case ISD::CONCAT_VECTORS:
|
|
return LowerCONCAT_VECTORS(Op, DAG);
|
|
case ISD::STORE:
|
|
return LowerSTORE(Op, DAG);
|
|
case ISD::LOAD:
|
|
return LowerLOAD(Op, DAG);
|
|
case ISD::SHL_PARTS:
|
|
return LowerShiftLeftParts(Op, DAG);
|
|
case ISD::SRA_PARTS:
|
|
case ISD::SRL_PARTS:
|
|
return LowerShiftRightParts(Op, DAG);
|
|
default:
|
|
llvm_unreachable("Custom lowering not defined for operation");
|
|
}
|
|
}
|
|
|
|
SDValue NVPTXTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
|
|
if (Op.getValueType() == MVT::i1)
|
|
return LowerLOADi1(Op, DAG);
|
|
else
|
|
return SDValue();
|
|
}
|
|
|
|
// v = ld i1* addr
|
|
// =>
|
|
// v1 = ld i8* addr (-> i16)
|
|
// v = trunc i16 to i1
|
|
SDValue NVPTXTargetLowering::LowerLOADi1(SDValue Op, SelectionDAG &DAG) const {
|
|
SDNode *Node = Op.getNode();
|
|
LoadSDNode *LD = cast<LoadSDNode>(Node);
|
|
SDLoc dl(Node);
|
|
assert(LD->getExtensionType() == ISD::NON_EXTLOAD);
|
|
assert(Node->getValueType(0) == MVT::i1 &&
|
|
"Custom lowering for i1 load only");
|
|
SDValue newLD =
|
|
DAG.getLoad(MVT::i16, dl, LD->getChain(), LD->getBasePtr(),
|
|
LD->getPointerInfo(), LD->isVolatile(), LD->isNonTemporal(),
|
|
LD->isInvariant(), LD->getAlignment());
|
|
SDValue result = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, newLD);
|
|
// The legalizer (the caller) is expecting two values from the legalized
|
|
// load, so we build a MergeValues node for it. See ExpandUnalignedLoad()
|
|
// in LegalizeDAG.cpp which also uses MergeValues.
|
|
SDValue Ops[] = { result, LD->getChain() };
|
|
return DAG.getMergeValues(Ops, dl);
|
|
}
|
|
|
|
SDValue NVPTXTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
|
|
EVT ValVT = Op.getOperand(1).getValueType();
|
|
if (ValVT == MVT::i1)
|
|
return LowerSTOREi1(Op, DAG);
|
|
else if (ValVT.isVector())
|
|
return LowerSTOREVector(Op, DAG);
|
|
else
|
|
return SDValue();
|
|
}
|
|
|
|
SDValue
|
|
NVPTXTargetLowering::LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const {
|
|
SDNode *N = Op.getNode();
|
|
SDValue Val = N->getOperand(1);
|
|
SDLoc DL(N);
|
|
EVT ValVT = Val.getValueType();
|
|
|
|
if (ValVT.isVector()) {
|
|
// We only handle "native" vector sizes for now, e.g. <4 x double> is not
|
|
// legal. We can (and should) split that into 2 stores of <2 x double> here
|
|
// but I'm leaving that as a TODO for now.
|
|
if (!ValVT.isSimple())
|
|
return SDValue();
|
|
switch (ValVT.getSimpleVT().SimpleTy) {
|
|
default:
|
|
return SDValue();
|
|
case MVT::v2i8:
|
|
case MVT::v2i16:
|
|
case MVT::v2i32:
|
|
case MVT::v2i64:
|
|
case MVT::v2f32:
|
|
case MVT::v2f64:
|
|
case MVT::v4i8:
|
|
case MVT::v4i16:
|
|
case MVT::v4i32:
|
|
case MVT::v4f32:
|
|
// This is a "native" vector type
|
|
break;
|
|
}
|
|
|
|
unsigned Opcode = 0;
|
|
EVT EltVT = ValVT.getVectorElementType();
|
|
unsigned NumElts = ValVT.getVectorNumElements();
|
|
|
|
// Since StoreV2 is a target node, we cannot rely on DAG type legalization.
|
|
// Therefore, we must ensure the type is legal. For i1 and i8, we set the
|
|
// stored type to i16 and propagate the "real" type as the memory type.
|
|
bool NeedExt = false;
|
|
if (EltVT.getSizeInBits() < 16)
|
|
NeedExt = true;
|
|
|
|
switch (NumElts) {
|
|
default:
|
|
return SDValue();
|
|
case 2:
|
|
Opcode = NVPTXISD::StoreV2;
|
|
break;
|
|
case 4: {
|
|
Opcode = NVPTXISD::StoreV4;
|
|
break;
|
|
}
|
|
}
|
|
|
|
SmallVector<SDValue, 8> Ops;
|
|
|
|
// First is the chain
|
|
Ops.push_back(N->getOperand(0));
|
|
|
|
// Then the split values
|
|
for (unsigned i = 0; i < NumElts; ++i) {
|
|
SDValue ExtVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Val,
|
|
DAG.getIntPtrConstant(i));
|
|
if (NeedExt)
|
|
ExtVal = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i16, ExtVal);
|
|
Ops.push_back(ExtVal);
|
|
}
|
|
|
|
// Then any remaining arguments
|
|
for (unsigned i = 2, e = N->getNumOperands(); i != e; ++i) {
|
|
Ops.push_back(N->getOperand(i));
|
|
}
|
|
|
|
MemSDNode *MemSD = cast<MemSDNode>(N);
|
|
|
|
SDValue NewSt = DAG.getMemIntrinsicNode(
|
|
Opcode, DL, DAG.getVTList(MVT::Other), Ops,
|
|
MemSD->getMemoryVT(), MemSD->getMemOperand());
|
|
|
|
//return DCI.CombineTo(N, NewSt, true);
|
|
return NewSt;
|
|
}
|
|
|
|
return SDValue();
|
|
}
|
|
|
|
// st i1 v, addr
|
|
// =>
|
|
// v1 = zxt v to i16
|
|
// st.u8 i16, addr
|
|
SDValue NVPTXTargetLowering::LowerSTOREi1(SDValue Op, SelectionDAG &DAG) const {
|
|
SDNode *Node = Op.getNode();
|
|
SDLoc dl(Node);
|
|
StoreSDNode *ST = cast<StoreSDNode>(Node);
|
|
SDValue Tmp1 = ST->getChain();
|
|
SDValue Tmp2 = ST->getBasePtr();
|
|
SDValue Tmp3 = ST->getValue();
|
|
assert(Tmp3.getValueType() == MVT::i1 && "Custom lowering for i1 store only");
|
|
unsigned Alignment = ST->getAlignment();
|
|
bool isVolatile = ST->isVolatile();
|
|
bool isNonTemporal = ST->isNonTemporal();
|
|
Tmp3 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Tmp3);
|
|
SDValue Result = DAG.getTruncStore(Tmp1, dl, Tmp3, Tmp2,
|
|
ST->getPointerInfo(), MVT::i8, isNonTemporal,
|
|
isVolatile, Alignment);
|
|
return Result;
|
|
}
|
|
|
|
SDValue NVPTXTargetLowering::getExtSymb(SelectionDAG &DAG, const char *inname,
|
|
int idx, EVT v) const {
|
|
std::string *name = nvTM->getManagedStrPool()->getManagedString(inname);
|
|
std::stringstream suffix;
|
|
suffix << idx;
|
|
*name += suffix.str();
|
|
return DAG.getTargetExternalSymbol(name->c_str(), v);
|
|
}
|
|
|
|
SDValue
|
|
NVPTXTargetLowering::getParamSymbol(SelectionDAG &DAG, int idx, EVT v) const {
|
|
std::string ParamSym;
|
|
raw_string_ostream ParamStr(ParamSym);
|
|
|
|
ParamStr << DAG.getMachineFunction().getName() << "_param_" << idx;
|
|
ParamStr.flush();
|
|
|
|
std::string *SavedStr =
|
|
nvTM->getManagedStrPool()->getManagedString(ParamSym.c_str());
|
|
return DAG.getTargetExternalSymbol(SavedStr->c_str(), v);
|
|
}
|
|
|
|
SDValue NVPTXTargetLowering::getParamHelpSymbol(SelectionDAG &DAG, int idx) {
|
|
return getExtSymb(DAG, ".HLPPARAM", idx);
|
|
}
|
|
|
|
// Check to see if the kernel argument is image*_t or sampler_t
|
|
|
|
bool llvm::isImageOrSamplerVal(const Value *arg, const Module *context) {
|
|
static const char *const specialTypes[] = { "struct._image2d_t",
|
|
"struct._image3d_t",
|
|
"struct._sampler_t" };
|
|
|
|
const Type *Ty = arg->getType();
|
|
const PointerType *PTy = dyn_cast<PointerType>(Ty);
|
|
|
|
if (!PTy)
|
|
return false;
|
|
|
|
if (!context)
|
|
return false;
|
|
|
|
const StructType *STy = dyn_cast<StructType>(PTy->getElementType());
|
|
const std::string TypeName = STy && !STy->isLiteral() ? STy->getName() : "";
|
|
|
|
for (int i = 0, e = array_lengthof(specialTypes); i != e; ++i)
|
|
if (TypeName == specialTypes[i])
|
|
return true;
|
|
|
|
return false;
|
|
}
|
|
|
|
SDValue NVPTXTargetLowering::LowerFormalArguments(
|
|
SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
|
|
const SmallVectorImpl<ISD::InputArg> &Ins, SDLoc dl, SelectionDAG &DAG,
|
|
SmallVectorImpl<SDValue> &InVals) const {
|
|
MachineFunction &MF = DAG.getMachineFunction();
|
|
const DataLayout *TD = getDataLayout();
|
|
|
|
const Function *F = MF.getFunction();
|
|
const AttributeSet &PAL = F->getAttributes();
|
|
const TargetLowering *TLI = DAG.getTarget().getTargetLowering();
|
|
|
|
SDValue Root = DAG.getRoot();
|
|
std::vector<SDValue> OutChains;
|
|
|
|
bool isKernel = llvm::isKernelFunction(*F);
|
|
bool isABI = (nvptxSubtarget.getSmVersion() >= 20);
|
|
assert(isABI && "Non-ABI compilation is not supported");
|
|
if (!isABI)
|
|
return Chain;
|
|
|
|
std::vector<Type *> argTypes;
|
|
std::vector<const Argument *> theArgs;
|
|
for (Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end();
|
|
I != E; ++I) {
|
|
theArgs.push_back(I);
|
|
argTypes.push_back(I->getType());
|
|
}
|
|
// argTypes.size() (or theArgs.size()) and Ins.size() need not match.
|
|
// Ins.size() will be larger
|
|
// * if there is an aggregate argument with multiple fields (each field
|
|
// showing up separately in Ins)
|
|
// * if there is a vector argument with more than typical vector-length
|
|
// elements (generally if more than 4) where each vector element is
|
|
// individually present in Ins.
|
|
// So a different index should be used for indexing into Ins.
|
|
// See similar issue in LowerCall.
|
|
unsigned InsIdx = 0;
|
|
|
|
int idx = 0;
|
|
for (unsigned i = 0, e = theArgs.size(); i != e; ++i, ++idx, ++InsIdx) {
|
|
Type *Ty = argTypes[i];
|
|
|
|
// If the kernel argument is image*_t or sampler_t, convert it to
|
|
// a i32 constant holding the parameter position. This can later
|
|
// matched in the AsmPrinter to output the correct mangled name.
|
|
if (isImageOrSamplerVal(
|
|
theArgs[i],
|
|
(theArgs[i]->getParent() ? theArgs[i]->getParent()->getParent()
|
|
: nullptr))) {
|
|
assert(isKernel && "Only kernels can have image/sampler params");
|
|
InVals.push_back(DAG.getConstant(i + 1, MVT::i32));
|
|
continue;
|
|
}
|
|
|
|
if (theArgs[i]->use_empty()) {
|
|
// argument is dead
|
|
if (Ty->isAggregateType()) {
|
|
SmallVector<EVT, 16> vtparts;
|
|
|
|
ComputePTXValueVTs(*this, Ty, vtparts);
|
|
assert(vtparts.size() > 0 && "empty aggregate type not expected");
|
|
for (unsigned parti = 0, parte = vtparts.size(); parti != parte;
|
|
++parti) {
|
|
InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT));
|
|
++InsIdx;
|
|
}
|
|
if (vtparts.size() > 0)
|
|
--InsIdx;
|
|
continue;
|
|
}
|
|
if (Ty->isVectorTy()) {
|
|
EVT ObjectVT = getValueType(Ty);
|
|
unsigned NumRegs = TLI->getNumRegisters(F->getContext(), ObjectVT);
|
|
for (unsigned parti = 0; parti < NumRegs; ++parti) {
|
|
InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT));
|
|
++InsIdx;
|
|
}
|
|
if (NumRegs > 0)
|
|
--InsIdx;
|
|
continue;
|
|
}
|
|
InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT));
|
|
continue;
|
|
}
|
|
|
|
// In the following cases, assign a node order of "idx+1"
|
|
// to newly created nodes. The SDNodes for params have to
|
|
// appear in the same order as their order of appearance
|
|
// in the original function. "idx+1" holds that order.
|
|
if (PAL.hasAttribute(i + 1, Attribute::ByVal) == false) {
|
|
if (Ty->isAggregateType()) {
|
|
SmallVector<EVT, 16> vtparts;
|
|
SmallVector<uint64_t, 16> offsets;
|
|
|
|
// NOTE: Here, we lose the ability to issue vector loads for vectors
|
|
// that are a part of a struct. This should be investigated in the
|
|
// future.
|
|
ComputePTXValueVTs(*this, Ty, vtparts, &offsets, 0);
|
|
assert(vtparts.size() > 0 && "empty aggregate type not expected");
|
|
bool aggregateIsPacked = false;
|
|
if (StructType *STy = llvm::dyn_cast<StructType>(Ty))
|
|
aggregateIsPacked = STy->isPacked();
|
|
|
|
SDValue Arg = getParamSymbol(DAG, idx, getPointerTy());
|
|
for (unsigned parti = 0, parte = vtparts.size(); parti != parte;
|
|
++parti) {
|
|
EVT partVT = vtparts[parti];
|
|
Value *srcValue = Constant::getNullValue(
|
|
PointerType::get(partVT.getTypeForEVT(F->getContext()),
|
|
llvm::ADDRESS_SPACE_PARAM));
|
|
SDValue srcAddr =
|
|
DAG.getNode(ISD::ADD, dl, getPointerTy(), Arg,
|
|
DAG.getConstant(offsets[parti], getPointerTy()));
|
|
unsigned partAlign =
|
|
aggregateIsPacked ? 1
|
|
: TD->getABITypeAlignment(
|
|
partVT.getTypeForEVT(F->getContext()));
|
|
SDValue p;
|
|
if (Ins[InsIdx].VT.getSizeInBits() > partVT.getSizeInBits()) {
|
|
ISD::LoadExtType ExtOp = Ins[InsIdx].Flags.isSExt() ?
|
|
ISD::SEXTLOAD : ISD::ZEXTLOAD;
|
|
p = DAG.getExtLoad(ExtOp, dl, Ins[InsIdx].VT, Root, srcAddr,
|
|
MachinePointerInfo(srcValue), partVT, false,
|
|
false, partAlign);
|
|
} else {
|
|
p = DAG.getLoad(partVT, dl, Root, srcAddr,
|
|
MachinePointerInfo(srcValue), false, false, false,
|
|
partAlign);
|
|
}
|
|
if (p.getNode())
|
|
p.getNode()->setIROrder(idx + 1);
|
|
InVals.push_back(p);
|
|
++InsIdx;
|
|
}
|
|
if (vtparts.size() > 0)
|
|
--InsIdx;
|
|
continue;
|
|
}
|
|
if (Ty->isVectorTy()) {
|
|
EVT ObjectVT = getValueType(Ty);
|
|
SDValue Arg = getParamSymbol(DAG, idx, getPointerTy());
|
|
unsigned NumElts = ObjectVT.getVectorNumElements();
|
|
assert(TLI->getNumRegisters(F->getContext(), ObjectVT) == NumElts &&
|
|
"Vector was not scalarized");
|
|
unsigned Ofst = 0;
|
|
EVT EltVT = ObjectVT.getVectorElementType();
|
|
|
|
// V1 load
|
|
// f32 = load ...
|
|
if (NumElts == 1) {
|
|
// We only have one element, so just directly load it
|
|
Value *SrcValue = Constant::getNullValue(PointerType::get(
|
|
EltVT.getTypeForEVT(F->getContext()), llvm::ADDRESS_SPACE_PARAM));
|
|
SDValue SrcAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(), Arg,
|
|
DAG.getConstant(Ofst, getPointerTy()));
|
|
SDValue P = DAG.getLoad(
|
|
EltVT, dl, Root, SrcAddr, MachinePointerInfo(SrcValue), false,
|
|
false, true,
|
|
TD->getABITypeAlignment(EltVT.getTypeForEVT(F->getContext())));
|
|
if (P.getNode())
|
|
P.getNode()->setIROrder(idx + 1);
|
|
|
|
if (Ins[InsIdx].VT.getSizeInBits() > EltVT.getSizeInBits())
|
|
P = DAG.getNode(ISD::ANY_EXTEND, dl, Ins[InsIdx].VT, P);
|
|
InVals.push_back(P);
|
|
Ofst += TD->getTypeAllocSize(EltVT.getTypeForEVT(F->getContext()));
|
|
++InsIdx;
|
|
} else if (NumElts == 2) {
|
|
// V2 load
|
|
// f32,f32 = load ...
|
|
EVT VecVT = EVT::getVectorVT(F->getContext(), EltVT, 2);
|
|
Value *SrcValue = Constant::getNullValue(PointerType::get(
|
|
VecVT.getTypeForEVT(F->getContext()), llvm::ADDRESS_SPACE_PARAM));
|
|
SDValue SrcAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(), Arg,
|
|
DAG.getConstant(Ofst, getPointerTy()));
|
|
SDValue P = DAG.getLoad(
|
|
VecVT, dl, Root, SrcAddr, MachinePointerInfo(SrcValue), false,
|
|
false, true,
|
|
TD->getABITypeAlignment(VecVT.getTypeForEVT(F->getContext())));
|
|
if (P.getNode())
|
|
P.getNode()->setIROrder(idx + 1);
|
|
|
|
SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, P,
|
|
DAG.getIntPtrConstant(0));
|
|
SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, P,
|
|
DAG.getIntPtrConstant(1));
|
|
|
|
if (Ins[InsIdx].VT.getSizeInBits() > EltVT.getSizeInBits()) {
|
|
Elt0 = DAG.getNode(ISD::ANY_EXTEND, dl, Ins[InsIdx].VT, Elt0);
|
|
Elt1 = DAG.getNode(ISD::ANY_EXTEND, dl, Ins[InsIdx].VT, Elt1);
|
|
}
|
|
|
|
InVals.push_back(Elt0);
|
|
InVals.push_back(Elt1);
|
|
Ofst += TD->getTypeAllocSize(VecVT.getTypeForEVT(F->getContext()));
|
|
InsIdx += 2;
|
|
} else {
|
|
// V4 loads
|
|
// We have at least 4 elements (<3 x Ty> expands to 4 elements) and
|
|
// the
|
|
// vector will be expanded to a power of 2 elements, so we know we can
|
|
// always round up to the next multiple of 4 when creating the vector
|
|
// loads.
|
|
// e.g. 4 elem => 1 ld.v4
|
|
// 6 elem => 2 ld.v4
|
|
// 8 elem => 2 ld.v4
|
|
// 11 elem => 3 ld.v4
|
|
unsigned VecSize = 4;
|
|
if (EltVT.getSizeInBits() == 64) {
|
|
VecSize = 2;
|
|
}
|
|
EVT VecVT = EVT::getVectorVT(F->getContext(), EltVT, VecSize);
|
|
for (unsigned i = 0; i < NumElts; i += VecSize) {
|
|
Value *SrcValue = Constant::getNullValue(
|
|
PointerType::get(VecVT.getTypeForEVT(F->getContext()),
|
|
llvm::ADDRESS_SPACE_PARAM));
|
|
SDValue SrcAddr =
|
|
DAG.getNode(ISD::ADD, dl, getPointerTy(), Arg,
|
|
DAG.getConstant(Ofst, getPointerTy()));
|
|
SDValue P = DAG.getLoad(
|
|
VecVT, dl, Root, SrcAddr, MachinePointerInfo(SrcValue), false,
|
|
false, true,
|
|
TD->getABITypeAlignment(VecVT.getTypeForEVT(F->getContext())));
|
|
if (P.getNode())
|
|
P.getNode()->setIROrder(idx + 1);
|
|
|
|
for (unsigned j = 0; j < VecSize; ++j) {
|
|
if (i + j >= NumElts)
|
|
break;
|
|
SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, P,
|
|
DAG.getIntPtrConstant(j));
|
|
if (Ins[InsIdx].VT.getSizeInBits() > EltVT.getSizeInBits())
|
|
Elt = DAG.getNode(ISD::ANY_EXTEND, dl, Ins[InsIdx].VT, Elt);
|
|
InVals.push_back(Elt);
|
|
}
|
|
Ofst += TD->getTypeAllocSize(VecVT.getTypeForEVT(F->getContext()));
|
|
}
|
|
InsIdx += NumElts;
|
|
}
|
|
|
|
if (NumElts > 0)
|
|
--InsIdx;
|
|
continue;
|
|
}
|
|
// A plain scalar.
|
|
EVT ObjectVT = getValueType(Ty);
|
|
// If ABI, load from the param symbol
|
|
SDValue Arg = getParamSymbol(DAG, idx, getPointerTy());
|
|
Value *srcValue = Constant::getNullValue(PointerType::get(
|
|
ObjectVT.getTypeForEVT(F->getContext()), llvm::ADDRESS_SPACE_PARAM));
|
|
SDValue p;
|
|
if (ObjectVT.getSizeInBits() < Ins[InsIdx].VT.getSizeInBits()) {
|
|
ISD::LoadExtType ExtOp = Ins[InsIdx].Flags.isSExt() ?
|
|
ISD::SEXTLOAD : ISD::ZEXTLOAD;
|
|
p = DAG.getExtLoad(ExtOp, dl, Ins[InsIdx].VT, Root, Arg,
|
|
MachinePointerInfo(srcValue), ObjectVT, false, false,
|
|
TD->getABITypeAlignment(ObjectVT.getTypeForEVT(F->getContext())));
|
|
} else {
|
|
p = DAG.getLoad(Ins[InsIdx].VT, dl, Root, Arg,
|
|
MachinePointerInfo(srcValue), false, false, false,
|
|
TD->getABITypeAlignment(ObjectVT.getTypeForEVT(F->getContext())));
|
|
}
|
|
if (p.getNode())
|
|
p.getNode()->setIROrder(idx + 1);
|
|
InVals.push_back(p);
|
|
continue;
|
|
}
|
|
|
|
// Param has ByVal attribute
|
|
// Return MoveParam(param symbol).
|
|
// Ideally, the param symbol can be returned directly,
|
|
// but when SDNode builder decides to use it in a CopyToReg(),
|
|
// machine instruction fails because TargetExternalSymbol
|
|
// (not lowered) is target dependent, and CopyToReg assumes
|
|
// the source is lowered.
|
|
EVT ObjectVT = getValueType(Ty);
|
|
assert(ObjectVT == Ins[InsIdx].VT &&
|
|
"Ins type did not match function type");
|
|
SDValue Arg = getParamSymbol(DAG, idx, getPointerTy());
|
|
SDValue p = DAG.getNode(NVPTXISD::MoveParam, dl, ObjectVT, Arg);
|
|
if (p.getNode())
|
|
p.getNode()->setIROrder(idx + 1);
|
|
if (isKernel)
|
|
InVals.push_back(p);
|
|
else {
|
|
SDValue p2 = DAG.getNode(
|
|
ISD::INTRINSIC_WO_CHAIN, dl, ObjectVT,
|
|
DAG.getConstant(Intrinsic::nvvm_ptr_local_to_gen, MVT::i32), p);
|
|
InVals.push_back(p2);
|
|
}
|
|
}
|
|
|
|
// Clang will check explicit VarArg and issue error if any. However, Clang
|
|
// will let code with
|
|
// implicit var arg like f() pass. See bug 617733.
|
|
// We treat this case as if the arg list is empty.
|
|
// if (F.isVarArg()) {
|
|
// assert(0 && "VarArg not supported yet!");
|
|
//}
|
|
|
|
if (!OutChains.empty())
|
|
DAG.setRoot(DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains));
|
|
|
|
return Chain;
|
|
}
|
|
|
|
|
|
SDValue
|
|
NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
|
|
bool isVarArg,
|
|
const SmallVectorImpl<ISD::OutputArg> &Outs,
|
|
const SmallVectorImpl<SDValue> &OutVals,
|
|
SDLoc dl, SelectionDAG &DAG) const {
|
|
MachineFunction &MF = DAG.getMachineFunction();
|
|
const Function *F = MF.getFunction();
|
|
Type *RetTy = F->getReturnType();
|
|
const DataLayout *TD = getDataLayout();
|
|
|
|
bool isABI = (nvptxSubtarget.getSmVersion() >= 20);
|
|
assert(isABI && "Non-ABI compilation is not supported");
|
|
if (!isABI)
|
|
return Chain;
|
|
|
|
if (VectorType *VTy = dyn_cast<VectorType>(RetTy)) {
|
|
// If we have a vector type, the OutVals array will be the scalarized
|
|
// components and we have combine them into 1 or more vector stores.
|
|
unsigned NumElts = VTy->getNumElements();
|
|
assert(NumElts == Outs.size() && "Bad scalarization of return value");
|
|
|
|
// const_cast can be removed in later LLVM versions
|
|
EVT EltVT = getValueType(RetTy).getVectorElementType();
|
|
bool NeedExtend = false;
|
|
if (EltVT.getSizeInBits() < 16)
|
|
NeedExtend = true;
|
|
|
|
// V1 store
|
|
if (NumElts == 1) {
|
|
SDValue StoreVal = OutVals[0];
|
|
// We only have one element, so just directly store it
|
|
if (NeedExtend)
|
|
StoreVal = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, StoreVal);
|
|
SDValue Ops[] = { Chain, DAG.getConstant(0, MVT::i32), StoreVal };
|
|
Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreRetval, dl,
|
|
DAG.getVTList(MVT::Other), Ops,
|
|
EltVT, MachinePointerInfo());
|
|
|
|
} else if (NumElts == 2) {
|
|
// V2 store
|
|
SDValue StoreVal0 = OutVals[0];
|
|
SDValue StoreVal1 = OutVals[1];
|
|
|
|
if (NeedExtend) {
|
|
StoreVal0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, StoreVal0);
|
|
StoreVal1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, StoreVal1);
|
|
}
|
|
|
|
SDValue Ops[] = { Chain, DAG.getConstant(0, MVT::i32), StoreVal0,
|
|
StoreVal1 };
|
|
Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreRetvalV2, dl,
|
|
DAG.getVTList(MVT::Other), Ops,
|
|
EltVT, MachinePointerInfo());
|
|
} else {
|
|
// V4 stores
|
|
// We have at least 4 elements (<3 x Ty> expands to 4 elements) and the
|
|
// vector will be expanded to a power of 2 elements, so we know we can
|
|
// always round up to the next multiple of 4 when creating the vector
|
|
// stores.
|
|
// e.g. 4 elem => 1 st.v4
|
|
// 6 elem => 2 st.v4
|
|
// 8 elem => 2 st.v4
|
|
// 11 elem => 3 st.v4
|
|
|
|
unsigned VecSize = 4;
|
|
if (OutVals[0].getValueType().getSizeInBits() == 64)
|
|
VecSize = 2;
|
|
|
|
unsigned Offset = 0;
|
|
|
|
EVT VecVT =
|
|
EVT::getVectorVT(F->getContext(), EltVT, VecSize);
|
|
unsigned PerStoreOffset =
|
|
TD->getTypeAllocSize(VecVT.getTypeForEVT(F->getContext()));
|
|
|
|
for (unsigned i = 0; i < NumElts; i += VecSize) {
|
|
// Get values
|
|
SDValue StoreVal;
|
|
SmallVector<SDValue, 8> Ops;
|
|
Ops.push_back(Chain);
|
|
Ops.push_back(DAG.getConstant(Offset, MVT::i32));
|
|
unsigned Opc = NVPTXISD::StoreRetvalV2;
|
|
EVT ExtendedVT = (NeedExtend) ? MVT::i16 : OutVals[0].getValueType();
|
|
|
|
StoreVal = OutVals[i];
|
|
if (NeedExtend)
|
|
StoreVal = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtendedVT, StoreVal);
|
|
Ops.push_back(StoreVal);
|
|
|
|
if (i + 1 < NumElts) {
|
|
StoreVal = OutVals[i + 1];
|
|
if (NeedExtend)
|
|
StoreVal = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtendedVT, StoreVal);
|
|
} else {
|
|
StoreVal = DAG.getUNDEF(ExtendedVT);
|
|
}
|
|
Ops.push_back(StoreVal);
|
|
|
|
if (VecSize == 4) {
|
|
Opc = NVPTXISD::StoreRetvalV4;
|
|
if (i + 2 < NumElts) {
|
|
StoreVal = OutVals[i + 2];
|
|
if (NeedExtend)
|
|
StoreVal =
|
|
DAG.getNode(ISD::ZERO_EXTEND, dl, ExtendedVT, StoreVal);
|
|
} else {
|
|
StoreVal = DAG.getUNDEF(ExtendedVT);
|
|
}
|
|
Ops.push_back(StoreVal);
|
|
|
|
if (i + 3 < NumElts) {
|
|
StoreVal = OutVals[i + 3];
|
|
if (NeedExtend)
|
|
StoreVal =
|
|
DAG.getNode(ISD::ZERO_EXTEND, dl, ExtendedVT, StoreVal);
|
|
} else {
|
|
StoreVal = DAG.getUNDEF(ExtendedVT);
|
|
}
|
|
Ops.push_back(StoreVal);
|
|
}
|
|
|
|
// Chain = DAG.getNode(Opc, dl, MVT::Other, &Ops[0], Ops.size());
|
|
Chain =
|
|
DAG.getMemIntrinsicNode(Opc, dl, DAG.getVTList(MVT::Other), Ops,
|
|
EltVT, MachinePointerInfo());
|
|
Offset += PerStoreOffset;
|
|
}
|
|
}
|
|
} else {
|
|
SmallVector<EVT, 16> ValVTs;
|
|
SmallVector<uint64_t, 16> Offsets;
|
|
ComputePTXValueVTs(*this, RetTy, ValVTs, &Offsets, 0);
|
|
assert(ValVTs.size() == OutVals.size() && "Bad return value decomposition");
|
|
|
|
for (unsigned i = 0, e = Outs.size(); i != e; ++i) {
|
|
SDValue theVal = OutVals[i];
|
|
EVT TheValType = theVal.getValueType();
|
|
unsigned numElems = 1;
|
|
if (TheValType.isVector())
|
|
numElems = TheValType.getVectorNumElements();
|
|
for (unsigned j = 0, je = numElems; j != je; ++j) {
|
|
SDValue TmpVal = theVal;
|
|
if (TheValType.isVector())
|
|
TmpVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
|
|
TheValType.getVectorElementType(), TmpVal,
|
|
DAG.getIntPtrConstant(j));
|
|
EVT TheStoreType = ValVTs[i];
|
|
if (RetTy->isIntegerTy() &&
|
|
TD->getTypeAllocSizeInBits(RetTy) < 32) {
|
|
// The following zero-extension is for integer types only, and
|
|
// specifically not for aggregates.
|
|
TmpVal = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, TmpVal);
|
|
TheStoreType = MVT::i32;
|
|
}
|
|
else if (TmpVal.getValueType().getSizeInBits() < 16)
|
|
TmpVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, TmpVal);
|
|
|
|
SDValue Ops[] = {
|
|
Chain,
|
|
DAG.getConstant(Offsets[i], MVT::i32),
|
|
TmpVal };
|
|
Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreRetval, dl,
|
|
DAG.getVTList(MVT::Other), Ops,
|
|
TheStoreType,
|
|
MachinePointerInfo());
|
|
}
|
|
}
|
|
}
|
|
|
|
return DAG.getNode(NVPTXISD::RET_FLAG, dl, MVT::Other, Chain);
|
|
}
|
|
|
|
|
|
void NVPTXTargetLowering::LowerAsmOperandForConstraint(
|
|
SDValue Op, std::string &Constraint, std::vector<SDValue> &Ops,
|
|
SelectionDAG &DAG) const {
|
|
if (Constraint.length() > 1)
|
|
return;
|
|
else
|
|
TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
|
|
}
|
|
|
|
// NVPTX suuport vector of legal types of any length in Intrinsics because the
|
|
// NVPTX specific type legalizer
|
|
// will legalize them to the PTX supported length.
|
|
bool NVPTXTargetLowering::isTypeSupportedInIntrinsic(MVT VT) const {
|
|
if (isTypeLegal(VT))
|
|
return true;
|
|
if (VT.isVector()) {
|
|
MVT eVT = VT.getVectorElementType();
|
|
if (isTypeLegal(eVT))
|
|
return true;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
static unsigned getOpcForTextureInstr(unsigned Intrinsic) {
|
|
switch (Intrinsic) {
|
|
default:
|
|
return 0;
|
|
|
|
case Intrinsic::nvvm_tex_1d_v4f32_i32:
|
|
return NVPTXISD::Tex1DFloatI32;
|
|
case Intrinsic::nvvm_tex_1d_v4f32_f32:
|
|
return NVPTXISD::Tex1DFloatFloat;
|
|
case Intrinsic::nvvm_tex_1d_level_v4f32_f32:
|
|
return NVPTXISD::Tex1DFloatFloatLevel;
|
|
case Intrinsic::nvvm_tex_1d_grad_v4f32_f32:
|
|
return NVPTXISD::Tex1DFloatFloatGrad;
|
|
case Intrinsic::nvvm_tex_1d_v4i32_i32:
|
|
return NVPTXISD::Tex1DI32I32;
|
|
case Intrinsic::nvvm_tex_1d_v4i32_f32:
|
|
return NVPTXISD::Tex1DI32Float;
|
|
case Intrinsic::nvvm_tex_1d_level_v4i32_f32:
|
|
return NVPTXISD::Tex1DI32FloatLevel;
|
|
case Intrinsic::nvvm_tex_1d_grad_v4i32_f32:
|
|
return NVPTXISD::Tex1DI32FloatGrad;
|
|
|
|
case Intrinsic::nvvm_tex_1d_array_v4f32_i32:
|
|
return NVPTXISD::Tex1DArrayFloatI32;
|
|
case Intrinsic::nvvm_tex_1d_array_v4f32_f32:
|
|
return NVPTXISD::Tex1DArrayFloatFloat;
|
|
case Intrinsic::nvvm_tex_1d_array_level_v4f32_f32:
|
|
return NVPTXISD::Tex1DArrayFloatFloatLevel;
|
|
case Intrinsic::nvvm_tex_1d_array_grad_v4f32_f32:
|
|
return NVPTXISD::Tex1DArrayFloatFloatGrad;
|
|
case Intrinsic::nvvm_tex_1d_array_v4i32_i32:
|
|
return NVPTXISD::Tex1DArrayI32I32;
|
|
case Intrinsic::nvvm_tex_1d_array_v4i32_f32:
|
|
return NVPTXISD::Tex1DArrayI32Float;
|
|
case Intrinsic::nvvm_tex_1d_array_level_v4i32_f32:
|
|
return NVPTXISD::Tex1DArrayI32FloatLevel;
|
|
case Intrinsic::nvvm_tex_1d_array_grad_v4i32_f32:
|
|
return NVPTXISD::Tex1DArrayI32FloatGrad;
|
|
|
|
case Intrinsic::nvvm_tex_2d_v4f32_i32:
|
|
return NVPTXISD::Tex2DFloatI32;
|
|
case Intrinsic::nvvm_tex_2d_v4f32_f32:
|
|
return NVPTXISD::Tex2DFloatFloat;
|
|
case Intrinsic::nvvm_tex_2d_level_v4f32_f32:
|
|
return NVPTXISD::Tex2DFloatFloatLevel;
|
|
case Intrinsic::nvvm_tex_2d_grad_v4f32_f32:
|
|
return NVPTXISD::Tex2DFloatFloatGrad;
|
|
case Intrinsic::nvvm_tex_2d_v4i32_i32:
|
|
return NVPTXISD::Tex2DI32I32;
|
|
case Intrinsic::nvvm_tex_2d_v4i32_f32:
|
|
return NVPTXISD::Tex2DI32Float;
|
|
case Intrinsic::nvvm_tex_2d_level_v4i32_f32:
|
|
return NVPTXISD::Tex2DI32FloatLevel;
|
|
case Intrinsic::nvvm_tex_2d_grad_v4i32_f32:
|
|
return NVPTXISD::Tex2DI32FloatGrad;
|
|
|
|
case Intrinsic::nvvm_tex_2d_array_v4f32_i32:
|
|
return NVPTXISD::Tex2DArrayFloatI32;
|
|
case Intrinsic::nvvm_tex_2d_array_v4f32_f32:
|
|
return NVPTXISD::Tex2DArrayFloatFloat;
|
|
case Intrinsic::nvvm_tex_2d_array_level_v4f32_f32:
|
|
return NVPTXISD::Tex2DArrayFloatFloatLevel;
|
|
case Intrinsic::nvvm_tex_2d_array_grad_v4f32_f32:
|
|
return NVPTXISD::Tex2DArrayFloatFloatGrad;
|
|
case Intrinsic::nvvm_tex_2d_array_v4i32_i32:
|
|
return NVPTXISD::Tex2DArrayI32I32;
|
|
case Intrinsic::nvvm_tex_2d_array_v4i32_f32:
|
|
return NVPTXISD::Tex2DArrayI32Float;
|
|
case Intrinsic::nvvm_tex_2d_array_level_v4i32_f32:
|
|
return NVPTXISD::Tex2DArrayI32FloatLevel;
|
|
case Intrinsic::nvvm_tex_2d_array_grad_v4i32_f32:
|
|
return NVPTXISD::Tex2DArrayI32FloatGrad;
|
|
|
|
case Intrinsic::nvvm_tex_3d_v4f32_i32:
|
|
return NVPTXISD::Tex3DFloatI32;
|
|
case Intrinsic::nvvm_tex_3d_v4f32_f32:
|
|
return NVPTXISD::Tex3DFloatFloat;
|
|
case Intrinsic::nvvm_tex_3d_level_v4f32_f32:
|
|
return NVPTXISD::Tex3DFloatFloatLevel;
|
|
case Intrinsic::nvvm_tex_3d_grad_v4f32_f32:
|
|
return NVPTXISD::Tex3DFloatFloatGrad;
|
|
case Intrinsic::nvvm_tex_3d_v4i32_i32:
|
|
return NVPTXISD::Tex3DI32I32;
|
|
case Intrinsic::nvvm_tex_3d_v4i32_f32:
|
|
return NVPTXISD::Tex3DI32Float;
|
|
case Intrinsic::nvvm_tex_3d_level_v4i32_f32:
|
|
return NVPTXISD::Tex3DI32FloatLevel;
|
|
case Intrinsic::nvvm_tex_3d_grad_v4i32_f32:
|
|
return NVPTXISD::Tex3DI32FloatGrad;
|
|
}
|
|
}
|
|
|
|
static unsigned getOpcForSurfaceInstr(unsigned Intrinsic) {
|
|
switch (Intrinsic) {
|
|
default:
|
|
return 0;
|
|
case Intrinsic::nvvm_suld_1d_i8_trap:
|
|
return NVPTXISD::Suld1DI8Trap;
|
|
case Intrinsic::nvvm_suld_1d_i16_trap:
|
|
return NVPTXISD::Suld1DI16Trap;
|
|
case Intrinsic::nvvm_suld_1d_i32_trap:
|
|
return NVPTXISD::Suld1DI32Trap;
|
|
case Intrinsic::nvvm_suld_1d_v2i8_trap:
|
|
return NVPTXISD::Suld1DV2I8Trap;
|
|
case Intrinsic::nvvm_suld_1d_v2i16_trap:
|
|
return NVPTXISD::Suld1DV2I16Trap;
|
|
case Intrinsic::nvvm_suld_1d_v2i32_trap:
|
|
return NVPTXISD::Suld1DV2I32Trap;
|
|
case Intrinsic::nvvm_suld_1d_v4i8_trap:
|
|
return NVPTXISD::Suld1DV4I8Trap;
|
|
case Intrinsic::nvvm_suld_1d_v4i16_trap:
|
|
return NVPTXISD::Suld1DV4I16Trap;
|
|
case Intrinsic::nvvm_suld_1d_v4i32_trap:
|
|
return NVPTXISD::Suld1DV4I32Trap;
|
|
case Intrinsic::nvvm_suld_1d_array_i8_trap:
|
|
return NVPTXISD::Suld1DArrayI8Trap;
|
|
case Intrinsic::nvvm_suld_1d_array_i16_trap:
|
|
return NVPTXISD::Suld1DArrayI16Trap;
|
|
case Intrinsic::nvvm_suld_1d_array_i32_trap:
|
|
return NVPTXISD::Suld1DArrayI32Trap;
|
|
case Intrinsic::nvvm_suld_1d_array_v2i8_trap:
|
|
return NVPTXISD::Suld1DArrayV2I8Trap;
|
|
case Intrinsic::nvvm_suld_1d_array_v2i16_trap:
|
|
return NVPTXISD::Suld1DArrayV2I16Trap;
|
|
case Intrinsic::nvvm_suld_1d_array_v2i32_trap:
|
|
return NVPTXISD::Suld1DArrayV2I32Trap;
|
|
case Intrinsic::nvvm_suld_1d_array_v4i8_trap:
|
|
return NVPTXISD::Suld1DArrayV4I8Trap;
|
|
case Intrinsic::nvvm_suld_1d_array_v4i16_trap:
|
|
return NVPTXISD::Suld1DArrayV4I16Trap;
|
|
case Intrinsic::nvvm_suld_1d_array_v4i32_trap:
|
|
return NVPTXISD::Suld1DArrayV4I32Trap;
|
|
case Intrinsic::nvvm_suld_2d_i8_trap:
|
|
return NVPTXISD::Suld2DI8Trap;
|
|
case Intrinsic::nvvm_suld_2d_i16_trap:
|
|
return NVPTXISD::Suld2DI16Trap;
|
|
case Intrinsic::nvvm_suld_2d_i32_trap:
|
|
return NVPTXISD::Suld2DI32Trap;
|
|
case Intrinsic::nvvm_suld_2d_v2i8_trap:
|
|
return NVPTXISD::Suld2DV2I8Trap;
|
|
case Intrinsic::nvvm_suld_2d_v2i16_trap:
|
|
return NVPTXISD::Suld2DV2I16Trap;
|
|
case Intrinsic::nvvm_suld_2d_v2i32_trap:
|
|
return NVPTXISD::Suld2DV2I32Trap;
|
|
case Intrinsic::nvvm_suld_2d_v4i8_trap:
|
|
return NVPTXISD::Suld2DV4I8Trap;
|
|
case Intrinsic::nvvm_suld_2d_v4i16_trap:
|
|
return NVPTXISD::Suld2DV4I16Trap;
|
|
case Intrinsic::nvvm_suld_2d_v4i32_trap:
|
|
return NVPTXISD::Suld2DV4I32Trap;
|
|
case Intrinsic::nvvm_suld_2d_array_i8_trap:
|
|
return NVPTXISD::Suld2DArrayI8Trap;
|
|
case Intrinsic::nvvm_suld_2d_array_i16_trap:
|
|
return NVPTXISD::Suld2DArrayI16Trap;
|
|
case Intrinsic::nvvm_suld_2d_array_i32_trap:
|
|
return NVPTXISD::Suld2DArrayI32Trap;
|
|
case Intrinsic::nvvm_suld_2d_array_v2i8_trap:
|
|
return NVPTXISD::Suld2DArrayV2I8Trap;
|
|
case Intrinsic::nvvm_suld_2d_array_v2i16_trap:
|
|
return NVPTXISD::Suld2DArrayV2I16Trap;
|
|
case Intrinsic::nvvm_suld_2d_array_v2i32_trap:
|
|
return NVPTXISD::Suld2DArrayV2I32Trap;
|
|
case Intrinsic::nvvm_suld_2d_array_v4i8_trap:
|
|
return NVPTXISD::Suld2DArrayV4I8Trap;
|
|
case Intrinsic::nvvm_suld_2d_array_v4i16_trap:
|
|
return NVPTXISD::Suld2DArrayV4I16Trap;
|
|
case Intrinsic::nvvm_suld_2d_array_v4i32_trap:
|
|
return NVPTXISD::Suld2DArrayV4I32Trap;
|
|
case Intrinsic::nvvm_suld_3d_i8_trap:
|
|
return NVPTXISD::Suld3DI8Trap;
|
|
case Intrinsic::nvvm_suld_3d_i16_trap:
|
|
return NVPTXISD::Suld3DI16Trap;
|
|
case Intrinsic::nvvm_suld_3d_i32_trap:
|
|
return NVPTXISD::Suld3DI32Trap;
|
|
case Intrinsic::nvvm_suld_3d_v2i8_trap:
|
|
return NVPTXISD::Suld3DV2I8Trap;
|
|
case Intrinsic::nvvm_suld_3d_v2i16_trap:
|
|
return NVPTXISD::Suld3DV2I16Trap;
|
|
case Intrinsic::nvvm_suld_3d_v2i32_trap:
|
|
return NVPTXISD::Suld3DV2I32Trap;
|
|
case Intrinsic::nvvm_suld_3d_v4i8_trap:
|
|
return NVPTXISD::Suld3DV4I8Trap;
|
|
case Intrinsic::nvvm_suld_3d_v4i16_trap:
|
|
return NVPTXISD::Suld3DV4I16Trap;
|
|
case Intrinsic::nvvm_suld_3d_v4i32_trap:
|
|
return NVPTXISD::Suld3DV4I32Trap;
|
|
}
|
|
}
|
|
|
|
// llvm.ptx.memcpy.const and llvm.ptx.memmove.const need to be modeled as
|
|
// TgtMemIntrinsic
|
|
// because we need the information that is only available in the "Value" type
|
|
// of destination
|
|
// pointer. In particular, the address space information.
|
|
bool NVPTXTargetLowering::getTgtMemIntrinsic(
|
|
IntrinsicInfo &Info, const CallInst &I, unsigned Intrinsic) const {
|
|
switch (Intrinsic) {
|
|
default:
|
|
return false;
|
|
|
|
case Intrinsic::nvvm_atomic_load_add_f32:
|
|
Info.opc = ISD::INTRINSIC_W_CHAIN;
|
|
Info.memVT = MVT::f32;
|
|
Info.ptrVal = I.getArgOperand(0);
|
|
Info.offset = 0;
|
|
Info.vol = 0;
|
|
Info.readMem = true;
|
|
Info.writeMem = true;
|
|
Info.align = 0;
|
|
return true;
|
|
|
|
case Intrinsic::nvvm_atomic_load_inc_32:
|
|
case Intrinsic::nvvm_atomic_load_dec_32:
|
|
Info.opc = ISD::INTRINSIC_W_CHAIN;
|
|
Info.memVT = MVT::i32;
|
|
Info.ptrVal = I.getArgOperand(0);
|
|
Info.offset = 0;
|
|
Info.vol = 0;
|
|
Info.readMem = true;
|
|
Info.writeMem = true;
|
|
Info.align = 0;
|
|
return true;
|
|
|
|
case Intrinsic::nvvm_ldu_global_i:
|
|
case Intrinsic::nvvm_ldu_global_f:
|
|
case Intrinsic::nvvm_ldu_global_p: {
|
|
|
|
Info.opc = ISD::INTRINSIC_W_CHAIN;
|
|
if (Intrinsic == Intrinsic::nvvm_ldu_global_i)
|
|
Info.memVT = getValueType(I.getType());
|
|
else if(Intrinsic == Intrinsic::nvvm_ldu_global_p)
|
|
Info.memVT = getPointerTy();
|
|
else
|
|
Info.memVT = getValueType(I.getType());
|
|
Info.ptrVal = I.getArgOperand(0);
|
|
Info.offset = 0;
|
|
Info.vol = 0;
|
|
Info.readMem = true;
|
|
Info.writeMem = false;
|
|
|
|
// alignment is available as metadata.
|
|
// Grab it and set the alignment.
|
|
assert(I.hasMetadataOtherThanDebugLoc() && "Must have alignment metadata");
|
|
MDNode *AlignMD = I.getMetadata("align");
|
|
assert(AlignMD && "Must have a non-null MDNode");
|
|
assert(AlignMD->getNumOperands() == 1 && "Must have a single operand");
|
|
Value *Align = AlignMD->getOperand(0);
|
|
int64_t Alignment = cast<ConstantInt>(Align)->getZExtValue();
|
|
Info.align = Alignment;
|
|
|
|
return true;
|
|
}
|
|
case Intrinsic::nvvm_ldg_global_i:
|
|
case Intrinsic::nvvm_ldg_global_f:
|
|
case Intrinsic::nvvm_ldg_global_p: {
|
|
|
|
Info.opc = ISD::INTRINSIC_W_CHAIN;
|
|
if (Intrinsic == Intrinsic::nvvm_ldg_global_i)
|
|
Info.memVT = getValueType(I.getType());
|
|
else if(Intrinsic == Intrinsic::nvvm_ldg_global_p)
|
|
Info.memVT = getPointerTy();
|
|
else
|
|
Info.memVT = getValueType(I.getType());
|
|
Info.ptrVal = I.getArgOperand(0);
|
|
Info.offset = 0;
|
|
Info.vol = 0;
|
|
Info.readMem = true;
|
|
Info.writeMem = false;
|
|
|
|
// alignment is available as metadata.
|
|
// Grab it and set the alignment.
|
|
assert(I.hasMetadataOtherThanDebugLoc() && "Must have alignment metadata");
|
|
MDNode *AlignMD = I.getMetadata("align");
|
|
assert(AlignMD && "Must have a non-null MDNode");
|
|
assert(AlignMD->getNumOperands() == 1 && "Must have a single operand");
|
|
Value *Align = AlignMD->getOperand(0);
|
|
int64_t Alignment = cast<ConstantInt>(Align)->getZExtValue();
|
|
Info.align = Alignment;
|
|
|
|
return true;
|
|
}
|
|
|
|
case Intrinsic::nvvm_tex_1d_v4f32_i32:
|
|
case Intrinsic::nvvm_tex_1d_v4f32_f32:
|
|
case Intrinsic::nvvm_tex_1d_level_v4f32_f32:
|
|
case Intrinsic::nvvm_tex_1d_grad_v4f32_f32:
|
|
case Intrinsic::nvvm_tex_1d_array_v4f32_i32:
|
|
case Intrinsic::nvvm_tex_1d_array_v4f32_f32:
|
|
case Intrinsic::nvvm_tex_1d_array_level_v4f32_f32:
|
|
case Intrinsic::nvvm_tex_1d_array_grad_v4f32_f32:
|
|
case Intrinsic::nvvm_tex_2d_v4f32_i32:
|
|
case Intrinsic::nvvm_tex_2d_v4f32_f32:
|
|
case Intrinsic::nvvm_tex_2d_level_v4f32_f32:
|
|
case Intrinsic::nvvm_tex_2d_grad_v4f32_f32:
|
|
case Intrinsic::nvvm_tex_2d_array_v4f32_i32:
|
|
case Intrinsic::nvvm_tex_2d_array_v4f32_f32:
|
|
case Intrinsic::nvvm_tex_2d_array_level_v4f32_f32:
|
|
case Intrinsic::nvvm_tex_2d_array_grad_v4f32_f32:
|
|
case Intrinsic::nvvm_tex_3d_v4f32_i32:
|
|
case Intrinsic::nvvm_tex_3d_v4f32_f32:
|
|
case Intrinsic::nvvm_tex_3d_level_v4f32_f32:
|
|
case Intrinsic::nvvm_tex_3d_grad_v4f32_f32: {
|
|
Info.opc = getOpcForTextureInstr(Intrinsic);
|
|
Info.memVT = MVT::f32;
|
|
Info.ptrVal = nullptr;
|
|
Info.offset = 0;
|
|
Info.vol = 0;
|
|
Info.readMem = true;
|
|
Info.writeMem = false;
|
|
Info.align = 16;
|
|
return true;
|
|
}
|
|
case Intrinsic::nvvm_tex_1d_v4i32_i32:
|
|
case Intrinsic::nvvm_tex_1d_v4i32_f32:
|
|
case Intrinsic::nvvm_tex_1d_level_v4i32_f32:
|
|
case Intrinsic::nvvm_tex_1d_grad_v4i32_f32:
|
|
case Intrinsic::nvvm_tex_1d_array_v4i32_i32:
|
|
case Intrinsic::nvvm_tex_1d_array_v4i32_f32:
|
|
case Intrinsic::nvvm_tex_1d_array_level_v4i32_f32:
|
|
case Intrinsic::nvvm_tex_1d_array_grad_v4i32_f32:
|
|
case Intrinsic::nvvm_tex_2d_v4i32_i32:
|
|
case Intrinsic::nvvm_tex_2d_v4i32_f32:
|
|
case Intrinsic::nvvm_tex_2d_level_v4i32_f32:
|
|
case Intrinsic::nvvm_tex_2d_grad_v4i32_f32:
|
|
case Intrinsic::nvvm_tex_2d_array_v4i32_i32:
|
|
case Intrinsic::nvvm_tex_2d_array_v4i32_f32:
|
|
case Intrinsic::nvvm_tex_2d_array_level_v4i32_f32:
|
|
case Intrinsic::nvvm_tex_2d_array_grad_v4i32_f32:
|
|
case Intrinsic::nvvm_tex_3d_v4i32_i32:
|
|
case Intrinsic::nvvm_tex_3d_v4i32_f32:
|
|
case Intrinsic::nvvm_tex_3d_level_v4i32_f32:
|
|
case Intrinsic::nvvm_tex_3d_grad_v4i32_f32: {
|
|
Info.opc = getOpcForTextureInstr(Intrinsic);
|
|
Info.memVT = MVT::i32;
|
|
Info.ptrVal = nullptr;
|
|
Info.offset = 0;
|
|
Info.vol = 0;
|
|
Info.readMem = true;
|
|
Info.writeMem = false;
|
|
Info.align = 16;
|
|
return true;
|
|
}
|
|
case Intrinsic::nvvm_suld_1d_i8_trap:
|
|
case Intrinsic::nvvm_suld_1d_v2i8_trap:
|
|
case Intrinsic::nvvm_suld_1d_v4i8_trap:
|
|
case Intrinsic::nvvm_suld_1d_array_i8_trap:
|
|
case Intrinsic::nvvm_suld_1d_array_v2i8_trap:
|
|
case Intrinsic::nvvm_suld_1d_array_v4i8_trap:
|
|
case Intrinsic::nvvm_suld_2d_i8_trap:
|
|
case Intrinsic::nvvm_suld_2d_v2i8_trap:
|
|
case Intrinsic::nvvm_suld_2d_v4i8_trap:
|
|
case Intrinsic::nvvm_suld_2d_array_i8_trap:
|
|
case Intrinsic::nvvm_suld_2d_array_v2i8_trap:
|
|
case Intrinsic::nvvm_suld_2d_array_v4i8_trap:
|
|
case Intrinsic::nvvm_suld_3d_i8_trap:
|
|
case Intrinsic::nvvm_suld_3d_v2i8_trap:
|
|
case Intrinsic::nvvm_suld_3d_v4i8_trap: {
|
|
Info.opc = getOpcForSurfaceInstr(Intrinsic);
|
|
Info.memVT = MVT::i8;
|
|
Info.ptrVal = nullptr;
|
|
Info.offset = 0;
|
|
Info.vol = 0;
|
|
Info.readMem = true;
|
|
Info.writeMem = false;
|
|
Info.align = 16;
|
|
return true;
|
|
}
|
|
case Intrinsic::nvvm_suld_1d_i16_trap:
|
|
case Intrinsic::nvvm_suld_1d_v2i16_trap:
|
|
case Intrinsic::nvvm_suld_1d_v4i16_trap:
|
|
case Intrinsic::nvvm_suld_1d_array_i16_trap:
|
|
case Intrinsic::nvvm_suld_1d_array_v2i16_trap:
|
|
case Intrinsic::nvvm_suld_1d_array_v4i16_trap:
|
|
case Intrinsic::nvvm_suld_2d_i16_trap:
|
|
case Intrinsic::nvvm_suld_2d_v2i16_trap:
|
|
case Intrinsic::nvvm_suld_2d_v4i16_trap:
|
|
case Intrinsic::nvvm_suld_2d_array_i16_trap:
|
|
case Intrinsic::nvvm_suld_2d_array_v2i16_trap:
|
|
case Intrinsic::nvvm_suld_2d_array_v4i16_trap:
|
|
case Intrinsic::nvvm_suld_3d_i16_trap:
|
|
case Intrinsic::nvvm_suld_3d_v2i16_trap:
|
|
case Intrinsic::nvvm_suld_3d_v4i16_trap: {
|
|
Info.opc = getOpcForSurfaceInstr(Intrinsic);
|
|
Info.memVT = MVT::i16;
|
|
Info.ptrVal = nullptr;
|
|
Info.offset = 0;
|
|
Info.vol = 0;
|
|
Info.readMem = true;
|
|
Info.writeMem = false;
|
|
Info.align = 16;
|
|
return true;
|
|
}
|
|
case Intrinsic::nvvm_suld_1d_i32_trap:
|
|
case Intrinsic::nvvm_suld_1d_v2i32_trap:
|
|
case Intrinsic::nvvm_suld_1d_v4i32_trap:
|
|
case Intrinsic::nvvm_suld_1d_array_i32_trap:
|
|
case Intrinsic::nvvm_suld_1d_array_v2i32_trap:
|
|
case Intrinsic::nvvm_suld_1d_array_v4i32_trap:
|
|
case Intrinsic::nvvm_suld_2d_i32_trap:
|
|
case Intrinsic::nvvm_suld_2d_v2i32_trap:
|
|
case Intrinsic::nvvm_suld_2d_v4i32_trap:
|
|
case Intrinsic::nvvm_suld_2d_array_i32_trap:
|
|
case Intrinsic::nvvm_suld_2d_array_v2i32_trap:
|
|
case Intrinsic::nvvm_suld_2d_array_v4i32_trap:
|
|
case Intrinsic::nvvm_suld_3d_i32_trap:
|
|
case Intrinsic::nvvm_suld_3d_v2i32_trap:
|
|
case Intrinsic::nvvm_suld_3d_v4i32_trap: {
|
|
Info.opc = getOpcForSurfaceInstr(Intrinsic);
|
|
Info.memVT = MVT::i32;
|
|
Info.ptrVal = nullptr;
|
|
Info.offset = 0;
|
|
Info.vol = 0;
|
|
Info.readMem = true;
|
|
Info.writeMem = false;
|
|
Info.align = 16;
|
|
return true;
|
|
}
|
|
|
|
}
|
|
return false;
|
|
}
|
|
|
|
/// isLegalAddressingMode - Return true if the addressing mode represented
|
|
/// by AM is legal for this target, for a load/store of the specified type.
|
|
/// Used to guide target specific optimizations, like loop strength reduction
|
|
/// (LoopStrengthReduce.cpp) and memory optimization for address mode
|
|
/// (CodeGenPrepare.cpp)
|
|
bool NVPTXTargetLowering::isLegalAddressingMode(const AddrMode &AM,
|
|
Type *Ty) const {
|
|
|
|
// AddrMode - This represents an addressing mode of:
|
|
// BaseGV + BaseOffs + BaseReg + Scale*ScaleReg
|
|
//
|
|
// The legal address modes are
|
|
// - [avar]
|
|
// - [areg]
|
|
// - [areg+immoff]
|
|
// - [immAddr]
|
|
|
|
if (AM.BaseGV) {
|
|
if (AM.BaseOffs || AM.HasBaseReg || AM.Scale)
|
|
return false;
|
|
return true;
|
|
}
|
|
|
|
switch (AM.Scale) {
|
|
case 0: // "r", "r+i" or "i" is allowed
|
|
break;
|
|
case 1:
|
|
if (AM.HasBaseReg) // "r+r+i" or "r+r" is not allowed.
|
|
return false;
|
|
// Otherwise we have r+i.
|
|
break;
|
|
default:
|
|
// No scale > 1 is allowed
|
|
return false;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
// NVPTX Inline Assembly Support
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
/// getConstraintType - Given a constraint letter, return the type of
|
|
/// constraint it is for this target.
|
|
NVPTXTargetLowering::ConstraintType
|
|
NVPTXTargetLowering::getConstraintType(const std::string &Constraint) const {
|
|
if (Constraint.size() == 1) {
|
|
switch (Constraint[0]) {
|
|
default:
|
|
break;
|
|
case 'b':
|
|
case 'r':
|
|
case 'h':
|
|
case 'c':
|
|
case 'l':
|
|
case 'f':
|
|
case 'd':
|
|
case '0':
|
|
case 'N':
|
|
return C_RegisterClass;
|
|
}
|
|
}
|
|
return TargetLowering::getConstraintType(Constraint);
|
|
}
|
|
|
|
std::pair<unsigned, const TargetRegisterClass *>
|
|
NVPTXTargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
|
|
MVT VT) const {
|
|
if (Constraint.size() == 1) {
|
|
switch (Constraint[0]) {
|
|
case 'b':
|
|
return std::make_pair(0U, &NVPTX::Int1RegsRegClass);
|
|
case 'c':
|
|
return std::make_pair(0U, &NVPTX::Int16RegsRegClass);
|
|
case 'h':
|
|
return std::make_pair(0U, &NVPTX::Int16RegsRegClass);
|
|
case 'r':
|
|
return std::make_pair(0U, &NVPTX::Int32RegsRegClass);
|
|
case 'l':
|
|
case 'N':
|
|
return std::make_pair(0U, &NVPTX::Int64RegsRegClass);
|
|
case 'f':
|
|
return std::make_pair(0U, &NVPTX::Float32RegsRegClass);
|
|
case 'd':
|
|
return std::make_pair(0U, &NVPTX::Float64RegsRegClass);
|
|
}
|
|
}
|
|
return TargetLowering::getRegForInlineAsmConstraint(Constraint, VT);
|
|
}
|
|
|
|
/// getFunctionAlignment - Return the Log2 alignment of this function.
|
|
unsigned NVPTXTargetLowering::getFunctionAlignment(const Function *) const {
|
|
return 4;
|
|
}
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
// NVPTX DAG Combining
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
extern unsigned FMAContractLevel;
|
|
|
|
/// PerformADDCombineWithOperands - Try DAG combinations for an ADD with
|
|
/// operands N0 and N1. This is a helper for PerformADDCombine that is
|
|
/// called with the default operands, and if that fails, with commuted
|
|
/// operands.
|
|
static SDValue PerformADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1,
|
|
TargetLowering::DAGCombinerInfo &DCI,
|
|
const NVPTXSubtarget &Subtarget,
|
|
CodeGenOpt::Level OptLevel) {
|
|
SelectionDAG &DAG = DCI.DAG;
|
|
// Skip non-integer, non-scalar case
|
|
EVT VT=N0.getValueType();
|
|
if (VT.isVector())
|
|
return SDValue();
|
|
|
|
// fold (add (mul a, b), c) -> (mad a, b, c)
|
|
//
|
|
if (N0.getOpcode() == ISD::MUL) {
|
|
assert (VT.isInteger());
|
|
// For integer:
|
|
// Since integer multiply-add costs the same as integer multiply
|
|
// but is more costly than integer add, do the fusion only when
|
|
// the mul is only used in the add.
|
|
if (OptLevel==CodeGenOpt::None || VT != MVT::i32 ||
|
|
!N0.getNode()->hasOneUse())
|
|
return SDValue();
|
|
|
|
// Do the folding
|
|
return DAG.getNode(NVPTXISD::IMAD, SDLoc(N), VT,
|
|
N0.getOperand(0), N0.getOperand(1), N1);
|
|
}
|
|
else if (N0.getOpcode() == ISD::FMUL) {
|
|
if (VT == MVT::f32 || VT == MVT::f64) {
|
|
if (FMAContractLevel == 0)
|
|
return SDValue();
|
|
|
|
// For floating point:
|
|
// Do the fusion only when the mul has less than 5 uses and all
|
|
// are add.
|
|
// The heuristic is that if a use is not an add, then that use
|
|
// cannot be fused into fma, therefore mul is still needed anyway.
|
|
// If there are more than 4 uses, even if they are all add, fusing
|
|
// them will increase register pressue.
|
|
//
|
|
int numUses = 0;
|
|
int nonAddCount = 0;
|
|
for (SDNode::use_iterator UI = N0.getNode()->use_begin(),
|
|
UE = N0.getNode()->use_end();
|
|
UI != UE; ++UI) {
|
|
numUses++;
|
|
SDNode *User = *UI;
|
|
if (User->getOpcode() != ISD::FADD)
|
|
++nonAddCount;
|
|
}
|
|
if (numUses >= 5)
|
|
return SDValue();
|
|
if (nonAddCount) {
|
|
int orderNo = N->getIROrder();
|
|
int orderNo2 = N0.getNode()->getIROrder();
|
|
// simple heuristics here for considering potential register
|
|
// pressure, the logics here is that the differnce are used
|
|
// to measure the distance between def and use, the longer distance
|
|
// more likely cause register pressure.
|
|
if (orderNo - orderNo2 < 500)
|
|
return SDValue();
|
|
|
|
// Now, check if at least one of the FMUL's operands is live beyond the node N,
|
|
// which guarantees that the FMA will not increase register pressure at node N.
|
|
bool opIsLive = false;
|
|
const SDNode *left = N0.getOperand(0).getNode();
|
|
const SDNode *right = N0.getOperand(1).getNode();
|
|
|
|
if (dyn_cast<ConstantSDNode>(left) || dyn_cast<ConstantSDNode>(right))
|
|
opIsLive = true;
|
|
|
|
if (!opIsLive)
|
|
for (SDNode::use_iterator UI = left->use_begin(), UE = left->use_end(); UI != UE; ++UI) {
|
|
SDNode *User = *UI;
|
|
int orderNo3 = User->getIROrder();
|
|
if (orderNo3 > orderNo) {
|
|
opIsLive = true;
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (!opIsLive)
|
|
for (SDNode::use_iterator UI = right->use_begin(), UE = right->use_end(); UI != UE; ++UI) {
|
|
SDNode *User = *UI;
|
|
int orderNo3 = User->getIROrder();
|
|
if (orderNo3 > orderNo) {
|
|
opIsLive = true;
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (!opIsLive)
|
|
return SDValue();
|
|
}
|
|
|
|
return DAG.getNode(ISD::FMA, SDLoc(N), VT,
|
|
N0.getOperand(0), N0.getOperand(1), N1);
|
|
}
|
|
}
|
|
|
|
return SDValue();
|
|
}
|
|
|
|
/// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD.
|
|
///
|
|
static SDValue PerformADDCombine(SDNode *N,
|
|
TargetLowering::DAGCombinerInfo &DCI,
|
|
const NVPTXSubtarget &Subtarget,
|
|
CodeGenOpt::Level OptLevel) {
|
|
SDValue N0 = N->getOperand(0);
|
|
SDValue N1 = N->getOperand(1);
|
|
|
|
// First try with the default operand order.
|
|
SDValue Result = PerformADDCombineWithOperands(N, N0, N1, DCI, Subtarget,
|
|
OptLevel);
|
|
if (Result.getNode())
|
|
return Result;
|
|
|
|
// If that didn't work, try again with the operands commuted.
|
|
return PerformADDCombineWithOperands(N, N1, N0, DCI, Subtarget, OptLevel);
|
|
}
|
|
|
|
static SDValue PerformANDCombine(SDNode *N,
|
|
TargetLowering::DAGCombinerInfo &DCI) {
|
|
// The type legalizer turns a vector load of i8 values into a zextload to i16
|
|
// registers, optionally ANY_EXTENDs it (if target type is integer),
|
|
// and ANDs off the high 8 bits. Since we turn this load into a
|
|
// target-specific DAG node, the DAG combiner fails to eliminate these AND
|
|
// nodes. Do that here.
|
|
SDValue Val = N->getOperand(0);
|
|
SDValue Mask = N->getOperand(1);
|
|
|
|
if (isa<ConstantSDNode>(Val)) {
|
|
std::swap(Val, Mask);
|
|
}
|
|
|
|
SDValue AExt;
|
|
// Generally, we will see zextload -> IMOV16rr -> ANY_EXTEND -> and
|
|
if (Val.getOpcode() == ISD::ANY_EXTEND) {
|
|
AExt = Val;
|
|
Val = Val->getOperand(0);
|
|
}
|
|
|
|
if (Val->isMachineOpcode() && Val->getMachineOpcode() == NVPTX::IMOV16rr) {
|
|
Val = Val->getOperand(0);
|
|
}
|
|
|
|
if (Val->getOpcode() == NVPTXISD::LoadV2 ||
|
|
Val->getOpcode() == NVPTXISD::LoadV4) {
|
|
ConstantSDNode *MaskCnst = dyn_cast<ConstantSDNode>(Mask);
|
|
if (!MaskCnst) {
|
|
// Not an AND with a constant
|
|
return SDValue();
|
|
}
|
|
|
|
uint64_t MaskVal = MaskCnst->getZExtValue();
|
|
if (MaskVal != 0xff) {
|
|
// Not an AND that chops off top 8 bits
|
|
return SDValue();
|
|
}
|
|
|
|
MemSDNode *Mem = dyn_cast<MemSDNode>(Val);
|
|
if (!Mem) {
|
|
// Not a MemSDNode?!?
|
|
return SDValue();
|
|
}
|
|
|
|
EVT MemVT = Mem->getMemoryVT();
|
|
if (MemVT != MVT::v2i8 && MemVT != MVT::v4i8) {
|
|
// We only handle the i8 case
|
|
return SDValue();
|
|
}
|
|
|
|
unsigned ExtType =
|
|
cast<ConstantSDNode>(Val->getOperand(Val->getNumOperands()-1))->
|
|
getZExtValue();
|
|
if (ExtType == ISD::SEXTLOAD) {
|
|
// If for some reason the load is a sextload, the and is needed to zero
|
|
// out the high 8 bits
|
|
return SDValue();
|
|
}
|
|
|
|
bool AddTo = false;
|
|
if (AExt.getNode() != 0) {
|
|
// Re-insert the ext as a zext.
|
|
Val = DCI.DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N),
|
|
AExt.getValueType(), Val);
|
|
AddTo = true;
|
|
}
|
|
|
|
// If we get here, the AND is unnecessary. Just replace it with the load
|
|
DCI.CombineTo(N, Val, AddTo);
|
|
}
|
|
|
|
return SDValue();
|
|
}
|
|
|
|
enum OperandSignedness {
|
|
Signed = 0,
|
|
Unsigned,
|
|
Unknown
|
|
};
|
|
|
|
/// IsMulWideOperandDemotable - Checks if the provided DAG node is an operand
|
|
/// that can be demoted to \p OptSize bits without loss of information. The
|
|
/// signedness of the operand, if determinable, is placed in \p S.
|
|
static bool IsMulWideOperandDemotable(SDValue Op,
|
|
unsigned OptSize,
|
|
OperandSignedness &S) {
|
|
S = Unknown;
|
|
|
|
if (Op.getOpcode() == ISD::SIGN_EXTEND ||
|
|
Op.getOpcode() == ISD::SIGN_EXTEND_INREG) {
|
|
EVT OrigVT = Op.getOperand(0).getValueType();
|
|
if (OrigVT.getSizeInBits() == OptSize) {
|
|
S = Signed;
|
|
return true;
|
|
}
|
|
} else if (Op.getOpcode() == ISD::ZERO_EXTEND) {
|
|
EVT OrigVT = Op.getOperand(0).getValueType();
|
|
if (OrigVT.getSizeInBits() == OptSize) {
|
|
S = Unsigned;
|
|
return true;
|
|
}
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
/// AreMulWideOperandsDemotable - Checks if the given LHS and RHS operands can
|
|
/// be demoted to \p OptSize bits without loss of information. If the operands
|
|
/// contain a constant, it should appear as the RHS operand. The signedness of
|
|
/// the operands is placed in \p IsSigned.
|
|
static bool AreMulWideOperandsDemotable(SDValue LHS, SDValue RHS,
|
|
unsigned OptSize,
|
|
bool &IsSigned) {
|
|
|
|
OperandSignedness LHSSign;
|
|
|
|
// The LHS operand must be a demotable op
|
|
if (!IsMulWideOperandDemotable(LHS, OptSize, LHSSign))
|
|
return false;
|
|
|
|
// We should have been able to determine the signedness from the LHS
|
|
if (LHSSign == Unknown)
|
|
return false;
|
|
|
|
IsSigned = (LHSSign == Signed);
|
|
|
|
// The RHS can be a demotable op or a constant
|
|
if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(RHS)) {
|
|
APInt Val = CI->getAPIntValue();
|
|
if (LHSSign == Unsigned) {
|
|
if (Val.isIntN(OptSize)) {
|
|
return true;
|
|
}
|
|
return false;
|
|
} else {
|
|
if (Val.isSignedIntN(OptSize)) {
|
|
return true;
|
|
}
|
|
return false;
|
|
}
|
|
} else {
|
|
OperandSignedness RHSSign;
|
|
if (!IsMulWideOperandDemotable(RHS, OptSize, RHSSign))
|
|
return false;
|
|
|
|
if (LHSSign != RHSSign)
|
|
return false;
|
|
|
|
return true;
|
|
}
|
|
}
|
|
|
|
/// TryMULWIDECombine - Attempt to replace a multiply of M bits with a multiply
|
|
/// of M/2 bits that produces an M-bit result (i.e. mul.wide). This transform
|
|
/// works on both multiply DAG nodes and SHL DAG nodes with a constant shift
|
|
/// amount.
|
|
static SDValue TryMULWIDECombine(SDNode *N,
|
|
TargetLowering::DAGCombinerInfo &DCI) {
|
|
EVT MulType = N->getValueType(0);
|
|
if (MulType != MVT::i32 && MulType != MVT::i64) {
|
|
return SDValue();
|
|
}
|
|
|
|
unsigned OptSize = MulType.getSizeInBits() >> 1;
|
|
SDValue LHS = N->getOperand(0);
|
|
SDValue RHS = N->getOperand(1);
|
|
|
|
// Canonicalize the multiply so the constant (if any) is on the right
|
|
if (N->getOpcode() == ISD::MUL) {
|
|
if (isa<ConstantSDNode>(LHS)) {
|
|
std::swap(LHS, RHS);
|
|
}
|
|
}
|
|
|
|
// If we have a SHL, determine the actual multiply amount
|
|
if (N->getOpcode() == ISD::SHL) {
|
|
ConstantSDNode *ShlRHS = dyn_cast<ConstantSDNode>(RHS);
|
|
if (!ShlRHS) {
|
|
return SDValue();
|
|
}
|
|
|
|
APInt ShiftAmt = ShlRHS->getAPIntValue();
|
|
unsigned BitWidth = MulType.getSizeInBits();
|
|
if (ShiftAmt.sge(0) && ShiftAmt.slt(BitWidth)) {
|
|
APInt MulVal = APInt(BitWidth, 1) << ShiftAmt;
|
|
RHS = DCI.DAG.getConstant(MulVal, MulType);
|
|
} else {
|
|
return SDValue();
|
|
}
|
|
}
|
|
|
|
bool Signed;
|
|
// Verify that our operands are demotable
|
|
if (!AreMulWideOperandsDemotable(LHS, RHS, OptSize, Signed)) {
|
|
return SDValue();
|
|
}
|
|
|
|
EVT DemotedVT;
|
|
if (MulType == MVT::i32) {
|
|
DemotedVT = MVT::i16;
|
|
} else {
|
|
DemotedVT = MVT::i32;
|
|
}
|
|
|
|
// Truncate the operands to the correct size. Note that these are just for
|
|
// type consistency and will (likely) be eliminated in later phases.
|
|
SDValue TruncLHS =
|
|
DCI.DAG.getNode(ISD::TRUNCATE, SDLoc(N), DemotedVT, LHS);
|
|
SDValue TruncRHS =
|
|
DCI.DAG.getNode(ISD::TRUNCATE, SDLoc(N), DemotedVT, RHS);
|
|
|
|
unsigned Opc;
|
|
if (Signed) {
|
|
Opc = NVPTXISD::MUL_WIDE_SIGNED;
|
|
} else {
|
|
Opc = NVPTXISD::MUL_WIDE_UNSIGNED;
|
|
}
|
|
|
|
return DCI.DAG.getNode(Opc, SDLoc(N), MulType, TruncLHS, TruncRHS);
|
|
}
|
|
|
|
/// PerformMULCombine - Runs PTX-specific DAG combine patterns on MUL nodes.
|
|
static SDValue PerformMULCombine(SDNode *N,
|
|
TargetLowering::DAGCombinerInfo &DCI,
|
|
CodeGenOpt::Level OptLevel) {
|
|
if (OptLevel > 0) {
|
|
// Try mul.wide combining at OptLevel > 0
|
|
SDValue Ret = TryMULWIDECombine(N, DCI);
|
|
if (Ret.getNode())
|
|
return Ret;
|
|
}
|
|
|
|
return SDValue();
|
|
}
|
|
|
|
/// PerformSHLCombine - Runs PTX-specific DAG combine patterns on SHL nodes.
|
|
static SDValue PerformSHLCombine(SDNode *N,
|
|
TargetLowering::DAGCombinerInfo &DCI,
|
|
CodeGenOpt::Level OptLevel) {
|
|
if (OptLevel > 0) {
|
|
// Try mul.wide combining at OptLevel > 0
|
|
SDValue Ret = TryMULWIDECombine(N, DCI);
|
|
if (Ret.getNode())
|
|
return Ret;
|
|
}
|
|
|
|
return SDValue();
|
|
}
|
|
|
|
SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N,
|
|
DAGCombinerInfo &DCI) const {
|
|
// FIXME: Get this from the DAG somehow
|
|
CodeGenOpt::Level OptLevel = CodeGenOpt::Aggressive;
|
|
switch (N->getOpcode()) {
|
|
default: break;
|
|
case ISD::ADD:
|
|
case ISD::FADD:
|
|
return PerformADDCombine(N, DCI, nvptxSubtarget, OptLevel);
|
|
case ISD::MUL:
|
|
return PerformMULCombine(N, DCI, OptLevel);
|
|
case ISD::SHL:
|
|
return PerformSHLCombine(N, DCI, OptLevel);
|
|
case ISD::AND:
|
|
return PerformANDCombine(N, DCI);
|
|
}
|
|
return SDValue();
|
|
}
|
|
|
|
/// ReplaceVectorLoad - Convert vector loads into multi-output scalar loads.
|
|
static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG,
|
|
SmallVectorImpl<SDValue> &Results) {
|
|
EVT ResVT = N->getValueType(0);
|
|
SDLoc DL(N);
|
|
|
|
assert(ResVT.isVector() && "Vector load must have vector type");
|
|
|
|
// We only handle "native" vector sizes for now, e.g. <4 x double> is not
|
|
// legal. We can (and should) split that into 2 loads of <2 x double> here
|
|
// but I'm leaving that as a TODO for now.
|
|
assert(ResVT.isSimple() && "Can only handle simple types");
|
|
switch (ResVT.getSimpleVT().SimpleTy) {
|
|
default:
|
|
return;
|
|
case MVT::v2i8:
|
|
case MVT::v2i16:
|
|
case MVT::v2i32:
|
|
case MVT::v2i64:
|
|
case MVT::v2f32:
|
|
case MVT::v2f64:
|
|
case MVT::v4i8:
|
|
case MVT::v4i16:
|
|
case MVT::v4i32:
|
|
case MVT::v4f32:
|
|
// This is a "native" vector type
|
|
break;
|
|
}
|
|
|
|
EVT EltVT = ResVT.getVectorElementType();
|
|
unsigned NumElts = ResVT.getVectorNumElements();
|
|
|
|
// Since LoadV2 is a target node, we cannot rely on DAG type legalization.
|
|
// Therefore, we must ensure the type is legal. For i1 and i8, we set the
|
|
// loaded type to i16 and propagate the "real" type as the memory type.
|
|
bool NeedTrunc = false;
|
|
if (EltVT.getSizeInBits() < 16) {
|
|
EltVT = MVT::i16;
|
|
NeedTrunc = true;
|
|
}
|
|
|
|
unsigned Opcode = 0;
|
|
SDVTList LdResVTs;
|
|
|
|
switch (NumElts) {
|
|
default:
|
|
return;
|
|
case 2:
|
|
Opcode = NVPTXISD::LoadV2;
|
|
LdResVTs = DAG.getVTList(EltVT, EltVT, MVT::Other);
|
|
break;
|
|
case 4: {
|
|
Opcode = NVPTXISD::LoadV4;
|
|
EVT ListVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other };
|
|
LdResVTs = DAG.getVTList(ListVTs);
|
|
break;
|
|
}
|
|
}
|
|
|
|
SmallVector<SDValue, 8> OtherOps;
|
|
|
|
// Copy regular operands
|
|
for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i)
|
|
OtherOps.push_back(N->getOperand(i));
|
|
|
|
LoadSDNode *LD = cast<LoadSDNode>(N);
|
|
|
|
// The select routine does not have access to the LoadSDNode instance, so
|
|
// pass along the extension information
|
|
OtherOps.push_back(DAG.getIntPtrConstant(LD->getExtensionType()));
|
|
|
|
SDValue NewLD = DAG.getMemIntrinsicNode(Opcode, DL, LdResVTs, OtherOps,
|
|
LD->getMemoryVT(),
|
|
LD->getMemOperand());
|
|
|
|
SmallVector<SDValue, 4> ScalarRes;
|
|
|
|
for (unsigned i = 0; i < NumElts; ++i) {
|
|
SDValue Res = NewLD.getValue(i);
|
|
if (NeedTrunc)
|
|
Res = DAG.getNode(ISD::TRUNCATE, DL, ResVT.getVectorElementType(), Res);
|
|
ScalarRes.push_back(Res);
|
|
}
|
|
|
|
SDValue LoadChain = NewLD.getValue(NumElts);
|
|
|
|
SDValue BuildVec = DAG.getNode(ISD::BUILD_VECTOR, DL, ResVT, ScalarRes);
|
|
|
|
Results.push_back(BuildVec);
|
|
Results.push_back(LoadChain);
|
|
}
|
|
|
|
static void ReplaceINTRINSIC_W_CHAIN(SDNode *N, SelectionDAG &DAG,
|
|
SmallVectorImpl<SDValue> &Results) {
|
|
SDValue Chain = N->getOperand(0);
|
|
SDValue Intrin = N->getOperand(1);
|
|
SDLoc DL(N);
|
|
|
|
// Get the intrinsic ID
|
|
unsigned IntrinNo = cast<ConstantSDNode>(Intrin.getNode())->getZExtValue();
|
|
switch (IntrinNo) {
|
|
default:
|
|
return;
|
|
case Intrinsic::nvvm_ldg_global_i:
|
|
case Intrinsic::nvvm_ldg_global_f:
|
|
case Intrinsic::nvvm_ldg_global_p:
|
|
case Intrinsic::nvvm_ldu_global_i:
|
|
case Intrinsic::nvvm_ldu_global_f:
|
|
case Intrinsic::nvvm_ldu_global_p: {
|
|
EVT ResVT = N->getValueType(0);
|
|
|
|
if (ResVT.isVector()) {
|
|
// Vector LDG/LDU
|
|
|
|
unsigned NumElts = ResVT.getVectorNumElements();
|
|
EVT EltVT = ResVT.getVectorElementType();
|
|
|
|
// Since LDU/LDG are target nodes, we cannot rely on DAG type
|
|
// legalization.
|
|
// Therefore, we must ensure the type is legal. For i1 and i8, we set the
|
|
// loaded type to i16 and propagate the "real" type as the memory type.
|
|
bool NeedTrunc = false;
|
|
if (EltVT.getSizeInBits() < 16) {
|
|
EltVT = MVT::i16;
|
|
NeedTrunc = true;
|
|
}
|
|
|
|
unsigned Opcode = 0;
|
|
SDVTList LdResVTs;
|
|
|
|
switch (NumElts) {
|
|
default:
|
|
return;
|
|
case 2:
|
|
switch (IntrinNo) {
|
|
default:
|
|
return;
|
|
case Intrinsic::nvvm_ldg_global_i:
|
|
case Intrinsic::nvvm_ldg_global_f:
|
|
case Intrinsic::nvvm_ldg_global_p:
|
|
Opcode = NVPTXISD::LDGV2;
|
|
break;
|
|
case Intrinsic::nvvm_ldu_global_i:
|
|
case Intrinsic::nvvm_ldu_global_f:
|
|
case Intrinsic::nvvm_ldu_global_p:
|
|
Opcode = NVPTXISD::LDUV2;
|
|
break;
|
|
}
|
|
LdResVTs = DAG.getVTList(EltVT, EltVT, MVT::Other);
|
|
break;
|
|
case 4: {
|
|
switch (IntrinNo) {
|
|
default:
|
|
return;
|
|
case Intrinsic::nvvm_ldg_global_i:
|
|
case Intrinsic::nvvm_ldg_global_f:
|
|
case Intrinsic::nvvm_ldg_global_p:
|
|
Opcode = NVPTXISD::LDGV4;
|
|
break;
|
|
case Intrinsic::nvvm_ldu_global_i:
|
|
case Intrinsic::nvvm_ldu_global_f:
|
|
case Intrinsic::nvvm_ldu_global_p:
|
|
Opcode = NVPTXISD::LDUV4;
|
|
break;
|
|
}
|
|
EVT ListVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other };
|
|
LdResVTs = DAG.getVTList(ListVTs);
|
|
break;
|
|
}
|
|
}
|
|
|
|
SmallVector<SDValue, 8> OtherOps;
|
|
|
|
// Copy regular operands
|
|
|
|
OtherOps.push_back(Chain); // Chain
|
|
// Skip operand 1 (intrinsic ID)
|
|
// Others
|
|
for (unsigned i = 2, e = N->getNumOperands(); i != e; ++i)
|
|
OtherOps.push_back(N->getOperand(i));
|
|
|
|
MemIntrinsicSDNode *MemSD = cast<MemIntrinsicSDNode>(N);
|
|
|
|
SDValue NewLD = DAG.getMemIntrinsicNode(Opcode, DL, LdResVTs, OtherOps,
|
|
MemSD->getMemoryVT(),
|
|
MemSD->getMemOperand());
|
|
|
|
SmallVector<SDValue, 4> ScalarRes;
|
|
|
|
for (unsigned i = 0; i < NumElts; ++i) {
|
|
SDValue Res = NewLD.getValue(i);
|
|
if (NeedTrunc)
|
|
Res =
|
|
DAG.getNode(ISD::TRUNCATE, DL, ResVT.getVectorElementType(), Res);
|
|
ScalarRes.push_back(Res);
|
|
}
|
|
|
|
SDValue LoadChain = NewLD.getValue(NumElts);
|
|
|
|
SDValue BuildVec =
|
|
DAG.getNode(ISD::BUILD_VECTOR, DL, ResVT, ScalarRes);
|
|
|
|
Results.push_back(BuildVec);
|
|
Results.push_back(LoadChain);
|
|
} else {
|
|
// i8 LDG/LDU
|
|
assert(ResVT.isSimple() && ResVT.getSimpleVT().SimpleTy == MVT::i8 &&
|
|
"Custom handling of non-i8 ldu/ldg?");
|
|
|
|
// Just copy all operands as-is
|
|
SmallVector<SDValue, 4> Ops;
|
|
for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i)
|
|
Ops.push_back(N->getOperand(i));
|
|
|
|
// Force output to i16
|
|
SDVTList LdResVTs = DAG.getVTList(MVT::i16, MVT::Other);
|
|
|
|
MemIntrinsicSDNode *MemSD = cast<MemIntrinsicSDNode>(N);
|
|
|
|
// We make sure the memory type is i8, which will be used during isel
|
|
// to select the proper instruction.
|
|
SDValue NewLD =
|
|
DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, DL, LdResVTs, Ops,
|
|
MVT::i8, MemSD->getMemOperand());
|
|
|
|
Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
|
|
NewLD.getValue(0)));
|
|
Results.push_back(NewLD.getValue(1));
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
void NVPTXTargetLowering::ReplaceNodeResults(
|
|
SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const {
|
|
switch (N->getOpcode()) {
|
|
default:
|
|
report_fatal_error("Unhandled custom legalization");
|
|
case ISD::LOAD:
|
|
ReplaceLoadVector(N, DAG, Results);
|
|
return;
|
|
case ISD::INTRINSIC_W_CHAIN:
|
|
ReplaceINTRINSIC_W_CHAIN(N, DAG, Results);
|
|
return;
|
|
}
|
|
}
|
|
|
|
// Pin NVPTXSection's and NVPTXTargetObjectFile's vtables to this file.
|
|
void NVPTXSection::anchor() {}
|
|
|
|
NVPTXTargetObjectFile::~NVPTXTargetObjectFile() {
|
|
delete TextSection;
|
|
delete DataSection;
|
|
delete BSSSection;
|
|
delete ReadOnlySection;
|
|
|
|
delete StaticCtorSection;
|
|
delete StaticDtorSection;
|
|
delete LSDASection;
|
|
delete EHFrameSection;
|
|
delete DwarfAbbrevSection;
|
|
delete DwarfInfoSection;
|
|
delete DwarfLineSection;
|
|
delete DwarfFrameSection;
|
|
delete DwarfPubTypesSection;
|
|
delete DwarfDebugInlineSection;
|
|
delete DwarfStrSection;
|
|
delete DwarfLocSection;
|
|
delete DwarfARangesSection;
|
|
delete DwarfRangesSection;
|
|
delete DwarfMacroInfoSection;
|
|
}
|