mirror of
https://github.com/c64scene-ar/llvm-6502.git
synced 2025-08-13 23:26:25 +00:00
[NVPTX] Implement fma and imad contraction as target DAGCombiner patterns
This also introduces DAGCombiner patterns for mul.wide to multiply two smaller integers and produce a larger integer git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@211935 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
@@ -24,10 +24,13 @@ using namespace llvm;
|
|||||||
|
|
||||||
#define DEBUG_TYPE "nvptx-isel"
|
#define DEBUG_TYPE "nvptx-isel"
|
||||||
|
|
||||||
static cl::opt<int>
|
unsigned FMAContractLevel = 0;
|
||||||
FMAContractLevel("nvptx-fma-level", cl::ZeroOrMore, cl::Hidden,
|
|
||||||
|
static cl::opt<unsigned, true>
|
||||||
|
FMAContractLevelOpt("nvptx-fma-level", cl::ZeroOrMore, cl::Hidden,
|
||||||
cl::desc("NVPTX Specific: FMA contraction (0: don't do it"
|
cl::desc("NVPTX Specific: FMA contraction (0: don't do it"
|
||||||
" 1: do it 2: do it aggressively"),
|
" 1: do it 2: do it aggressively"),
|
||||||
|
cl::location(FMAContractLevel),
|
||||||
cl::init(2));
|
cl::init(2));
|
||||||
|
|
||||||
static cl::opt<int> UsePrecDivF32(
|
static cl::opt<int> UsePrecDivF32(
|
||||||
|
@@ -243,6 +243,13 @@ NVPTXTargetLowering::NVPTXTargetLowering(NVPTXTargetMachine &TM)
|
|||||||
setOperationAction(ISD::CTPOP, MVT::i32, Legal);
|
setOperationAction(ISD::CTPOP, MVT::i32, Legal);
|
||||||
setOperationAction(ISD::CTPOP, MVT::i64, Legal);
|
setOperationAction(ISD::CTPOP, MVT::i64, Legal);
|
||||||
|
|
||||||
|
// We have some custom DAG combine patterns for these nodes
|
||||||
|
setTargetDAGCombine(ISD::ADD);
|
||||||
|
setTargetDAGCombine(ISD::AND);
|
||||||
|
setTargetDAGCombine(ISD::FADD);
|
||||||
|
setTargetDAGCombine(ISD::MUL);
|
||||||
|
setTargetDAGCombine(ISD::SHL);
|
||||||
|
|
||||||
// Now deduce the information based on the above mentioned
|
// Now deduce the information based on the above mentioned
|
||||||
// actions
|
// actions
|
||||||
computeRegisterProperties();
|
computeRegisterProperties();
|
||||||
@@ -334,6 +341,10 @@ const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const {
|
|||||||
return "NVPTXISD::StoreV2";
|
return "NVPTXISD::StoreV2";
|
||||||
case NVPTXISD::StoreV4:
|
case NVPTXISD::StoreV4:
|
||||||
return "NVPTXISD::StoreV4";
|
return "NVPTXISD::StoreV4";
|
||||||
|
case NVPTXISD::FUN_SHFL_CLAMP:
|
||||||
|
return "NVPTXISD::FUN_SHFL_CLAMP";
|
||||||
|
case NVPTXISD::FUN_SHFR_CLAMP:
|
||||||
|
return "NVPTXISD::FUN_SHFR_CLAMP";
|
||||||
case NVPTXISD::Tex1DFloatI32: return "NVPTXISD::Tex1DFloatI32";
|
case NVPTXISD::Tex1DFloatI32: return "NVPTXISD::Tex1DFloatI32";
|
||||||
case NVPTXISD::Tex1DFloatFloat: return "NVPTXISD::Tex1DFloatFloat";
|
case NVPTXISD::Tex1DFloatFloat: return "NVPTXISD::Tex1DFloatFloat";
|
||||||
case NVPTXISD::Tex1DFloatFloatLevel:
|
case NVPTXISD::Tex1DFloatFloatLevel:
|
||||||
@@ -2475,6 +2486,406 @@ unsigned NVPTXTargetLowering::getFunctionAlignment(const Function *) const {
|
|||||||
return 4;
|
return 4;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
//===----------------------------------------------------------------------===//
|
||||||
|
// NVPTX DAG Combining
|
||||||
|
//===----------------------------------------------------------------------===//
|
||||||
|
|
||||||
|
extern unsigned FMAContractLevel;
|
||||||
|
|
||||||
|
/// PerformADDCombineWithOperands - Try DAG combinations for an ADD with
|
||||||
|
/// operands N0 and N1. This is a helper for PerformADDCombine that is
|
||||||
|
/// called with the default operands, and if that fails, with commuted
|
||||||
|
/// operands.
|
||||||
|
static SDValue PerformADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1,
|
||||||
|
TargetLowering::DAGCombinerInfo &DCI,
|
||||||
|
const NVPTXSubtarget &Subtarget,
|
||||||
|
CodeGenOpt::Level OptLevel) {
|
||||||
|
SelectionDAG &DAG = DCI.DAG;
|
||||||
|
// Skip non-integer, non-scalar case
|
||||||
|
EVT VT=N0.getValueType();
|
||||||
|
if (VT.isVector())
|
||||||
|
return SDValue();
|
||||||
|
|
||||||
|
// fold (add (mul a, b), c) -> (mad a, b, c)
|
||||||
|
//
|
||||||
|
if (N0.getOpcode() == ISD::MUL) {
|
||||||
|
assert (VT.isInteger());
|
||||||
|
// For integer:
|
||||||
|
// Since integer multiply-add costs the same as integer multiply
|
||||||
|
// but is more costly than integer add, do the fusion only when
|
||||||
|
// the mul is only used in the add.
|
||||||
|
if (OptLevel==CodeGenOpt::None || VT != MVT::i32 ||
|
||||||
|
!N0.getNode()->hasOneUse())
|
||||||
|
return SDValue();
|
||||||
|
|
||||||
|
// Do the folding
|
||||||
|
return DAG.getNode(NVPTXISD::IMAD, SDLoc(N), VT,
|
||||||
|
N0.getOperand(0), N0.getOperand(1), N1);
|
||||||
|
}
|
||||||
|
else if (N0.getOpcode() == ISD::FMUL) {
|
||||||
|
if (VT == MVT::f32 || VT == MVT::f64) {
|
||||||
|
if (FMAContractLevel == 0)
|
||||||
|
return SDValue();
|
||||||
|
|
||||||
|
// For floating point:
|
||||||
|
// Do the fusion only when the mul has less than 5 uses and all
|
||||||
|
// are add.
|
||||||
|
// The heuristic is that if a use is not an add, then that use
|
||||||
|
// cannot be fused into fma, therefore mul is still needed anyway.
|
||||||
|
// If there are more than 4 uses, even if they are all add, fusing
|
||||||
|
// them will increase register pressue.
|
||||||
|
//
|
||||||
|
int numUses = 0;
|
||||||
|
int nonAddCount = 0;
|
||||||
|
for (SDNode::use_iterator UI = N0.getNode()->use_begin(),
|
||||||
|
UE = N0.getNode()->use_end();
|
||||||
|
UI != UE; ++UI) {
|
||||||
|
numUses++;
|
||||||
|
SDNode *User = *UI;
|
||||||
|
if (User->getOpcode() != ISD::FADD)
|
||||||
|
++nonAddCount;
|
||||||
|
}
|
||||||
|
if (numUses >= 5)
|
||||||
|
return SDValue();
|
||||||
|
if (nonAddCount) {
|
||||||
|
int orderNo = N->getIROrder();
|
||||||
|
int orderNo2 = N0.getNode()->getIROrder();
|
||||||
|
// simple heuristics here for considering potential register
|
||||||
|
// pressure, the logics here is that the differnce are used
|
||||||
|
// to measure the distance between def and use, the longer distance
|
||||||
|
// more likely cause register pressure.
|
||||||
|
if (orderNo - orderNo2 < 500)
|
||||||
|
return SDValue();
|
||||||
|
|
||||||
|
// Now, check if at least one of the FMUL's operands is live beyond the node N,
|
||||||
|
// which guarantees that the FMA will not increase register pressure at node N.
|
||||||
|
bool opIsLive = false;
|
||||||
|
const SDNode *left = N0.getOperand(0).getNode();
|
||||||
|
const SDNode *right = N0.getOperand(1).getNode();
|
||||||
|
|
||||||
|
if (dyn_cast<ConstantSDNode>(left) || dyn_cast<ConstantSDNode>(right))
|
||||||
|
opIsLive = true;
|
||||||
|
|
||||||
|
if (!opIsLive)
|
||||||
|
for (SDNode::use_iterator UI = left->use_begin(), UE = left->use_end(); UI != UE; ++UI) {
|
||||||
|
SDNode *User = *UI;
|
||||||
|
int orderNo3 = User->getIROrder();
|
||||||
|
if (orderNo3 > orderNo) {
|
||||||
|
opIsLive = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!opIsLive)
|
||||||
|
for (SDNode::use_iterator UI = right->use_begin(), UE = right->use_end(); UI != UE; ++UI) {
|
||||||
|
SDNode *User = *UI;
|
||||||
|
int orderNo3 = User->getIROrder();
|
||||||
|
if (orderNo3 > orderNo) {
|
||||||
|
opIsLive = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!opIsLive)
|
||||||
|
return SDValue();
|
||||||
|
}
|
||||||
|
|
||||||
|
return DAG.getNode(ISD::FMA, SDLoc(N), VT,
|
||||||
|
N0.getOperand(0), N0.getOperand(1), N1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return SDValue();
|
||||||
|
}
|
||||||
|
|
||||||
|
/// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD.
|
||||||
|
///
|
||||||
|
static SDValue PerformADDCombine(SDNode *N,
|
||||||
|
TargetLowering::DAGCombinerInfo &DCI,
|
||||||
|
const NVPTXSubtarget &Subtarget,
|
||||||
|
CodeGenOpt::Level OptLevel) {
|
||||||
|
SDValue N0 = N->getOperand(0);
|
||||||
|
SDValue N1 = N->getOperand(1);
|
||||||
|
|
||||||
|
// First try with the default operand order.
|
||||||
|
SDValue Result = PerformADDCombineWithOperands(N, N0, N1, DCI, Subtarget,
|
||||||
|
OptLevel);
|
||||||
|
if (Result.getNode())
|
||||||
|
return Result;
|
||||||
|
|
||||||
|
// If that didn't work, try again with the operands commuted.
|
||||||
|
return PerformADDCombineWithOperands(N, N1, N0, DCI, Subtarget, OptLevel);
|
||||||
|
}
|
||||||
|
|
||||||
|
static SDValue PerformANDCombine(SDNode *N,
|
||||||
|
TargetLowering::DAGCombinerInfo &DCI) {
|
||||||
|
// The type legalizer turns a vector load of i8 values into a zextload to i16
|
||||||
|
// registers, optionally ANY_EXTENDs it (if target type is integer),
|
||||||
|
// and ANDs off the high 8 bits. Since we turn this load into a
|
||||||
|
// target-specific DAG node, the DAG combiner fails to eliminate these AND
|
||||||
|
// nodes. Do that here.
|
||||||
|
SDValue Val = N->getOperand(0);
|
||||||
|
SDValue Mask = N->getOperand(1);
|
||||||
|
|
||||||
|
if (isa<ConstantSDNode>(Val)) {
|
||||||
|
std::swap(Val, Mask);
|
||||||
|
}
|
||||||
|
|
||||||
|
SDValue AExt;
|
||||||
|
// Generally, we will see zextload -> IMOV16rr -> ANY_EXTEND -> and
|
||||||
|
if (Val.getOpcode() == ISD::ANY_EXTEND) {
|
||||||
|
AExt = Val;
|
||||||
|
Val = Val->getOperand(0);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (Val->isMachineOpcode() && Val->getMachineOpcode() == NVPTX::IMOV16rr) {
|
||||||
|
Val = Val->getOperand(0);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (Val->getOpcode() == NVPTXISD::LoadV2 ||
|
||||||
|
Val->getOpcode() == NVPTXISD::LoadV4) {
|
||||||
|
ConstantSDNode *MaskCnst = dyn_cast<ConstantSDNode>(Mask);
|
||||||
|
if (!MaskCnst) {
|
||||||
|
// Not an AND with a constant
|
||||||
|
return SDValue();
|
||||||
|
}
|
||||||
|
|
||||||
|
uint64_t MaskVal = MaskCnst->getZExtValue();
|
||||||
|
if (MaskVal != 0xff) {
|
||||||
|
// Not an AND that chops off top 8 bits
|
||||||
|
return SDValue();
|
||||||
|
}
|
||||||
|
|
||||||
|
MemSDNode *Mem = dyn_cast<MemSDNode>(Val);
|
||||||
|
if (!Mem) {
|
||||||
|
// Not a MemSDNode?!?
|
||||||
|
return SDValue();
|
||||||
|
}
|
||||||
|
|
||||||
|
EVT MemVT = Mem->getMemoryVT();
|
||||||
|
if (MemVT != MVT::v2i8 && MemVT != MVT::v4i8) {
|
||||||
|
// We only handle the i8 case
|
||||||
|
return SDValue();
|
||||||
|
}
|
||||||
|
|
||||||
|
unsigned ExtType =
|
||||||
|
cast<ConstantSDNode>(Val->getOperand(Val->getNumOperands()-1))->
|
||||||
|
getZExtValue();
|
||||||
|
if (ExtType == ISD::SEXTLOAD) {
|
||||||
|
// If for some reason the load is a sextload, the and is needed to zero
|
||||||
|
// out the high 8 bits
|
||||||
|
return SDValue();
|
||||||
|
}
|
||||||
|
|
||||||
|
bool AddTo = false;
|
||||||
|
if (AExt.getNode() != 0) {
|
||||||
|
// Re-insert the ext as a zext.
|
||||||
|
Val = DCI.DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N),
|
||||||
|
AExt.getValueType(), Val);
|
||||||
|
AddTo = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
// If we get here, the AND is unnecessary. Just replace it with the load
|
||||||
|
DCI.CombineTo(N, Val, AddTo);
|
||||||
|
}
|
||||||
|
|
||||||
|
return SDValue();
|
||||||
|
}
|
||||||
|
|
||||||
|
enum OperandSignedness {
|
||||||
|
Signed = 0,
|
||||||
|
Unsigned,
|
||||||
|
Unknown
|
||||||
|
};
|
||||||
|
|
||||||
|
/// IsMulWideOperandDemotable - Checks if the provided DAG node is an operand
|
||||||
|
/// that can be demoted to \p OptSize bits without loss of information. The
|
||||||
|
/// signedness of the operand, if determinable, is placed in \p S.
|
||||||
|
static bool IsMulWideOperandDemotable(SDValue Op,
|
||||||
|
unsigned OptSize,
|
||||||
|
OperandSignedness &S) {
|
||||||
|
S = Unknown;
|
||||||
|
|
||||||
|
if (Op.getOpcode() == ISD::SIGN_EXTEND ||
|
||||||
|
Op.getOpcode() == ISD::SIGN_EXTEND_INREG) {
|
||||||
|
EVT OrigVT = Op.getOperand(0).getValueType();
|
||||||
|
if (OrigVT.getSizeInBits() == OptSize) {
|
||||||
|
S = Signed;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
} else if (Op.getOpcode() == ISD::ZERO_EXTEND) {
|
||||||
|
EVT OrigVT = Op.getOperand(0).getValueType();
|
||||||
|
if (OrigVT.getSizeInBits() == OptSize) {
|
||||||
|
S = Unsigned;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
/// AreMulWideOperandsDemotable - Checks if the given LHS and RHS operands can
|
||||||
|
/// be demoted to \p OptSize bits without loss of information. If the operands
|
||||||
|
/// contain a constant, it should appear as the RHS operand. The signedness of
|
||||||
|
/// the operands is placed in \p IsSigned.
|
||||||
|
static bool AreMulWideOperandsDemotable(SDValue LHS, SDValue RHS,
|
||||||
|
unsigned OptSize,
|
||||||
|
bool &IsSigned) {
|
||||||
|
|
||||||
|
OperandSignedness LHSSign;
|
||||||
|
|
||||||
|
// The LHS operand must be a demotable op
|
||||||
|
if (!IsMulWideOperandDemotable(LHS, OptSize, LHSSign))
|
||||||
|
return false;
|
||||||
|
|
||||||
|
// We should have been able to determine the signedness from the LHS
|
||||||
|
if (LHSSign == Unknown)
|
||||||
|
return false;
|
||||||
|
|
||||||
|
IsSigned = (LHSSign == Signed);
|
||||||
|
|
||||||
|
// The RHS can be a demotable op or a constant
|
||||||
|
if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(RHS)) {
|
||||||
|
APInt Val = CI->getAPIntValue();
|
||||||
|
if (LHSSign == Unsigned) {
|
||||||
|
if (Val.isIntN(OptSize)) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
} else {
|
||||||
|
if (Val.isSignedIntN(OptSize)) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
OperandSignedness RHSSign;
|
||||||
|
if (!IsMulWideOperandDemotable(RHS, OptSize, RHSSign))
|
||||||
|
return false;
|
||||||
|
|
||||||
|
if (LHSSign != RHSSign)
|
||||||
|
return false;
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// TryMULWIDECombine - Attempt to replace a multiply of M bits with a multiply
|
||||||
|
/// of M/2 bits that produces an M-bit result (i.e. mul.wide). This transform
|
||||||
|
/// works on both multiply DAG nodes and SHL DAG nodes with a constant shift
|
||||||
|
/// amount.
|
||||||
|
static SDValue TryMULWIDECombine(SDNode *N,
|
||||||
|
TargetLowering::DAGCombinerInfo &DCI) {
|
||||||
|
EVT MulType = N->getValueType(0);
|
||||||
|
if (MulType != MVT::i32 && MulType != MVT::i64) {
|
||||||
|
return SDValue();
|
||||||
|
}
|
||||||
|
|
||||||
|
unsigned OptSize = MulType.getSizeInBits() >> 1;
|
||||||
|
SDValue LHS = N->getOperand(0);
|
||||||
|
SDValue RHS = N->getOperand(1);
|
||||||
|
|
||||||
|
// Canonicalize the multiply so the constant (if any) is on the right
|
||||||
|
if (N->getOpcode() == ISD::MUL) {
|
||||||
|
if (isa<ConstantSDNode>(LHS)) {
|
||||||
|
std::swap(LHS, RHS);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// If we have a SHL, determine the actual multiply amount
|
||||||
|
if (N->getOpcode() == ISD::SHL) {
|
||||||
|
ConstantSDNode *ShlRHS = dyn_cast<ConstantSDNode>(RHS);
|
||||||
|
if (!ShlRHS) {
|
||||||
|
return SDValue();
|
||||||
|
}
|
||||||
|
|
||||||
|
APInt ShiftAmt = ShlRHS->getAPIntValue();
|
||||||
|
unsigned BitWidth = MulType.getSizeInBits();
|
||||||
|
if (ShiftAmt.sge(0) && ShiftAmt.slt(BitWidth)) {
|
||||||
|
APInt MulVal = APInt(BitWidth, 1) << ShiftAmt;
|
||||||
|
RHS = DCI.DAG.getConstant(MulVal, MulType);
|
||||||
|
} else {
|
||||||
|
return SDValue();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
bool Signed;
|
||||||
|
// Verify that our operands are demotable
|
||||||
|
if (!AreMulWideOperandsDemotable(LHS, RHS, OptSize, Signed)) {
|
||||||
|
return SDValue();
|
||||||
|
}
|
||||||
|
|
||||||
|
EVT DemotedVT;
|
||||||
|
if (MulType == MVT::i32) {
|
||||||
|
DemotedVT = MVT::i16;
|
||||||
|
} else {
|
||||||
|
DemotedVT = MVT::i32;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Truncate the operands to the correct size. Note that these are just for
|
||||||
|
// type consistency and will (likely) be eliminated in later phases.
|
||||||
|
SDValue TruncLHS =
|
||||||
|
DCI.DAG.getNode(ISD::TRUNCATE, SDLoc(N), DemotedVT, LHS);
|
||||||
|
SDValue TruncRHS =
|
||||||
|
DCI.DAG.getNode(ISD::TRUNCATE, SDLoc(N), DemotedVT, RHS);
|
||||||
|
|
||||||
|
unsigned Opc;
|
||||||
|
if (Signed) {
|
||||||
|
Opc = NVPTXISD::MUL_WIDE_SIGNED;
|
||||||
|
} else {
|
||||||
|
Opc = NVPTXISD::MUL_WIDE_UNSIGNED;
|
||||||
|
}
|
||||||
|
|
||||||
|
return DCI.DAG.getNode(Opc, SDLoc(N), MulType, TruncLHS, TruncRHS);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// PerformMULCombine - Runs PTX-specific DAG combine patterns on MUL nodes.
|
||||||
|
static SDValue PerformMULCombine(SDNode *N,
|
||||||
|
TargetLowering::DAGCombinerInfo &DCI,
|
||||||
|
CodeGenOpt::Level OptLevel) {
|
||||||
|
if (OptLevel > 0) {
|
||||||
|
// Try mul.wide combining at OptLevel > 0
|
||||||
|
SDValue Ret = TryMULWIDECombine(N, DCI);
|
||||||
|
if (Ret.getNode())
|
||||||
|
return Ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
return SDValue();
|
||||||
|
}
|
||||||
|
|
||||||
|
/// PerformSHLCombine - Runs PTX-specific DAG combine patterns on SHL nodes.
|
||||||
|
static SDValue PerformSHLCombine(SDNode *N,
|
||||||
|
TargetLowering::DAGCombinerInfo &DCI,
|
||||||
|
CodeGenOpt::Level OptLevel) {
|
||||||
|
if (OptLevel > 0) {
|
||||||
|
// Try mul.wide combining at OptLevel > 0
|
||||||
|
SDValue Ret = TryMULWIDECombine(N, DCI);
|
||||||
|
if (Ret.getNode())
|
||||||
|
return Ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
return SDValue();
|
||||||
|
}
|
||||||
|
|
||||||
|
SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N,
|
||||||
|
DAGCombinerInfo &DCI) const {
|
||||||
|
// FIXME: Get this from the DAG somehow
|
||||||
|
CodeGenOpt::Level OptLevel = CodeGenOpt::Aggressive;
|
||||||
|
switch (N->getOpcode()) {
|
||||||
|
default: break;
|
||||||
|
case ISD::ADD:
|
||||||
|
case ISD::FADD:
|
||||||
|
return PerformADDCombine(N, DCI, nvptxSubtarget, OptLevel);
|
||||||
|
case ISD::MUL:
|
||||||
|
return PerformMULCombine(N, DCI, OptLevel);
|
||||||
|
case ISD::SHL:
|
||||||
|
return PerformSHLCombine(N, DCI, OptLevel);
|
||||||
|
case ISD::AND:
|
||||||
|
return PerformANDCombine(N, DCI);
|
||||||
|
}
|
||||||
|
return SDValue();
|
||||||
|
}
|
||||||
|
|
||||||
/// ReplaceVectorLoad - Convert vector loads into multi-output scalar loads.
|
/// ReplaceVectorLoad - Convert vector loads into multi-output scalar loads.
|
||||||
static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG,
|
static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG,
|
||||||
SmallVectorImpl<SDValue> &Results) {
|
SmallVectorImpl<SDValue> &Results) {
|
||||||
|
@@ -49,6 +49,9 @@ enum NodeType {
|
|||||||
CallSeqBegin,
|
CallSeqBegin,
|
||||||
CallSeqEnd,
|
CallSeqEnd,
|
||||||
CallPrototype,
|
CallPrototype,
|
||||||
|
MUL_WIDE_SIGNED,
|
||||||
|
MUL_WIDE_UNSIGNED,
|
||||||
|
IMAD,
|
||||||
Dummy,
|
Dummy,
|
||||||
|
|
||||||
LoadV2 = ISD::FIRST_TARGET_MEMORY_OPCODE,
|
LoadV2 = ISD::FIRST_TARGET_MEMORY_OPCODE,
|
||||||
@@ -258,6 +261,7 @@ private:
|
|||||||
|
|
||||||
void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue> &Results,
|
void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue> &Results,
|
||||||
SelectionDAG &DAG) const override;
|
SelectionDAG &DAG) const override;
|
||||||
|
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
|
||||||
|
|
||||||
unsigned getArgumentAlignment(SDValue Callee, const ImmutableCallSite *CS,
|
unsigned getArgumentAlignment(SDValue Callee, const ImmutableCallSite *CS,
|
||||||
Type *Ty, unsigned Idx) const;
|
Type *Ty, unsigned Idx) const;
|
||||||
|
@@ -464,32 +464,44 @@ def SHL2MUL16 : SDNodeXForm<imm, [{
|
|||||||
return CurDAG->getTargetConstant(temp.shl(v), MVT::i16);
|
return CurDAG->getTargetConstant(temp.shl(v), MVT::i16);
|
||||||
}]>;
|
}]>;
|
||||||
|
|
||||||
def MULWIDES64 : NVPTXInst<(outs Int64Regs:$dst),
|
def MULWIDES64
|
||||||
(ins Int32Regs:$a, Int32Regs:$b),
|
: NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
|
||||||
"mul.wide.s32 \t$dst, $a, $b;", []>;
|
"mul.wide.s32 \t$dst, $a, $b;", []>;
|
||||||
def MULWIDES64Imm : NVPTXInst<(outs Int64Regs:$dst),
|
def MULWIDES64Imm
|
||||||
(ins Int32Regs:$a, i64imm:$b),
|
: NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
|
||||||
|
"mul.wide.s32 \t$dst, $a, $b;", []>;
|
||||||
|
def MULWIDES64Imm64
|
||||||
|
: NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i64imm:$b),
|
||||||
"mul.wide.s32 \t$dst, $a, $b;", []>;
|
"mul.wide.s32 \t$dst, $a, $b;", []>;
|
||||||
|
|
||||||
def MULWIDEU64 : NVPTXInst<(outs Int64Regs:$dst),
|
def MULWIDEU64
|
||||||
(ins Int32Regs:$a, Int32Regs:$b),
|
: NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
|
||||||
"mul.wide.u32 \t$dst, $a, $b;", []>;
|
"mul.wide.u32 \t$dst, $a, $b;", []>;
|
||||||
def MULWIDEU64Imm : NVPTXInst<(outs Int64Regs:$dst),
|
def MULWIDEU64Imm
|
||||||
(ins Int32Regs:$a, i64imm:$b),
|
: NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
|
||||||
|
"mul.wide.u32 \t$dst, $a, $b;", []>;
|
||||||
|
def MULWIDEU64Imm64
|
||||||
|
: NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i64imm:$b),
|
||||||
"mul.wide.u32 \t$dst, $a, $b;", []>;
|
"mul.wide.u32 \t$dst, $a, $b;", []>;
|
||||||
|
|
||||||
def MULWIDES32 : NVPTXInst<(outs Int32Regs:$dst),
|
def MULWIDES32
|
||||||
(ins Int16Regs:$a, Int16Regs:$b),
|
: NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b),
|
||||||
"mul.wide.s16 \t$dst, $a, $b;", []>;
|
"mul.wide.s16 \t$dst, $a, $b;", []>;
|
||||||
def MULWIDES32Imm : NVPTXInst<(outs Int32Regs:$dst),
|
def MULWIDES32Imm
|
||||||
(ins Int16Regs:$a, i32imm:$b),
|
: NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i16imm:$b),
|
||||||
|
"mul.wide.s16 \t$dst, $a, $b;", []>;
|
||||||
|
def MULWIDES32Imm32
|
||||||
|
: NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i32imm:$b),
|
||||||
"mul.wide.s16 \t$dst, $a, $b;", []>;
|
"mul.wide.s16 \t$dst, $a, $b;", []>;
|
||||||
|
|
||||||
def MULWIDEU32 : NVPTXInst<(outs Int32Regs:$dst),
|
def MULWIDEU32
|
||||||
(ins Int16Regs:$a, Int16Regs:$b),
|
: NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b),
|
||||||
"mul.wide.u16 \t$dst, $a, $b;", []>;
|
"mul.wide.u16 \t$dst, $a, $b;", []>;
|
||||||
def MULWIDEU32Imm : NVPTXInst<(outs Int32Regs:$dst),
|
def MULWIDEU32Imm
|
||||||
(ins Int16Regs:$a, i32imm:$b),
|
: NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i16imm:$b),
|
||||||
|
"mul.wide.u16 \t$dst, $a, $b;", []>;
|
||||||
|
def MULWIDEU32Imm32
|
||||||
|
: NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i32imm:$b),
|
||||||
"mul.wide.u16 \t$dst, $a, $b;", []>;
|
"mul.wide.u16 \t$dst, $a, $b;", []>;
|
||||||
|
|
||||||
def : Pat<(shl (sext Int32Regs:$a), (i32 Int5Const:$b)),
|
def : Pat<(shl (sext Int32Regs:$a), (i32 Int5Const:$b)),
|
||||||
@@ -510,25 +522,63 @@ def : Pat<(mul (sext Int32Regs:$a), (sext Int32Regs:$b)),
|
|||||||
(MULWIDES64 Int32Regs:$a, Int32Regs:$b)>,
|
(MULWIDES64 Int32Regs:$a, Int32Regs:$b)>,
|
||||||
Requires<[doMulWide]>;
|
Requires<[doMulWide]>;
|
||||||
def : Pat<(mul (sext Int32Regs:$a), (i64 SInt32Const:$b)),
|
def : Pat<(mul (sext Int32Regs:$a), (i64 SInt32Const:$b)),
|
||||||
(MULWIDES64Imm Int32Regs:$a, (i64 SInt32Const:$b))>,
|
(MULWIDES64Imm64 Int32Regs:$a, (i64 SInt32Const:$b))>,
|
||||||
Requires<[doMulWide]>;
|
Requires<[doMulWide]>;
|
||||||
|
|
||||||
def : Pat<(mul (zext Int32Regs:$a), (zext Int32Regs:$b)),
|
def : Pat<(mul (zext Int32Regs:$a), (zext Int32Regs:$b)),
|
||||||
(MULWIDEU64 Int32Regs:$a, Int32Regs:$b)>, Requires<[doMulWide]>;
|
(MULWIDEU64 Int32Regs:$a, Int32Regs:$b)>,
|
||||||
|
Requires<[doMulWide]>;
|
||||||
def : Pat<(mul (zext Int32Regs:$a), (i64 UInt32Const:$b)),
|
def : Pat<(mul (zext Int32Regs:$a), (i64 UInt32Const:$b)),
|
||||||
(MULWIDEU64Imm Int32Regs:$a, (i64 UInt32Const:$b))>,
|
(MULWIDEU64Imm64 Int32Regs:$a, (i64 UInt32Const:$b))>,
|
||||||
Requires<[doMulWide]>;
|
Requires<[doMulWide]>;
|
||||||
|
|
||||||
def : Pat<(mul (sext Int16Regs:$a), (sext Int16Regs:$b)),
|
def : Pat<(mul (sext Int16Regs:$a), (sext Int16Regs:$b)),
|
||||||
(MULWIDES32 Int16Regs:$a, Int16Regs:$b)>, Requires<[doMulWide]>;
|
(MULWIDES32 Int16Regs:$a, Int16Regs:$b)>,
|
||||||
|
Requires<[doMulWide]>;
|
||||||
def : Pat<(mul (sext Int16Regs:$a), (i32 SInt16Const:$b)),
|
def : Pat<(mul (sext Int16Regs:$a), (i32 SInt16Const:$b)),
|
||||||
(MULWIDES32Imm Int16Regs:$a, (i32 SInt16Const:$b))>,
|
(MULWIDES32Imm32 Int16Regs:$a, (i32 SInt16Const:$b))>,
|
||||||
Requires<[doMulWide]>;
|
Requires<[doMulWide]>;
|
||||||
|
|
||||||
def : Pat<(mul (zext Int16Regs:$a), (zext Int16Regs:$b)),
|
def : Pat<(mul (zext Int16Regs:$a), (zext Int16Regs:$b)),
|
||||||
(MULWIDEU32 Int16Regs:$a, Int16Regs:$b)>, Requires<[doMulWide]>;
|
(MULWIDEU32 Int16Regs:$a, Int16Regs:$b)>,
|
||||||
|
Requires<[doMulWide]>;
|
||||||
def : Pat<(mul (zext Int16Regs:$a), (i32 UInt16Const:$b)),
|
def : Pat<(mul (zext Int16Regs:$a), (i32 UInt16Const:$b)),
|
||||||
(MULWIDEU32Imm Int16Regs:$a, (i32 UInt16Const:$b))>,
|
(MULWIDEU32Imm32 Int16Regs:$a, (i32 UInt16Const:$b))>,
|
||||||
|
Requires<[doMulWide]>;
|
||||||
|
|
||||||
|
|
||||||
|
def SDTMulWide
|
||||||
|
: SDTypeProfile<1, 2, [SDTCisSameAs<1, 2>]>;
|
||||||
|
def mul_wide_signed
|
||||||
|
: SDNode<"NVPTXISD::MUL_WIDE_SIGNED", SDTMulWide>;
|
||||||
|
def mul_wide_unsigned
|
||||||
|
: SDNode<"NVPTXISD::MUL_WIDE_UNSIGNED", SDTMulWide>;
|
||||||
|
|
||||||
|
def : Pat<(i32 (mul_wide_signed Int16Regs:$a, Int16Regs:$b)),
|
||||||
|
(MULWIDES32 Int16Regs:$a, Int16Regs:$b)>,
|
||||||
|
Requires<[doMulWide]>;
|
||||||
|
def : Pat<(i32 (mul_wide_signed Int16Regs:$a, imm:$b)),
|
||||||
|
(MULWIDES32Imm Int16Regs:$a, imm:$b)>,
|
||||||
|
Requires<[doMulWide]>;
|
||||||
|
def : Pat<(i32 (mul_wide_unsigned Int16Regs:$a, Int16Regs:$b)),
|
||||||
|
(MULWIDEU32 Int16Regs:$a, Int16Regs:$b)>,
|
||||||
|
Requires<[doMulWide]>;
|
||||||
|
def : Pat<(i32 (mul_wide_unsigned Int16Regs:$a, imm:$b)),
|
||||||
|
(MULWIDEU32Imm Int16Regs:$a, imm:$b)>,
|
||||||
|
Requires<[doMulWide]>;
|
||||||
|
|
||||||
|
|
||||||
|
def : Pat<(i64 (mul_wide_signed Int32Regs:$a, Int32Regs:$b)),
|
||||||
|
(MULWIDES64 Int32Regs:$a, Int32Regs:$b)>,
|
||||||
|
Requires<[doMulWide]>;
|
||||||
|
def : Pat<(i64 (mul_wide_signed Int32Regs:$a, imm:$b)),
|
||||||
|
(MULWIDES64Imm Int32Regs:$a, imm:$b)>,
|
||||||
|
Requires<[doMulWide]>;
|
||||||
|
def : Pat<(i64 (mul_wide_unsigned Int32Regs:$a, Int32Regs:$b)),
|
||||||
|
(MULWIDEU64 Int32Regs:$a, Int32Regs:$b)>,
|
||||||
|
Requires<[doMulWide]>;
|
||||||
|
def : Pat<(i64 (mul_wide_unsigned Int32Regs:$a, imm:$b)),
|
||||||
|
(MULWIDEU64Imm Int32Regs:$a, imm:$b)>,
|
||||||
Requires<[doMulWide]>;
|
Requires<[doMulWide]>;
|
||||||
|
|
||||||
defm MULT : I3<"mul.lo.s", mul>;
|
defm MULT : I3<"mul.lo.s", mul>;
|
||||||
@@ -544,69 +594,75 @@ defm SREM : I3<"rem.s", srem>;
|
|||||||
defm UREM : I3<"rem.u", urem>;
|
defm UREM : I3<"rem.u", urem>;
|
||||||
// The ri version will not be selected as DAGCombiner::visitUREM will lower it.
|
// The ri version will not be selected as DAGCombiner::visitUREM will lower it.
|
||||||
|
|
||||||
|
def SDTIMAD
|
||||||
|
: SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>, SDTCisInt<0>,
|
||||||
|
SDTCisInt<2>, SDTCisSameAs<0, 2>,
|
||||||
|
SDTCisSameAs<0, 3>]>;
|
||||||
|
def imad
|
||||||
|
: SDNode<"NVPTXISD::IMAD", SDTIMAD>;
|
||||||
|
|
||||||
def MAD16rrr : NVPTXInst<(outs Int16Regs:$dst),
|
def MAD16rrr : NVPTXInst<(outs Int16Regs:$dst),
|
||||||
(ins Int16Regs:$a, Int16Regs:$b, Int16Regs:$c),
|
(ins Int16Regs:$a, Int16Regs:$b, Int16Regs:$c),
|
||||||
"mad.lo.s16 \t$dst, $a, $b, $c;",
|
"mad.lo.s16 \t$dst, $a, $b, $c;",
|
||||||
[(set Int16Regs:$dst, (add
|
[(set Int16Regs:$dst,
|
||||||
(mul Int16Regs:$a, Int16Regs:$b), Int16Regs:$c))]>;
|
(imad Int16Regs:$a, Int16Regs:$b, Int16Regs:$c))]>;
|
||||||
def MAD16rri : NVPTXInst<(outs Int16Regs:$dst),
|
def MAD16rri : NVPTXInst<(outs Int16Regs:$dst),
|
||||||
(ins Int16Regs:$a, Int16Regs:$b, i16imm:$c),
|
(ins Int16Regs:$a, Int16Regs:$b, i16imm:$c),
|
||||||
"mad.lo.s16 \t$dst, $a, $b, $c;",
|
"mad.lo.s16 \t$dst, $a, $b, $c;",
|
||||||
[(set Int16Regs:$dst, (add
|
[(set Int16Regs:$dst,
|
||||||
(mul Int16Regs:$a, Int16Regs:$b), imm:$c))]>;
|
(imad Int16Regs:$a, Int16Regs:$b, imm:$c))]>;
|
||||||
def MAD16rir : NVPTXInst<(outs Int16Regs:$dst),
|
def MAD16rir : NVPTXInst<(outs Int16Regs:$dst),
|
||||||
(ins Int16Regs:$a, i16imm:$b, Int16Regs:$c),
|
(ins Int16Regs:$a, i16imm:$b, Int16Regs:$c),
|
||||||
"mad.lo.s16 \t$dst, $a, $b, $c;",
|
"mad.lo.s16 \t$dst, $a, $b, $c;",
|
||||||
[(set Int16Regs:$dst, (add
|
[(set Int16Regs:$dst,
|
||||||
(mul Int16Regs:$a, imm:$b), Int16Regs:$c))]>;
|
(imad Int16Regs:$a, imm:$b, Int16Regs:$c))]>;
|
||||||
def MAD16rii : NVPTXInst<(outs Int16Regs:$dst),
|
def MAD16rii : NVPTXInst<(outs Int16Regs:$dst),
|
||||||
(ins Int16Regs:$a, i16imm:$b, i16imm:$c),
|
(ins Int16Regs:$a, i16imm:$b, i16imm:$c),
|
||||||
"mad.lo.s16 \t$dst, $a, $b, $c;",
|
"mad.lo.s16 \t$dst, $a, $b, $c;",
|
||||||
[(set Int16Regs:$dst, (add (mul Int16Regs:$a, imm:$b),
|
[(set Int16Regs:$dst,
|
||||||
imm:$c))]>;
|
(imad Int16Regs:$a, imm:$b, imm:$c))]>;
|
||||||
|
|
||||||
def MAD32rrr : NVPTXInst<(outs Int32Regs:$dst),
|
def MAD32rrr : NVPTXInst<(outs Int32Regs:$dst),
|
||||||
(ins Int32Regs:$a, Int32Regs:$b, Int32Regs:$c),
|
(ins Int32Regs:$a, Int32Regs:$b, Int32Regs:$c),
|
||||||
"mad.lo.s32 \t$dst, $a, $b, $c;",
|
"mad.lo.s32 \t$dst, $a, $b, $c;",
|
||||||
[(set Int32Regs:$dst, (add
|
[(set Int32Regs:$dst,
|
||||||
(mul Int32Regs:$a, Int32Regs:$b), Int32Regs:$c))]>;
|
(imad Int32Regs:$a, Int32Regs:$b, Int32Regs:$c))]>;
|
||||||
def MAD32rri : NVPTXInst<(outs Int32Regs:$dst),
|
def MAD32rri : NVPTXInst<(outs Int32Regs:$dst),
|
||||||
(ins Int32Regs:$a, Int32Regs:$b, i32imm:$c),
|
(ins Int32Regs:$a, Int32Regs:$b, i32imm:$c),
|
||||||
"mad.lo.s32 \t$dst, $a, $b, $c;",
|
"mad.lo.s32 \t$dst, $a, $b, $c;",
|
||||||
[(set Int32Regs:$dst, (add
|
[(set Int32Regs:$dst,
|
||||||
(mul Int32Regs:$a, Int32Regs:$b), imm:$c))]>;
|
(imad Int32Regs:$a, Int32Regs:$b, imm:$c))]>;
|
||||||
def MAD32rir : NVPTXInst<(outs Int32Regs:$dst),
|
def MAD32rir : NVPTXInst<(outs Int32Regs:$dst),
|
||||||
(ins Int32Regs:$a, i32imm:$b, Int32Regs:$c),
|
(ins Int32Regs:$a, i32imm:$b, Int32Regs:$c),
|
||||||
"mad.lo.s32 \t$dst, $a, $b, $c;",
|
"mad.lo.s32 \t$dst, $a, $b, $c;",
|
||||||
[(set Int32Regs:$dst, (add
|
[(set Int32Regs:$dst,
|
||||||
(mul Int32Regs:$a, imm:$b), Int32Regs:$c))]>;
|
(imad Int32Regs:$a, imm:$b, Int32Regs:$c))]>;
|
||||||
def MAD32rii : NVPTXInst<(outs Int32Regs:$dst),
|
def MAD32rii : NVPTXInst<(outs Int32Regs:$dst),
|
||||||
(ins Int32Regs:$a, i32imm:$b, i32imm:$c),
|
(ins Int32Regs:$a, i32imm:$b, i32imm:$c),
|
||||||
"mad.lo.s32 \t$dst, $a, $b, $c;",
|
"mad.lo.s32 \t$dst, $a, $b, $c;",
|
||||||
[(set Int32Regs:$dst, (add
|
[(set Int32Regs:$dst,
|
||||||
(mul Int32Regs:$a, imm:$b), imm:$c))]>;
|
(imad Int32Regs:$a, imm:$b, imm:$c))]>;
|
||||||
|
|
||||||
def MAD64rrr : NVPTXInst<(outs Int64Regs:$dst),
|
def MAD64rrr : NVPTXInst<(outs Int64Regs:$dst),
|
||||||
(ins Int64Regs:$a, Int64Regs:$b, Int64Regs:$c),
|
(ins Int64Regs:$a, Int64Regs:$b, Int64Regs:$c),
|
||||||
"mad.lo.s64 \t$dst, $a, $b, $c;",
|
"mad.lo.s64 \t$dst, $a, $b, $c;",
|
||||||
[(set Int64Regs:$dst, (add
|
[(set Int64Regs:$dst,
|
||||||
(mul Int64Regs:$a, Int64Regs:$b), Int64Regs:$c))]>;
|
(imad Int64Regs:$a, Int64Regs:$b, Int64Regs:$c))]>;
|
||||||
def MAD64rri : NVPTXInst<(outs Int64Regs:$dst),
|
def MAD64rri : NVPTXInst<(outs Int64Regs:$dst),
|
||||||
(ins Int64Regs:$a, Int64Regs:$b, i64imm:$c),
|
(ins Int64Regs:$a, Int64Regs:$b, i64imm:$c),
|
||||||
"mad.lo.s64 \t$dst, $a, $b, $c;",
|
"mad.lo.s64 \t$dst, $a, $b, $c;",
|
||||||
[(set Int64Regs:$dst, (add
|
[(set Int64Regs:$dst,
|
||||||
(mul Int64Regs:$a, Int64Regs:$b), imm:$c))]>;
|
(imad Int64Regs:$a, Int64Regs:$b, imm:$c))]>;
|
||||||
def MAD64rir : NVPTXInst<(outs Int64Regs:$dst),
|
def MAD64rir : NVPTXInst<(outs Int64Regs:$dst),
|
||||||
(ins Int64Regs:$a, i64imm:$b, Int64Regs:$c),
|
(ins Int64Regs:$a, i64imm:$b, Int64Regs:$c),
|
||||||
"mad.lo.s64 \t$dst, $a, $b, $c;",
|
"mad.lo.s64 \t$dst, $a, $b, $c;",
|
||||||
[(set Int64Regs:$dst, (add
|
[(set Int64Regs:$dst,
|
||||||
(mul Int64Regs:$a, imm:$b), Int64Regs:$c))]>;
|
(imad Int64Regs:$a, imm:$b, Int64Regs:$c))]>;
|
||||||
def MAD64rii : NVPTXInst<(outs Int64Regs:$dst),
|
def MAD64rii : NVPTXInst<(outs Int64Regs:$dst),
|
||||||
(ins Int64Regs:$a, i64imm:$b, i64imm:$c),
|
(ins Int64Regs:$a, i64imm:$b, i64imm:$c),
|
||||||
"mad.lo.s64 \t$dst, $a, $b, $c;",
|
"mad.lo.s64 \t$dst, $a, $b, $c;",
|
||||||
[(set Int64Regs:$dst, (add
|
[(set Int64Regs:$dst,
|
||||||
(mul Int64Regs:$a, imm:$b), imm:$c))]>;
|
(imad Int64Regs:$a, imm:$b, imm:$c))]>;
|
||||||
|
|
||||||
|
|
||||||
def INEG16 : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src),
|
def INEG16 : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src),
|
||||||
"neg.s16 \t$dst, $src;",
|
"neg.s16 \t$dst, $src;",
|
||||||
@@ -812,36 +868,26 @@ multiclass FPCONTRACT32<string OpcStr, Predicate Pred> {
|
|||||||
def rrr : NVPTXInst<(outs Float32Regs:$dst),
|
def rrr : NVPTXInst<(outs Float32Regs:$dst),
|
||||||
(ins Float32Regs:$a, Float32Regs:$b, Float32Regs:$c),
|
(ins Float32Regs:$a, Float32Regs:$b, Float32Regs:$c),
|
||||||
!strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
|
!strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
|
||||||
[(set Float32Regs:$dst, (fadd
|
[(set Float32Regs:$dst,
|
||||||
(fmul Float32Regs:$a, Float32Regs:$b),
|
(fma Float32Regs:$a, Float32Regs:$b, Float32Regs:$c))]>,
|
||||||
Float32Regs:$c))]>, Requires<[Pred]>;
|
|
||||||
// This is to WAR a weird bug in Tablegen that does not automatically
|
|
||||||
// generate the following permutated rule rrr2 from the above rrr.
|
|
||||||
// So we explicitly add it here. This happens to FMA32 only.
|
|
||||||
// See the comments at FMAD32 and FMA32 for more information.
|
|
||||||
def rrr2 : NVPTXInst<(outs Float32Regs:$dst),
|
|
||||||
(ins Float32Regs:$a, Float32Regs:$b, Float32Regs:$c),
|
|
||||||
!strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
|
|
||||||
[(set Float32Regs:$dst, (fadd Float32Regs:$c,
|
|
||||||
(fmul Float32Regs:$a, Float32Regs:$b)))]>,
|
|
||||||
Requires<[Pred]>;
|
Requires<[Pred]>;
|
||||||
def rri : NVPTXInst<(outs Float32Regs:$dst),
|
def rri : NVPTXInst<(outs Float32Regs:$dst),
|
||||||
(ins Float32Regs:$a, Float32Regs:$b, f32imm:$c),
|
(ins Float32Regs:$a, Float32Regs:$b, f32imm:$c),
|
||||||
!strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
|
!strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
|
||||||
[(set Float32Regs:$dst, (fadd
|
[(set Float32Regs:$dst,
|
||||||
(fmul Float32Regs:$a, Float32Regs:$b), fpimm:$c))]>,
|
(fma Float32Regs:$a, Float32Regs:$b, fpimm:$c))]>,
|
||||||
Requires<[Pred]>;
|
Requires<[Pred]>;
|
||||||
def rir : NVPTXInst<(outs Float32Regs:$dst),
|
def rir : NVPTXInst<(outs Float32Regs:$dst),
|
||||||
(ins Float32Regs:$a, f32imm:$b, Float32Regs:$c),
|
(ins Float32Regs:$a, f32imm:$b, Float32Regs:$c),
|
||||||
!strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
|
!strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
|
||||||
[(set Float32Regs:$dst, (fadd
|
[(set Float32Regs:$dst,
|
||||||
(fmul Float32Regs:$a, fpimm:$b), Float32Regs:$c))]>,
|
(fma Float32Regs:$a, fpimm:$b, Float32Regs:$c))]>,
|
||||||
Requires<[Pred]>;
|
Requires<[Pred]>;
|
||||||
def rii : NVPTXInst<(outs Float32Regs:$dst),
|
def rii : NVPTXInst<(outs Float32Regs:$dst),
|
||||||
(ins Float32Regs:$a, f32imm:$b, f32imm:$c),
|
(ins Float32Regs:$a, f32imm:$b, f32imm:$c),
|
||||||
!strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
|
!strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
|
||||||
[(set Float32Regs:$dst, (fadd
|
[(set Float32Regs:$dst,
|
||||||
(fmul Float32Regs:$a, fpimm:$b), fpimm:$c))]>,
|
(fma Float32Regs:$a, fpimm:$b, fpimm:$c))]>,
|
||||||
Requires<[Pred]>;
|
Requires<[Pred]>;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -849,73 +895,32 @@ multiclass FPCONTRACT64<string OpcStr, Predicate Pred> {
|
|||||||
def rrr : NVPTXInst<(outs Float64Regs:$dst),
|
def rrr : NVPTXInst<(outs Float64Regs:$dst),
|
||||||
(ins Float64Regs:$a, Float64Regs:$b, Float64Regs:$c),
|
(ins Float64Regs:$a, Float64Regs:$b, Float64Regs:$c),
|
||||||
!strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
|
!strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
|
||||||
[(set Float64Regs:$dst, (fadd
|
[(set Float64Regs:$dst,
|
||||||
(fmul Float64Regs:$a, Float64Regs:$b),
|
(fma Float64Regs:$a, Float64Regs:$b, Float64Regs:$c))]>,
|
||||||
Float64Regs:$c))]>, Requires<[Pred]>;
|
Requires<[Pred]>;
|
||||||
def rri : NVPTXInst<(outs Float64Regs:$dst),
|
def rri : NVPTXInst<(outs Float64Regs:$dst),
|
||||||
(ins Float64Regs:$a, Float64Regs:$b, f64imm:$c),
|
(ins Float64Regs:$a, Float64Regs:$b, f64imm:$c),
|
||||||
!strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
|
!strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
|
||||||
[(set Float64Regs:$dst, (fadd (fmul Float64Regs:$a,
|
[(set Float64Regs:$dst,
|
||||||
Float64Regs:$b), fpimm:$c))]>, Requires<[Pred]>;
|
(fma Float64Regs:$a, Float64Regs:$b, fpimm:$c))]>,
|
||||||
|
Requires<[Pred]>;
|
||||||
def rir : NVPTXInst<(outs Float64Regs:$dst),
|
def rir : NVPTXInst<(outs Float64Regs:$dst),
|
||||||
(ins Float64Regs:$a, f64imm:$b, Float64Regs:$c),
|
(ins Float64Regs:$a, f64imm:$b, Float64Regs:$c),
|
||||||
!strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
|
!strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
|
||||||
[(set Float64Regs:$dst, (fadd
|
[(set Float64Regs:$dst,
|
||||||
(fmul Float64Regs:$a, fpimm:$b), Float64Regs:$c))]>,
|
(fma Float64Regs:$a, fpimm:$b, Float64Regs:$c))]>,
|
||||||
Requires<[Pred]>;
|
Requires<[Pred]>;
|
||||||
def rii : NVPTXInst<(outs Float64Regs:$dst),
|
def rii : NVPTXInst<(outs Float64Regs:$dst),
|
||||||
(ins Float64Regs:$a, f64imm:$b, f64imm:$c),
|
(ins Float64Regs:$a, f64imm:$b, f64imm:$c),
|
||||||
!strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
|
!strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
|
||||||
[(set Float64Regs:$dst, (fadd
|
[(set Float64Regs:$dst,
|
||||||
(fmul Float64Regs:$a, fpimm:$b), fpimm:$c))]>,
|
(fma Float64Regs:$a, fpimm:$b, fpimm:$c))]>,
|
||||||
Requires<[Pred]>;
|
Requires<[Pred]>;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Due to a unknown reason (most likely a bug in tablegen), tablegen does not
|
defm FMA32_ftz : FPCONTRACT32<"fma.rn.ftz.f32", doF32FTZ>;
|
||||||
// automatically generate the rrr2 rule from
|
defm FMA32 : FPCONTRACT32<"fma.rn.f32", doNoF32FTZ>;
|
||||||
// the rrr rule (see FPCONTRACT32) for FMA32, though it does for FMAD32.
|
defm FMA64 : FPCONTRACT64<"fma.rn.f64", doNoF32FTZ>;
|
||||||
// If we reverse the order of the following two lines, then rrr2 rule will be
|
|
||||||
// generated for FMA32, but not for rrr.
|
|
||||||
// Therefore, we manually write the rrr2 rule in FPCONTRACT32.
|
|
||||||
defm FMA32_ftz : FPCONTRACT32<"fma.rn.ftz.f32", doFMAF32_ftz>;
|
|
||||||
defm FMA32 : FPCONTRACT32<"fma.rn.f32", doFMAF32>;
|
|
||||||
defm FMA64 : FPCONTRACT64<"fma.rn.f64", doFMAF64>;
|
|
||||||
|
|
||||||
// b*c-a => fmad(b, c, -a)
|
|
||||||
multiclass FPCONTRACT32_SUB_PAT_MAD<NVPTXInst Inst, Predicate Pred> {
|
|
||||||
def : Pat<(fsub (fmul Float32Regs:$b, Float32Regs:$c), Float32Regs:$a),
|
|
||||||
(Inst Float32Regs:$b, Float32Regs:$c, (FNEGf32 Float32Regs:$a))>,
|
|
||||||
Requires<[Pred]>;
|
|
||||||
}
|
|
||||||
|
|
||||||
// a-b*c => fmad(-b,c, a)
|
|
||||||
// - legal because a-b*c <=> a+(-b*c) <=> a+(-b)*c
|
|
||||||
// b*c-a => fmad(b, c, -a)
|
|
||||||
// - legal because b*c-a <=> b*c+(-a)
|
|
||||||
multiclass FPCONTRACT32_SUB_PAT<NVPTXInst Inst, Predicate Pred> {
|
|
||||||
def : Pat<(fsub Float32Regs:$a, (fmul Float32Regs:$b, Float32Regs:$c)),
|
|
||||||
(Inst (FNEGf32 Float32Regs:$b), Float32Regs:$c, Float32Regs:$a)>,
|
|
||||||
Requires<[Pred]>;
|
|
||||||
def : Pat<(fsub (fmul Float32Regs:$b, Float32Regs:$c), Float32Regs:$a),
|
|
||||||
(Inst Float32Regs:$b, Float32Regs:$c, (FNEGf32 Float32Regs:$a))>,
|
|
||||||
Requires<[Pred]>;
|
|
||||||
}
|
|
||||||
|
|
||||||
// a-b*c => fmad(-b,c, a)
|
|
||||||
// b*c-a => fmad(b, c, -a)
|
|
||||||
multiclass FPCONTRACT64_SUB_PAT<NVPTXInst Inst, Predicate Pred> {
|
|
||||||
def : Pat<(fsub Float64Regs:$a, (fmul Float64Regs:$b, Float64Regs:$c)),
|
|
||||||
(Inst (FNEGf64 Float64Regs:$b), Float64Regs:$c, Float64Regs:$a)>,
|
|
||||||
Requires<[Pred]>;
|
|
||||||
|
|
||||||
def : Pat<(fsub (fmul Float64Regs:$b, Float64Regs:$c), Float64Regs:$a),
|
|
||||||
(Inst Float64Regs:$b, Float64Regs:$c, (FNEGf64 Float64Regs:$a))>,
|
|
||||||
Requires<[Pred]>;
|
|
||||||
}
|
|
||||||
|
|
||||||
defm FMAF32ext_ftz : FPCONTRACT32_SUB_PAT<FMA32_ftzrrr, doFMAF32AGG_ftz>;
|
|
||||||
defm FMAF32ext : FPCONTRACT32_SUB_PAT<FMA32rrr, doFMAF32AGG>;
|
|
||||||
defm FMAF64ext : FPCONTRACT64_SUB_PAT<FMA64rrr, doFMAF64AGG>;
|
|
||||||
|
|
||||||
def SINF: NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$src),
|
def SINF: NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$src),
|
||||||
"sin.approx.f32 \t$dst, $src;",
|
"sin.approx.f32 \t$dst, $src;",
|
||||||
|
9
test/CodeGen/NVPTX/imad.ll
Normal file
9
test/CodeGen/NVPTX/imad.ll
Normal file
@@ -0,0 +1,9 @@
|
|||||||
|
; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
|
||||||
|
|
||||||
|
; CHECK: imad
|
||||||
|
define i32 @imad(i32 %a, i32 %b, i32 %c) {
|
||||||
|
; CHECK: mad.lo.s32
|
||||||
|
%val0 = mul i32 %a, %b
|
||||||
|
%val1 = add i32 %val0, %c
|
||||||
|
ret i32 %val1
|
||||||
|
}
|
37
test/CodeGen/NVPTX/mulwide.ll
Normal file
37
test/CodeGen/NVPTX/mulwide.ll
Normal file
@@ -0,0 +1,37 @@
|
|||||||
|
; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
|
||||||
|
|
||||||
|
; CHECK: mulwide16
|
||||||
|
define i32 @mulwide16(i16 %a, i16 %b) {
|
||||||
|
; CHECK: mul.wide.s16
|
||||||
|
%val0 = sext i16 %a to i32
|
||||||
|
%val1 = sext i16 %b to i32
|
||||||
|
%val2 = mul i32 %val0, %val1
|
||||||
|
ret i32 %val2
|
||||||
|
}
|
||||||
|
|
||||||
|
; CHECK: mulwideu16
|
||||||
|
define i32 @mulwideu16(i16 %a, i16 %b) {
|
||||||
|
; CHECK: mul.wide.u16
|
||||||
|
%val0 = zext i16 %a to i32
|
||||||
|
%val1 = zext i16 %b to i32
|
||||||
|
%val2 = mul i32 %val0, %val1
|
||||||
|
ret i32 %val2
|
||||||
|
}
|
||||||
|
|
||||||
|
; CHECK: mulwide32
|
||||||
|
define i64 @mulwide32(i32 %a, i32 %b) {
|
||||||
|
; CHECK: mul.wide.s32
|
||||||
|
%val0 = sext i32 %a to i64
|
||||||
|
%val1 = sext i32 %b to i64
|
||||||
|
%val2 = mul i64 %val0, %val1
|
||||||
|
ret i64 %val2
|
||||||
|
}
|
||||||
|
|
||||||
|
; CHECK: mulwideu32
|
||||||
|
define i64 @mulwideu32(i32 %a, i32 %b) {
|
||||||
|
; CHECK: mul.wide.u32
|
||||||
|
%val0 = zext i32 %a to i64
|
||||||
|
%val1 = zext i32 %b to i64
|
||||||
|
%val2 = mul i64 %val0, %val1
|
||||||
|
ret i64 %val2
|
||||||
|
}
|
Reference in New Issue
Block a user