mirror of
https://github.com/c64scene-ar/llvm-6502.git
synced 2025-05-22 03:39:03 +00:00
[NVPTX] Use approximate FP ops when unsafe-fp-math is used, and append
.ftz to instructions if the nvptx-f32ftz attribute is set to "true" git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@186820 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
parent
1abb7bc7e9
commit
3a8ee4ffd7
@ -25,11 +25,6 @@
|
|||||||
|
|
||||||
using namespace llvm;
|
using namespace llvm;
|
||||||
|
|
||||||
static cl::opt<bool> UseFMADInstruction(
|
|
||||||
"nvptx-mad-enable", cl::ZeroOrMore,
|
|
||||||
cl::desc("NVPTX Specific: Enable generating FMAD instructions"),
|
|
||||||
cl::init(false));
|
|
||||||
|
|
||||||
static cl::opt<int>
|
static cl::opt<int>
|
||||||
FMAContractLevel("nvptx-fma-level", cl::ZeroOrMore,
|
FMAContractLevel("nvptx-fma-level", cl::ZeroOrMore,
|
||||||
cl::desc("NVPTX Specific: FMA contraction (0: don't do it"
|
cl::desc("NVPTX Specific: FMA contraction (0: don't do it"
|
||||||
@ -47,6 +42,12 @@ UsePrecSqrtF32("nvptx-prec-sqrtf32",
|
|||||||
cl::desc("NVPTX Specific: 0 use sqrt.approx, 1 use sqrt.rn."),
|
cl::desc("NVPTX Specific: 0 use sqrt.approx, 1 use sqrt.rn."),
|
||||||
cl::init(true));
|
cl::init(true));
|
||||||
|
|
||||||
|
static cl::opt<bool>
|
||||||
|
FtzEnabled("nvptx-f32ftz", cl::ZeroOrMore,
|
||||||
|
cl::desc("NVPTX Specific: Flush f32 subnormals to sign-preserving zero."),
|
||||||
|
cl::init(false));
|
||||||
|
|
||||||
|
|
||||||
/// createNVPTXISelDag - This pass converts a legalized DAG into a
|
/// createNVPTXISelDag - This pass converts a legalized DAG into a
|
||||||
/// NVPTX-specific DAG, ready for instruction scheduling.
|
/// NVPTX-specific DAG, ready for instruction scheduling.
|
||||||
FunctionPass *llvm::createNVPTXISelDag(NVPTXTargetMachine &TM,
|
FunctionPass *llvm::createNVPTXISelDag(NVPTXTargetMachine &TM,
|
||||||
@ -58,12 +59,7 @@ NVPTXDAGToDAGISel::NVPTXDAGToDAGISel(NVPTXTargetMachine &tm,
|
|||||||
CodeGenOpt::Level OptLevel)
|
CodeGenOpt::Level OptLevel)
|
||||||
: SelectionDAGISel(tm, OptLevel),
|
: SelectionDAGISel(tm, OptLevel),
|
||||||
Subtarget(tm.getSubtarget<NVPTXSubtarget>()) {
|
Subtarget(tm.getSubtarget<NVPTXSubtarget>()) {
|
||||||
// Always do fma.f32 fpcontract if the target supports the instruction.
|
|
||||||
// Always do fma.f64 fpcontract if the target supports the instruction.
|
|
||||||
// Do mad.f32 is nvptx-mad-enable is specified and the target does not
|
|
||||||
// support fma.f32.
|
|
||||||
|
|
||||||
doFMADF32 = (OptLevel > 0) && UseFMADInstruction && !Subtarget.hasFMAF32();
|
|
||||||
doFMAF32 = (OptLevel > 0) && Subtarget.hasFMAF32() && (FMAContractLevel >= 1);
|
doFMAF32 = (OptLevel > 0) && Subtarget.hasFMAF32() && (FMAContractLevel >= 1);
|
||||||
doFMAF64 = (OptLevel > 0) && Subtarget.hasFMAF64() && (FMAContractLevel >= 1);
|
doFMAF64 = (OptLevel > 0) && Subtarget.hasFMAF64() && (FMAContractLevel >= 1);
|
||||||
doFMAF32AGG =
|
doFMAF32AGG =
|
||||||
@ -71,20 +67,51 @@ NVPTXDAGToDAGISel::NVPTXDAGToDAGISel(NVPTXTargetMachine &tm,
|
|||||||
doFMAF64AGG =
|
doFMAF64AGG =
|
||||||
(OptLevel > 0) && Subtarget.hasFMAF64() && (FMAContractLevel == 2);
|
(OptLevel > 0) && Subtarget.hasFMAF64() && (FMAContractLevel == 2);
|
||||||
|
|
||||||
allowFMA = (FMAContractLevel >= 1) || UseFMADInstruction;
|
allowFMA = (FMAContractLevel >= 1);
|
||||||
|
|
||||||
UseF32FTZ = false;
|
|
||||||
|
|
||||||
doMulWide = (OptLevel > 0);
|
doMulWide = (OptLevel > 0);
|
||||||
|
}
|
||||||
|
|
||||||
// Decide how to translate f32 div
|
int NVPTXDAGToDAGISel::getDivF32Level() const {
|
||||||
do_DIVF32_PREC = UsePrecDivF32;
|
if (UsePrecDivF32.getNumOccurrences() > 0) {
|
||||||
// Decide how to translate f32 sqrt
|
// If nvptx-prec-div32=N is used on the command-line, always honor it
|
||||||
do_SQRTF32_PREC = UsePrecSqrtF32;
|
return UsePrecDivF32;
|
||||||
// sm less than sm_20 does not support div.rnd. Use div.full.
|
} else {
|
||||||
if (do_DIVF32_PREC == 2 && !Subtarget.reqPTX20())
|
// Otherwise, use div.approx if fast math is enabled
|
||||||
do_DIVF32_PREC = 1;
|
if (TM.Options.UnsafeFPMath)
|
||||||
|
return 0;
|
||||||
|
else
|
||||||
|
return 2;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
bool NVPTXDAGToDAGISel::usePrecSqrtF32() const {
|
||||||
|
if (UsePrecSqrtF32.getNumOccurrences() > 0) {
|
||||||
|
// If nvptx-prec-sqrtf32 is used on the command-line, always honor it
|
||||||
|
return UsePrecSqrtF32;
|
||||||
|
} else {
|
||||||
|
// Otherwise, use sqrt.approx if fast math is enabled
|
||||||
|
if (TM.Options.UnsafeFPMath)
|
||||||
|
return false;
|
||||||
|
else
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
bool NVPTXDAGToDAGISel::useF32FTZ() const {
|
||||||
|
if (FtzEnabled.getNumOccurrences() > 0) {
|
||||||
|
// If nvptx-f32ftz is used on the command-line, always honor it
|
||||||
|
return FtzEnabled;
|
||||||
|
} else {
|
||||||
|
const Function *F = MF->getFunction();
|
||||||
|
// Otherwise, check for an nvptx-f32ftz attribute on the function
|
||||||
|
if (F->hasFnAttribute("nvptx-f32ftz"))
|
||||||
|
return (F->getAttributes().getAttribute(AttributeSet::FunctionIndex,
|
||||||
|
"nvptx-f32ftz")
|
||||||
|
.getValueAsString() == "true");
|
||||||
|
else
|
||||||
|
return false;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Select - Select instructions not customized! Used for
|
/// Select - Select instructions not customized! Used for
|
||||||
|
@ -28,38 +28,22 @@ class LLVM_LIBRARY_VISIBILITY NVPTXDAGToDAGISel : public SelectionDAGISel {
|
|||||||
|
|
||||||
// If true, generate corresponding FPCONTRACT. This is
|
// If true, generate corresponding FPCONTRACT. This is
|
||||||
// language dependent (i.e. CUDA and OpenCL works differently).
|
// language dependent (i.e. CUDA and OpenCL works differently).
|
||||||
bool doFMADF32;
|
|
||||||
bool doFMAF64;
|
bool doFMAF64;
|
||||||
bool doFMAF32;
|
bool doFMAF32;
|
||||||
bool doFMAF64AGG;
|
bool doFMAF64AGG;
|
||||||
bool doFMAF32AGG;
|
bool doFMAF32AGG;
|
||||||
bool allowFMA;
|
bool allowFMA;
|
||||||
|
|
||||||
// 0: use div.approx
|
|
||||||
// 1: use div.full
|
|
||||||
// 2: For sm_20 and later, ieee-compliant div.rnd.f32 can be generated;
|
|
||||||
// Otherwise, use div.full
|
|
||||||
int do_DIVF32_PREC;
|
|
||||||
|
|
||||||
// If true, generate sqrt.rn, else generate sqrt.approx. If FTZ
|
|
||||||
// is true, then generate the corresponding FTZ version.
|
|
||||||
bool do_SQRTF32_PREC;
|
|
||||||
|
|
||||||
// If true, add .ftz to f32 instructions.
|
|
||||||
// This is only meaningful for sm_20 and later, as the default
|
|
||||||
// is not ftz.
|
|
||||||
// For sm earlier than sm_20, f32 denorms are always ftz by the
|
|
||||||
// hardware.
|
|
||||||
// We always add the .ftz modifier regardless of the sm value
|
|
||||||
// when Use32FTZ is true.
|
|
||||||
bool UseF32FTZ;
|
|
||||||
|
|
||||||
// If true, generate mul.wide from sext and mul
|
// If true, generate mul.wide from sext and mul
|
||||||
bool doMulWide;
|
bool doMulWide;
|
||||||
|
|
||||||
|
int getDivF32Level() const;
|
||||||
|
bool usePrecSqrtF32() const;
|
||||||
|
bool useF32FTZ() const;
|
||||||
|
|
||||||
public:
|
public:
|
||||||
explicit NVPTXDAGToDAGISel(NVPTXTargetMachine &tm,
|
explicit NVPTXDAGToDAGISel(NVPTXTargetMachine &tm,
|
||||||
CodeGenOpt::Level OptLevel);
|
CodeGenOpt::Level OptLevel);
|
||||||
|
|
||||||
// Pass Name
|
// Pass Name
|
||||||
virtual const char *getPassName() const {
|
virtual const char *getPassName() const {
|
||||||
|
@ -136,28 +136,26 @@ def hasLDG : Predicate<"Subtarget.hasLDG()">;
|
|||||||
def hasLDU : Predicate<"Subtarget.hasLDU()">;
|
def hasLDU : Predicate<"Subtarget.hasLDU()">;
|
||||||
def hasGenericLdSt : Predicate<"Subtarget.hasGenericLdSt()">;
|
def hasGenericLdSt : Predicate<"Subtarget.hasGenericLdSt()">;
|
||||||
|
|
||||||
def doF32FTZ : Predicate<"UseF32FTZ==1">;
|
def doF32FTZ : Predicate<"useF32FTZ()">;
|
||||||
def doNoF32FTZ : Predicate<"UseF32FTZ==0">;
|
def doNoF32FTZ : Predicate<"!useF32FTZ()">;
|
||||||
|
|
||||||
def doFMAF32 : Predicate<"doFMAF32">;
|
def doFMAF32 : Predicate<"doFMAF32">;
|
||||||
def doFMAF32_ftz : Predicate<"(doFMAF32 && UseF32FTZ)">;
|
def doFMAF32_ftz : Predicate<"(doFMAF32 && useF32FTZ())">;
|
||||||
def doFMAF32AGG : Predicate<"doFMAF32AGG">;
|
def doFMAF32AGG : Predicate<"doFMAF32AGG">;
|
||||||
def doFMAF32AGG_ftz : Predicate<"(doFMAF32AGG && UseF32FTZ)">;
|
def doFMAF32AGG_ftz : Predicate<"(doFMAF32AGG && useF32FTZ())">;
|
||||||
def doFMAF64 : Predicate<"doFMAF64">;
|
def doFMAF64 : Predicate<"doFMAF64">;
|
||||||
def doFMAF64AGG : Predicate<"doFMAF64AGG">;
|
def doFMAF64AGG : Predicate<"doFMAF64AGG">;
|
||||||
def doFMADF32 : Predicate<"doFMADF32">;
|
|
||||||
def doFMADF32_ftz : Predicate<"(doFMADF32 && UseF32FTZ)">;
|
|
||||||
|
|
||||||
def doMulWide : Predicate<"doMulWide">;
|
def doMulWide : Predicate<"doMulWide">;
|
||||||
|
|
||||||
def allowFMA : Predicate<"allowFMA">;
|
def allowFMA : Predicate<"allowFMA">;
|
||||||
def allowFMA_ftz : Predicate<"(allowFMA && UseF32FTZ)">;
|
def allowFMA_ftz : Predicate<"(allowFMA && useF32FTZ())">;
|
||||||
|
|
||||||
def do_DIVF32_APPROX : Predicate<"do_DIVF32_PREC==0">;
|
def do_DIVF32_APPROX : Predicate<"getDivF32Level()==0">;
|
||||||
def do_DIVF32_FULL : Predicate<"do_DIVF32_PREC==1">;
|
def do_DIVF32_FULL : Predicate<"getDivF32Level()==1">;
|
||||||
|
|
||||||
def do_SQRTF32_APPROX : Predicate<"do_SQRTF32_PREC==0">;
|
def do_SQRTF32_APPROX : Predicate<"!usePrecSqrtF32()">;
|
||||||
def do_SQRTF32_RN : Predicate<"do_SQRTF32_PREC==1">;
|
def do_SQRTF32_RN : Predicate<"usePrecSqrtF32()">;
|
||||||
|
|
||||||
def hasHWROT32 : Predicate<"Subtarget.hasHWROT32()">;
|
def hasHWROT32 : Predicate<"Subtarget.hasHWROT32()">;
|
||||||
|
|
||||||
@ -864,8 +862,6 @@ multiclass FPCONTRACT64<string OpcStr, Predicate Pred> {
|
|||||||
// If we reverse the order of the following two lines, then rrr2 rule will be
|
// If we reverse the order of the following two lines, then rrr2 rule will be
|
||||||
// generated for FMA32, but not for rrr.
|
// generated for FMA32, but not for rrr.
|
||||||
// Therefore, we manually write the rrr2 rule in FPCONTRACT32.
|
// Therefore, we manually write the rrr2 rule in FPCONTRACT32.
|
||||||
defm FMAD32_ftz : FPCONTRACT32<"mad.ftz.f32", doFMADF32_ftz>;
|
|
||||||
defm FMAD32 : FPCONTRACT32<"mad.f32", doFMADF32>;
|
|
||||||
defm FMA32_ftz : FPCONTRACT32<"fma.rn.ftz.f32", doFMAF32_ftz>;
|
defm FMA32_ftz : FPCONTRACT32<"fma.rn.ftz.f32", doFMAF32_ftz>;
|
||||||
defm FMA32 : FPCONTRACT32<"fma.rn.f32", doFMAF32>;
|
defm FMA32 : FPCONTRACT32<"fma.rn.f32", doFMAF32>;
|
||||||
defm FMA64 : FPCONTRACT64<"fma.rn.f64", doFMAF64>;
|
defm FMA64 : FPCONTRACT64<"fma.rn.f64", doFMAF64>;
|
||||||
@ -904,8 +900,6 @@ multiclass FPCONTRACT64_SUB_PAT<NVPTXInst Inst, Predicate Pred> {
|
|||||||
|
|
||||||
defm FMAF32ext_ftz : FPCONTRACT32_SUB_PAT<FMA32_ftzrrr, doFMAF32AGG_ftz>;
|
defm FMAF32ext_ftz : FPCONTRACT32_SUB_PAT<FMA32_ftzrrr, doFMAF32AGG_ftz>;
|
||||||
defm FMAF32ext : FPCONTRACT32_SUB_PAT<FMA32rrr, doFMAF32AGG>;
|
defm FMAF32ext : FPCONTRACT32_SUB_PAT<FMA32rrr, doFMAF32AGG>;
|
||||||
defm FMADF32ext_ftz : FPCONTRACT32_SUB_PAT_MAD<FMAD32_ftzrrr, doFMADF32_ftz>;
|
|
||||||
defm FMADF32ext : FPCONTRACT32_SUB_PAT_MAD<FMAD32rrr, doFMADF32>;
|
|
||||||
defm FMAF64ext : FPCONTRACT64_SUB_PAT<FMA64rrr, doFMAF64AGG>;
|
defm FMAF64ext : FPCONTRACT64_SUB_PAT<FMA64rrr, doFMAF64AGG>;
|
||||||
|
|
||||||
def SINF: NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$src),
|
def SINF: NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$src),
|
||||||
|
43
test/CodeGen/NVPTX/fast-math.ll
Normal file
43
test/CodeGen/NVPTX/fast-math.ll
Normal file
@ -0,0 +1,43 @@
|
|||||||
|
; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
|
||||||
|
|
||||||
|
|
||||||
|
declare float @llvm.nvvm.sqrt.f(float)
|
||||||
|
|
||||||
|
|
||||||
|
; CHECK: sqrt_div
|
||||||
|
; CHECK: sqrt.rn.f32
|
||||||
|
; CHECK: div.rn.f32
|
||||||
|
define float @sqrt_div(float %a, float %b) {
|
||||||
|
%t1 = tail call float @llvm.nvvm.sqrt.f(float %a)
|
||||||
|
%t2 = fdiv float %t1, %b
|
||||||
|
ret float %t2
|
||||||
|
}
|
||||||
|
|
||||||
|
; CHECK: sqrt_div_fast
|
||||||
|
; CHECK: sqrt.approx.f32
|
||||||
|
; CHECK: div.approx.f32
|
||||||
|
define float @sqrt_div_fast(float %a, float %b) #0 {
|
||||||
|
%t1 = tail call float @llvm.nvvm.sqrt.f(float %a)
|
||||||
|
%t2 = fdiv float %t1, %b
|
||||||
|
ret float %t2
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
; CHECK: fadd
|
||||||
|
; CHECK: add.f32
|
||||||
|
define float @fadd(float %a, float %b) {
|
||||||
|
%t1 = fadd float %a, %b
|
||||||
|
ret float %t1
|
||||||
|
}
|
||||||
|
|
||||||
|
; CHECK: fadd_ftz
|
||||||
|
; CHECK: add.ftz.f32
|
||||||
|
define float @fadd_ftz(float %a, float %b) #1 {
|
||||||
|
%t1 = fadd float %a, %b
|
||||||
|
ret float %t1
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
attributes #0 = { "unsafe-fp-math" = "true" }
|
||||||
|
attributes #1 = { "nvptx-f32ftz" = "true" }
|
Loading…
x
Reference in New Issue
Block a user