mirror of
https://github.com/c64scene-ar/llvm-6502.git
synced 2024-12-13 04:30:23 +00:00
[NVPTX] Use approximate FP ops when unsafe-fp-math is used, and append
.ftz to instructions if the nvptx-f32ftz attribute is set to "true" git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@186820 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
parent
1abb7bc7e9
commit
3a8ee4ffd7
@ -25,11 +25,6 @@
|
||||
|
||||
using namespace llvm;
|
||||
|
||||
static cl::opt<bool> UseFMADInstruction(
|
||||
"nvptx-mad-enable", cl::ZeroOrMore,
|
||||
cl::desc("NVPTX Specific: Enable generating FMAD instructions"),
|
||||
cl::init(false));
|
||||
|
||||
static cl::opt<int>
|
||||
FMAContractLevel("nvptx-fma-level", cl::ZeroOrMore,
|
||||
cl::desc("NVPTX Specific: FMA contraction (0: don't do it"
|
||||
@ -47,6 +42,12 @@ UsePrecSqrtF32("nvptx-prec-sqrtf32",
|
||||
cl::desc("NVPTX Specific: 0 use sqrt.approx, 1 use sqrt.rn."),
|
||||
cl::init(true));
|
||||
|
||||
static cl::opt<bool>
|
||||
FtzEnabled("nvptx-f32ftz", cl::ZeroOrMore,
|
||||
cl::desc("NVPTX Specific: Flush f32 subnormals to sign-preserving zero."),
|
||||
cl::init(false));
|
||||
|
||||
|
||||
/// createNVPTXISelDag - This pass converts a legalized DAG into a
|
||||
/// NVPTX-specific DAG, ready for instruction scheduling.
|
||||
FunctionPass *llvm::createNVPTXISelDag(NVPTXTargetMachine &TM,
|
||||
@ -58,12 +59,7 @@ NVPTXDAGToDAGISel::NVPTXDAGToDAGISel(NVPTXTargetMachine &tm,
|
||||
CodeGenOpt::Level OptLevel)
|
||||
: SelectionDAGISel(tm, OptLevel),
|
||||
Subtarget(tm.getSubtarget<NVPTXSubtarget>()) {
|
||||
// Always do fma.f32 fpcontract if the target supports the instruction.
|
||||
// Always do fma.f64 fpcontract if the target supports the instruction.
|
||||
// Do mad.f32 is nvptx-mad-enable is specified and the target does not
|
||||
// support fma.f32.
|
||||
|
||||
doFMADF32 = (OptLevel > 0) && UseFMADInstruction && !Subtarget.hasFMAF32();
|
||||
doFMAF32 = (OptLevel > 0) && Subtarget.hasFMAF32() && (FMAContractLevel >= 1);
|
||||
doFMAF64 = (OptLevel > 0) && Subtarget.hasFMAF64() && (FMAContractLevel >= 1);
|
||||
doFMAF32AGG =
|
||||
@ -71,20 +67,51 @@ NVPTXDAGToDAGISel::NVPTXDAGToDAGISel(NVPTXTargetMachine &tm,
|
||||
doFMAF64AGG =
|
||||
(OptLevel > 0) && Subtarget.hasFMAF64() && (FMAContractLevel == 2);
|
||||
|
||||
allowFMA = (FMAContractLevel >= 1) || UseFMADInstruction;
|
||||
|
||||
UseF32FTZ = false;
|
||||
allowFMA = (FMAContractLevel >= 1);
|
||||
|
||||
doMulWide = (OptLevel > 0);
|
||||
}
|
||||
|
||||
// Decide how to translate f32 div
|
||||
do_DIVF32_PREC = UsePrecDivF32;
|
||||
// Decide how to translate f32 sqrt
|
||||
do_SQRTF32_PREC = UsePrecSqrtF32;
|
||||
// sm less than sm_20 does not support div.rnd. Use div.full.
|
||||
if (do_DIVF32_PREC == 2 && !Subtarget.reqPTX20())
|
||||
do_DIVF32_PREC = 1;
|
||||
int NVPTXDAGToDAGISel::getDivF32Level() const {
|
||||
if (UsePrecDivF32.getNumOccurrences() > 0) {
|
||||
// If nvptx-prec-div32=N is used on the command-line, always honor it
|
||||
return UsePrecDivF32;
|
||||
} else {
|
||||
// Otherwise, use div.approx if fast math is enabled
|
||||
if (TM.Options.UnsafeFPMath)
|
||||
return 0;
|
||||
else
|
||||
return 2;
|
||||
}
|
||||
}
|
||||
|
||||
bool NVPTXDAGToDAGISel::usePrecSqrtF32() const {
|
||||
if (UsePrecSqrtF32.getNumOccurrences() > 0) {
|
||||
// If nvptx-prec-sqrtf32 is used on the command-line, always honor it
|
||||
return UsePrecSqrtF32;
|
||||
} else {
|
||||
// Otherwise, use sqrt.approx if fast math is enabled
|
||||
if (TM.Options.UnsafeFPMath)
|
||||
return false;
|
||||
else
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
bool NVPTXDAGToDAGISel::useF32FTZ() const {
|
||||
if (FtzEnabled.getNumOccurrences() > 0) {
|
||||
// If nvptx-f32ftz is used on the command-line, always honor it
|
||||
return FtzEnabled;
|
||||
} else {
|
||||
const Function *F = MF->getFunction();
|
||||
// Otherwise, check for an nvptx-f32ftz attribute on the function
|
||||
if (F->hasFnAttribute("nvptx-f32ftz"))
|
||||
return (F->getAttributes().getAttribute(AttributeSet::FunctionIndex,
|
||||
"nvptx-f32ftz")
|
||||
.getValueAsString() == "true");
|
||||
else
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/// Select - Select instructions not customized! Used for
|
||||
|
@ -28,38 +28,22 @@ class LLVM_LIBRARY_VISIBILITY NVPTXDAGToDAGISel : public SelectionDAGISel {
|
||||
|
||||
// If true, generate corresponding FPCONTRACT. This is
|
||||
// language dependent (i.e. CUDA and OpenCL works differently).
|
||||
bool doFMADF32;
|
||||
bool doFMAF64;
|
||||
bool doFMAF32;
|
||||
bool doFMAF64AGG;
|
||||
bool doFMAF32AGG;
|
||||
bool allowFMA;
|
||||
|
||||
// 0: use div.approx
|
||||
// 1: use div.full
|
||||
// 2: For sm_20 and later, ieee-compliant div.rnd.f32 can be generated;
|
||||
// Otherwise, use div.full
|
||||
int do_DIVF32_PREC;
|
||||
|
||||
// If true, generate sqrt.rn, else generate sqrt.approx. If FTZ
|
||||
// is true, then generate the corresponding FTZ version.
|
||||
bool do_SQRTF32_PREC;
|
||||
|
||||
// If true, add .ftz to f32 instructions.
|
||||
// This is only meaningful for sm_20 and later, as the default
|
||||
// is not ftz.
|
||||
// For sm earlier than sm_20, f32 denorms are always ftz by the
|
||||
// hardware.
|
||||
// We always add the .ftz modifier regardless of the sm value
|
||||
// when Use32FTZ is true.
|
||||
bool UseF32FTZ;
|
||||
|
||||
// If true, generate mul.wide from sext and mul
|
||||
bool doMulWide;
|
||||
|
||||
int getDivF32Level() const;
|
||||
bool usePrecSqrtF32() const;
|
||||
bool useF32FTZ() const;
|
||||
|
||||
public:
|
||||
explicit NVPTXDAGToDAGISel(NVPTXTargetMachine &tm,
|
||||
CodeGenOpt::Level OptLevel);
|
||||
CodeGenOpt::Level OptLevel);
|
||||
|
||||
// Pass Name
|
||||
virtual const char *getPassName() const {
|
||||
|
@ -136,28 +136,26 @@ def hasLDG : Predicate<"Subtarget.hasLDG()">;
|
||||
def hasLDU : Predicate<"Subtarget.hasLDU()">;
|
||||
def hasGenericLdSt : Predicate<"Subtarget.hasGenericLdSt()">;
|
||||
|
||||
def doF32FTZ : Predicate<"UseF32FTZ==1">;
|
||||
def doNoF32FTZ : Predicate<"UseF32FTZ==0">;
|
||||
def doF32FTZ : Predicate<"useF32FTZ()">;
|
||||
def doNoF32FTZ : Predicate<"!useF32FTZ()">;
|
||||
|
||||
def doFMAF32 : Predicate<"doFMAF32">;
|
||||
def doFMAF32_ftz : Predicate<"(doFMAF32 && UseF32FTZ)">;
|
||||
def doFMAF32_ftz : Predicate<"(doFMAF32 && useF32FTZ())">;
|
||||
def doFMAF32AGG : Predicate<"doFMAF32AGG">;
|
||||
def doFMAF32AGG_ftz : Predicate<"(doFMAF32AGG && UseF32FTZ)">;
|
||||
def doFMAF32AGG_ftz : Predicate<"(doFMAF32AGG && useF32FTZ())">;
|
||||
def doFMAF64 : Predicate<"doFMAF64">;
|
||||
def doFMAF64AGG : Predicate<"doFMAF64AGG">;
|
||||
def doFMADF32 : Predicate<"doFMADF32">;
|
||||
def doFMADF32_ftz : Predicate<"(doFMADF32 && UseF32FTZ)">;
|
||||
|
||||
def doMulWide : Predicate<"doMulWide">;
|
||||
|
||||
def allowFMA : Predicate<"allowFMA">;
|
||||
def allowFMA_ftz : Predicate<"(allowFMA && UseF32FTZ)">;
|
||||
def allowFMA_ftz : Predicate<"(allowFMA && useF32FTZ())">;
|
||||
|
||||
def do_DIVF32_APPROX : Predicate<"do_DIVF32_PREC==0">;
|
||||
def do_DIVF32_FULL : Predicate<"do_DIVF32_PREC==1">;
|
||||
def do_DIVF32_APPROX : Predicate<"getDivF32Level()==0">;
|
||||
def do_DIVF32_FULL : Predicate<"getDivF32Level()==1">;
|
||||
|
||||
def do_SQRTF32_APPROX : Predicate<"do_SQRTF32_PREC==0">;
|
||||
def do_SQRTF32_RN : Predicate<"do_SQRTF32_PREC==1">;
|
||||
def do_SQRTF32_APPROX : Predicate<"!usePrecSqrtF32()">;
|
||||
def do_SQRTF32_RN : Predicate<"usePrecSqrtF32()">;
|
||||
|
||||
def hasHWROT32 : Predicate<"Subtarget.hasHWROT32()">;
|
||||
|
||||
@ -864,8 +862,6 @@ multiclass FPCONTRACT64<string OpcStr, Predicate Pred> {
|
||||
// If we reverse the order of the following two lines, then rrr2 rule will be
|
||||
// generated for FMA32, but not for rrr.
|
||||
// Therefore, we manually write the rrr2 rule in FPCONTRACT32.
|
||||
defm FMAD32_ftz : FPCONTRACT32<"mad.ftz.f32", doFMADF32_ftz>;
|
||||
defm FMAD32 : FPCONTRACT32<"mad.f32", doFMADF32>;
|
||||
defm FMA32_ftz : FPCONTRACT32<"fma.rn.ftz.f32", doFMAF32_ftz>;
|
||||
defm FMA32 : FPCONTRACT32<"fma.rn.f32", doFMAF32>;
|
||||
defm FMA64 : FPCONTRACT64<"fma.rn.f64", doFMAF64>;
|
||||
@ -904,8 +900,6 @@ multiclass FPCONTRACT64_SUB_PAT<NVPTXInst Inst, Predicate Pred> {
|
||||
|
||||
defm FMAF32ext_ftz : FPCONTRACT32_SUB_PAT<FMA32_ftzrrr, doFMAF32AGG_ftz>;
|
||||
defm FMAF32ext : FPCONTRACT32_SUB_PAT<FMA32rrr, doFMAF32AGG>;
|
||||
defm FMADF32ext_ftz : FPCONTRACT32_SUB_PAT_MAD<FMAD32_ftzrrr, doFMADF32_ftz>;
|
||||
defm FMADF32ext : FPCONTRACT32_SUB_PAT_MAD<FMAD32rrr, doFMADF32>;
|
||||
defm FMAF64ext : FPCONTRACT64_SUB_PAT<FMA64rrr, doFMAF64AGG>;
|
||||
|
||||
def SINF: NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$src),
|
||||
|
43
test/CodeGen/NVPTX/fast-math.ll
Normal file
43
test/CodeGen/NVPTX/fast-math.ll
Normal file
@ -0,0 +1,43 @@
|
||||
; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
|
||||
|
||||
|
||||
declare float @llvm.nvvm.sqrt.f(float)
|
||||
|
||||
|
||||
; CHECK: sqrt_div
|
||||
; CHECK: sqrt.rn.f32
|
||||
; CHECK: div.rn.f32
|
||||
define float @sqrt_div(float %a, float %b) {
|
||||
%t1 = tail call float @llvm.nvvm.sqrt.f(float %a)
|
||||
%t2 = fdiv float %t1, %b
|
||||
ret float %t2
|
||||
}
|
||||
|
||||
; CHECK: sqrt_div_fast
|
||||
; CHECK: sqrt.approx.f32
|
||||
; CHECK: div.approx.f32
|
||||
define float @sqrt_div_fast(float %a, float %b) #0 {
|
||||
%t1 = tail call float @llvm.nvvm.sqrt.f(float %a)
|
||||
%t2 = fdiv float %t1, %b
|
||||
ret float %t2
|
||||
}
|
||||
|
||||
|
||||
; CHECK: fadd
|
||||
; CHECK: add.f32
|
||||
define float @fadd(float %a, float %b) {
|
||||
%t1 = fadd float %a, %b
|
||||
ret float %t1
|
||||
}
|
||||
|
||||
; CHECK: fadd_ftz
|
||||
; CHECK: add.ftz.f32
|
||||
define float @fadd_ftz(float %a, float %b) #1 {
|
||||
%t1 = fadd float %a, %b
|
||||
ret float %t1
|
||||
}
|
||||
|
||||
|
||||
|
||||
attributes #0 = { "unsafe-fp-math" = "true" }
|
||||
attributes #1 = { "nvptx-f32ftz" = "true" }
|
Loading…
Reference in New Issue
Block a user