From 3a8ee4ffd783bd0cf2d83089edb43ec546b49d0d Mon Sep 17 00:00:00 2001 From: Justin Holewinski Date: Mon, 22 Jul 2013 12:18:04 +0000 Subject: [PATCH] [NVPTX] Use approximate FP ops when unsafe-fp-math is used, and append .ftz to instructions if the nvptx-f32ftz attribute is set to "true" git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@186820 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp | 67 ++++++++++++++++++-------- lib/Target/NVPTX/NVPTXISelDAGToDAG.h | 26 ++-------- lib/Target/NVPTX/NVPTXInstrInfo.td | 24 ++++----- test/CodeGen/NVPTX/fast-math.ll | 43 +++++++++++++++++ 4 files changed, 104 insertions(+), 56 deletions(-) create mode 100644 test/CodeGen/NVPTX/fast-math.ll diff --git a/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp index b613587f2d0..ba85e35a73a 100644 --- a/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp +++ b/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp @@ -25,11 +25,6 @@ using namespace llvm; -static cl::opt UseFMADInstruction( - "nvptx-mad-enable", cl::ZeroOrMore, - cl::desc("NVPTX Specific: Enable generating FMAD instructions"), - cl::init(false)); - static cl::opt FMAContractLevel("nvptx-fma-level", cl::ZeroOrMore, cl::desc("NVPTX Specific: FMA contraction (0: don't do it" @@ -47,6 +42,12 @@ UsePrecSqrtF32("nvptx-prec-sqrtf32", cl::desc("NVPTX Specific: 0 use sqrt.approx, 1 use sqrt.rn."), cl::init(true)); +static cl::opt +FtzEnabled("nvptx-f32ftz", cl::ZeroOrMore, + cl::desc("NVPTX Specific: Flush f32 subnormals to sign-preserving zero."), + cl::init(false)); + + /// createNVPTXISelDag - This pass converts a legalized DAG into a /// NVPTX-specific DAG, ready for instruction scheduling. FunctionPass *llvm::createNVPTXISelDag(NVPTXTargetMachine &TM, @@ -58,12 +59,7 @@ NVPTXDAGToDAGISel::NVPTXDAGToDAGISel(NVPTXTargetMachine &tm, CodeGenOpt::Level OptLevel) : SelectionDAGISel(tm, OptLevel), Subtarget(tm.getSubtarget()) { - // Always do fma.f32 fpcontract if the target supports the instruction. - // Always do fma.f64 fpcontract if the target supports the instruction. - // Do mad.f32 is nvptx-mad-enable is specified and the target does not - // support fma.f32. - doFMADF32 = (OptLevel > 0) && UseFMADInstruction && !Subtarget.hasFMAF32(); doFMAF32 = (OptLevel > 0) && Subtarget.hasFMAF32() && (FMAContractLevel >= 1); doFMAF64 = (OptLevel > 0) && Subtarget.hasFMAF64() && (FMAContractLevel >= 1); doFMAF32AGG = @@ -71,20 +67,51 @@ NVPTXDAGToDAGISel::NVPTXDAGToDAGISel(NVPTXTargetMachine &tm, doFMAF64AGG = (OptLevel > 0) && Subtarget.hasFMAF64() && (FMAContractLevel == 2); - allowFMA = (FMAContractLevel >= 1) || UseFMADInstruction; - - UseF32FTZ = false; + allowFMA = (FMAContractLevel >= 1); doMulWide = (OptLevel > 0); +} - // Decide how to translate f32 div - do_DIVF32_PREC = UsePrecDivF32; - // Decide how to translate f32 sqrt - do_SQRTF32_PREC = UsePrecSqrtF32; - // sm less than sm_20 does not support div.rnd. Use div.full. - if (do_DIVF32_PREC == 2 && !Subtarget.reqPTX20()) - do_DIVF32_PREC = 1; +int NVPTXDAGToDAGISel::getDivF32Level() const { + if (UsePrecDivF32.getNumOccurrences() > 0) { + // If nvptx-prec-div32=N is used on the command-line, always honor it + return UsePrecDivF32; + } else { + // Otherwise, use div.approx if fast math is enabled + if (TM.Options.UnsafeFPMath) + return 0; + else + return 2; + } +} +bool NVPTXDAGToDAGISel::usePrecSqrtF32() const { + if (UsePrecSqrtF32.getNumOccurrences() > 0) { + // If nvptx-prec-sqrtf32 is used on the command-line, always honor it + return UsePrecSqrtF32; + } else { + // Otherwise, use sqrt.approx if fast math is enabled + if (TM.Options.UnsafeFPMath) + return false; + else + return true; + } +} + +bool NVPTXDAGToDAGISel::useF32FTZ() const { + if (FtzEnabled.getNumOccurrences() > 0) { + // If nvptx-f32ftz is used on the command-line, always honor it + return FtzEnabled; + } else { + const Function *F = MF->getFunction(); + // Otherwise, check for an nvptx-f32ftz attribute on the function + if (F->hasFnAttribute("nvptx-f32ftz")) + return (F->getAttributes().getAttribute(AttributeSet::FunctionIndex, + "nvptx-f32ftz") + .getValueAsString() == "true"); + else + return false; + } } /// Select - Select instructions not customized! Used for diff --git a/lib/Target/NVPTX/NVPTXISelDAGToDAG.h b/lib/Target/NVPTX/NVPTXISelDAGToDAG.h index 428e7b22883..d961e501453 100644 --- a/lib/Target/NVPTX/NVPTXISelDAGToDAG.h +++ b/lib/Target/NVPTX/NVPTXISelDAGToDAG.h @@ -28,38 +28,22 @@ class LLVM_LIBRARY_VISIBILITY NVPTXDAGToDAGISel : public SelectionDAGISel { // If true, generate corresponding FPCONTRACT. This is // language dependent (i.e. CUDA and OpenCL works differently). - bool doFMADF32; bool doFMAF64; bool doFMAF32; bool doFMAF64AGG; bool doFMAF32AGG; bool allowFMA; - // 0: use div.approx - // 1: use div.full - // 2: For sm_20 and later, ieee-compliant div.rnd.f32 can be generated; - // Otherwise, use div.full - int do_DIVF32_PREC; - - // If true, generate sqrt.rn, else generate sqrt.approx. If FTZ - // is true, then generate the corresponding FTZ version. - bool do_SQRTF32_PREC; - - // If true, add .ftz to f32 instructions. - // This is only meaningful for sm_20 and later, as the default - // is not ftz. - // For sm earlier than sm_20, f32 denorms are always ftz by the - // hardware. - // We always add the .ftz modifier regardless of the sm value - // when Use32FTZ is true. - bool UseF32FTZ; - // If true, generate mul.wide from sext and mul bool doMulWide; + int getDivF32Level() const; + bool usePrecSqrtF32() const; + bool useF32FTZ() const; + public: explicit NVPTXDAGToDAGISel(NVPTXTargetMachine &tm, - CodeGenOpt::Level OptLevel); + CodeGenOpt::Level OptLevel); // Pass Name virtual const char *getPassName() const { diff --git a/lib/Target/NVPTX/NVPTXInstrInfo.td b/lib/Target/NVPTX/NVPTXInstrInfo.td index e6335a0d8e0..8ce16e9d1c4 100644 --- a/lib/Target/NVPTX/NVPTXInstrInfo.td +++ b/lib/Target/NVPTX/NVPTXInstrInfo.td @@ -136,28 +136,26 @@ def hasLDG : Predicate<"Subtarget.hasLDG()">; def hasLDU : Predicate<"Subtarget.hasLDU()">; def hasGenericLdSt : Predicate<"Subtarget.hasGenericLdSt()">; -def doF32FTZ : Predicate<"UseF32FTZ==1">; -def doNoF32FTZ : Predicate<"UseF32FTZ==0">; +def doF32FTZ : Predicate<"useF32FTZ()">; +def doNoF32FTZ : Predicate<"!useF32FTZ()">; def doFMAF32 : Predicate<"doFMAF32">; -def doFMAF32_ftz : Predicate<"(doFMAF32 && UseF32FTZ)">; +def doFMAF32_ftz : Predicate<"(doFMAF32 && useF32FTZ())">; def doFMAF32AGG : Predicate<"doFMAF32AGG">; -def doFMAF32AGG_ftz : Predicate<"(doFMAF32AGG && UseF32FTZ)">; +def doFMAF32AGG_ftz : Predicate<"(doFMAF32AGG && useF32FTZ())">; def doFMAF64 : Predicate<"doFMAF64">; def doFMAF64AGG : Predicate<"doFMAF64AGG">; -def doFMADF32 : Predicate<"doFMADF32">; -def doFMADF32_ftz : Predicate<"(doFMADF32 && UseF32FTZ)">; def doMulWide : Predicate<"doMulWide">; def allowFMA : Predicate<"allowFMA">; -def allowFMA_ftz : Predicate<"(allowFMA && UseF32FTZ)">; +def allowFMA_ftz : Predicate<"(allowFMA && useF32FTZ())">; -def do_DIVF32_APPROX : Predicate<"do_DIVF32_PREC==0">; -def do_DIVF32_FULL : Predicate<"do_DIVF32_PREC==1">; +def do_DIVF32_APPROX : Predicate<"getDivF32Level()==0">; +def do_DIVF32_FULL : Predicate<"getDivF32Level()==1">; -def do_SQRTF32_APPROX : Predicate<"do_SQRTF32_PREC==0">; -def do_SQRTF32_RN : Predicate<"do_SQRTF32_PREC==1">; +def do_SQRTF32_APPROX : Predicate<"!usePrecSqrtF32()">; +def do_SQRTF32_RN : Predicate<"usePrecSqrtF32()">; def hasHWROT32 : Predicate<"Subtarget.hasHWROT32()">; @@ -864,8 +862,6 @@ multiclass FPCONTRACT64 { // If we reverse the order of the following two lines, then rrr2 rule will be // generated for FMA32, but not for rrr. // Therefore, we manually write the rrr2 rule in FPCONTRACT32. -defm FMAD32_ftz : FPCONTRACT32<"mad.ftz.f32", doFMADF32_ftz>; -defm FMAD32 : FPCONTRACT32<"mad.f32", doFMADF32>; defm FMA32_ftz : FPCONTRACT32<"fma.rn.ftz.f32", doFMAF32_ftz>; defm FMA32 : FPCONTRACT32<"fma.rn.f32", doFMAF32>; defm FMA64 : FPCONTRACT64<"fma.rn.f64", doFMAF64>; @@ -904,8 +900,6 @@ multiclass FPCONTRACT64_SUB_PAT { defm FMAF32ext_ftz : FPCONTRACT32_SUB_PAT; defm FMAF32ext : FPCONTRACT32_SUB_PAT; -defm FMADF32ext_ftz : FPCONTRACT32_SUB_PAT_MAD; -defm FMADF32ext : FPCONTRACT32_SUB_PAT_MAD; defm FMAF64ext : FPCONTRACT64_SUB_PAT; def SINF: NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$src), diff --git a/test/CodeGen/NVPTX/fast-math.ll b/test/CodeGen/NVPTX/fast-math.ll new file mode 100644 index 00000000000..9da26adc151 --- /dev/null +++ b/test/CodeGen/NVPTX/fast-math.ll @@ -0,0 +1,43 @@ +; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s + + +declare float @llvm.nvvm.sqrt.f(float) + + +; CHECK: sqrt_div +; CHECK: sqrt.rn.f32 +; CHECK: div.rn.f32 +define float @sqrt_div(float %a, float %b) { + %t1 = tail call float @llvm.nvvm.sqrt.f(float %a) + %t2 = fdiv float %t1, %b + ret float %t2 +} + +; CHECK: sqrt_div_fast +; CHECK: sqrt.approx.f32 +; CHECK: div.approx.f32 +define float @sqrt_div_fast(float %a, float %b) #0 { + %t1 = tail call float @llvm.nvvm.sqrt.f(float %a) + %t2 = fdiv float %t1, %b + ret float %t2 +} + + +; CHECK: fadd +; CHECK: add.f32 +define float @fadd(float %a, float %b) { + %t1 = fadd float %a, %b + ret float %t1 +} + +; CHECK: fadd_ftz +; CHECK: add.ftz.f32 +define float @fadd_ftz(float %a, float %b) #1 { + %t1 = fadd float %a, %b + ret float %t1 +} + + + +attributes #0 = { "unsafe-fp-math" = "true" } +attributes #1 = { "nvptx-f32ftz" = "true" }