[NVPTX] Use approximate FP ops when unsafe-fp-math is used, and append

.ftz to instructions if the nvptx-f32ftz attribute is set to "true"

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@186820 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
Justin Holewinski 2013-07-22 12:18:04 +00:00
parent 1abb7bc7e9
commit 3a8ee4ffd7
4 changed files with 104 additions and 56 deletions

View File

@ -25,11 +25,6 @@
using namespace llvm;
static cl::opt<bool> UseFMADInstruction(
"nvptx-mad-enable", cl::ZeroOrMore,
cl::desc("NVPTX Specific: Enable generating FMAD instructions"),
cl::init(false));
static cl::opt<int>
FMAContractLevel("nvptx-fma-level", cl::ZeroOrMore,
cl::desc("NVPTX Specific: FMA contraction (0: don't do it"
@ -47,6 +42,12 @@ UsePrecSqrtF32("nvptx-prec-sqrtf32",
cl::desc("NVPTX Specific: 0 use sqrt.approx, 1 use sqrt.rn."),
cl::init(true));
static cl::opt<bool>
FtzEnabled("nvptx-f32ftz", cl::ZeroOrMore,
cl::desc("NVPTX Specific: Flush f32 subnormals to sign-preserving zero."),
cl::init(false));
/// createNVPTXISelDag - This pass converts a legalized DAG into a
/// NVPTX-specific DAG, ready for instruction scheduling.
FunctionPass *llvm::createNVPTXISelDag(NVPTXTargetMachine &TM,
@ -58,12 +59,7 @@ NVPTXDAGToDAGISel::NVPTXDAGToDAGISel(NVPTXTargetMachine &tm,
CodeGenOpt::Level OptLevel)
: SelectionDAGISel(tm, OptLevel),
Subtarget(tm.getSubtarget<NVPTXSubtarget>()) {
// Always do fma.f32 fpcontract if the target supports the instruction.
// Always do fma.f64 fpcontract if the target supports the instruction.
// Do mad.f32 is nvptx-mad-enable is specified and the target does not
// support fma.f32.
doFMADF32 = (OptLevel > 0) && UseFMADInstruction && !Subtarget.hasFMAF32();
doFMAF32 = (OptLevel > 0) && Subtarget.hasFMAF32() && (FMAContractLevel >= 1);
doFMAF64 = (OptLevel > 0) && Subtarget.hasFMAF64() && (FMAContractLevel >= 1);
doFMAF32AGG =
@ -71,20 +67,51 @@ NVPTXDAGToDAGISel::NVPTXDAGToDAGISel(NVPTXTargetMachine &tm,
doFMAF64AGG =
(OptLevel > 0) && Subtarget.hasFMAF64() && (FMAContractLevel == 2);
allowFMA = (FMAContractLevel >= 1) || UseFMADInstruction;
UseF32FTZ = false;
allowFMA = (FMAContractLevel >= 1);
doMulWide = (OptLevel > 0);
}
// Decide how to translate f32 div
do_DIVF32_PREC = UsePrecDivF32;
// Decide how to translate f32 sqrt
do_SQRTF32_PREC = UsePrecSqrtF32;
// sm less than sm_20 does not support div.rnd. Use div.full.
if (do_DIVF32_PREC == 2 && !Subtarget.reqPTX20())
do_DIVF32_PREC = 1;
int NVPTXDAGToDAGISel::getDivF32Level() const {
if (UsePrecDivF32.getNumOccurrences() > 0) {
// If nvptx-prec-div32=N is used on the command-line, always honor it
return UsePrecDivF32;
} else {
// Otherwise, use div.approx if fast math is enabled
if (TM.Options.UnsafeFPMath)
return 0;
else
return 2;
}
}
bool NVPTXDAGToDAGISel::usePrecSqrtF32() const {
if (UsePrecSqrtF32.getNumOccurrences() > 0) {
// If nvptx-prec-sqrtf32 is used on the command-line, always honor it
return UsePrecSqrtF32;
} else {
// Otherwise, use sqrt.approx if fast math is enabled
if (TM.Options.UnsafeFPMath)
return false;
else
return true;
}
}
bool NVPTXDAGToDAGISel::useF32FTZ() const {
if (FtzEnabled.getNumOccurrences() > 0) {
// If nvptx-f32ftz is used on the command-line, always honor it
return FtzEnabled;
} else {
const Function *F = MF->getFunction();
// Otherwise, check for an nvptx-f32ftz attribute on the function
if (F->hasFnAttribute("nvptx-f32ftz"))
return (F->getAttributes().getAttribute(AttributeSet::FunctionIndex,
"nvptx-f32ftz")
.getValueAsString() == "true");
else
return false;
}
}
/// Select - Select instructions not customized! Used for

View File

@ -28,38 +28,22 @@ class LLVM_LIBRARY_VISIBILITY NVPTXDAGToDAGISel : public SelectionDAGISel {
// If true, generate corresponding FPCONTRACT. This is
// language dependent (i.e. CUDA and OpenCL works differently).
bool doFMADF32;
bool doFMAF64;
bool doFMAF32;
bool doFMAF64AGG;
bool doFMAF32AGG;
bool allowFMA;
// 0: use div.approx
// 1: use div.full
// 2: For sm_20 and later, ieee-compliant div.rnd.f32 can be generated;
// Otherwise, use div.full
int do_DIVF32_PREC;
// If true, generate sqrt.rn, else generate sqrt.approx. If FTZ
// is true, then generate the corresponding FTZ version.
bool do_SQRTF32_PREC;
// If true, add .ftz to f32 instructions.
// This is only meaningful for sm_20 and later, as the default
// is not ftz.
// For sm earlier than sm_20, f32 denorms are always ftz by the
// hardware.
// We always add the .ftz modifier regardless of the sm value
// when Use32FTZ is true.
bool UseF32FTZ;
// If true, generate mul.wide from sext and mul
bool doMulWide;
int getDivF32Level() const;
bool usePrecSqrtF32() const;
bool useF32FTZ() const;
public:
explicit NVPTXDAGToDAGISel(NVPTXTargetMachine &tm,
CodeGenOpt::Level OptLevel);
CodeGenOpt::Level OptLevel);
// Pass Name
virtual const char *getPassName() const {

View File

@ -136,28 +136,26 @@ def hasLDG : Predicate<"Subtarget.hasLDG()">;
def hasLDU : Predicate<"Subtarget.hasLDU()">;
def hasGenericLdSt : Predicate<"Subtarget.hasGenericLdSt()">;
def doF32FTZ : Predicate<"UseF32FTZ==1">;
def doNoF32FTZ : Predicate<"UseF32FTZ==0">;
def doF32FTZ : Predicate<"useF32FTZ()">;
def doNoF32FTZ : Predicate<"!useF32FTZ()">;
def doFMAF32 : Predicate<"doFMAF32">;
def doFMAF32_ftz : Predicate<"(doFMAF32 && UseF32FTZ)">;
def doFMAF32_ftz : Predicate<"(doFMAF32 && useF32FTZ())">;
def doFMAF32AGG : Predicate<"doFMAF32AGG">;
def doFMAF32AGG_ftz : Predicate<"(doFMAF32AGG && UseF32FTZ)">;
def doFMAF32AGG_ftz : Predicate<"(doFMAF32AGG && useF32FTZ())">;
def doFMAF64 : Predicate<"doFMAF64">;
def doFMAF64AGG : Predicate<"doFMAF64AGG">;
def doFMADF32 : Predicate<"doFMADF32">;
def doFMADF32_ftz : Predicate<"(doFMADF32 && UseF32FTZ)">;
def doMulWide : Predicate<"doMulWide">;
def allowFMA : Predicate<"allowFMA">;
def allowFMA_ftz : Predicate<"(allowFMA && UseF32FTZ)">;
def allowFMA_ftz : Predicate<"(allowFMA && useF32FTZ())">;
def do_DIVF32_APPROX : Predicate<"do_DIVF32_PREC==0">;
def do_DIVF32_FULL : Predicate<"do_DIVF32_PREC==1">;
def do_DIVF32_APPROX : Predicate<"getDivF32Level()==0">;
def do_DIVF32_FULL : Predicate<"getDivF32Level()==1">;
def do_SQRTF32_APPROX : Predicate<"do_SQRTF32_PREC==0">;
def do_SQRTF32_RN : Predicate<"do_SQRTF32_PREC==1">;
def do_SQRTF32_APPROX : Predicate<"!usePrecSqrtF32()">;
def do_SQRTF32_RN : Predicate<"usePrecSqrtF32()">;
def hasHWROT32 : Predicate<"Subtarget.hasHWROT32()">;
@ -864,8 +862,6 @@ multiclass FPCONTRACT64<string OpcStr, Predicate Pred> {
// If we reverse the order of the following two lines, then rrr2 rule will be
// generated for FMA32, but not for rrr.
// Therefore, we manually write the rrr2 rule in FPCONTRACT32.
defm FMAD32_ftz : FPCONTRACT32<"mad.ftz.f32", doFMADF32_ftz>;
defm FMAD32 : FPCONTRACT32<"mad.f32", doFMADF32>;
defm FMA32_ftz : FPCONTRACT32<"fma.rn.ftz.f32", doFMAF32_ftz>;
defm FMA32 : FPCONTRACT32<"fma.rn.f32", doFMAF32>;
defm FMA64 : FPCONTRACT64<"fma.rn.f64", doFMAF64>;
@ -904,8 +900,6 @@ multiclass FPCONTRACT64_SUB_PAT<NVPTXInst Inst, Predicate Pred> {
defm FMAF32ext_ftz : FPCONTRACT32_SUB_PAT<FMA32_ftzrrr, doFMAF32AGG_ftz>;
defm FMAF32ext : FPCONTRACT32_SUB_PAT<FMA32rrr, doFMAF32AGG>;
defm FMADF32ext_ftz : FPCONTRACT32_SUB_PAT_MAD<FMAD32_ftzrrr, doFMADF32_ftz>;
defm FMADF32ext : FPCONTRACT32_SUB_PAT_MAD<FMAD32rrr, doFMADF32>;
defm FMAF64ext : FPCONTRACT64_SUB_PAT<FMA64rrr, doFMAF64AGG>;
def SINF: NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$src),

View File

@ -0,0 +1,43 @@
; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
declare float @llvm.nvvm.sqrt.f(float)
; CHECK: sqrt_div
; CHECK: sqrt.rn.f32
; CHECK: div.rn.f32
define float @sqrt_div(float %a, float %b) {
%t1 = tail call float @llvm.nvvm.sqrt.f(float %a)
%t2 = fdiv float %t1, %b
ret float %t2
}
; CHECK: sqrt_div_fast
; CHECK: sqrt.approx.f32
; CHECK: div.approx.f32
define float @sqrt_div_fast(float %a, float %b) #0 {
%t1 = tail call float @llvm.nvvm.sqrt.f(float %a)
%t2 = fdiv float %t1, %b
ret float %t2
}
; CHECK: fadd
; CHECK: add.f32
define float @fadd(float %a, float %b) {
%t1 = fadd float %a, %b
ret float %t1
}
; CHECK: fadd_ftz
; CHECK: add.ftz.f32
define float @fadd_ftz(float %a, float %b) #1 {
%t1 = fadd float %a, %b
ret float %t1
}
attributes #0 = { "unsafe-fp-math" = "true" }
attributes #1 = { "nvptx-f32ftz" = "true" }