From 707fd44038edf9ec0d3fe7b99d51e7c71e36f9d0 Mon Sep 17 00:00:00 2001 From: Justin Holewinski Date: Fri, 17 Jun 2011 12:12:42 +0000 Subject: [PATCH] PTX: Adjust rounding modes * rounding modes for fp add, mul, sub now use .rn * float -> int rounding correctly uses .rzi not .rni * 32bit fdiv for sm13 uses div.rn (instead of div.approx) * 32bit fdiv for sm10 now uses div (instead of div.approx) Approx is not IEEE 754 compatible (and should be optionally set by a flag to the backend instead). The .rn rounding modifier is the PTX default anyway, but it's better to be explicit. All these modifiers should be available by using __fmul_rz functions for example, but support will need to be added for this in the backend. Patch by Dan Bailey git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@133253 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/PTX/PTXInstrInfo.td | 59 +++++++++++++++++++++------------- test/CodeGen/PTX/add.ll | 8 ++--- test/CodeGen/PTX/cvt.ll | 16 ++++----- test/CodeGen/PTX/fdiv-sm10.ll | 2 +- test/CodeGen/PTX/fdiv-sm13.ll | 2 +- test/CodeGen/PTX/mul.ll | 8 ++--- test/CodeGen/PTX/sub.ll | 8 ++--- 7 files changed, 59 insertions(+), 44 deletions(-) diff --git a/lib/Target/PTX/PTXInstrInfo.td b/lib/Target/PTX/PTXInstrInfo.td index 71f7cc32b8a..8477cd71d31 100644 --- a/lib/Target/PTX/PTXInstrInfo.td +++ b/lib/Target/PTX/PTXInstrInfo.td @@ -584,24 +584,39 @@ defm REM : INT3<"rem", urem>; defm FNEG : PTX_FLOAT_2OP<"neg", fneg>; // Standard Binary Operations -defm FADD : PTX_FLOAT_3OP<"add", fadd>; -defm FSUB : PTX_FLOAT_3OP<"sub", fsub>; -defm FMUL : PTX_FLOAT_3OP<"mul", fmul>; +defm FADD : PTX_FLOAT_3OP<"add.rn", fadd>; +defm FSUB : PTX_FLOAT_3OP<"sub.rn", fsub>; +defm FMUL : PTX_FLOAT_3OP<"mul.rn", fmul>; -// TODO: Allow user selection of rounding modes for fdiv. -// For division, we need to have f32 and f64 differently. -// For f32, we just always use .approx since it is supported on all hardware -// for PTX 1.4+, which is our minimum target. -def FDIVrr32 : InstPTX<(outs RegF32:$d), +// For floating-point division: +// SM_13+ defaults to .rn for f32 and f64, +// SM10 must *not* provide a rounding + +// TODO: +// - Allow user selection of rounding modes for fdiv +// - Add support for -prec-div=false (.approx) + +def FDIVrr32SM13 : InstPTX<(outs RegF32:$d), (ins RegF32:$a, RegF32:$b), - "div.approx.f32\t$d, $a, $b", - [(set RegF32:$d, (fdiv RegF32:$a, RegF32:$b))]>; -def FDIVri32 : InstPTX<(outs RegF32:$d), + "div.rn.f32\t$d, $a, $b", + [(set RegF32:$d, (fdiv RegF32:$a, RegF32:$b))]>, + Requires<[SupportsSM13]>; +def FDIVri32SM13 : InstPTX<(outs RegF32:$d), (ins RegF32:$a, f32imm:$b), - "div.approx.f32\t$d, $a, $b", - [(set RegF32:$d, (fdiv RegF32:$a, fpimm:$b))]>; + "div.rn.f32\t$d, $a, $b", + [(set RegF32:$d, (fdiv RegF32:$a, fpimm:$b))]>, + Requires<[SupportsSM13]>; +def FDIVrr32SM10 : InstPTX<(outs RegF32:$d), + (ins RegF32:$a, RegF32:$b), + "div.f32\t$d, $a, $b", + [(set RegF32:$d, (fdiv RegF32:$a, RegF32:$b))]>, + Requires<[DoesNotSupportSM13]>; +def FDIVri32SM10 : InstPTX<(outs RegF32:$d), + (ins RegF32:$a, f32imm:$b), + "div.f32\t$d, $a, $b", + [(set RegF32:$d, (fdiv RegF32:$a, fpimm:$b))]>, + Requires<[DoesNotSupportSM13]>; -// For f64, we must specify a rounding for sm 1.3+ but *not* for sm 1.0. def FDIVrr64SM13 : InstPTX<(outs RegF64:$d), (ins RegF64:$a, RegF64:$b), "div.rn.f64\t$d, $a, $b", @@ -825,11 +840,11 @@ def CVT_pred_u64 [(set RegPred:$d, (trunc RegI64:$a))]>; def CVT_pred_f32 - : InstPTX<(outs RegPred:$d), (ins RegF32:$a), "cvt.rni.pred.f32\t$d, $a", + : InstPTX<(outs RegPred:$d), (ins RegF32:$a), "cvt.rzi.pred.f32\t$d, $a", [(set RegPred:$d, (fp_to_uint RegF32:$a))]>; def CVT_pred_f64 - : InstPTX<(outs RegPred:$d), (ins RegF64:$a), "cvt.rni.pred.f64\t$d, $a", + : InstPTX<(outs RegPred:$d), (ins RegF64:$a), "cvt.rzi.pred.f64\t$d, $a", [(set RegPred:$d, (fp_to_uint RegF64:$a))]>; // Conversion to u16 @@ -847,11 +862,11 @@ def CVT_u16_u64 [(set RegI16:$d, (trunc RegI64:$a))]>; def CVT_u16_f32 - : InstPTX<(outs RegI16:$d), (ins RegF32:$a), "cvt.rni.u16.f32\t$d, $a", + : InstPTX<(outs RegI16:$d), (ins RegF32:$a), "cvt.rzi.u16.f32\t$d, $a", [(set RegI16:$d, (fp_to_uint RegF32:$a))]>; def CVT_u16_f64 - : InstPTX<(outs RegI16:$d), (ins RegF64:$a), "cvt.rni.u16.f64\t$d, $a", + : InstPTX<(outs RegI16:$d), (ins RegF64:$a), "cvt.rzi.u16.f64\t$d, $a", [(set RegI16:$d, (fp_to_uint RegF64:$a))]>; // Conversion to u32 @@ -869,11 +884,11 @@ def CVT_u32_u64 [(set RegI32:$d, (trunc RegI64:$a))]>; def CVT_u32_f32 - : InstPTX<(outs RegI32:$d), (ins RegF32:$a), "cvt.rni.u32.f32\t$d, $a", + : InstPTX<(outs RegI32:$d), (ins RegF32:$a), "cvt.rzi.u32.f32\t$d, $a", [(set RegI32:$d, (fp_to_uint RegF32:$a))]>; def CVT_u32_f64 - : InstPTX<(outs RegI32:$d), (ins RegF64:$a), "cvt.rni.u32.f64\t$d, $a", + : InstPTX<(outs RegI32:$d), (ins RegF64:$a), "cvt.rzi.u32.f64\t$d, $a", [(set RegI32:$d, (fp_to_uint RegF64:$a))]>; // Conversion to u64 @@ -891,11 +906,11 @@ def CVT_u64_u32 [(set RegI64:$d, (zext RegI32:$a))]>; def CVT_u64_f32 - : InstPTX<(outs RegI64:$d), (ins RegF32:$a), "cvt.rni.u64.f32\t$d, $a", + : InstPTX<(outs RegI64:$d), (ins RegF32:$a), "cvt.rzi.u64.f32\t$d, $a", [(set RegI64:$d, (fp_to_uint RegF32:$a))]>; def CVT_u64_f64 - : InstPTX<(outs RegI64:$d), (ins RegF64:$a), "cvt.rni.u64.f64\t$d, $a", + : InstPTX<(outs RegI64:$d), (ins RegF64:$a), "cvt.rzi.u64.f64\t$d, $a", [(set RegI64:$d, (fp_to_uint RegF64:$a))]>; // Conversion to f32 diff --git a/test/CodeGen/PTX/add.ll b/test/CodeGen/PTX/add.ll index b89a2f62691..c16be4933c5 100644 --- a/test/CodeGen/PTX/add.ll +++ b/test/CodeGen/PTX/add.ll @@ -22,14 +22,14 @@ define ptx_device i64 @t1_u64(i64 %x, i64 %y) { } define ptx_device float @t1_f32(float %x, float %y) { -; CHECK: add.f32 r0, r1, r2 +; CHECK: add.rn.f32 r0, r1, r2 ; CHECK-NEXT: ret; %z = fadd float %x, %y ret float %z } define ptx_device double @t1_f64(double %x, double %y) { -; CHECK: add.f64 rd0, rd1, rd2 +; CHECK: add.rn.f64 rd0, rd1, rd2 ; CHECK-NEXT: ret; %z = fadd double %x, %y ret double %z @@ -57,14 +57,14 @@ define ptx_device i64 @t2_u64(i64 %x) { } define ptx_device float @t2_f32(float %x) { -; CHECK: add.f32 r0, r1, 0F3F800000; +; CHECK: add.rn.f32 r0, r1, 0F3F800000; ; CHECK-NEXT: ret; %z = fadd float %x, 1.0 ret float %z } define ptx_device double @t2_f64(double %x) { -; CHECK: add.f64 rd0, rd1, 0D3FF0000000000000; +; CHECK: add.rn.f64 rd0, rd1, 0D3FF0000000000000; ; CHECK-NEXT: ret; %z = fadd double %x, 1.0 ret double %z diff --git a/test/CodeGen/PTX/cvt.ll b/test/CodeGen/PTX/cvt.ll index 984cb4d8d5a..bf18bd79211 100644 --- a/test/CodeGen/PTX/cvt.ll +++ b/test/CodeGen/PTX/cvt.ll @@ -31,7 +31,7 @@ define ptx_device i32 @cvt_pred_i64(i64 %x, i1 %y) { } define ptx_device i32 @cvt_pred_f32(float %x, i1 %y) { -; CHECK: cvt.rni.pred.f32 p0, r1; +; CHECK: cvt.rzi.pred.f32 p0, r1; ; CHECK: ret; %a = fptoui float %x to i1 %b = and i1 %a, %y @@ -40,7 +40,7 @@ define ptx_device i32 @cvt_pred_f32(float %x, i1 %y) { } define ptx_device i32 @cvt_pred_f64(double %x, i1 %y) { -; CHECK: cvt.rni.pred.f64 p0, rd1; +; CHECK: cvt.rzi.pred.f64 p0, rd1; ; CHECK: ret; %a = fptoui double %x to i1 %b = and i1 %a, %y @@ -72,14 +72,14 @@ define ptx_device i16 @cvt_i16_i64(i64 %x) { } define ptx_device i16 @cvt_i16_f32(float %x) { -; CHECK: cvt.rni.u16.f32 rh0, r1; +; CHECK: cvt.rzi.u16.f32 rh0, r1; ; CHECK: ret; %a = fptoui float %x to i16 ret i16 %a } define ptx_device i16 @cvt_i16_f64(double %x) { -; CHECK: cvt.rni.u16.f64 rh0, rd1; +; CHECK: cvt.rzi.u16.f64 rh0, rd1; ; CHECK: ret; %a = fptoui double %x to i16 ret i16 %a @@ -109,14 +109,14 @@ define ptx_device i32 @cvt_i32_i64(i64 %x) { } define ptx_device i32 @cvt_i32_f32(float %x) { -; CHECK: cvt.rni.u32.f32 r0, r1; +; CHECK: cvt.rzi.u32.f32 r0, r1; ; CHECK: ret; %a = fptoui float %x to i32 ret i32 %a } define ptx_device i32 @cvt_i32_f64(double %x) { -; CHECK: cvt.rni.u32.f64 r0, rd1; +; CHECK: cvt.rzi.u32.f64 r0, rd1; ; CHECK: ret; %a = fptoui double %x to i32 ret i32 %a @@ -146,14 +146,14 @@ define ptx_device i64 @cvt_i64_i32(i32 %x) { } define ptx_device i64 @cvt_i64_f32(float %x) { -; CHECK: cvt.rni.u64.f32 rd0, r1; +; CHECK: cvt.rzi.u64.f32 rd0, r1; ; CHECK: ret; %a = fptoui float %x to i64 ret i64 %a } define ptx_device i64 @cvt_i64_f64(double %x) { -; CHECK: cvt.rni.u64.f64 rd0, rd1; +; CHECK: cvt.rzi.u64.f64 rd0, rd1; ; CHECK: ret; %a = fptoui double %x to i64 ret i64 %a diff --git a/test/CodeGen/PTX/fdiv-sm10.ll b/test/CodeGen/PTX/fdiv-sm10.ll index 9aff25111b0..eb32222f3a3 100644 --- a/test/CodeGen/PTX/fdiv-sm10.ll +++ b/test/CodeGen/PTX/fdiv-sm10.ll @@ -1,7 +1,7 @@ ; RUN: llc < %s -march=ptx32 -mattr=+sm10 | FileCheck %s define ptx_device float @t1_f32(float %x, float %y) { -; CHECK: div.approx.f32 r0, r1, r2; +; CHECK: div.f32 r0, r1, r2; ; CHECK-NEXT: ret; %a = fdiv float %x, %y ret float %a diff --git a/test/CodeGen/PTX/fdiv-sm13.ll b/test/CodeGen/PTX/fdiv-sm13.ll index 84e0adab7e1..ad24f35b206 100644 --- a/test/CodeGen/PTX/fdiv-sm13.ll +++ b/test/CodeGen/PTX/fdiv-sm13.ll @@ -1,7 +1,7 @@ ; RUN: llc < %s -march=ptx32 -mattr=+sm13 | FileCheck %s define ptx_device float @t1_f32(float %x, float %y) { -; CHECK: div.approx.f32 r0, r1, r2; +; CHECK: div.rn.f32 r0, r1, r2; ; CHECK-NEXT: ret; %a = fdiv float %x, %y ret float %a diff --git a/test/CodeGen/PTX/mul.ll b/test/CodeGen/PTX/mul.ll index 93f94e35096..2093556dac4 100644 --- a/test/CodeGen/PTX/mul.ll +++ b/test/CodeGen/PTX/mul.ll @@ -11,28 +11,28 @@ ;} define ptx_device float @t1_f32(float %x, float %y) { -; CHECK: mul.f32 r0, r1, r2 +; CHECK: mul.rn.f32 r0, r1, r2 ; CHECK-NEXT: ret; %z = fmul float %x, %y ret float %z } define ptx_device double @t1_f64(double %x, double %y) { -; CHECK: mul.f64 rd0, rd1, rd2 +; CHECK: mul.rn.f64 rd0, rd1, rd2 ; CHECK-NEXT: ret; %z = fmul double %x, %y ret double %z } define ptx_device float @t2_f32(float %x) { -; CHECK: mul.f32 r0, r1, 0F40A00000; +; CHECK: mul.rn.f32 r0, r1, 0F40A00000; ; CHECK-NEXT: ret; %z = fmul float %x, 5.0 ret float %z } define ptx_device double @t2_f64(double %x) { -; CHECK: mul.f64 rd0, rd1, 0D4014000000000000; +; CHECK: mul.rn.f64 rd0, rd1, 0D4014000000000000; ; CHECK-NEXT: ret; %z = fmul double %x, 5.0 ret double %z diff --git a/test/CodeGen/PTX/sub.ll b/test/CodeGen/PTX/sub.ll index 9efeaace0e7..4d552801a47 100644 --- a/test/CodeGen/PTX/sub.ll +++ b/test/CodeGen/PTX/sub.ll @@ -22,14 +22,14 @@ define ptx_device i64 @t1_u64(i64 %x, i64 %y) { } define ptx_device float @t1_f32(float %x, float %y) { -; CHECK: sub.f32 r0, r1, r2 +; CHECK: sub.rn.f32 r0, r1, r2 ; CHECK-NEXT: ret; %z = fsub float %x, %y ret float %z } define ptx_device double @t1_f64(double %x, double %y) { -; CHECK: sub.f64 rd0, rd1, rd2 +; CHECK: sub.rn.f64 rd0, rd1, rd2 ; CHECK-NEXT: ret; %z = fsub double %x, %y ret double %z @@ -57,14 +57,14 @@ define ptx_device i64 @t2_u64(i64 %x) { } define ptx_device float @t2_f32(float %x) { -; CHECK: add.f32 r0, r1, 0FBF800000; +; CHECK: add.rn.f32 r0, r1, 0FBF800000; ; CHECK-NEXT: ret; %z = fsub float %x, 1.0 ret float %z } define ptx_device double @t2_f64(double %x) { -; CHECK: add.f64 rd0, rd1, 0DBFF0000000000000; +; CHECK: add.rn.f64 rd0, rd1, 0DBFF0000000000000; ; CHECK-NEXT: ret; %z = fsub double %x, 1.0 ret double %z