From 8a44761afe88cfbc0b8741318cb24e1eadfb5698 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Sat, 14 Feb 2015 04:30:08 +0000 Subject: [PATCH] R600/SI: Implement correct f64 fdiv This version passes the OpenCL conformance test. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@229239 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/R600/AMDGPUInstructions.td | 15 +--- lib/Target/R600/CaymanInstructions.td | 2 +- lib/Target/R600/EvergreenInstructions.td | 2 +- lib/Target/R600/R600Instructions.td | 2 +- lib/Target/R600/SIISelLowering.cpp | 66 +++++++++++++++- lib/Target/R600/SIInstructions.td | 17 ++--- test/CodeGen/R600/fdiv.f64.ll | 96 ++++++++++++++++++++++++ test/CodeGen/R600/fdiv64.ll | 15 ---- test/CodeGen/R600/frem.ll | 16 ++-- test/CodeGen/R600/llvm.AMDGPU.rcp.f64.ll | 2 + 10 files changed, 184 insertions(+), 49 deletions(-) create mode 100644 test/CodeGen/R600/fdiv.f64.ll delete mode 100644 test/CodeGen/R600/fdiv64.ll diff --git a/lib/Target/R600/AMDGPUInstructions.td b/lib/Target/R600/AMDGPUInstructions.td index 12b494eef9c..1fa51a9258a 100644 --- a/lib/Target/R600/AMDGPUInstructions.td +++ b/lib/Target/R600/AMDGPUInstructions.td @@ -668,17 +668,10 @@ class RcpPat : Pat < (RcpInst $src) >; -multiclass RsqPat { - def : Pat < - (fdiv FP_ONE, (fsqrt vt:$src)), - (RsqInst $src) - >; - - def : Pat < - (AMDGPUrcp (fsqrt vt:$src)), - (RsqInst $src) - >; -} +class RsqPat : Pat < + (AMDGPUrcp (fsqrt vt:$src)), + (RsqInst $src) +>; include "R600Instructions.td" include "R700Instructions.td" diff --git a/lib/Target/R600/CaymanInstructions.td b/lib/Target/R600/CaymanInstructions.td index 41b5fc0f876..ba4df82a6d3 100644 --- a/lib/Target/R600/CaymanInstructions.td +++ b/lib/Target/R600/CaymanInstructions.td @@ -46,7 +46,7 @@ def SIN_cm : SIN_Common<0x8D>; def COS_cm : COS_Common<0x8E>; } // End isVector = 1 -defm : RsqPat; +def : RsqPat; def : POW_Common ; diff --git a/lib/Target/R600/EvergreenInstructions.td b/lib/Target/R600/EvergreenInstructions.td index 62b701d2231..9f9472c1c17 100644 --- a/lib/Target/R600/EvergreenInstructions.td +++ b/lib/Target/R600/EvergreenInstructions.td @@ -69,7 +69,7 @@ def EXP_IEEE_eg : EXP_IEEE_Common<0x81>; def LOG_IEEE_eg : LOG_IEEE_Common<0x83>; def RECIP_CLAMPED_eg : RECIP_CLAMPED_Common<0x84>; def RECIPSQRT_IEEE_eg : RECIPSQRT_IEEE_Common<0x89>; -defm : RsqPat; +def : RsqPat; def SIN_eg : SIN_Common<0x8D>; def COS_eg : COS_Common<0x8E>; diff --git a/lib/Target/R600/R600Instructions.td b/lib/Target/R600/R600Instructions.td index 99097c47617..37c40d537d3 100644 --- a/lib/Target/R600/R600Instructions.td +++ b/lib/Target/R600/R600Instructions.td @@ -1184,7 +1184,7 @@ let Predicates = [isR600] in { def TGSI_LIT_Z_r600 : TGSI_LIT_Z_Common; def : Pat<(fsqrt f32:$src), (MUL $src, (RECIPSQRT_CLAMPED_r600 $src))>; - defm : RsqPat; + def : RsqPat; def R600_ExportSwz : ExportSwzInst { let Word1{20-17} = 0; // BURST_COUNT diff --git a/lib/Target/R600/SIISelLowering.cpp b/lib/Target/R600/SIISelLowering.cpp index 141ba80492f..452b54c3032 100644 --- a/lib/Target/R600/SIISelLowering.cpp +++ b/lib/Target/R600/SIISelLowering.cpp @@ -211,6 +211,7 @@ SITargetLowering::SITargetLowering(TargetMachine &TM, } setOperationAction(ISD::FDIV, MVT::f32, Custom); + setOperationAction(ISD::FDIV, MVT::f64, Custom); setTargetDAGCombine(ISD::FADD); setTargetDAGCombine(ISD::FSUB); @@ -1130,7 +1131,70 @@ SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const { } SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const { - return SDValue(); + if (DAG.getTarget().Options.UnsafeFPMath) + return LowerFastFDIV(Op, DAG); + + SDLoc SL(Op); + SDValue X = Op.getOperand(0); + SDValue Y = Op.getOperand(1); + + const SDValue One = DAG.getConstantFP(1.0, MVT::f64); + + SDVTList ScaleVT = DAG.getVTList(MVT::f64, MVT::i1); + + SDValue DivScale0 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, Y, Y, X); + + SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f64, DivScale0); + + SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f64, DivScale0); + + SDValue Fma0 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Rcp, One); + + SDValue Fma1 = DAG.getNode(ISD::FMA, SL, MVT::f64, Rcp, Fma0, Rcp); + + SDValue Fma2 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Fma1, One); + + SDValue DivScale1 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, X, Y, X); + + SDValue Fma3 = DAG.getNode(ISD::FMA, SL, MVT::f64, Fma1, Fma2, Fma1); + SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f64, DivScale1, Fma3); + + SDValue Fma4 = DAG.getNode(ISD::FMA, SL, MVT::f64, + NegDivScale0, Mul, DivScale1); + + SDValue Scale; + + if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS) { + // Workaround a hardware bug on SI where the condition output from div_scale + // is not usable. + + const SDValue Hi = DAG.getConstant(1, MVT::i32); + + // Figure out if the scale to use for div_fmas. + SDValue NumBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, X); + SDValue DenBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Y); + SDValue Scale0BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale0); + SDValue Scale1BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale1); + + SDValue NumHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, NumBC, Hi); + SDValue DenHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, DenBC, Hi); + + SDValue Scale0Hi + = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale0BC, Hi); + SDValue Scale1Hi + = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale1BC, Hi); + + SDValue CmpDen = DAG.getSetCC(SL, MVT::i1, DenHi, Scale0Hi, ISD::SETEQ); + SDValue CmpNum = DAG.getSetCC(SL, MVT::i1, NumHi, Scale1Hi, ISD::SETEQ); + Scale = DAG.getNode(ISD::XOR, SL, MVT::i1, CmpNum, CmpDen); + } else { + Scale = DivScale1.getValue(1); + } + + SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f64, + Fma4, Fma3, Mul, Scale); + + return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f64, Fmas, Y, X); } SDValue SITargetLowering::LowerFDIV(SDValue Op, SelectionDAG &DAG) const { diff --git a/lib/Target/R600/SIInstructions.td b/lib/Target/R600/SIInstructions.td index 0f14dc31136..fbd38c959c8 100644 --- a/lib/Target/R600/SIInstructions.td +++ b/lib/Target/R600/SIInstructions.td @@ -2166,9 +2166,13 @@ def : Pat < //===----------------------------------------------------------------------===// let Predicates = [UnsafeFPMath] in { -def : RcpPat; -defm : RsqPat; -defm : RsqPat; + +//def : RcpPat; +//defm : RsqPat; +//defm : RsqPat; + +def : RsqPat; +def : RsqPat; } //===----------------------------------------------------------------------===// @@ -2685,13 +2689,6 @@ def : Pat < (V_MUL_LEGACY_F32_e32 $src0, (V_RCP_LEGACY_F32_e32 $src1)) >; -def : Pat< - (fdiv f64:$src0, f64:$src1), - (V_MUL_F64 0 /* src0_modifiers */, $src0, - 0 /* src1_modifiers */, (V_RCP_F64_e32 $src1), - 0 /* clamp */, 0 /* omod */) ->; - def : Pat < (int_AMDGPU_cube v4f32:$src), (REG_SEQUENCE VReg_128, diff --git a/test/CodeGen/R600/fdiv.f64.ll b/test/CodeGen/R600/fdiv.f64.ll new file mode 100644 index 00000000000..276642f9901 --- /dev/null +++ b/test/CodeGen/R600/fdiv.f64.ll @@ -0,0 +1,96 @@ +; RUN: llc -march=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=COMMON %s +; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=COMMON %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=COMMON %s + + +; COMMON-LABEL: {{^}}fdiv_f64: +; COMMON-DAG: buffer_load_dwordx2 [[NUM:v\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0 +; COMMON-DAG: buffer_load_dwordx2 [[DEN:v\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0 offset:8 +; CI-DAG: v_div_scale_f64 [[SCALE0:v\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, [[DEN]], [[DEN]], [[NUM]] +; CI-DAG: v_div_scale_f64 [[SCALE1:v\[[0-9]+:[0-9]+\]]], vcc, [[NUM]], [[DEN]], [[NUM]] + +; Check for div_scale bug workaround on SI +; SI-DAG: v_div_scale_f64 [[SCALE0:v\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, [[DEN]], [[DEN]], [[NUM]] +; SI-DAG: v_div_scale_f64 [[SCALE1:v\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, [[NUM]], [[DEN]], [[NUM]] + +; COMMON-DAG: v_rcp_f64_e32 [[RCP_SCALE0:v\[[0-9]+:[0-9]+\]]], [[SCALE0]] + +; SI-DAG: v_cmp_eq_i32_e32 vcc, {{v[0-9]+}}, {{v[0-9]+}} +; SI-DAG: v_cmp_eq_i32_e64 [[CMP0:s\[[0-9]+:[0-9]+\]]], {{v[0-9]+}}, {{v[0-9]+}} +; SI-DAG: s_xor_b64 vcc, [[CMP0]], vcc + +; COMMON-DAG: v_fma_f64 [[FMA0:v\[[0-9]+:[0-9]+\]]], -[[SCALE0]], [[RCP_SCALE0]], 1.0 +; COMMON-DAG: v_fma_f64 [[FMA1:v\[[0-9]+:[0-9]+\]]], [[RCP_SCALE0]], [[FMA0]], [[RCP_SCALE0]] +; COMMON-DAG: v_fma_f64 [[FMA2:v\[[0-9]+:[0-9]+\]]], -[[SCALE0]], [[FMA1]], 1.0 +; COMMON-DAG: v_fma_f64 [[FMA3:v\[[0-9]+:[0-9]+\]]], [[FMA1]], [[FMA2]], [[FMA1]] +; COMMON-DAG: v_mul_f64 [[MUL:v\[[0-9]+:[0-9]+\]]], [[SCALE1]], [[FMA3]] +; COMMON-DAG: v_fma_f64 [[FMA4:v\[[0-9]+:[0-9]+\]]], -[[SCALE0]], [[MUL]], [[SCALE1]] +; COMMON: v_div_fmas_f64 [[FMAS:v\[[0-9]+:[0-9]+\]]], [[FMA3]], [[FMA4]], [[MUL]] +; COMMON: v_div_fixup_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[FMAS]], [[DEN]], [[NUM]] +; COMMON: buffer_store_dwordx2 [[RESULT]] +; COMMON: s_endpgm +define void @fdiv_f64(double addrspace(1)* %out, double addrspace(1)* %in) nounwind { + %gep.1 = getelementptr double addrspace(1)* %in, i32 1 + %num = load double addrspace(1)* %in + %den = load double addrspace(1)* %gep.1 + %result = fdiv double %num, %den + store double %result, double addrspace(1)* %out + ret void +} + +; COMMON-LABEL: {{^}}fdiv_f64_s_v: +define void @fdiv_f64_s_v(double addrspace(1)* %out, double addrspace(1)* %in, double %num) nounwind { + %den = load double addrspace(1)* %in + %result = fdiv double %num, %den + store double %result, double addrspace(1)* %out + ret void +} + +; COMMON-LABEL: {{^}}fdiv_f64_v_s: +define void @fdiv_f64_v_s(double addrspace(1)* %out, double addrspace(1)* %in, double %den) nounwind { + %num = load double addrspace(1)* %in + %result = fdiv double %num, %den + store double %result, double addrspace(1)* %out + ret void +} + +; COMMON-LABEL: {{^}}fdiv_f64_s_s: +define void @fdiv_f64_s_s(double addrspace(1)* %out, double %num, double %den) nounwind { + %result = fdiv double %num, %den + store double %result, double addrspace(1)* %out + ret void +} + +; COMMON-LABEL: {{^}}v_fdiv_v2f64: +define void @v_fdiv_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %in) nounwind { + %gep.1 = getelementptr <2 x double> addrspace(1)* %in, i32 1 + %num = load <2 x double> addrspace(1)* %in + %den = load <2 x double> addrspace(1)* %gep.1 + %result = fdiv <2 x double> %num, %den + store <2 x double> %result, <2 x double> addrspace(1)* %out + ret void +} + +; COMMON-LABEL: {{^}}s_fdiv_v2f64: +define void @s_fdiv_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %num, <2 x double> %den) { + %result = fdiv <2 x double> %num, %den + store <2 x double> %result, <2 x double> addrspace(1)* %out + ret void +} + +; COMMON-LABEL: {{^}}v_fdiv_v4f64: +define void @v_fdiv_v4f64(<4 x double> addrspace(1)* %out, <4 x double> addrspace(1)* %in) nounwind { + %gep.1 = getelementptr <4 x double> addrspace(1)* %in, i32 1 + %num = load <4 x double> addrspace(1)* %in + %den = load <4 x double> addrspace(1)* %gep.1 + %result = fdiv <4 x double> %num, %den + store <4 x double> %result, <4 x double> addrspace(1)* %out + ret void +} + +; COMMON-LABEL: {{^}}s_fdiv_v4f64: +define void @s_fdiv_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %num, <4 x double> %den) { + %result = fdiv <4 x double> %num, %den + store <4 x double> %result, <4 x double> addrspace(1)* %out + ret void +} diff --git a/test/CodeGen/R600/fdiv64.ll b/test/CodeGen/R600/fdiv64.ll deleted file mode 100644 index c081ccf50ca..00000000000 --- a/test/CodeGen/R600/fdiv64.ll +++ /dev/null @@ -1,15 +0,0 @@ -; RUN: llc < %s -march=amdgcn -mcpu=tahiti -verify-machineinstrs | FileCheck %s -; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s - -; CHECK: {{^}}fdiv_f64: -; CHECK: v_rcp_f64_e32 {{v\[[0-9]+:[0-9]+\]}} -; CHECK: v_mul_f64 {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} - -define void @fdiv_f64(double addrspace(1)* %out, double addrspace(1)* %in1, - double addrspace(1)* %in2) { - %r0 = load double addrspace(1)* %in1 - %r1 = load double addrspace(1)* %in2 - %r2 = fdiv double %r0, %r1 - store double %r2, double addrspace(1)* %out - ret void -} diff --git a/test/CodeGen/R600/frem.ll b/test/CodeGen/R600/frem.ll index b1a51a41532..02a00704cf0 100644 --- a/test/CodeGen/R600/frem.ll +++ b/test/CodeGen/R600/frem.ll @@ -42,18 +42,16 @@ define void @unsafe_frem_f32(float addrspace(1)* %out, float addrspace(1)* %in1, ret void } - ; FUNC-LABEL: {{^}}frem_f64: ; GCN: buffer_load_dwordx2 [[Y:v\[[0-9]+:[0-9]+\]]], {{.*}}, 0 ; GCN: buffer_load_dwordx2 [[X:v\[[0-9]+:[0-9]+\]]], {{.*}}, 0 -; TODO: Check SI. -; CI: v_rcp_f64_e32 [[INVY:v\[[0-9]+:[0-9]+\]]], [[Y]] -; CI: v_mul_f64 [[DIV:v\[[0-9]+:[0-9]+\]]], [[X]], [[INVY]] -; CI: v_trunc_f64_e32 [[TRUNC:v\[[0-9]+:[0-9]+\]]], [[DIV]] -; CI: v_mul_f64 [[RESULTM:v\[[0-9]+:[0-9]+\]]], [[TRUNC]], [[Y]] -; SI: v_mul_f64 [[RESULTM:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, [[Y]] -; GCN: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[X]], -[[RESULTM]] -; GCN: buffer_store_dwordx2 [[RESULT]], {{.*}}, 0 +; GCN-DAG: v_div_fmas_f64 +; GCN-DAG: v_div_scale_f64 +; GCN-DAG: v_mul_f64 +; CI: v_trunc_f64_e32 +; CI: v_mul_f64 +; GCN: v_add_f64 +; GCN: buffer_store_dwordx2 ; GCN: s_endpgm define void @frem_f64(double addrspace(1)* %out, double addrspace(1)* %in1, double addrspace(1)* %in2) #0 { diff --git a/test/CodeGen/R600/llvm.AMDGPU.rcp.f64.ll b/test/CodeGen/R600/llvm.AMDGPU.rcp.f64.ll index b80658b5ed4..d2a655bf909 100644 --- a/test/CodeGen/R600/llvm.AMDGPU.rcp.f64.ll +++ b/test/CodeGen/R600/llvm.AMDGPU.rcp.f64.ll @@ -23,6 +23,8 @@ define void @rcp_pat_f64(double addrspace(1)* %out, double %src) nounwind { ; FUNC-LABEL: {{^}}rsq_rcp_pat_f64: ; SI-UNSAFE: v_rsq_f64_e32 ; SI-SAFE-NOT: v_rsq_f64_e32 +; SI-SAFE: v_sqrt_f64 +; SI-SAFE: v_rcp_f64 define void @rsq_rcp_pat_f64(double addrspace(1)* %out, double %src) nounwind { %sqrt = call double @llvm.sqrt.f64(double %src) nounwind readnone %rcp = call double @llvm.AMDGPU.rcp.f64(double %sqrt) nounwind readnone