diff --git a/lib/Target/R600/SIISelLowering.cpp b/lib/Target/R600/SIISelLowering.cpp index 12677f0e062..71aec8400d7 100644 --- a/lib/Target/R600/SIISelLowering.cpp +++ b/lib/Target/R600/SIISelLowering.cpp @@ -636,6 +636,17 @@ MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter( return BB; } +bool SITargetLowering::enableAggressiveFMAFusion(EVT VT) const { + // This currently forces unfolding various combinations of fsub into fma with + // free fneg'd operands. As long as we have fast FMA (controlled by + // isFMAFasterThanFMulAndFAdd), we should perform these. + + // When fma is quarter rate, for f64 where add / sub are at best half rate, + // most of these combines appear to be cycle neutral but save on instruction + // count / code size. + return true; +} + EVT SITargetLowering::getSetCCResultType(LLVMContext &Ctx, EVT VT) const { if (!VT.isVector()) { return MVT::i1; @@ -647,6 +658,21 @@ MVT SITargetLowering::getScalarShiftAmountTy(EVT VT) const { return MVT::i32; } +// Answering this is somewhat tricky and depends on the specific device which +// have different rates for fma or all f64 operations. +// +// v_fma_f64 and v_mul_f64 always take the same number of cycles as each other +// regardless of which device (although the number of cycles differs between +// devices), so it is always profitable for f64. +// +// v_fma_f32 takes 4 or 16 cycles depending on the device, so it is profitable +// only on full rate devices. Normally, we should prefer selecting v_mad_f32 +// which we can always do even without fused FP ops since it returns the same +// result as the separate operations and since it is always full +// rate. Therefore, we lie and report that it is not faster for f32. v_mad_f32 +// however does not support denormals, so we do report fma as faster if we have +// a fast fma device and require denormals. +// bool SITargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const { VT = VT.getScalarType(); @@ -655,7 +681,10 @@ bool SITargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const { switch (VT.getSimpleVT().SimpleTy) { case MVT::f32: - return Subtarget->hasFastFMAF32(); + // This is as fast on some subtargets. However, we always have full rate f32 + // mad available which returns the same result as the separate operations + // which we should prefer over fma. + return false; case MVT::f64: return true; default: diff --git a/lib/Target/R600/SIISelLowering.h b/lib/Target/R600/SIISelLowering.h index 876fd8c9f36..bb003f46f4b 100644 --- a/lib/Target/R600/SIISelLowering.h +++ b/lib/Target/R600/SIISelLowering.h @@ -95,6 +95,7 @@ public: MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr * MI, MachineBasicBlock * BB) const override; + bool enableAggressiveFMAFusion(EVT VT) const override; EVT getSetCCResultType(LLVMContext &Context, EVT VT) const override; MVT getScalarShiftAmountTy(EVT VT) const override; bool isFMAFasterThanFMulAndFAdd(EVT VT) const override; diff --git a/test/CodeGen/R600/fma-combine.ll b/test/CodeGen/R600/fma-combine.ll new file mode 100644 index 00000000000..9aac90cb953 --- /dev/null +++ b/test/CodeGen/R600/fma-combine.ll @@ -0,0 +1,368 @@ +; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -fp-contract=fast < %s | FileCheck -check-prefix=SI-FASTFMAF -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs -fp-contract=fast < %s | FileCheck -check-prefix=SI-SLOWFMAF -check-prefix=SI -check-prefix=FUNC %s + +declare i32 @llvm.r600.read.tidig.x() #0 +declare double @llvm.fabs.f64(double) #0 +declare double @llvm.fma.f64(double, double, double) #0 +declare float @llvm.fma.f32(float, float, float) #0 + +; (fadd (fmul x, y), z) -> (fma x, y, z) +; FUNC-LABEL: {{^}}combine_to_fma_f64_0: +; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} +; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} +; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[C]] +; SI: buffer_store_dwordx2 [[RESULT]] +define void @combine_to_fma_f64_0(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 { + %tid = tail call i32 @llvm.r600.read.tidig.x() #0 + %gep.0 = getelementptr double addrspace(1)* %in, i32 %tid + %gep.1 = getelementptr double addrspace(1)* %gep.0, i32 1 + %gep.2 = getelementptr double addrspace(1)* %gep.0, i32 2 + %gep.out = getelementptr double addrspace(1)* %out, i32 %tid + + %a = load double addrspace(1)* %gep.0 + %b = load double addrspace(1)* %gep.1 + %c = load double addrspace(1)* %gep.2 + + %mul = fmul double %a, %b + %fma = fadd double %mul, %c + store double %fma, double addrspace(1)* %gep.out + ret void +} + +; (fadd (fmul x, y), z) -> (fma x, y, z) +; FUNC-LABEL: {{^}}combine_to_fma_f64_0_2use: +; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} +; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} +; SI-DAG: buffer_load_dwordx2 [[D:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24{{$}} +; SI-DAG: v_fma_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[C]] +; SI-DAG: v_fma_f64 [[RESULT1:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[D]] +; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI-DAG: buffer_store_dwordx2 [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} +; SI: s_endpgm +define void @combine_to_fma_f64_0_2use(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 { + %tid = tail call i32 @llvm.r600.read.tidig.x() #0 + %gep.0 = getelementptr double addrspace(1)* %in, i32 %tid + %gep.1 = getelementptr double addrspace(1)* %gep.0, i32 1 + %gep.2 = getelementptr double addrspace(1)* %gep.0, i32 2 + %gep.3 = getelementptr double addrspace(1)* %gep.0, i32 3 + %gep.out.0 = getelementptr double addrspace(1)* %out, i32 %tid + %gep.out.1 = getelementptr double addrspace(1)* %gep.out.0, i32 1 + + %a = load double addrspace(1)* %gep.0 + %b = load double addrspace(1)* %gep.1 + %c = load double addrspace(1)* %gep.2 + %d = load double addrspace(1)* %gep.3 + + %mul = fmul double %a, %b + %fma0 = fadd double %mul, %c + %fma1 = fadd double %mul, %d + store double %fma0, double addrspace(1)* %gep.out.0 + store double %fma1, double addrspace(1)* %gep.out.1 + ret void +} + +; (fadd x, (fmul y, z)) -> (fma y, z, x) +; FUNC-LABEL: {{^}}combine_to_fma_f64_1: +; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} +; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} +; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[C]] +; SI: buffer_store_dwordx2 [[RESULT]] +define void @combine_to_fma_f64_1(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 { + %tid = tail call i32 @llvm.r600.read.tidig.x() #0 + %gep.0 = getelementptr double addrspace(1)* %in, i32 %tid + %gep.1 = getelementptr double addrspace(1)* %gep.0, i32 1 + %gep.2 = getelementptr double addrspace(1)* %gep.0, i32 2 + %gep.out = getelementptr double addrspace(1)* %out, i32 %tid + + %a = load double addrspace(1)* %gep.0 + %b = load double addrspace(1)* %gep.1 + %c = load double addrspace(1)* %gep.2 + + %mul = fmul double %a, %b + %fma = fadd double %c, %mul + store double %fma, double addrspace(1)* %gep.out + ret void +} + +; (fsub (fmul x, y), z) -> (fma x, y, (fneg z)) +; FUNC-LABEL: {{^}}combine_to_fma_fsub_0_f64: +; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} +; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} +; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], -[[C]] +; SI: buffer_store_dwordx2 [[RESULT]] +define void @combine_to_fma_fsub_0_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 { + %tid = tail call i32 @llvm.r600.read.tidig.x() #0 + %gep.0 = getelementptr double addrspace(1)* %in, i32 %tid + %gep.1 = getelementptr double addrspace(1)* %gep.0, i32 1 + %gep.2 = getelementptr double addrspace(1)* %gep.0, i32 2 + %gep.out = getelementptr double addrspace(1)* %out, i32 %tid + + %a = load double addrspace(1)* %gep.0 + %b = load double addrspace(1)* %gep.1 + %c = load double addrspace(1)* %gep.2 + + %mul = fmul double %a, %b + %fma = fsub double %mul, %c + store double %fma, double addrspace(1)* %gep.out + ret void +} + +; (fsub (fmul x, y), z) -> (fma x, y, (fneg z)) +; FUNC-LABEL: {{^}}combine_to_fma_fsub_f64_0_2use: +; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} +; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} +; SI-DAG: buffer_load_dwordx2 [[D:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24{{$}} +; SI-DAG: v_fma_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], -[[C]] +; SI-DAG: v_fma_f64 [[RESULT1:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], -[[D]] +; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI-DAG: buffer_store_dwordx2 [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} +; SI: s_endpgm +define void @combine_to_fma_fsub_f64_0_2use(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 { + %tid = tail call i32 @llvm.r600.read.tidig.x() #0 + %gep.0 = getelementptr double addrspace(1)* %in, i32 %tid + %gep.1 = getelementptr double addrspace(1)* %gep.0, i32 1 + %gep.2 = getelementptr double addrspace(1)* %gep.0, i32 2 + %gep.3 = getelementptr double addrspace(1)* %gep.0, i32 3 + %gep.out.0 = getelementptr double addrspace(1)* %out, i32 %tid + %gep.out.1 = getelementptr double addrspace(1)* %gep.out.0, i32 1 + + %a = load double addrspace(1)* %gep.0 + %b = load double addrspace(1)* %gep.1 + %c = load double addrspace(1)* %gep.2 + %d = load double addrspace(1)* %gep.3 + + %mul = fmul double %a, %b + %fma0 = fsub double %mul, %c + %fma1 = fsub double %mul, %d + store double %fma0, double addrspace(1)* %gep.out.0 + store double %fma1, double addrspace(1)* %gep.out.1 + ret void +} + +; (fsub x, (fmul y, z)) -> (fma (fneg y), z, x) +; FUNC-LABEL: {{^}}combine_to_fma_fsub_1_f64: +; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} +; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} +; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], [[C]] +; SI: buffer_store_dwordx2 [[RESULT]] +define void @combine_to_fma_fsub_1_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 { + %tid = tail call i32 @llvm.r600.read.tidig.x() #0 + %gep.0 = getelementptr double addrspace(1)* %in, i32 %tid + %gep.1 = getelementptr double addrspace(1)* %gep.0, i32 1 + %gep.2 = getelementptr double addrspace(1)* %gep.0, i32 2 + %gep.out = getelementptr double addrspace(1)* %out, i32 %tid + + %a = load double addrspace(1)* %gep.0 + %b = load double addrspace(1)* %gep.1 + %c = load double addrspace(1)* %gep.2 + + %mul = fmul double %a, %b + %fma = fsub double %c, %mul + store double %fma, double addrspace(1)* %gep.out + ret void +} + +; (fsub x, (fmul y, z)) -> (fma (fneg y), z, x) +; FUNC-LABEL: {{^}}combine_to_fma_fsub_1_f64_2use: +; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} +; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} +; SI-DAG: buffer_load_dwordx2 [[D:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24{{$}} +; SI-DAG: v_fma_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], [[C]] +; SI-DAG: v_fma_f64 [[RESULT1:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], [[D]] +; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI-DAG: buffer_store_dwordx2 [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} +; SI: s_endpgm +define void @combine_to_fma_fsub_1_f64_2use(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 { + %tid = tail call i32 @llvm.r600.read.tidig.x() #0 + %gep.0 = getelementptr double addrspace(1)* %in, i32 %tid + %gep.1 = getelementptr double addrspace(1)* %gep.0, i32 1 + %gep.2 = getelementptr double addrspace(1)* %gep.0, i32 2 + %gep.3 = getelementptr double addrspace(1)* %gep.0, i32 3 + %gep.out.0 = getelementptr double addrspace(1)* %out, i32 %tid + %gep.out.1 = getelementptr double addrspace(1)* %gep.out.0, i32 1 + + %a = load double addrspace(1)* %gep.0 + %b = load double addrspace(1)* %gep.1 + %c = load double addrspace(1)* %gep.2 + %d = load double addrspace(1)* %gep.3 + + %mul = fmul double %a, %b + %fma0 = fsub double %c, %mul + %fma1 = fsub double %d, %mul + store double %fma0, double addrspace(1)* %gep.out.0 + store double %fma1, double addrspace(1)* %gep.out.1 + ret void +} + +; (fsub (fneg (fmul x, y)), z) -> (fma (fneg x), y, (fneg z)) +; FUNC-LABEL: {{^}}combine_to_fma_fsub_2_f64: +; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} +; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} +; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], -[[C]] +; SI: buffer_store_dwordx2 [[RESULT]] +define void @combine_to_fma_fsub_2_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 { + %tid = tail call i32 @llvm.r600.read.tidig.x() #0 + %gep.0 = getelementptr double addrspace(1)* %in, i32 %tid + %gep.1 = getelementptr double addrspace(1)* %gep.0, i32 1 + %gep.2 = getelementptr double addrspace(1)* %gep.0, i32 2 + %gep.out = getelementptr double addrspace(1)* %out, i32 %tid + + %a = load double addrspace(1)* %gep.0 + %b = load double addrspace(1)* %gep.1 + %c = load double addrspace(1)* %gep.2 + + %mul = fmul double %a, %b + %mul.neg = fsub double -0.0, %mul + %fma = fsub double %mul.neg, %c + + store double %fma, double addrspace(1)* %gep.out + ret void +} + +; (fsub (fneg (fmul x, y)), z) -> (fma (fneg x), y, (fneg z)) +; FUNC-LABEL: {{^}}combine_to_fma_fsub_2_f64_2uses_neg: +; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} +; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} +; SI-DAG: v_fma_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], -[[C]] +; SI-DAG: v_fma_f64 [[RESULT1:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], -[[D]] +; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI-DAG: buffer_store_dwordx2 [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} +; SI: s_endpgm +define void @combine_to_fma_fsub_2_f64_2uses_neg(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 { + %tid = tail call i32 @llvm.r600.read.tidig.x() #0 + %gep.0 = getelementptr double addrspace(1)* %in, i32 %tid + %gep.1 = getelementptr double addrspace(1)* %gep.0, i32 1 + %gep.2 = getelementptr double addrspace(1)* %gep.0, i32 2 + %gep.3 = getelementptr double addrspace(1)* %gep.0, i32 3 + %gep.out.0 = getelementptr double addrspace(1)* %out, i32 %tid + %gep.out.1 = getelementptr double addrspace(1)* %gep.out.0, i32 1 + + %a = load double addrspace(1)* %gep.0 + %b = load double addrspace(1)* %gep.1 + %c = load double addrspace(1)* %gep.2 + %d = load double addrspace(1)* %gep.3 + + %mul = fmul double %a, %b + %mul.neg = fsub double -0.0, %mul + %fma0 = fsub double %mul.neg, %c + %fma1 = fsub double %mul.neg, %d + + store double %fma0, double addrspace(1)* %gep.out.0 + store double %fma1, double addrspace(1)* %gep.out.1 + ret void +} + +; (fsub (fneg (fmul x, y)), z) -> (fma (fneg x), y, (fneg z)) +; FUNC-LABEL: {{^}}combine_to_fma_fsub_2_f64_2uses_mul: +; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} +; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} +; SI-DAG: v_fma_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], -[[C]] +; SI-DAG: v_fma_f64 [[RESULT1:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], -[[D]] +; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI-DAG: buffer_store_dwordx2 [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} +; SI: s_endpgm +define void @combine_to_fma_fsub_2_f64_2uses_mul(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 { + %tid = tail call i32 @llvm.r600.read.tidig.x() #0 + %gep.0 = getelementptr double addrspace(1)* %in, i32 %tid + %gep.1 = getelementptr double addrspace(1)* %gep.0, i32 1 + %gep.2 = getelementptr double addrspace(1)* %gep.0, i32 2 + %gep.3 = getelementptr double addrspace(1)* %gep.0, i32 3 + %gep.out.0 = getelementptr double addrspace(1)* %out, i32 %tid + %gep.out.1 = getelementptr double addrspace(1)* %gep.out.0, i32 1 + + %a = load double addrspace(1)* %gep.0 + %b = load double addrspace(1)* %gep.1 + %c = load double addrspace(1)* %gep.2 + %d = load double addrspace(1)* %gep.3 + + %mul = fmul double %a, %b + %mul.neg = fsub double -0.0, %mul + %fma0 = fsub double %mul.neg, %c + %fma1 = fsub double %mul, %d + + store double %fma0, double addrspace(1)* %gep.out.0 + store double %fma1, double addrspace(1)* %gep.out.1 + ret void +} + +; fold (fsub (fma x, y, (fmul u, v)), z) -> (fma x, y (fma u, v, (fneg z))) + +; FUNC-LABEL: {{^}}aggressive_combine_to_fma_fsub_0_f64: +; SI-DAG: buffer_load_dwordx2 [[X:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI-DAG: buffer_load_dwordx2 [[Y:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} +; SI-DAG: buffer_load_dwordx2 [[Z:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} +; SI-DAG: buffer_load_dwordx2 [[U:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24{{$}} +; SI-DAG: buffer_load_dwordx2 [[V:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:32{{$}} +; SI: v_fma_f64 [[FMA0:v\[[0-9]+:[0-9]+\]]], [[U]], [[V]], -[[Z]] +; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[X]], [[Y]], [[FMA0]] +; SI: buffer_store_dwordx2 [[RESULT]] +define void @aggressive_combine_to_fma_fsub_0_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 { + %tid = tail call i32 @llvm.r600.read.tidig.x() #0 + %gep.0 = getelementptr double addrspace(1)* %in, i32 %tid + %gep.1 = getelementptr double addrspace(1)* %gep.0, i32 1 + %gep.2 = getelementptr double addrspace(1)* %gep.0, i32 2 + %gep.3 = getelementptr double addrspace(1)* %gep.0, i32 3 + %gep.4 = getelementptr double addrspace(1)* %gep.0, i32 4 + %gep.out = getelementptr double addrspace(1)* %out, i32 %tid + + %x = load double addrspace(1)* %gep.0 + %y = load double addrspace(1)* %gep.1 + %z = load double addrspace(1)* %gep.2 + %u = load double addrspace(1)* %gep.3 + %v = load double addrspace(1)* %gep.4 + + %tmp0 = fmul double %u, %v + %tmp1 = call double @llvm.fma.f64(double %x, double %y, double %tmp0) #0 + %tmp2 = fsub double %tmp1, %z + + store double %tmp2, double addrspace(1)* %gep.out + ret void +} + +; fold (fsub x, (fma y, z, (fmul u, v))) +; -> (fma (fneg y), z, (fma (fneg u), v, x)) + +; FUNC-LABEL: {{^}}aggressive_combine_to_fma_fsub_1_f64: +; SI-DAG: buffer_load_dwordx2 [[X:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI-DAG: buffer_load_dwordx2 [[Y:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} +; SI-DAG: buffer_load_dwordx2 [[Z:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} +; SI-DAG: buffer_load_dwordx2 [[U:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24{{$}} +; SI-DAG: buffer_load_dwordx2 [[V:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:32{{$}} +; SI: v_fma_f64 [[FMA0:v\[[0-9]+:[0-9]+\]]], -[[U]], [[V]], [[X]] +; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], -[[Y]], [[Z]], [[FMA0]] +; SI: buffer_store_dwordx2 [[RESULT]] +define void @aggressive_combine_to_fma_fsub_1_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 { + %tid = tail call i32 @llvm.r600.read.tidig.x() #0 + %gep.0 = getelementptr double addrspace(1)* %in, i32 %tid + %gep.1 = getelementptr double addrspace(1)* %gep.0, i32 1 + %gep.2 = getelementptr double addrspace(1)* %gep.0, i32 2 + %gep.3 = getelementptr double addrspace(1)* %gep.0, i32 3 + %gep.4 = getelementptr double addrspace(1)* %gep.0, i32 4 + %gep.out = getelementptr double addrspace(1)* %out, i32 %tid + + %x = load double addrspace(1)* %gep.0 + %y = load double addrspace(1)* %gep.1 + %z = load double addrspace(1)* %gep.2 + %u = load double addrspace(1)* %gep.3 + %v = load double addrspace(1)* %gep.4 + + %tmp0 = fmul double %u, %v + %tmp1 = call double @llvm.fma.f64(double %y, double %z, double %tmp0) #0 + %tmp2 = fsub double %x, %tmp1 + + store double %tmp2, double addrspace(1)* %gep.out + ret void +} + +attributes #0 = { nounwind readnone } +attributes #1 = { nounwind }