R600/SI: Allow commuting some 3 op instructions

e.g. v_mad_f32 a, b, c -> v_mad_f32 b, a, c

This simplifies matching v_madmk_f32.

This looks somewhat surprising, but it appears to be
OK to do this. We can commute src0 and src1 in all
of these instructions, and that's all that appears
to matter.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@221910 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
Matt Arsenault 2014-11-13 19:26:47 +00:00
parent ab9c404bbd
commit 1aae959de7
6 changed files with 206 additions and 24 deletions

View File

@ -1374,12 +1374,12 @@ defm V_SUBREV_F32 : VOP2Inst <vop2<0x5>, "v_subrev_f32",
>; >;
} // End isCommutable = 1 } // End isCommutable = 1
let isCommutable = 1 in {
defm V_MAC_LEGACY_F32 : VOP2Inst <vop2<0x6>, "v_mac_legacy_f32", defm V_MAC_LEGACY_F32 : VOP2Inst <vop2<0x6>, "v_mac_legacy_f32",
VOP_F32_F32_F32 VOP_F32_F32_F32
>; >;
let isCommutable = 1 in {
defm V_MUL_LEGACY_F32 : VOP2Inst <vop2<0x7>, "v_mul_legacy_f32", defm V_MUL_LEGACY_F32 : VOP2Inst <vop2<0x7>, "v_mul_legacy_f32",
VOP_F32_F32_F32, int_AMDGPU_mul VOP_F32_F32_F32, int_AMDGPU_mul
>; >;
@ -1388,7 +1388,6 @@ defm V_MUL_F32 : VOP2Inst <vop2<0x8>, "v_mul_f32",
VOP_F32_F32_F32, fmul VOP_F32_F32_F32, fmul
>; >;
defm V_MUL_I32_I24 : VOP2Inst <vop2<0x9>, "v_mul_i32_i24", defm V_MUL_I32_I24 : VOP2Inst <vop2<0x9>, "v_mul_i32_i24",
VOP_I32_I32_I32, AMDGPUmul_i24 VOP_I32_I32_I32, AMDGPUmul_i24
>; >;
@ -1449,11 +1448,21 @@ defm V_XOR_B32 : VOP2Inst <vop2<0x1d>, "v_xor_b32",
defm V_BFM_B32 : VOP2Inst <vop2<0x1e>, "v_bfm_b32", defm V_BFM_B32 : VOP2Inst <vop2<0x1e>, "v_bfm_b32",
VOP_I32_I32_I32, AMDGPUbfm>; VOP_I32_I32_I32, AMDGPUbfm>;
let isCommutable = 1 in {
defm V_MAC_F32 : VOP2Inst <vop2<0x1f>, "v_mac_f32", VOP_F32_F32_F32>; defm V_MAC_F32 : VOP2Inst <vop2<0x1f>, "v_mac_f32", VOP_F32_F32_F32>;
} // End isCommutable = 1
defm V_MADMK_F32 : VOP2Inst <vop2<0x20>, "v_madmk_f32", VOP_F32_F32_F32>; defm V_MADMK_F32 : VOP2Inst <vop2<0x20>, "v_madmk_f32", VOP_F32_F32_F32>;
let isCommutable = 1 in {
defm V_MADAK_F32 : VOP2Inst <vop2<0x21>, "v_madak_f32", VOP_F32_F32_F32>; defm V_MADAK_F32 : VOP2Inst <vop2<0x21>, "v_madak_f32", VOP_F32_F32_F32>;
} // End isCommutable = 1
defm V_BCNT_U32_B32 : VOP2Inst <vop2<0x22>, "v_bcnt_u32_b32", VOP_I32_I32_I32>; defm V_BCNT_U32_B32 : VOP2Inst <vop2<0x22>, "v_bcnt_u32_b32", VOP_I32_I32_I32>;
defm V_MBCNT_LO_U32_B32 : VOP2Inst <vop2<0x23>, "v_mbcnt_lo_u32_b32", defm V_MBCNT_LO_U32_B32 : VOP2Inst <vop2<0x23>, "v_mbcnt_lo_u32_b32",
VOP_I32_I32_I32 VOP_I32_I32_I32
>; >;
defm V_MBCNT_HI_U32_B32 : VOP2Inst <vop2<0x24>, "v_mbcnt_hi_u32_b32", defm V_MBCNT_HI_U32_B32 : VOP2Inst <vop2<0x24>, "v_mbcnt_hi_u32_b32",
@ -1503,18 +1512,22 @@ defm V_CVT_PKRTZ_F16_F32 : VOP2Inst <vop2<0x2f>, "v_cvt_pkrtz_f16_f32",
// VOP3 Instructions // VOP3 Instructions
//===----------------------------------------------------------------------===// //===----------------------------------------------------------------------===//
let isCommutable = 1 in {
defm V_MAD_LEGACY_F32 : VOP3Inst <vop3<0x140>, "v_mad_legacy_f32", defm V_MAD_LEGACY_F32 : VOP3Inst <vop3<0x140>, "v_mad_legacy_f32",
VOP_F32_F32_F32_F32 VOP_F32_F32_F32_F32
>; >;
defm V_MAD_F32 : VOP3Inst <vop3<0x141>, "v_mad_f32", defm V_MAD_F32 : VOP3Inst <vop3<0x141>, "v_mad_f32",
VOP_F32_F32_F32_F32, fmad VOP_F32_F32_F32_F32, fmad
>; >;
defm V_MAD_I32_I24 : VOP3Inst <vop3<0x142>, "v_mad_i32_i24", defm V_MAD_I32_I24 : VOP3Inst <vop3<0x142>, "v_mad_i32_i24",
VOP_I32_I32_I32_I32, AMDGPUmad_i24 VOP_I32_I32_I32_I32, AMDGPUmad_i24
>; >;
defm V_MAD_U32_U24 : VOP3Inst <vop3<0x143>, "v_mad_u32_u24", defm V_MAD_U32_U24 : VOP3Inst <vop3<0x143>, "v_mad_u32_u24",
VOP_I32_I32_I32_I32, AMDGPUmad_u24 VOP_I32_I32_I32_I32, AMDGPUmad_u24
>; >;
} // End isCommutable = 1
defm V_CUBEID_F32 : VOP3Inst <vop3<0x144>, "v_cubeid_f32", defm V_CUBEID_F32 : VOP3Inst <vop3<0x144>, "v_cubeid_f32",
VOP_F32_F32_F32_F32 VOP_F32_F32_F32_F32
@ -1537,12 +1550,16 @@ defm V_BFE_I32 : VOP3Inst <vop3<0x149>, "v_bfe_i32",
defm V_BFI_B32 : VOP3Inst <vop3<0x14a>, "v_bfi_b32", defm V_BFI_B32 : VOP3Inst <vop3<0x14a>, "v_bfi_b32",
VOP_I32_I32_I32_I32, AMDGPUbfi VOP_I32_I32_I32_I32, AMDGPUbfi
>; >;
let isCommutable = 1 in {
defm V_FMA_F32 : VOP3Inst <vop3<0x14b>, "v_fma_f32", defm V_FMA_F32 : VOP3Inst <vop3<0x14b>, "v_fma_f32",
VOP_F32_F32_F32_F32, fma VOP_F32_F32_F32_F32, fma
>; >;
defm V_FMA_F64 : VOP3Inst <vop3<0x14c>, "v_fma_f64", defm V_FMA_F64 : VOP3Inst <vop3<0x14c>, "v_fma_f64",
VOP_F64_F64_F64_F64, fma VOP_F64_F64_F64_F64, fma
>; >;
} // End isCommutable = 1
//def V_LERP_U8 : VOP3_U8 <0x0000014d, "v_lerp_u8", []>; //def V_LERP_U8 : VOP3_U8 <0x0000014d, "v_lerp_u8", []>;
defm V_ALIGNBIT_B32 : VOP3Inst <vop3<0x14e>, "v_alignbit_b32", defm V_ALIGNBIT_B32 : VOP3Inst <vop3<0x14e>, "v_alignbit_b32",
VOP_I32_I32_I32_I32 VOP_I32_I32_I32_I32
@ -1629,15 +1646,19 @@ defm V_DIV_SCALE_F32 : VOP3b_32 <vop3<0x16d>, "v_div_scale_f32", []>;
// Double precision division pre-scale. // Double precision division pre-scale.
defm V_DIV_SCALE_F64 : VOP3b_64 <vop3<0x16e>, "v_div_scale_f64", []>; defm V_DIV_SCALE_F64 : VOP3b_64 <vop3<0x16e>, "v_div_scale_f64", []>;
let isCommutable = 1 in {
defm V_DIV_FMAS_F32 : VOP3Inst <vop3<0x16f>, "v_div_fmas_f32", defm V_DIV_FMAS_F32 : VOP3Inst <vop3<0x16f>, "v_div_fmas_f32",
VOP_F32_F32_F32_F32, AMDGPUdiv_fmas VOP_F32_F32_F32_F32, AMDGPUdiv_fmas
>; >;
defm V_DIV_FMAS_F64 : VOP3Inst <vop3<0x170>, "v_div_fmas_f64", defm V_DIV_FMAS_F64 : VOP3Inst <vop3<0x170>, "v_div_fmas_f64",
VOP_F64_F64_F64_F64, AMDGPUdiv_fmas VOP_F64_F64_F64_F64, AMDGPUdiv_fmas
>; >;
} // End isCommutable = 1
//def V_MSAD_U8 : VOP3_U8 <0x00000171, "v_msad_u8", []>; //def V_MSAD_U8 : VOP3_U8 <0x00000171, "v_msad_u8", []>;
//def V_QSAD_U8 : VOP3_U8 <0x00000172, "v_qsad_u8", []>; //def V_QSAD_U8 : VOP3_U8 <0x00000172, "v_qsad_u8", []>;
//def V_MQSAD_U8 : VOP3_U8 <0x00000173, "v_mqsad_u8", []>; //def V_MQSAD_U8 : VOP3_U8 <0x00000173, "v_mqsad_u8", []>;
defm V_TRIG_PREOP_F64 : VOP3Inst < defm V_TRIG_PREOP_F64 : VOP3Inst <
vop3<0x174>, "v_trig_preop_f64", VOP_F64_F64_I32, AMDGPUtrig_preop vop3<0x174>, "v_trig_preop_f64", VOP_F64_F64_I32, AMDGPUtrig_preop
>; >;
@ -2840,6 +2861,8 @@ defm V_MQSAD_U16_U8 : VOP3Inst <vop3<0x172>, "v_mqsad_u16_u8",
defm V_MQSAD_U32_U8 : VOP3Inst <vop3<0x175>, "v_mqsad_u32_u8", defm V_MQSAD_U32_U8 : VOP3Inst <vop3<0x175>, "v_mqsad_u32_u8",
VOP_I32_I32_I32 VOP_I32_I32_I32
>; >;
let isCommutable = 1 in {
defm V_MAD_U64_U32 : VOP3Inst <vop3<0x176>, "v_mad_u64_u32", defm V_MAD_U64_U32 : VOP3Inst <vop3<0x176>, "v_mad_u64_u32",
VOP_I64_I32_I32_I64 VOP_I64_I32_I32_I64
>; >;
@ -2848,6 +2871,7 @@ defm V_MAD_U64_U32 : VOP3Inst <vop3<0x176>, "v_mad_u64_u32",
defm V_MAD_I64_I32 : VOP3Inst <vop3<0x177>, "v_mad_i64_i32", defm V_MAD_I64_I32 : VOP3Inst <vop3<0x177>, "v_mad_i64_i32",
VOP_I64_I32_I32_I64 VOP_I64_I32_I32_I64
>; >;
} // End isCommutable = 1
// Remaining instructions: // Remaining instructions:
// FLAT_* // FLAT_*

View File

@ -46,5 +46,111 @@ define void @commute_mul_imm_fneg_f32(float addrspace(1)* %out, float addrspace(
ret void ret void
} }
; FIXME: Should use SGPR for literal.
; FUNC-LABEL: @commute_add_lit_fabs_f32
; SI: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
; SI: v_mov_b32_e32 [[K:v[0-9]+]], 0x44800000
; SI: v_add_f32_e64 [[REG:v[0-9]+]], |[[X]]|, [[K]]
; SI-NEXT: buffer_store_dword [[REG]]
define void @commute_add_lit_fabs_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
%tid = call i32 @llvm.r600.read.tidig.x() #1
%gep.0 = getelementptr float addrspace(1)* %in, i32 %tid
%x = load float addrspace(1)* %gep.0
%x.fabs = call float @llvm.fabs.f32(float %x) #1
%z = fadd float 1024.0, %x.fabs
store float %z, float addrspace(1)* %out
ret void
}
; FUNC-LABEL: @commute_add_fabs_f32
; SI-DAG: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
; SI-DAG: buffer_load_dword [[Y:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:0x4
; SI: v_add_f32_e64 [[REG:v[0-9]+]], [[X]], |[[Y]]|
; SI-NEXT: buffer_store_dword [[REG]]
define void @commute_add_fabs_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
%tid = call i32 @llvm.r600.read.tidig.x() #1
%gep.0 = getelementptr float addrspace(1)* %in, i32 %tid
%gep.1 = getelementptr float addrspace(1)* %gep.0, i32 1
%x = load float addrspace(1)* %gep.0
%y = load float addrspace(1)* %gep.1
%y.fabs = call float @llvm.fabs.f32(float %y) #1
%z = fadd float %x, %y.fabs
store float %z, float addrspace(1)* %out
ret void
}
; FUNC-LABEL: @commute_mul_fneg_f32
; SI-DAG: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
; SI-DAG: buffer_load_dword [[Y:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:0x4
; SI: v_mul_f32_e64 [[REG:v[0-9]+]], [[X]], -[[Y]]
; SI-NEXT: buffer_store_dword [[REG]]
define void @commute_mul_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
%tid = call i32 @llvm.r600.read.tidig.x() #1
%gep.0 = getelementptr float addrspace(1)* %in, i32 %tid
%gep.1 = getelementptr float addrspace(1)* %gep.0, i32 1
%x = load float addrspace(1)* %gep.0
%y = load float addrspace(1)* %gep.1
%y.fneg = fsub float -0.000000e+00, %y
%z = fmul float %x, %y.fneg
store float %z, float addrspace(1)* %out
ret void
}
; FUNC-LABEL: @commute_mul_fabs_fneg_f32
; SI-DAG: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
; SI-DAG: buffer_load_dword [[Y:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:0x4
; SI: v_mul_f32_e64 [[REG:v[0-9]+]], [[X]], -|[[Y]]|
; SI-NEXT: buffer_store_dword [[REG]]
define void @commute_mul_fabs_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
%tid = call i32 @llvm.r600.read.tidig.x() #1
%gep.0 = getelementptr float addrspace(1)* %in, i32 %tid
%gep.1 = getelementptr float addrspace(1)* %gep.0, i32 1
%x = load float addrspace(1)* %gep.0
%y = load float addrspace(1)* %gep.1
%y.fabs = call float @llvm.fabs.f32(float %y) #1
%y.fabs.fneg = fsub float -0.000000e+00, %y.fabs
%z = fmul float %x, %y.fabs.fneg
store float %z, float addrspace(1)* %out
ret void
}
; There's no reason to commute this.
; FUNC-LABEL: @commute_mul_fabs_x_fabs_y_f32
; SI-DAG: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
; SI-DAG: buffer_load_dword [[Y:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:0x4
; SI: v_mul_f32_e64 [[REG:v[0-9]+]], |[[X]]|, |[[Y]]|
; SI-NEXT: buffer_store_dword [[REG]]
define void @commute_mul_fabs_x_fabs_y_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
%tid = call i32 @llvm.r600.read.tidig.x() #1
%gep.0 = getelementptr float addrspace(1)* %in, i32 %tid
%gep.1 = getelementptr float addrspace(1)* %gep.0, i32 1
%x = load float addrspace(1)* %gep.0
%y = load float addrspace(1)* %gep.1
%x.fabs = call float @llvm.fabs.f32(float %x) #1
%y.fabs = call float @llvm.fabs.f32(float %y) #1
%z = fmul float %x.fabs, %y.fabs
store float %z, float addrspace(1)* %out
ret void
}
; FUNC-LABEL: @commute_mul_fabs_x_fneg_fabs_y_f32
; SI-DAG: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
; SI-DAG: buffer_load_dword [[Y:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:0x4
; SI: v_mul_f32_e64 [[REG:v[0-9]+]], |[[X]]|, -|[[Y]]|
; SI-NEXT: buffer_store_dword [[REG]]
define void @commute_mul_fabs_x_fneg_fabs_y_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
%tid = call i32 @llvm.r600.read.tidig.x() #1
%gep.0 = getelementptr float addrspace(1)* %in, i32 %tid
%gep.1 = getelementptr float addrspace(1)* %gep.0, i32 1
%x = load float addrspace(1)* %gep.0
%y = load float addrspace(1)* %gep.1
%x.fabs = call float @llvm.fabs.f32(float %x) #1
%y.fabs = call float @llvm.fabs.f32(float %y) #1
%y.fabs.fneg = fsub float -0.000000e+00, %y.fabs
%z = fmul float %x.fabs, %y.fabs.fneg
store float %z, float addrspace(1)* %out
ret void
}
attributes #0 = { nounwind } attributes #0 = { nounwind }
attributes #1 = { nounwind readnone } attributes #1 = { nounwind readnone }

View File

@ -5,6 +5,8 @@ declare float @llvm.fma.f32(float, float, float) nounwind readnone
declare <2 x float> @llvm.fma.v2f32(<2 x float>, <2 x float>, <2 x float>) nounwind readnone declare <2 x float> @llvm.fma.v2f32(<2 x float>, <2 x float>, <2 x float>) nounwind readnone
declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>) nounwind readnone declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>) nounwind readnone
declare i32 @llvm.r600.read.tidig.x() nounwind readnone
; FUNC-LABEL: {{^}}fma_f32: ; FUNC-LABEL: {{^}}fma_f32:
; SI: v_fma_f32 {{v[0-9]+, v[0-9]+, v[0-9]+, v[0-9]+}} ; SI: v_fma_f32 {{v[0-9]+, v[0-9]+, v[0-9]+, v[0-9]+}}
@ -12,12 +14,12 @@ declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>) nounw
; EG: FMA {{\*? *}}[[RES]] ; EG: FMA {{\*? *}}[[RES]]
define void @fma_f32(float addrspace(1)* %out, float addrspace(1)* %in1, define void @fma_f32(float addrspace(1)* %out, float addrspace(1)* %in1,
float addrspace(1)* %in2, float addrspace(1)* %in3) { float addrspace(1)* %in2, float addrspace(1)* %in3) {
%r0 = load float addrspace(1)* %in1 %r0 = load float addrspace(1)* %in1
%r1 = load float addrspace(1)* %in2 %r1 = load float addrspace(1)* %in2
%r2 = load float addrspace(1)* %in3 %r2 = load float addrspace(1)* %in3
%r3 = tail call float @llvm.fma.f32(float %r0, float %r1, float %r2) %r3 = tail call float @llvm.fma.f32(float %r0, float %r1, float %r2)
store float %r3, float addrspace(1)* %out store float %r3, float addrspace(1)* %out
ret void ret void
} }
; FUNC-LABEL: {{^}}fma_v2f32: ; FUNC-LABEL: {{^}}fma_v2f32:
@ -29,12 +31,12 @@ define void @fma_f32(float addrspace(1)* %out, float addrspace(1)* %in1,
; EG-DAG: FMA {{\*? *}}[[RES]].[[CHHI]] ; EG-DAG: FMA {{\*? *}}[[RES]].[[CHHI]]
define void @fma_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in1, define void @fma_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in1,
<2 x float> addrspace(1)* %in2, <2 x float> addrspace(1)* %in3) { <2 x float> addrspace(1)* %in2, <2 x float> addrspace(1)* %in3) {
%r0 = load <2 x float> addrspace(1)* %in1 %r0 = load <2 x float> addrspace(1)* %in1
%r1 = load <2 x float> addrspace(1)* %in2 %r1 = load <2 x float> addrspace(1)* %in2
%r2 = load <2 x float> addrspace(1)* %in3 %r2 = load <2 x float> addrspace(1)* %in3
%r3 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %r0, <2 x float> %r1, <2 x float> %r2) %r3 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %r0, <2 x float> %r1, <2 x float> %r2)
store <2 x float> %r3, <2 x float> addrspace(1)* %out store <2 x float> %r3, <2 x float> addrspace(1)* %out
ret void ret void
} }
; FUNC-LABEL: {{^}}fma_v4f32: ; FUNC-LABEL: {{^}}fma_v4f32:
@ -50,10 +52,41 @@ define void @fma_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)*
; EG-DAG: FMA {{\*? *}}[[RES]].W ; EG-DAG: FMA {{\*? *}}[[RES]].W
define void @fma_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in1, define void @fma_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in1,
<4 x float> addrspace(1)* %in2, <4 x float> addrspace(1)* %in3) { <4 x float> addrspace(1)* %in2, <4 x float> addrspace(1)* %in3) {
%r0 = load <4 x float> addrspace(1)* %in1 %r0 = load <4 x float> addrspace(1)* %in1
%r1 = load <4 x float> addrspace(1)* %in2 %r1 = load <4 x float> addrspace(1)* %in2
%r2 = load <4 x float> addrspace(1)* %in3 %r2 = load <4 x float> addrspace(1)* %in3
%r3 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %r0, <4 x float> %r1, <4 x float> %r2) %r3 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %r0, <4 x float> %r1, <4 x float> %r2)
store <4 x float> %r3, <4 x float> addrspace(1)* %out store <4 x float> %r3, <4 x float> addrspace(1)* %out
ret void ret void
}
; FUNC-LABEL: @fma_commute_mul_inline_imm_f32
; SI: v_fma_f32 {{v[0-9]+}}, 2.0, {{v[0-9]+}}, {{v[0-9]+}}
define void @fma_commute_mul_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind {
%tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
%in.a.gep = getelementptr float addrspace(1)* %in.a, i32 %tid
%in.b.gep = getelementptr float addrspace(1)* %in.b, i32 %tid
%out.gep = getelementptr float addrspace(1)* %out, i32 %tid
%a = load float addrspace(1)* %in.a.gep, align 4
%b = load float addrspace(1)* %in.b.gep, align 4
%fma = call float @llvm.fma.f32(float %a, float 2.0, float %b)
store float %fma, float addrspace(1)* %out.gep, align 4
ret void
}
; FUNC-LABEL: @fma_commute_mul_s_f32
define void @fma_commute_mul_s_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b, float %b) nounwind {
%tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
%in.a.gep = getelementptr float addrspace(1)* %in.a, i32 %tid
%in.b.gep = getelementptr float addrspace(1)* %in.b, i32 %tid
%out.gep = getelementptr float addrspace(1)* %out, i32 %tid
%a = load float addrspace(1)* %in.a.gep, align 4
%c = load float addrspace(1)* %in.b.gep, align 4
%fma = call float @llvm.fma.f32(float %a, float %b, float %c)
store float %fma, float addrspace(1)* %out.gep, align 4
ret void
} }

View File

@ -116,7 +116,7 @@ define void @fadd_b_a_a_f32(float addrspace(1)* %out,
; CHECK-LABEL: {{^}}fmuladd_neg_2.0_a_b_f32 ; CHECK-LABEL: {{^}}fmuladd_neg_2.0_a_b_f32
; CHECK-DAG: buffer_load_dword [[R1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; CHECK-DAG: buffer_load_dword [[R1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
; CHECK-DAG: buffer_load_dword [[R2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:0x4 ; CHECK-DAG: buffer_load_dword [[R2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:0x4
; CHECK: v_mad_f32 [[RESULT:v[0-9]+]], [[R1]], -2.0, [[R2]] ; CHECK: v_mad_f32 [[RESULT:v[0-9]+]], -2.0, [[R1]], [[R2]]
; CHECK: buffer_store_dword [[RESULT]] ; CHECK: buffer_store_dword [[RESULT]]
define void @fmuladd_neg_2.0_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) { define void @fmuladd_neg_2.0_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
%tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
@ -158,7 +158,7 @@ define void @fmuladd_neg_2.0_neg_a_b_f32(float addrspace(1)* %out, float addrspa
; CHECK-LABEL: {{^}}fmuladd_2.0_neg_a_b_f32 ; CHECK-LABEL: {{^}}fmuladd_2.0_neg_a_b_f32
; CHECK-DAG: buffer_load_dword [[R1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; CHECK-DAG: buffer_load_dword [[R1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
; CHECK-DAG: buffer_load_dword [[R2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:0x4 ; CHECK-DAG: buffer_load_dword [[R2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:0x4
; CHECK: v_mad_f32 [[RESULT:v[0-9]+]], [[R1]], -2.0, [[R2]] ; CHECK: v_mad_f32 [[RESULT:v[0-9]+]], -2.0, [[R1]], [[R2]]
; CHECK: buffer_store_dword [[RESULT]] ; CHECK: buffer_store_dword [[RESULT]]
define void @fmuladd_2.0_neg_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) { define void @fmuladd_2.0_neg_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
%tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone

View File

@ -5,6 +5,7 @@
; XUN: llc -march=r600 -mcpu=rv770 -verify-machineinstrs < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s ; XUN: llc -march=r600 -mcpu=rv770 -verify-machineinstrs < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
declare i32 @llvm.AMDGPU.umad24(i32, i32, i32) nounwind readnone declare i32 @llvm.AMDGPU.umad24(i32, i32, i32) nounwind readnone
declare i32 @llvm.r600.read.tidig.x() nounwind readnone
; FUNC-LABEL: {{^}}test_umad24: ; FUNC-LABEL: {{^}}test_umad24:
; SI: v_mad_u32_u24 ; SI: v_mad_u32_u24
@ -17,3 +18,21 @@ define void @test_umad24(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2
ret void ret void
} }
; FUNC-LABEL: {{^}}commute_umad24:
; SI-DAG: buffer_load_dword [[SRC0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
; SI-DAG: buffer_load_dword [[SRC2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:0x4
; SI: v_mad_u32_u24 [[RESULT:v[0-9]+]], 4, [[SRC0]], [[SRC2]]
; SI: buffer_store_dword [[RESULT]]
define void @commute_umad24(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
%tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
%out.gep = getelementptr i32 addrspace(1)* %out, i32 %tid
%src0.gep = getelementptr i32 addrspace(1)* %out, i32 %tid
%src2.gep = getelementptr i32 addrspace(1)* %src0.gep, i32 1
%src0 = load i32 addrspace(1)* %src0.gep, align 4
%src2 = load i32 addrspace(1)* %src2.gep, align 4
%mad = call i32 @llvm.AMDGPU.umad24(i32 %src0, i32 4, i32 %src2) nounwind readnone
store i32 %mad, i32 addrspace(1)* %out.gep, align 4
ret void
}

View File

@ -73,7 +73,7 @@ define void @test_sgpr_use_twice_ternary_op_a_a_imm(float addrspace(1)* %out, fl
; SI-LABEL: {{^}}test_sgpr_use_twice_ternary_op_a_imm_a: ; SI-LABEL: {{^}}test_sgpr_use_twice_ternary_op_a_imm_a:
; SI: s_load_dword [[SGPR:s[0-9]+]] ; SI: s_load_dword [[SGPR:s[0-9]+]]
; SI: v_fma_f32 [[RESULT:v[0-9]+]], [[SGPR]], 2.0, [[SGPR]] ; SI: v_fma_f32 [[RESULT:v[0-9]+]], 2.0, [[SGPR]], [[SGPR]]
; SI: buffer_store_dword [[RESULT]] ; SI: buffer_store_dword [[RESULT]]
define void @test_sgpr_use_twice_ternary_op_a_imm_a(float addrspace(1)* %out, float %a) #0 { define void @test_sgpr_use_twice_ternary_op_a_imm_a(float addrspace(1)* %out, float %a) #0 {
%fma = call float @llvm.fma.f32(float %a, float 2.0, float %a) #1 %fma = call float @llvm.fma.f32(float %a, float 2.0, float %a) #1