R600/SI: Remove i1 pseudo VALU ops

Select i1 logical ops directly to 64-bit SALU instructions.
Vector i1 values are always really in SGPRs, with each
bit for each item in the wave. This saves about 4 instructions
when and/or/xoring any condition, and also helps write conditions
that need to be passed in vcc.

This should work correctly now that the SGPR live range
fixing pass works. More work is needed to eliminate the VReg_1
pseudo regclass and possibly the entire SILowerI1Copies pass.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@223206 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
Matt Arsenault 2014-12-03 05:22:35 +00:00
parent bd5f9f45d1
commit ec0a7cd15a
10 changed files with 339 additions and 124 deletions

View File

@ -131,6 +131,10 @@ def as_i32imm: SDNodeXForm<imm, [{
return CurDAG->getTargetConstant(N->getSExtValue(), MVT::i32);
}]>;
def as_i64imm: SDNodeXForm<imm, [{
return CurDAG->getTargetConstant(N->getSExtValue(), MVT::i64);
}]>;
def IMM8bit : PatLeaf <(imm),
[{return isUInt<8>(N->getZExtValue());}]
>;

View File

@ -1686,30 +1686,8 @@ defm V_TRIG_PREOP_F64 : VOP3Inst <
//===----------------------------------------------------------------------===//
// Pseudo Instructions
//===----------------------------------------------------------------------===//
let isCodeGenOnly = 1, isPseudo = 1 in {
def V_MOV_I1 : InstSI <
(outs VReg_1:$dst),
(ins i1imm:$src),
"", [(set i1:$dst, (imm:$src))]
>;
def V_AND_I1 : InstSI <
(outs VReg_1:$dst), (ins VReg_1:$src0, VReg_1:$src1), "",
[(set i1:$dst, (and i1:$src0, i1:$src1))]
>;
def V_OR_I1 : InstSI <
(outs VReg_1:$dst), (ins VReg_1:$src0, VReg_1:$src1), "",
[(set i1:$dst, (or i1:$src0, i1:$src1))]
>;
def V_XOR_I1 : InstSI <
(outs VReg_1:$dst), (ins VReg_1:$src0, VReg_1:$src1), "",
[(set i1:$dst, (xor i1:$src0, i1:$src1))]
>;
let hasSideEffects = 1 in {
def SGPR_USE : InstSI <(outs),(ins), "", []>;
}
@ -2495,6 +2473,14 @@ def : Pat <
(S_MOV_B64 InlineImm<i64>:$imm)
>;
// XXX - Should this use a s_cmp to set SCC?
// Set to sign-extended 64-bit value (true = -1, false = 0)
def : Pat <
(i1 imm:$imm),
(S_MOV_B64 (i64 (as_i64imm $imm)))
>;
/********** ===================== **********/
/********** Interpolation Paterns **********/
/********** ===================== **********/
@ -3045,6 +3031,27 @@ def : Pat <
(V_CNDMASK_B32_e64 0, -1, $src), sub1)
>;
// If we need to perform a logical operation on i1 values, we need to
// use vector comparisons since there is only one SCC register. Vector
// comparisions still write to a pair of SGPRs, so treat these as
// 64-bit comparisons. When legalizing SGPR copies, instructions
// resulting in the copies from SCC to these instructions will be
// moved to the VALU.
def : Pat <
(i1 (and i1:$src0, i1:$src1)),
(S_AND_B64 $src0, $src1)
>;
def : Pat <
(i1 (or i1:$src0, i1:$src1)),
(S_OR_B64 $src0, $src1)
>;
def : Pat <
(i1 (xor i1:$src0, i1:$src1)),
(S_XOR_B64 $src0, $src1)
>;
def : Pat <
(f32 (sint_to_fp i1:$src)),
(V_CNDMASK_B32_e64 (i32 0), CONST.FP32_NEG_ONE, $src)
@ -3057,7 +3064,7 @@ def : Pat <
def : Pat <
(f64 (sint_to_fp i1:$src)),
(V_CVT_F64_I32_e32 (V_CNDMASK_B32_e64 (i32 0), (i32 -1), $src))
(V_CVT_F64_I32_e32 (V_CNDMASK_B32_e64 (i32 0), (i32 -1), $src))
>;
def : Pat <

View File

@ -85,30 +85,6 @@ bool SILowerI1Copies::runOnMachineFunction(MachineFunction &MF) {
Next = std::next(I);
MachineInstr &MI = *I;
if (MI.getOpcode() == AMDGPU::V_MOV_I1) {
I1Defs.push_back(MI.getOperand(0).getReg());
MI.setDesc(TII->get(AMDGPU::V_MOV_B32_e32));
continue;
}
if (MI.getOpcode() == AMDGPU::V_AND_I1) {
I1Defs.push_back(MI.getOperand(0).getReg());
MI.setDesc(TII->get(AMDGPU::V_AND_B32_e64));
continue;
}
if (MI.getOpcode() == AMDGPU::V_OR_I1) {
I1Defs.push_back(MI.getOperand(0).getReg());
MI.setDesc(TII->get(AMDGPU::V_OR_B32_e64));
continue;
}
if (MI.getOpcode() == AMDGPU::V_XOR_I1) {
I1Defs.push_back(MI.getOperand(0).getReg());
MI.setDesc(TII->get(AMDGPU::V_XOR_B32_e64));
continue;
}
if (MI.getOpcode() == AMDGPU::IMPLICIT_DEF) {
unsigned Reg = MI.getOperand(0).getReg();
const TargetRegisterClass *RC = MRI.getRegClass(Reg);
@ -117,32 +93,52 @@ bool SILowerI1Copies::runOnMachineFunction(MachineFunction &MF) {
continue;
}
if (MI.getOpcode() != AMDGPU::COPY ||
!TargetRegisterInfo::isVirtualRegister(MI.getOperand(0).getReg()) ||
!TargetRegisterInfo::isVirtualRegister(MI.getOperand(1).getReg()))
if (MI.getOpcode() != AMDGPU::COPY)
continue;
const MachineOperand &Dst = MI.getOperand(0);
const MachineOperand &Src = MI.getOperand(1);
const TargetRegisterClass *DstRC =
MRI.getRegClass(MI.getOperand(0).getReg());
const TargetRegisterClass *SrcRC =
MRI.getRegClass(MI.getOperand(1).getReg());
if (!TargetRegisterInfo::isVirtualRegister(Src.getReg()) ||
!TargetRegisterInfo::isVirtualRegister(Dst.getReg()))
continue;
const TargetRegisterClass *DstRC = MRI.getRegClass(Dst.getReg());
const TargetRegisterClass *SrcRC = MRI.getRegClass(Src.getReg());
if (DstRC == &AMDGPU::VReg_1RegClass &&
TRI->getCommonSubClass(SrcRC, &AMDGPU::SGPR_64RegClass)) {
I1Defs.push_back(MI.getOperand(0).getReg());
BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(AMDGPU::V_CNDMASK_B32_e64))
.addOperand(MI.getOperand(0))
.addImm(0)
.addImm(-1)
.addOperand(MI.getOperand(1));
I1Defs.push_back(Dst.getReg());
DebugLoc DL = MI.getDebugLoc();
MachineInstr *DefInst = MRI.getUniqueVRegDef(Src.getReg());
if (DefInst->getOpcode() == AMDGPU::S_MOV_B64) {
if (DefInst->getOperand(1).isImm()) {
I1Defs.push_back(Dst.getReg());
int64_t Val = DefInst->getOperand(1).getImm();
assert(Val == 0 || Val == -1);
BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_MOV_B32_e32))
.addOperand(Dst)
.addImm(Val);
MI.eraseFromParent();
continue;
}
}
BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64))
.addOperand(Dst)
.addImm(0)
.addImm(-1)
.addOperand(Src);
MI.eraseFromParent();
} else if (TRI->getCommonSubClass(DstRC, &AMDGPU::SGPR_64RegClass) &&
SrcRC == &AMDGPU::VReg_1RegClass) {
BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(AMDGPU::V_CMP_NE_I32_e64))
.addOperand(MI.getOperand(0))
.addOperand(MI.getOperand(1))
.addImm(0);
.addOperand(Dst)
.addOperand(Src)
.addImm(0);
MI.eraseFromParent();
}
}

View File

@ -1,5 +1,5 @@
; RUN: llc -march=r600 -mcpu=bonaire < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s
; RUN: llc -march=r600 -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
; RUN: llc -march=r600 -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s
; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
declare double @llvm.ceil.f64(double) nounwind readnone
declare <2 x double> @llvm.ceil.v2f64(<2 x double>) nounwind readnone
@ -22,12 +22,15 @@ declare <16 x double> @llvm.ceil.v16f64(<16 x double>) nounwind readnone
; SI: cmp_gt_i32
; SI: cndmask_b32
; SI: cndmask_b32
; SI: cmp_gt_f64
; SI: cndmask_b32
; SI: cmp_ne_i32
; SI: cndmask_b32
; SI: cndmask_b32
; SI: v_cmp_o_f64
; SI: v_cmp_neq_f64
; SI: s_and_b64
; SI: v_cmp_gt_f64
; SI: s_and_b64
; SI: v_cndmask_b32
; SI: v_cndmask_b32
; SI: v_add_f64
; SI: s_endpgm
define void @fceil_f64(double addrspace(1)* %out, double %x) {
%y = call double @llvm.ceil.f64(double %x) nounwind readnone
store double %y, double addrspace(1)* %out

View File

@ -1,5 +1,5 @@
; RUN: llc -march=r600 -mcpu=bonaire < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s
; RUN: llc -march=r600 -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
; RUN: llc -march=r600 -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s
; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
declare double @llvm.floor.f64(double) nounwind readnone
declare <2 x double> @llvm.floor.v2f64(<2 x double>) nounwind readnone
@ -23,12 +23,15 @@ declare <16 x double> @llvm.floor.v16f64(<16 x double>) nounwind readnone
; SI: cmp_gt_i32
; SI: cndmask_b32
; SI: cndmask_b32
; SI: cmp_lt_f64
; SI: cndmask_b32
; SI: cmp_ne_i32
; SI: cndmask_b32
; SI: cndmask_b32
; SI: v_cmp_o_f64
; SI: v_cmp_neq_f64
; SI: s_and_b64
; SI: v_cmp_lt_f64
; SI: s_and_b64
; SI: v_cndmask_b32
; SI: v_cndmask_b32
; SI: v_add_f64
; SI: s_endpgm
define void @ffloor_f64(double addrspace(1)* %out, double %x) {
%y = call double @llvm.floor.f64(double %x) nounwind readnone
store double %y, double addrspace(1)* %out

View File

@ -96,11 +96,12 @@ entry:
; R600-DAG: SETNE_DX10
; R600-DAG: AND_INT
; R600-DAG: SETNE_INT
; SI: v_cmp_o_f32
; SI: v_cmp_neq_f32
; SI: v_cndmask_b32_e64
; SI: v_cndmask_b32_e64
; SI: v_and_b32_e32
; SI-DAG: v_cmp_o_f32_e32 vcc
; SI-DAG: v_cmp_neq_f32_e64 [[CMP1:s\[[0-9]+:[0-9]+\]]]
; SI: s_and_b64 [[AND:s\[[0-9]+:[0-9]+\]]], [[CMP1]], vcc
; SI: v_cndmask_b32_e64 [[VRESULT:v[0-9]+]], 0, -1, [[AND]]
; SI: buffer_store_dword [[VRESULT]]
define void @f32_one(i32 addrspace(1)* %out, float %a, float %b) {
entry:
%0 = fcmp one float %a, %b
@ -130,11 +131,12 @@ entry:
; R600-DAG: SETE_DX10
; R600-DAG: OR_INT
; R600-DAG: SETNE_INT
; SI: v_cmp_u_f32
; SI: v_cmp_eq_f32
; SI: v_cndmask_b32_e64
; SI: v_cndmask_b32_e64
; SI: v_or_b32_e32
; SI-DAG: v_cmp_u_f32_e32 vcc
; SI-DAG: v_cmp_eq_f32_e64 [[CMP1:s\[[0-9]+:[0-9]+\]]]
; SI: s_or_b64 [[OR:s\[[0-9]+:[0-9]+\]]], [[CMP1]], vcc
; SI: v_cndmask_b32_e64 [[VRESULT:v[0-9]+]], 0, -1, [[OR]]
; SI: buffer_store_dword [[VRESULT]]
define void @f32_ueq(i32 addrspace(1)* %out, float %a, float %b) {
entry:
%0 = fcmp ueq float %a, %b
@ -148,9 +150,8 @@ entry:
; R600: SETE_DX10
; SI: v_cmp_u_f32
; SI: v_cmp_gt_f32
; SI: v_cndmask_b32_e64
; SI: v_cndmask_b32_e64
; SI: v_or_b32_e32
; SI: s_or_b64
; SI: v_cndmask_b32
define void @f32_ugt(i32 addrspace(1)* %out, float %a, float %b) {
entry:
%0 = fcmp ugt float %a, %b
@ -164,9 +165,8 @@ entry:
; R600: SETE_DX10
; SI: v_cmp_u_f32
; SI: v_cmp_ge_f32
; SI: v_cndmask_b32_e64
; SI: v_cndmask_b32_e64
; SI: v_or_b32_e32
; SI: s_or_b64
; SI: v_cndmask_b32
define void @f32_uge(i32 addrspace(1)* %out, float %a, float %b) {
entry:
%0 = fcmp uge float %a, %b
@ -180,9 +180,8 @@ entry:
; R600: SETE_DX10
; SI: v_cmp_u_f32
; SI: v_cmp_lt_f32
; SI: v_cndmask_b32_e64
; SI: v_cndmask_b32_e64
; SI: v_or_b32_e32
; SI: s_or_b64
; SI: v_cndmask_b32
define void @f32_ult(i32 addrspace(1)* %out, float %a, float %b) {
entry:
%0 = fcmp ult float %a, %b
@ -196,9 +195,8 @@ entry:
; R600: SETE_DX10
; SI: v_cmp_u_f32
; SI: v_cmp_le_f32
; SI: v_cndmask_b32_e64
; SI: v_cndmask_b32_e64
; SI: v_or_b32_e32
; SI: s_or_b64
; SI: v_cndmask_b32
define void @f32_ule(i32 addrspace(1)* %out, float %a, float %b) {
entry:
%0 = fcmp ule float %a, %b

View File

@ -57,11 +57,11 @@ entry:
}
; FUNC-LABEL: {{^}}f64_one:
; SI: v_cmp_o_f64
; SI: v_cmp_neq_f64
; SI: v_cndmask_b32_e64
; SI: v_cndmask_b32_e64
; SI: v_and_b32_e32
; SI-DAG: v_cmp_o_f64_e32 vcc
; SI-DAG: v_cmp_neq_f64_e64 [[CMP1:s\[[0-9]+:[0-9]+\]]]
; SI: s_and_b64 [[AND:s\[[0-9]+:[0-9]+\]]], [[CMP1]], vcc
; SI: v_cndmask_b32_e64 [[VRESULT:v[0-9]+]], 0, -1, [[AND]]
; SI: buffer_store_dword [[VRESULT]]
define void @f64_one(i32 addrspace(1)* %out, double %a, double %b) {
entry:
%0 = fcmp one double %a, %b
@ -83,9 +83,8 @@ entry:
; FUNC-LABEL: {{^}}f64_ueq:
; SI: v_cmp_u_f64
; SI: v_cmp_eq_f64
; SI: v_cndmask_b32_e64
; SI: v_cndmask_b32_e64
; SI: v_or_b32_e32
; SI: s_or_b64
; SI: v_cndmask_b32
define void @f64_ueq(i32 addrspace(1)* %out, double %a, double %b) {
entry:
%0 = fcmp ueq double %a, %b
@ -97,9 +96,8 @@ entry:
; FUNC-LABEL: {{^}}f64_ugt:
; SI: v_cmp_u_f64
; SI: v_cmp_gt_f64
; SI: v_cndmask_b32_e64
; SI: v_cndmask_b32_e64
; SI: v_or_b32_e32
; SI: s_or_b64
; SI: v_cndmask_b32
define void @f64_ugt(i32 addrspace(1)* %out, double %a, double %b) {
entry:
%0 = fcmp ugt double %a, %b
@ -111,9 +109,8 @@ entry:
; FUNC-LABEL: {{^}}f64_uge:
; SI: v_cmp_u_f64
; SI: v_cmp_ge_f64
; SI: v_cndmask_b32_e64
; SI: v_cndmask_b32_e64
; SI: v_or_b32_e32
; SI: s_or_b64
; SI: v_cndmask_b32
define void @f64_uge(i32 addrspace(1)* %out, double %a, double %b) {
entry:
%0 = fcmp uge double %a, %b
@ -125,9 +122,8 @@ entry:
; FUNC-LABEL: {{^}}f64_ult:
; SI: v_cmp_u_f64
; SI: v_cmp_lt_f64
; SI: v_cndmask_b32_e64
; SI: v_cndmask_b32_e64
; SI: v_or_b32_e32
; SI: s_or_b64
; SI: v_cndmask_b32
define void @f64_ult(i32 addrspace(1)* %out, double %a, double %b) {
entry:
%0 = fcmp ult double %a, %b
@ -139,9 +135,8 @@ entry:
; FUNC-LABEL: {{^}}f64_ule:
; SI: v_cmp_u_f64
; SI: v_cmp_le_f64
; SI: v_cndmask_b32_e64
; SI: v_cndmask_b32_e64
; SI: v_or_b32_e32
; SI: s_or_b64
; SI: v_cndmask_b32
define void @f64_ule(i32 addrspace(1)* %out, double %a, double %b) {
entry:
%0 = fcmp ule double %a, %b

View File

@ -59,6 +59,47 @@ endif:
ret void
}
; FIXME: Should write to different SGPR pairs instead of copying to
; VALU for i1 phi.
; SI-LABEL: {{^}}sgpr_if_else_valu_cmp_phi_br:
; SI: buffer_load_dword [[AVAL:v[0-9]+]]
; SI: v_cmp_lt_i32_e64 [[CMP_IF:s\[[0-9]+:[0-9]+\]]], [[AVAL]], 0
; SI: v_cndmask_b32_e64 [[V_CMP:v[0-9]+]], 0, -1, [[CMP_IF]]
; SI: BB2_1:
; SI: buffer_load_dword [[AVAL:v[0-9]+]]
; SI: v_cmp_eq_i32_e64 [[CMP_ELSE:s\[[0-9]+:[0-9]+\]]], [[AVAL]], 0
; SI: v_cndmask_b32_e64 [[V_CMP]], 0, -1, [[CMP_ELSE]]
; SI: v_cmp_ne_i32_e64 [[CMP_CMP:s\[[0-9]+:[0-9]+\]]], [[V_CMP]], 0
; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP_CMP]]
; SI: buffer_store_dword [[RESULT]]
define void @sgpr_if_else_valu_cmp_phi_br(i32 addrspace(1)* %out, i32 addrspace(1)* %a, i32 addrspace(1)* %b) {
entry:
%tid = call i32 @llvm.r600.read.tidig.x() #0
%tmp1 = icmp eq i32 %tid, 0
br i1 %tmp1, label %if, label %else
if:
%gep.if = getelementptr i32 addrspace(1)* %a, i32 %tid
%a.val = load i32 addrspace(1)* %gep.if
%cmp.if = icmp eq i32 %a.val, 0
br label %endif
else:
%gep.else = getelementptr i32 addrspace(1)* %b, i32 %tid
%b.val = load i32 addrspace(1)* %gep.else
%cmp.else = icmp slt i32 %b.val, 0
br label %endif
endif:
%tmp4 = phi i1 [%cmp.if, %if], [%cmp.else, %else]
%ext = sext i1 %tmp4 to i32
store i32 %ext, i32 addrspace(1)* %out
ret void
}
declare i32 @llvm.r600.read.tidig.x() #0
attributes #0 = { readnone }

View File

@ -1,10 +1,13 @@
; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs -enable-misched -asm-verbose < %s | FileCheck -check-prefix=SI %s
declare i32 @llvm.r600.read.tidig.x() nounwind readnone
; SI-LABEL: @test_if
; Make sure the i1 values created by the cfg structurizer pass are
; moved using VALU instructions
; SI-NOT: s_mov_b64 s[{{[0-9]:[0-9]}}], -1
; SI: v_mov_b32_e32 v{{[0-9]}}, -1
define void @test_if(i32 %a, i32 %b, i32 addrspace(1)* %src, i32 addrspace(1)* %dst) {
define void @test_if(i32 %a, i32 %b, i32 addrspace(1)* %src, i32 addrspace(1)* %dst) #1 {
entry:
switch i32 %a, label %default [
i32 0, label %case0
@ -37,3 +40,150 @@ else:
end:
ret void
}
; SI-LABEL: @simple_test_v_if
; SI: v_cmp_ne_i32_e64 [[BR_SREG:s\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, 0
; SI: s_and_saveexec_b64 [[BR_SREG]], [[BR_SREG]]
; SI: s_xor_b64 [[BR_SREG]], exec, [[BR_SREG]]
; SI: ; BB#1
; SI: buffer_store_dword
; SI: s_endpgm
; SI: BB1_2:
; SI: s_or_b64 exec, exec, [[BR_SREG]]
; SI: s_endpgm
define void @simple_test_v_if(i32 addrspace(1)* %dst, i32 addrspace(1)* %src) #1 {
%tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
%is.0 = icmp ne i32 %tid, 0
br i1 %is.0, label %store, label %exit
store:
%gep = getelementptr i32 addrspace(1)* %dst, i32 %tid
store i32 999, i32 addrspace(1)* %gep
ret void
exit:
ret void
}
; SI-LABEL: @simple_test_v_loop
; SI: v_cmp_ne_i32_e64 [[BR_SREG:s\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, 0
; SI: s_and_saveexec_b64 [[BR_SREG]], [[BR_SREG]]
; SI: s_xor_b64 [[BR_SREG]], exec, [[BR_SREG]]
; SI: s_cbranch_execz BB2_2
; SI: ; BB#1:
; SI: s_mov_b64 {{s\[[0-9]+:[0-9]+\]}}, 0{{$}}
; SI: BB2_3:
; SI: buffer_load_dword
; SI: buffer_store_dword
; SI: v_cmp_eq_i32_e32 vcc,
; SI: s_or_b64 [[OR_SREG:s\[[0-9]+:[0-9]+\]]]
; SI: v_add_i32_e32 {{v[0-9]+}}, 1, {{v[0-9]+}}
; SI: s_andn2_b64 exec, exec, [[OR_SREG]]
; SI: s_cbranch_execnz BB2_3
define void @simple_test_v_loop(i32 addrspace(1)* %dst, i32 addrspace(1)* %src) #1 {
entry:
%tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
%is.0 = icmp ne i32 %tid, 0
%limit = add i32 %tid, 64
br i1 %is.0, label %loop, label %exit
loop:
%i = phi i32 [%tid, %entry], [%i.inc, %loop]
%gep.src = getelementptr i32 addrspace(1)* %src, i32 %i
%gep.dst = getelementptr i32 addrspace(1)* %dst, i32 %i
%load = load i32 addrspace(1)* %src
store i32 %load, i32 addrspace(1)* %gep.dst
%i.inc = add nsw i32 %i, 1
%cmp = icmp eq i32 %limit, %i.inc
br i1 %cmp, label %exit, label %loop
exit:
ret void
}
; SI-LABEL: @multi_vcond_loop
; Load loop limit from buffer
; Branch to exit if uniformly not taken
; SI: ; BB#0:
; SI: buffer_load_dword [[VBOUND:v[0-9]+]]
; SI: v_cmp_gt_i32_e64 [[OUTER_CMP_SREG:s\[[0-9]+:[0-9]+\]]]
; SI: s_and_saveexec_b64 [[OUTER_CMP_SREG]], [[OUTER_CMP_SREG]]
; SI: s_xor_b64 [[OUTER_CMP_SREG]], exec, [[OUTER_CMP_SREG]]
; SI: s_cbranch_execz BB3_2
; Initialize inner condition to false
; SI: ; BB#1:
; SI: s_mov_b64 [[ZERO:s\[[0-9]+:[0-9]+\]]], 0{{$}}
; SI: s_mov_b64 [[COND_STATE:s\[[0-9]+:[0-9]+\]]], [[ZERO]]
; Clear exec bits for workitems that load -1s
; SI: BB3_3:
; SI: buffer_load_dword [[A:v[0-9]+]]
; SI: buffer_load_dword [[B:v[0-9]+]]
; SI-DAG: v_cmp_ne_i32_e64 [[NEG1_CHECK_0:s\[[0-9]+:[0-9]+\]]], [[A]], -1
; SI-DAG: v_cmp_ne_i32_e64 [[NEG1_CHECK_1:s\[[0-9]+:[0-9]+\]]], [[B]], -1
; SI: s_and_b64 [[ORNEG1:s\[[0-9]+:[0-9]+\]]], [[NEG1_CHECK_1]], [[NEG1_CHECK_0]]
; SI: s_and_saveexec_b64 [[ORNEG1]], [[ORNEG1]]
; SI: s_xor_b64 [[ORNEG1]], exec, [[ORNEG1]]
; SI: s_cbranch_execz BB3_5
; SI: BB#4:
; SI: buffer_store_dword
; SI: v_cmp_ge_i64_e32 vcc
; SI: s_or_b64 [[COND_STATE]], vcc, [[COND_STATE]]
; SI: BB3_5:
; SI: s_or_b64 exec, exec, [[ORNEG1]]
; SI: s_or_b64 [[COND_STATE]], [[ORNEG1]], [[COND_STATE]]
; SI: s_andn2_b64 exec, exec, [[COND_STATE]]
; SI: s_cbranch_execnz BB3_3
; SI: BB#6
; SI: s_or_b64 exec, exec, [[COND_STATE]]
; SI: BB3_2:
; SI-NOT: [[COND_STATE]]
; SI: s_endpgm
define void @multi_vcond_loop(i32 addrspace(1)* noalias nocapture %arg, i32 addrspace(1)* noalias nocapture readonly %arg1, i32 addrspace(1)* noalias nocapture readonly %arg2, i32 addrspace(1)* noalias nocapture readonly %arg3) #1 {
bb:
%tmp = tail call i32 @llvm.r600.read.tidig.x() #0
%tmp4 = sext i32 %tmp to i64
%tmp5 = getelementptr inbounds i32 addrspace(1)* %arg3, i64 %tmp4
%tmp6 = load i32 addrspace(1)* %tmp5, align 4
%tmp7 = icmp sgt i32 %tmp6, 0
%tmp8 = sext i32 %tmp6 to i64
br i1 %tmp7, label %bb10, label %bb26
bb10: ; preds = %bb, %bb20
%tmp11 = phi i64 [ %tmp23, %bb20 ], [ 0, %bb ]
%tmp12 = add nsw i64 %tmp11, %tmp4
%tmp13 = getelementptr inbounds i32 addrspace(1)* %arg1, i64 %tmp12
%tmp14 = load i32 addrspace(1)* %tmp13, align 4
%tmp15 = getelementptr inbounds i32 addrspace(1)* %arg2, i64 %tmp12
%tmp16 = load i32 addrspace(1)* %tmp15, align 4
%tmp17 = icmp ne i32 %tmp14, -1
%tmp18 = icmp ne i32 %tmp16, -1
%tmp19 = and i1 %tmp17, %tmp18
br i1 %tmp19, label %bb20, label %bb26
bb20: ; preds = %bb10
%tmp21 = add nsw i32 %tmp16, %tmp14
%tmp22 = getelementptr inbounds i32 addrspace(1)* %arg, i64 %tmp12
store i32 %tmp21, i32 addrspace(1)* %tmp22, align 4
%tmp23 = add nuw nsw i64 %tmp11, 1
%tmp24 = icmp slt i64 %tmp23, %tmp8
br i1 %tmp24, label %bb10, label %bb26
bb26: ; preds = %bb10, %bb20, %bb
ret void
}
attributes #0 = { nounwind readnone }
attributes #1 = { nounwind }

View File

@ -39,19 +39,37 @@ define void @xor_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in
; FUNC-LABEL: {{^}}xor_i1:
; EG: XOR_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], PS}}
; SI: v_xor_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
; SI-DAG: v_cmp_ge_f32_e64 [[CMP0:s\[[0-9]+:[0-9]+\]]], {{v[0-9]+}}, 0.0
; SI-DAG: v_cmp_ge_f32_e64 [[CMP1:s\[[0-9]+:[0-9]+\]]], {{v[0-9]+}}, 1.0
; SI: s_xor_b64 [[XOR:s\[[0-9]+:[0-9]+\]]], [[CMP0]], [[CMP1]]
; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}, [[XOR]]
; SI: buffer_store_dword [[RESULT]]
; SI: s_endpgm
define void @xor_i1(float addrspace(1)* %out, float addrspace(1)* %in0, float addrspace(1)* %in1) {
%a = load float addrspace(1) * %in0
%b = load float addrspace(1) * %in1
%acmp = fcmp oge float %a, 0.000000e+00
%bcmp = fcmp oge float %b, 0.000000e+00
%bcmp = fcmp oge float %b, 1.000000e+00
%xor = xor i1 %acmp, %bcmp
%result = select i1 %xor, float %a, float %b
store float %result, float addrspace(1)* %out
ret void
}
; FUNC-LABEL: {{^}}v_xor_i1:
; SI: buffer_load_ubyte [[A:v[0-9]+]]
; SI: buffer_load_ubyte [[B:v[0-9]+]]
; SI: v_xor_b32_e32 [[XOR:v[0-9]+]], [[B]], [[A]]
; SI: v_and_b32_e32 [[RESULT:v[0-9]+]], 1, [[XOR]]
; SI: buffer_store_byte [[RESULT]]
define void @v_xor_i1(i1 addrspace(1)* %out, i1 addrspace(1)* %in0, i1 addrspace(1)* %in1) {
%a = load i1 addrspace(1)* %in0
%b = load i1 addrspace(1)* %in1
%xor = xor i1 %a, %b
store i1 %xor, i1 addrspace(1)* %out
ret void
}
; FUNC-LABEL: {{^}}vector_xor_i32:
; SI: v_xor_b32_e32
define void @vector_xor_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in0, i32 addrspace(1)* %in1) {