R600/SI: Add basic DAG combines for fp_class

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@225306 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
Matt Arsenault 2015-01-06 23:00:39 +00:00
parent b6520ab625
commit a5b2b64292
3 changed files with 212 additions and 1 deletions

View File

@ -218,7 +218,7 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) :
setTargetDAGCombine(ISD::FMAXNUM);
setTargetDAGCombine(ISD::SELECT_CC);
setTargetDAGCombine(ISD::SETCC);
setTargetDAGCombine(ISD::OR);
setTargetDAGCombine(ISD::UINT_TO_FP);
// All memory operations. Some folding on the pointer operand is done to help
@ -1302,6 +1302,49 @@ SDValue SITargetLowering::performSHLPtrCombine(SDNode *N,
return DAG.getNode(ISD::ADD, SL, VT, ShlX, COffset);
}
SDValue SITargetLowering::performOrCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
SelectionDAG &DAG = DCI.DAG;
SDValue LHS = N->getOperand(0);
SDValue RHS = N->getOperand(1);
// or (fp_class x, c1), (fp_class x, c2) -> fp_class x, (c1 | c2)
if (LHS.getOpcode() == AMDGPUISD::FP_CLASS &&
RHS.getOpcode() == AMDGPUISD::FP_CLASS) {
SDValue Src = LHS.getOperand(0);
if (Src != RHS.getOperand(0))
return SDValue();
const ConstantSDNode *CLHS = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
if (!CLHS || !CRHS)
return SDValue();
// Only 10 bits are used.
static const uint32_t MaxMask = 0x3ff;
uint32_t NewMask = (CLHS->getZExtValue() | CRHS->getZExtValue()) & MaxMask;
return DAG.getNode(AMDGPUISD::FP_CLASS, SDLoc(N), MVT::i1,
Src, DAG.getConstant(NewMask, MVT::i32));
}
return SDValue();
}
SDValue SITargetLowering::performClassCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
SelectionDAG &DAG = DCI.DAG;
SDValue Mask = N->getOperand(1);
// fp_class x, 0 -> false
if (const ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(Mask)) {
if (CMask->isNullValue())
return DAG.getConstant(0, MVT::i1);
}
return SDValue();
}
static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) {
switch (Opc) {
case ISD::FMAXNUM:
@ -1531,6 +1574,10 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
}
break;
}
case ISD::OR:
return performOrCombine(N, DCI);
case AMDGPUISD::FP_CLASS:
return performClassCombine(N, DCI);
}
return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
}

View File

@ -58,6 +58,8 @@ class SITargetLowering : public AMDGPUTargetLowering {
SDValue performSHLPtrCombine(SDNode *N,
unsigned AS,
DAGCombinerInfo &DCI) const;
SDValue performOrCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performClassCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performMin3Max3Combine(SDNode *N, DAGCombinerInfo &DCI) const;

View File

@ -331,5 +331,167 @@ define void @test_class_lit_constant_dynamic_mask_f64(i32 addrspace(1)* %out, i3
ret void
}
; SI-LABEL: {{^}}test_fold_or_class_f32_0:
; SI-NOT: v_cmp_class
; SI: v_cmp_class_f32_e64 {{s\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, 3{{$}}
; SI-NOT: v_cmp_class
; SI: s_endpgm
define void @test_fold_or_class_f32_0(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 {
%tid = call i32 @llvm.r600.read.tidig.x() #1
%gep.in = getelementptr float addrspace(1)* %in, i32 %tid
%gep.out = getelementptr i32 addrspace(1)* %out, i32 %tid
%a = load float addrspace(1)* %gep.in
%class0 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 1) #1
%class1 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 3) #1
%or = or i1 %class0, %class1
%sext = sext i1 %or to i32
store i32 %sext, i32 addrspace(1)* %out, align 4
ret void
}
; SI-LABEL: {{^}}test_fold_or3_class_f32_0:
; SI-NOT: v_cmp_class
; SI: v_cmp_class_f32_e64 s{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, 7{{$}}
; SI-NOT: v_cmp_class
; SI: s_endpgm
define void @test_fold_or3_class_f32_0(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 {
%tid = call i32 @llvm.r600.read.tidig.x() #1
%gep.in = getelementptr float addrspace(1)* %in, i32 %tid
%gep.out = getelementptr i32 addrspace(1)* %out, i32 %tid
%a = load float addrspace(1)* %gep.in
%class0 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 1) #1
%class1 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 2) #1
%class2 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 4) #1
%or.0 = or i1 %class0, %class1
%or.1 = or i1 %or.0, %class2
%sext = sext i1 %or.1 to i32
store i32 %sext, i32 addrspace(1)* %out, align 4
ret void
}
; SI-LABEL: {{^}}test_fold_or_all_tests_class_f32_0:
; SI-NOT: v_cmp_class
; SI: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x3ff{{$}}
; SI: v_cmp_class_f32_e32 vcc, v{{[0-9]+}}, [[MASK]]{{$}}
; SI-NOT: v_cmp_class
; SI: s_endpgm
define void @test_fold_or_all_tests_class_f32_0(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 {
%tid = call i32 @llvm.r600.read.tidig.x() #1
%gep.in = getelementptr float addrspace(1)* %in, i32 %tid
%gep.out = getelementptr i32 addrspace(1)* %out, i32 %tid
%a = load float addrspace(1)* %gep.in
%class0 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 1) #1
%class1 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 2) #1
%class2 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 4) #1
%class3 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 8) #1
%class4 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 16) #1
%class5 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 32) #1
%class6 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 64) #1
%class7 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 128) #1
%class8 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 256) #1
%class9 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 512) #1
%or.0 = or i1 %class0, %class1
%or.1 = or i1 %or.0, %class2
%or.2 = or i1 %or.1, %class3
%or.3 = or i1 %or.2, %class4
%or.4 = or i1 %or.3, %class5
%or.5 = or i1 %or.4, %class6
%or.6 = or i1 %or.5, %class7
%or.7 = or i1 %or.6, %class8
%or.8 = or i1 %or.7, %class9
%sext = sext i1 %or.8 to i32
store i32 %sext, i32 addrspace(1)* %out, align 4
ret void
}
; SI-LABEL: {{^}}test_fold_or_class_f32_1:
; SI-NOT: v_cmp_class
; SI: v_cmp_class_f32_e64 {{s\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, 12{{$}}
; SI-NOT: v_cmp_class
; SI: s_endpgm
define void @test_fold_or_class_f32_1(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 {
%tid = call i32 @llvm.r600.read.tidig.x() #1
%gep.in = getelementptr float addrspace(1)* %in, i32 %tid
%gep.out = getelementptr i32 addrspace(1)* %out, i32 %tid
%a = load float addrspace(1)* %gep.in
%class0 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 4) #1
%class1 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 8) #1
%or = or i1 %class0, %class1
%sext = sext i1 %or to i32
store i32 %sext, i32 addrspace(1)* %out, align 4
ret void
}
; SI-LABEL: {{^}}test_fold_or_class_f32_2:
; SI-NOT: v_cmp_class
; SI: v_cmp_class_f32_e64 {{s\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, 7{{$}}
; SI-NOT: v_cmp_class
; SI: s_endpgm
define void @test_fold_or_class_f32_2(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 {
%tid = call i32 @llvm.r600.read.tidig.x() #1
%gep.in = getelementptr float addrspace(1)* %in, i32 %tid
%gep.out = getelementptr i32 addrspace(1)* %out, i32 %tid
%a = load float addrspace(1)* %gep.in
%class0 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 7) #1
%class1 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 7) #1
%or = or i1 %class0, %class1
%sext = sext i1 %or to i32
store i32 %sext, i32 addrspace(1)* %out, align 4
ret void
}
; SI-LABEL: {{^}}test_no_fold_or_class_f32_0:
; SI-DAG: v_cmp_class_f32_e64 {{s\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, 4{{$}}
; SI-DAG: v_cmp_class_f32_e64 {{s\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}, 8{{$}}
; SI: s_or_b64
; SI: s_endpgm
define void @test_no_fold_or_class_f32_0(i32 addrspace(1)* %out, float addrspace(1)* %in, float %b) #0 {
%tid = call i32 @llvm.r600.read.tidig.x() #1
%gep.in = getelementptr float addrspace(1)* %in, i32 %tid
%gep.out = getelementptr i32 addrspace(1)* %out, i32 %tid
%a = load float addrspace(1)* %gep.in
%class0 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 4) #1
%class1 = call i1 @llvm.AMDGPU.class.f32(float %b, i32 8) #1
%or = or i1 %class0, %class1
%sext = sext i1 %or to i32
store i32 %sext, i32 addrspace(1)* %out, align 4
ret void
}
; SI-LABEL: {{^}}test_class_0_f32:
; SI-NOT: v_cmp_class
; SI: v_mov_b32_e32 [[RESULT:v[0-9]+]], 0{{$}}
; SI-NEXT: buffer_store_dword [[RESULT]]
; SI: s_endpgm
define void @test_class_0_f32(i32 addrspace(1)* %out, float %a) #0 {
%result = call i1 @llvm.AMDGPU.class.f32(float %a, i32 0) #1
%sext = sext i1 %result to i32
store i32 %sext, i32 addrspace(1)* %out, align 4
ret void
}
; SI-LABEL: {{^}}test_class_0_f64:
; SI-NOT: v_cmp_class
; SI: v_mov_b32_e32 [[RESULT:v[0-9]+]], 0{{$}}
; SI: buffer_store_dword [[RESULT]]
; SI: s_endpgm
define void @test_class_0_f64(i32 addrspace(1)* %out, double %a) #0 {
%result = call i1 @llvm.AMDGPU.class.f64(double %a, i32 0) #1
%sext = sext i1 %result to i32
store i32 %sext, i32 addrspace(1)* %out, align 4
ret void
}
attributes #0 = { nounwind }
attributes #1 = { nounwind readnone }