R600/SI: Add basic DAG combines for fp_class

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@225306 91177308-0d34-0410-b5e6-96231b3b80d8
2025-04-05 01:31:05 +00:00 · 2015-01-06 23:00:39 +00:00 · 2015-01-06 23:00:39 +00:00 · a5b2b64292
commit a5b2b64292
parent b6520ab625
3 changed files with 212 additions and 1 deletions
--- a/lib/Target/R600/SIISelLowering.cpp
+++ b/lib/Target/R600/SIISelLowering.cpp
@ -218,7 +218,7 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) :
  setTargetDAGCombine(ISD::FMAXNUM);
  setTargetDAGCombine(ISD::SELECT_CC);
  setTargetDAGCombine(ISD::SETCC);
-
+  setTargetDAGCombine(ISD::OR);
  setTargetDAGCombine(ISD::UINT_TO_FP);

  // All memory operations. Some folding on the pointer operand is done to help
@ -1302,6 +1302,49 @@ SDValue SITargetLowering::performSHLPtrCombine(SDNode *N,
  return DAG.getNode(ISD::ADD, SL, VT, ShlX, COffset);
 }

+SDValue SITargetLowering::performOrCombine(SDNode *N,
+                                           DAGCombinerInfo &DCI) const {
+  SelectionDAG &DAG = DCI.DAG;
+  SDValue LHS = N->getOperand(0);
+  SDValue RHS = N->getOperand(1);
+
+  // or (fp_class x, c1), (fp_class x, c2) -> fp_class x, (c1 | c2)
+  if (LHS.getOpcode() == AMDGPUISD::FP_CLASS &&
+      RHS.getOpcode() == AMDGPUISD::FP_CLASS) {
+    SDValue Src = LHS.getOperand(0);
+    if (Src != RHS.getOperand(0))
+      return SDValue();
+
+    const ConstantSDNode *CLHS = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
+    const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
+    if (!CLHS || !CRHS)
+      return SDValue();
+
+    // Only 10 bits are used.
+    static const uint32_t MaxMask = 0x3ff;
+
+    uint32_t NewMask = (CLHS->getZExtValue() | CRHS->getZExtValue()) & MaxMask;
+    return DAG.getNode(AMDGPUISD::FP_CLASS, SDLoc(N), MVT::i1,
+                       Src, DAG.getConstant(NewMask, MVT::i32));
+  }
+
+  return SDValue();
+}
+
+SDValue SITargetLowering::performClassCombine(SDNode *N,
+                                              DAGCombinerInfo &DCI) const {
+  SelectionDAG &DAG = DCI.DAG;
+  SDValue Mask = N->getOperand(1);
+
+  // fp_class x, 0 -> false
+  if (const ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(Mask)) {
+    if (CMask->isNullValue())
+      return DAG.getConstant(0, MVT::i1);
+  }
+
+  return SDValue();
+}
+
 static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) {
  switch (Opc) {
  case ISD::FMAXNUM:
@ -1531,6 +1574,10 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
    }
    break;
  }
+  case ISD::OR:
+    return performOrCombine(N, DCI);
+  case AMDGPUISD::FP_CLASS:
+    return performClassCombine(N, DCI);
  }
  return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
 }
--- a/lib/Target/R600/SIISelLowering.h
+++ b/lib/Target/R600/SIISelLowering.h
@ -58,6 +58,8 @@ class SITargetLowering : public AMDGPUTargetLowering {
  SDValue performSHLPtrCombine(SDNode *N,
                               unsigned AS,
                               DAGCombinerInfo &DCI) const;
+  SDValue performOrCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+  SDValue performClassCombine(SDNode *N, DAGCombinerInfo &DCI) const;

  SDValue performMin3Max3Combine(SDNode *N, DAGCombinerInfo &DCI) const;

--- a/test/CodeGen/R600/llvm.AMDGPU.class.ll
+++ b/test/CodeGen/R600/llvm.AMDGPU.class.ll
@ -331,5 +331,167 @@ define void @test_class_lit_constant_dynamic_mask_f64(i32 addrspace(1)* %out, i3
  ret void
 }

+; SI-LABEL: {{^}}test_fold_or_class_f32_0:
+; SI-NOT: v_cmp_class
+; SI: v_cmp_class_f32_e64 {{s\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, 3{{$}}
+; SI-NOT: v_cmp_class
+; SI: s_endpgm
+define void @test_fold_or_class_f32_0(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %gep.in = getelementptr float addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32 addrspace(1)* %out, i32 %tid
+  %a = load float addrspace(1)* %gep.in
+
+  %class0 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 1) #1
+  %class1 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 3) #1
+  %or = or i1 %class0, %class1
+
+  %sext = sext i1 %or to i32
+  store i32 %sext, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}test_fold_or3_class_f32_0:
+; SI-NOT: v_cmp_class
+; SI: v_cmp_class_f32_e64 s{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, 7{{$}}
+; SI-NOT: v_cmp_class
+; SI: s_endpgm
+define void @test_fold_or3_class_f32_0(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %gep.in = getelementptr float addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32 addrspace(1)* %out, i32 %tid
+  %a = load float addrspace(1)* %gep.in
+
+  %class0 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 1) #1
+  %class1 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 2) #1
+  %class2 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 4) #1
+  %or.0 = or i1 %class0, %class1
+  %or.1 = or i1 %or.0, %class2
+
+  %sext = sext i1 %or.1 to i32
+  store i32 %sext, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}test_fold_or_all_tests_class_f32_0:
+; SI-NOT: v_cmp_class
+; SI: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x3ff{{$}}
+; SI: v_cmp_class_f32_e32 vcc, v{{[0-9]+}}, [[MASK]]{{$}}
+; SI-NOT: v_cmp_class
+; SI: s_endpgm
+define void @test_fold_or_all_tests_class_f32_0(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %gep.in = getelementptr float addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32 addrspace(1)* %out, i32 %tid
+  %a = load float addrspace(1)* %gep.in
+
+  %class0 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 1) #1
+  %class1 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 2) #1
+  %class2 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 4) #1
+  %class3 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 8) #1
+  %class4 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 16) #1
+  %class5 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 32) #1
+  %class6 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 64) #1
+  %class7 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 128) #1
+  %class8 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 256) #1
+  %class9 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 512) #1
+  %or.0 = or i1 %class0, %class1
+  %or.1 = or i1 %or.0, %class2
+  %or.2 = or i1 %or.1, %class3
+  %or.3 = or i1 %or.2, %class4
+  %or.4 = or i1 %or.3, %class5
+  %or.5 = or i1 %or.4, %class6
+  %or.6 = or i1 %or.5, %class7
+  %or.7 = or i1 %or.6, %class8
+  %or.8 = or i1 %or.7, %class9
+  %sext = sext i1 %or.8 to i32
+  store i32 %sext, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}test_fold_or_class_f32_1:
+; SI-NOT: v_cmp_class
+; SI: v_cmp_class_f32_e64 {{s\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, 12{{$}}
+; SI-NOT: v_cmp_class
+; SI: s_endpgm
+define void @test_fold_or_class_f32_1(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %gep.in = getelementptr float addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32 addrspace(1)* %out, i32 %tid
+  %a = load float addrspace(1)* %gep.in
+
+  %class0 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 4) #1
+  %class1 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 8) #1
+  %or = or i1 %class0, %class1
+
+  %sext = sext i1 %or to i32
+  store i32 %sext, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}test_fold_or_class_f32_2:
+; SI-NOT: v_cmp_class
+; SI: v_cmp_class_f32_e64 {{s\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, 7{{$}}
+; SI-NOT: v_cmp_class
+; SI: s_endpgm
+define void @test_fold_or_class_f32_2(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %gep.in = getelementptr float addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32 addrspace(1)* %out, i32 %tid
+  %a = load float addrspace(1)* %gep.in
+
+  %class0 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 7) #1
+  %class1 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 7) #1
+  %or = or i1 %class0, %class1
+
+  %sext = sext i1 %or to i32
+  store i32 %sext, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}test_no_fold_or_class_f32_0:
+; SI-DAG: v_cmp_class_f32_e64 {{s\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, 4{{$}}
+; SI-DAG: v_cmp_class_f32_e64 {{s\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}, 8{{$}}
+; SI: s_or_b64
+; SI: s_endpgm
+define void @test_no_fold_or_class_f32_0(i32 addrspace(1)* %out, float addrspace(1)* %in, float %b) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %gep.in = getelementptr float addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32 addrspace(1)* %out, i32 %tid
+  %a = load float addrspace(1)* %gep.in
+
+  %class0 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 4) #1
+  %class1 = call i1 @llvm.AMDGPU.class.f32(float %b, i32 8) #1
+  %or = or i1 %class0, %class1
+
+  %sext = sext i1 %or to i32
+  store i32 %sext, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}test_class_0_f32:
+; SI-NOT: v_cmp_class
+; SI: v_mov_b32_e32 [[RESULT:v[0-9]+]], 0{{$}}
+; SI-NEXT: buffer_store_dword [[RESULT]]
+; SI: s_endpgm
+define void @test_class_0_f32(i32 addrspace(1)* %out, float %a) #0 {
+  %result = call i1 @llvm.AMDGPU.class.f32(float %a, i32 0) #1
+  %sext = sext i1 %result to i32
+  store i32 %sext, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}test_class_0_f64:
+; SI-NOT: v_cmp_class
+; SI: v_mov_b32_e32 [[RESULT:v[0-9]+]], 0{{$}}
+; SI: buffer_store_dword [[RESULT]]
+; SI: s_endpgm
+define void @test_class_0_f64(i32 addrspace(1)* %out, double %a) #0 {
+  %result = call i1 @llvm.AMDGPU.class.f64(double %a, i32 0) #1
+  %sext = sext i1 %result to i32
+  store i32 %sext, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
 attributes #0 = { nounwind }
 attributes #1 = { nounwind readnone }