R600: Don't emit empty then clause and use alu_pop_after

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@186725 91177308-0d34-0410-b5e6-96231b3b80d8
2024-12-14 11:32:34 +00:00 · 2013-07-19 21:45:15 +00:00 · 2013-07-19 21:45:15 +00:00 · 272458bd06
commit 272458bd06
parent 12140450fa
6 changed files with 175 additions and 17 deletions
--- a/lib/Target/R600/AMDILCFGStructurizer.cpp
+++ b/lib/Target/R600/AMDILCFGStructurizer.cpp
@ -1039,8 +1039,11 @@ int AMDGPUCFGStructurizer::ifPatternMatch(MachineBasicBlock *MBB) {
  } else if (FalseMBB->succ_size() == 1
             && *FalseMBB->succ_begin() == TrueMBB) {
    // Triangle pattern, true is empty
-    LandBlk = TrueMBB;
-    TrueMBB = NULL;
+    // We reverse the predicate to make a triangle, empty false pattern;
+    std::swap(TrueMBB, FalseMBB);
+    reversePredicateSetter(MBB->end());
+    LandBlk = FalseMBB;
+    FalseMBB = NULL;
  } else if (FalseMBB->succ_size() == 1
             && isSameloopDetachedContbreak(TrueMBB, FalseMBB)) {
    LandBlk = *FalseMBB->succ_begin();
@ -1456,6 +1459,7 @@ void AMDGPUCFGStructurizer::mergeSerialBlock(MachineBasicBlock *DstMBB,
 void AMDGPUCFGStructurizer::mergeIfthenelseBlock(MachineInstr *BranchMI,
    MachineBasicBlock *MBB, MachineBasicBlock *TrueMBB,
    MachineBasicBlock *FalseMBB, MachineBasicBlock *LandMBB) {
+  assert (TrueMBB);
  DEBUG(
    dbgs() << "ifPattern BB" << MBB->getNumber();
    dbgs() << "{  ";
--- a/lib/Target/R600/R600ControlFlowFinalizer.cpp
+++ b/lib/Target/R600/R600ControlFlowFinalizer.cpp
@ -347,6 +347,9 @@ public:
        MaxStack = 1;
      }
      std::vector<ClauseFile> FetchClauses, AluClauses;
+      std::vector<MachineInstr *> LastAlu(1);
+      std::vector<MachineInstr *> ToPopAfter;
+      
      for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
          I != E;) {
        if (TII->usesTextureCache(I) || TII->usesVertexCache(I)) {
@ -357,6 +360,10 @@ public:
        }

        MachineBasicBlock::iterator MI = I;
+        if (MI->getOpcode() != AMDGPU::ENDIF)
+          LastAlu.back() = 0;
+        if (MI->getOpcode() == AMDGPU::CF_ALU)
+          LastAlu.back() = MI;
        I++;
        switch (MI->getOpcode()) {
        case AMDGPU::CF_ALU_PUSH_BEFORE:
@ -403,6 +410,7 @@ public:
          break;
        }
        case AMDGPU::IF_PREDICATE_SET: {
+          LastAlu.push_back(0);
          MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI),
              getHWInstrDesc(CF_JUMP))
              .addImm(0)
@ -420,7 +428,7 @@ public:
          MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI),
              getHWInstrDesc(CF_ELSE))
              .addImm(0)
-              .addImm(1);
+              .addImm(0);
          DEBUG(dbgs() << CfCount << ":"; MIb->dump(););
          IfThenElseStack.push_back(MIb);
          MI->eraseFromParent();
@ -429,17 +437,24 @@ public:
        }
        case AMDGPU::ENDIF: {
          CurrentStack--;
-          MachineInstr *IfOrElseInst = IfThenElseStack.back();
-          IfThenElseStack.pop_back();
-          CounterPropagateAddr(IfOrElseInst, CfCount + 1);
+          if (LastAlu.back()) {
+            ToPopAfter.push_back(LastAlu.back());
+          } else {
            MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI),
                getHWInstrDesc(CF_POP))
                .addImm(CfCount + 1)
                .addImm(1);
            (void)MIb;
            DEBUG(dbgs() << CfCount << ":"; MIb->dump(););
-          MI->eraseFromParent();
            CfCount++;
+          }
+          
+          MachineInstr *IfOrElseInst = IfThenElseStack.back();
+          IfThenElseStack.pop_back();
+          CounterPropagateAddr(IfOrElseInst, CfCount);
+          IfOrElseInst->getOperand(1).setImm(1);
+          LastAlu.pop_back();
+          MI->eraseFromParent();
          break;
        }
        case AMDGPU::PREDICATED_BREAK: {
@ -484,6 +499,21 @@ public:
          break;
        }
      }
+      for (unsigned i = 0, e = ToPopAfter.size(); i < e; ++i) {
+        MachineInstr *Alu = ToPopAfter[i];
+        BuildMI(MBB, Alu, MBB.findDebugLoc((MachineBasicBlock::iterator)Alu),
+            TII->get(AMDGPU::CF_ALU_POP_AFTER))
+            .addImm(Alu->getOperand(0).getImm())
+            .addImm(Alu->getOperand(1).getImm())
+            .addImm(Alu->getOperand(2).getImm())
+            .addImm(Alu->getOperand(3).getImm())
+            .addImm(Alu->getOperand(4).getImm())
+            .addImm(Alu->getOperand(5).getImm())
+            .addImm(Alu->getOperand(6).getImm())
+            .addImm(Alu->getOperand(7).getImm())
+            .addImm(Alu->getOperand(8).getImm());
+        Alu->eraseFromParent();
+      }
      MFI->StackSize = getHWStackSize(MaxStack, HasPush);
    }

--- a/lib/Target/R600/R600Instructions.td
+++ b/lib/Target/R600/R600Instructions.td
@ -624,6 +624,7 @@ ins, AsmPrint, [] >, CF_WORD0_EG, CF_WORD1_EG {

 def CF_ALU : ALU_CLAUSE<8, "ALU">;
 def CF_ALU_PUSH_BEFORE : ALU_CLAUSE<9, "ALU_PUSH_BEFORE">;
+def CF_ALU_POP_AFTER : ALU_CLAUSE<10, "ALU_POP_AFTER">;

 def FETCH_CLAUSE : AMDGPUInst <(outs),
 (ins i32imm:$addr), "Fetch clause starting at $addr:", [] > {
--- a/test/CodeGen/R600/jump-address.ll
+++ b/test/CodeGen/R600/jump-address.ll
@ -1,6 +1,6 @@
 ;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s

-; CHECK: JUMP @7
+; CHECK: JUMP @5
 ; CHECK: EXPORT
 ; CHECK-NOT: EXPORT

--- a/test/CodeGen/R600/loop-address.ll
+++ b/test/CodeGen/R600/loop-address.ll
@ -2,12 +2,11 @@

 ;CHECK: TEX
 ;CHECK: ALU_PUSH
-;CHECK: JUMP @4
-;CHECK: ELSE @16
+;CHECK: JUMP @15
 ;CHECK: TEX
-;CHECK: LOOP_START_DX10 @15
-;CHECK: LOOP_BREAK @14
-;CHECK: POP @16
+;CHECK: LOOP_START_DX10 @14
+;CHECK: LOOP_BREAK @13
+;CHECK: POP @15

 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024-v2048:2048:2048-n32:64"
 target triple = "r600--"
--- a/test/CodeGen/R600/r600cfg.ll
+++ b/test/CodeGen/R600/r600cfg.ll
@ -0,0 +1,124 @@
+;RUN: llc < %s -march=r600 -mcpu=redwood
+;REQUIRES: asserts
+
+define void @main() #0 {
+main_body:
+  %0 = call float @llvm.R600.load.input(i32 4)
+  %1 = call float @llvm.R600.load.input(i32 5)
+  %2 = call float @llvm.R600.load.input(i32 6)
+  %3 = call float @llvm.R600.load.input(i32 7)
+  %4 = bitcast float %0 to i32
+  %5 = icmp eq i32 %4, 0
+  %6 = sext i1 %5 to i32
+  %7 = bitcast i32 %6 to float
+  %8 = bitcast float %7 to i32
+  %9 = icmp ne i32 %8, 0
+  %. = select i1 %9, float 0x36A0000000000000, float %0
+  br label %LOOP
+
+LOOP:                                             ; preds = %LOOP47, %main_body
+  %temp12.0 = phi float [ 0x36A0000000000000, %main_body ], [ %temp12.1, %LOOP47 ]
+  %temp8.0 = phi float [ 0.000000e+00, %main_body ], [ %38, %LOOP47 ]
+  %temp4.1 = phi float [ %., %main_body ], [ %52, %LOOP47 ]
+  %10 = bitcast float %temp4.1 to i32
+  %11 = icmp eq i32 %10, 1
+  %12 = sext i1 %11 to i32
+  %13 = bitcast i32 %12 to float
+  %14 = bitcast float %13 to i32
+  %15 = icmp ne i32 %14, 0
+  br i1 %15, label %IF41, label %ENDIF40
+
+IF41:                                             ; preds = %LOOP
+  %16 = insertelement <4 x float> undef, float %0, i32 0
+  %17 = insertelement <4 x float> %16, float %temp8.0, i32 1
+  %18 = insertelement <4 x float> %17, float %temp12.0, i32 2
+  %19 = insertelement <4 x float> %18, float 0.000000e+00, i32 3
+  call void @llvm.R600.store.stream.output(<4 x float> %19, i32 0, i32 0, i32 1)
+  %20 = insertelement <4 x float> undef, float %0, i32 0
+  %21 = insertelement <4 x float> %20, float %temp8.0, i32 1
+  %22 = insertelement <4 x float> %21, float %temp12.0, i32 2
+  %23 = insertelement <4 x float> %22, float 0.000000e+00, i32 3
+  call void @llvm.R600.store.stream.output(<4 x float> %23, i32 0, i32 0, i32 2)
+  %24 = insertelement <4 x float> undef, float %0, i32 0
+  %25 = insertelement <4 x float> %24, float %temp8.0, i32 1
+  %26 = insertelement <4 x float> %25, float %temp12.0, i32 2
+  %27 = insertelement <4 x float> %26, float 0.000000e+00, i32 3
+  call void @llvm.R600.store.stream.output(<4 x float> %27, i32 0, i32 0, i32 4)
+  %28 = insertelement <4 x float> undef, float 0.000000e+00, i32 0
+  %29 = insertelement <4 x float> %28, float 0.000000e+00, i32 1
+  %30 = insertelement <4 x float> %29, float 0.000000e+00, i32 2
+  %31 = insertelement <4 x float> %30, float 0.000000e+00, i32 3
+  call void @llvm.R600.store.swizzle(<4 x float> %31, i32 60, i32 1)
+  %32 = insertelement <4 x float> undef, float %0, i32 0
+  %33 = insertelement <4 x float> %32, float %temp8.0, i32 1
+  %34 = insertelement <4 x float> %33, float %temp12.0, i32 2
+  %35 = insertelement <4 x float> %34, float 0.000000e+00, i32 3
+  call void @llvm.R600.store.swizzle(<4 x float> %35, i32 0, i32 2)
+  ret void
+
+ENDIF40:                                          ; preds = %LOOP
+  %36 = bitcast float %temp8.0 to i32
+  %37 = add i32 %36, 1
+  %38 = bitcast i32 %37 to float
+  %39 = bitcast float %temp4.1 to i32
+  %40 = urem i32 %39, 2
+  %41 = bitcast i32 %40 to float
+  %42 = bitcast float %41 to i32
+  %43 = icmp eq i32 %42, 0
+  %44 = sext i1 %43 to i32
+  %45 = bitcast i32 %44 to float
+  %46 = bitcast float %45 to i32
+  %47 = icmp ne i32 %46, 0
+  %48 = bitcast float %temp4.1 to i32
+  br i1 %47, label %IF44, label %ELSE45
+
+IF44:                                             ; preds = %ENDIF40
+  %49 = udiv i32 %48, 2
+  br label %ENDIF43
+
+ELSE45:                                           ; preds = %ENDIF40
+  %50 = mul i32 3, %48
+  %51 = add i32 %50, 1
+  br label %ENDIF43
+
+ENDIF43:                                          ; preds = %ELSE45, %IF44
+  %.sink = phi i32 [ %49, %IF44 ], [ %51, %ELSE45 ]
+  %52 = bitcast i32 %.sink to float
+  %53 = load <4 x float> addrspace(8)* null
+  %54 = extractelement <4 x float> %53, i32 0
+  %55 = bitcast float %54 to i32
+  br label %LOOP47
+
+LOOP47:                                           ; preds = %ENDIF48, %ENDIF43
+  %temp12.1 = phi float [ %temp12.0, %ENDIF43 ], [ %67, %ENDIF48 ]
+  %temp28.0 = phi float [ 0.000000e+00, %ENDIF43 ], [ %70, %ENDIF48 ]
+  %56 = bitcast float %temp28.0 to i32
+  %57 = icmp uge i32 %56, %55
+  %58 = sext i1 %57 to i32
+  %59 = bitcast i32 %58 to float
+  %60 = bitcast float %59 to i32
+  %61 = icmp ne i32 %60, 0
+  br i1 %61, label %LOOP, label %ENDIF48
+
+ENDIF48:                                          ; preds = %LOOP47
+  %62 = bitcast float %temp12.1 to i32
+  %63 = mul i32 %62, 2
+  %64 = bitcast i32 %63 to float
+  %65 = bitcast float %64 to i32
+  %66 = urem i32 %65, 2147483647
+  %67 = bitcast i32 %66 to float
+  %68 = bitcast float %temp28.0 to i32
+  %69 = add i32 %68, 1
+  %70 = bitcast i32 %69 to float
+  br label %LOOP47
+}
+
+; Function Attrs: readnone
+declare float @llvm.R600.load.input(i32) #1
+
+declare void @llvm.R600.store.stream.output(<4 x float>, i32, i32, i32)
+
+declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
+
+attributes #0 = { "ShaderType"="1" }
+attributes #1 = { readnone }