llvm-6502/test/Transforms/LoopRotate/phi-duplicate.ll

; RUN: opt -S %s -loop-rotate | FileCheck %s
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
target triple = "x86_64-apple-darwin10.0"

; PR5837
define void @test(i32 %N, double* %G) nounwind ssp {
entry:
  br label %for.cond

for.cond:                                         ; preds = %for.body, %entry
  %j.0 = phi i64 [ 1, %entry ], [ %inc, %for.body ] ; <i64> [#uses=5]
  %cmp = icmp slt i64 %j.0, 1000                  ; <i1> [#uses=1]
  br i1 %cmp, label %for.body, label %for.end

for.body:                                         ; preds = %for.cond
  %arrayidx = getelementptr inbounds double* %G, i64 %j.0 ; <double*> [#uses=1]
  %tmp3 = load double* %arrayidx                  ; <double> [#uses=1]
  %sub = sub i64 %j.0, 1                          ; <i64> [#uses=1]
  %arrayidx6 = getelementptr inbounds double* %G, i64 %sub ; <double*> [#uses=1]
  %tmp7 = load double* %arrayidx6                 ; <double> [#uses=1]
  %add = fadd double %tmp3, %tmp7                 ; <double> [#uses=1]
  %arrayidx10 = getelementptr inbounds double* %G, i64 %j.0 ; <double*> [#uses=1]
  store double %add, double* %arrayidx10
  %inc = add nsw i64 %j.0, 1                      ; <i64> [#uses=1]
  br label %for.cond

for.end:                                          ; preds = %for.cond
  ret void
}

; Should only end up with one phi.
; CHECK:      define void @test
; CHECK-NEXT: entry:
; CHECK-NEXT:   br label %for.body
; CHECK:      for.body:
; CHECK-NEXT:   %j.01 = phi i64
; CHECK-NOT:  br
; CHECK:   br i1 %cmp, label %for.body, label %for.end
; CHECK:      for.end:
; CHECK-NEXT:        ret void
fix PR5837 by having SSAUpdate reuse phi nodes for the 'GetValueInMiddleOfBlock' case, instead of inserting duplicates. A similar fix is almost certainly needed by the machine-level SSAUpdate implementation. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@91820 91177308-0d34-0410-b5e6-96231b3b80d8 2009-12-21 07:16:11 +00:00			`; RUN: opt -S %s -loop-rotate \| FileCheck %s`
			`target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"`
			`target triple = "x86_64-apple-darwin10.0"`

			`; PR5837`
			`define void @test(i32 %N, double* %G) nounwind ssp {`
			`entry:`
			`br label %for.cond`

			`for.cond: ; preds = %for.body, %entry`
			`%j.0 = phi i64 [ 1, %entry ], [ %inc, %for.body ] ; <i64> [#uses=5]`
			`%cmp = icmp slt i64 %j.0, 1000 ; <i1> [#uses=1]`
			`br i1 %cmp, label %for.body, label %for.end`

			`for.body: ; preds = %for.cond`
			`%arrayidx = getelementptr inbounds double* %G, i64 %j.0 ; <double*> [#uses=1]`
			`%tmp3 = load double* %arrayidx ; <double> [#uses=1]`
			`%sub = sub i64 %j.0, 1 ; <i64> [#uses=1]`
			`%arrayidx6 = getelementptr inbounds double* %G, i64 %sub ; <double*> [#uses=1]`
			`%tmp7 = load double* %arrayidx6 ; <double> [#uses=1]`
			`%add = fadd double %tmp3, %tmp7 ; <double> [#uses=1]`
			`%arrayidx10 = getelementptr inbounds double* %G, i64 %j.0 ; <double*> [#uses=1]`
			`store double %add, double* %arrayidx10`
			`%inc = add nsw i64 %j.0, 1 ; <i64> [#uses=1]`
			`br label %for.cond`

			`for.end: ; preds = %for.cond`
			`ret void`
			`}`

When loop rotation happens, it is very common for the duplicated condbr to be foldable into an uncond branch. When this happens, we can make a much simpler CFG for the loop, which is important for nested loop cases where we want the outer loop to be aggressively optimized. Handle this case more aggressively. For example, previously on phi-duplicate.ll we would get this: define void @test(i32 %N, double* %G) nounwind ssp { entry: %cmp1 = icmp slt i64 1, 1000 br i1 %cmp1, label %bb.nph, label %for.end bb.nph: ; preds = %entry br label %for.body for.body: ; preds = %bb.nph, %for.cond %j.02 = phi i64 [ 1, %bb.nph ], [ %inc, %for.cond ] %arrayidx = getelementptr inbounds double* %G, i64 %j.02 %tmp3 = load double* %arrayidx %sub = sub i64 %j.02, 1 %arrayidx6 = getelementptr inbounds double* %G, i64 %sub %tmp7 = load double* %arrayidx6 %add = fadd double %tmp3, %tmp7 %arrayidx10 = getelementptr inbounds double* %G, i64 %j.02 store double %add, double* %arrayidx10 %inc = add nsw i64 %j.02, 1 br label %for.cond for.cond: ; preds = %for.body %cmp = icmp slt i64 %inc, 1000 br i1 %cmp, label %for.body, label %for.cond.for.end_crit_edge for.cond.for.end_crit_edge: ; preds = %for.cond br label %for.end for.end: ; preds = %for.cond.for.end_crit_edge, %entry ret void } Now we get the much nicer: define void @test(i32 %N, double* %G) nounwind ssp { entry: br label %for.body for.body: ; preds = %entry, %for.body %j.01 = phi i64 [ 1, %entry ], [ %inc, %for.body ] %arrayidx = getelementptr inbounds double* %G, i64 %j.01 %tmp3 = load double* %arrayidx %sub = sub i64 %j.01, 1 %arrayidx6 = getelementptr inbounds double* %G, i64 %sub %tmp7 = load double* %arrayidx6 %add = fadd double %tmp3, %tmp7 %arrayidx10 = getelementptr inbounds double* %G, i64 %j.01 store double %add, double* %arrayidx10 %inc = add nsw i64 %j.01, 1 %cmp = icmp slt i64 %inc, 1000 br i1 %cmp, label %for.body, label %for.end for.end: ; preds = %for.body ret void } With all of these recent changes, we are now able to compile: void foo(char X) { for (int i = 0; i != 100; ++i) for (int j = 0; j != 100; ++j) X[j+i100] = 0; } into a single memset of 10000 bytes. This series of changes should also be helpful for other nested loop scenarios as well. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@123079 91177308-0d34-0410-b5e6-96231b3b80d8 2011-01-08 19:59:06 +00:00			`; Should only end up with one phi.`
When rotating loops, put the original header at the bottom of the loop, making the resulting loop significantly less ugly. Also, zap its trivial PHI nodes, since it's easy. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@111255 91177308-0d34-0410-b5e6-96231b3b80d8 2010-08-17 17:39:21 +00:00			`; CHECK: define void @test`
			`; CHECK-NEXT: entry:`
			`; CHECK-NEXT: br label %for.body`
			`; CHECK: for.body:`
Have loop-rotate simplify instructions (yay instsimplify!) as it clones them into the loop preheader, eliminating silly instructions like "icmp i32 0, 100" in fixed tripcount loops. This also better exposes the bigger problem with loop rotate that I'd like to fix: once this has been folded, the duplicated conditional branch often turns into an uncond branch. Not aggressively handling this is pessimizing later loop optimizations somethin' fierce by making "dominates all exit blocks" checks fail. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@123060 91177308-0d34-0410-b5e6-96231b3b80d8 2011-01-08 08:24:46 +00:00			`; CHECK-NEXT: %j.01 = phi i64`
When loop rotation happens, it is very common for the duplicated condbr to be foldable into an uncond branch. When this happens, we can make a much simpler CFG for the loop, which is important for nested loop cases where we want the outer loop to be aggressively optimized. Handle this case more aggressively. For example, previously on phi-duplicate.ll we would get this: define void @test(i32 %N, double* %G) nounwind ssp { entry: %cmp1 = icmp slt i64 1, 1000 br i1 %cmp1, label %bb.nph, label %for.end bb.nph: ; preds = %entry br label %for.body for.body: ; preds = %bb.nph, %for.cond %j.02 = phi i64 [ 1, %bb.nph ], [ %inc, %for.cond ] %arrayidx = getelementptr inbounds double* %G, i64 %j.02 %tmp3 = load double* %arrayidx %sub = sub i64 %j.02, 1 %arrayidx6 = getelementptr inbounds double* %G, i64 %sub %tmp7 = load double* %arrayidx6 %add = fadd double %tmp3, %tmp7 %arrayidx10 = getelementptr inbounds double* %G, i64 %j.02 store double %add, double* %arrayidx10 %inc = add nsw i64 %j.02, 1 br label %for.cond for.cond: ; preds = %for.body %cmp = icmp slt i64 %inc, 1000 br i1 %cmp, label %for.body, label %for.cond.for.end_crit_edge for.cond.for.end_crit_edge: ; preds = %for.cond br label %for.end for.end: ; preds = %for.cond.for.end_crit_edge, %entry ret void } Now we get the much nicer: define void @test(i32 %N, double* %G) nounwind ssp { entry: br label %for.body for.body: ; preds = %entry, %for.body %j.01 = phi i64 [ 1, %entry ], [ %inc, %for.body ] %arrayidx = getelementptr inbounds double* %G, i64 %j.01 %tmp3 = load double* %arrayidx %sub = sub i64 %j.01, 1 %arrayidx6 = getelementptr inbounds double* %G, i64 %sub %tmp7 = load double* %arrayidx6 %add = fadd double %tmp3, %tmp7 %arrayidx10 = getelementptr inbounds double* %G, i64 %j.01 store double %add, double* %arrayidx10 %inc = add nsw i64 %j.01, 1 %cmp = icmp slt i64 %inc, 1000 br i1 %cmp, label %for.body, label %for.end for.end: ; preds = %for.body ret void } With all of these recent changes, we are now able to compile: void foo(char X) { for (int i = 0; i != 100; ++i) for (int j = 0; j != 100; ++j) X[j+i100] = 0; } into a single memset of 10000 bytes. This series of changes should also be helpful for other nested loop scenarios as well. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@123079 91177308-0d34-0410-b5e6-96231b3b80d8 2011-01-08 19:59:06 +00:00			`; CHECK-NOT: br`
			`; CHECK: br i1 %cmp, label %for.body, label %for.end`
			`; CHECK: for.end:`
			`; CHECK-NEXT: ret void`