2012-12-30 01:28:40 +00:00
|
|
|
; RUN: opt -S -loop-rotate < %s | FileCheck %s
|
2009-12-21 07:16:11 +00:00
|
|
|
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
|
|
|
|
target triple = "x86_64-apple-darwin10.0"
|
|
|
|
|
|
|
|
; PR5837
|
|
|
|
define void @test(i32 %N, double* %G) nounwind ssp {
|
|
|
|
entry:
|
|
|
|
br label %for.cond
|
|
|
|
|
|
|
|
for.cond: ; preds = %for.body, %entry
|
|
|
|
%j.0 = phi i64 [ 1, %entry ], [ %inc, %for.body ] ; <i64> [#uses=5]
|
|
|
|
%cmp = icmp slt i64 %j.0, 1000 ; <i1> [#uses=1]
|
|
|
|
br i1 %cmp, label %for.body, label %for.end
|
|
|
|
|
|
|
|
for.body: ; preds = %for.cond
|
|
|
|
%arrayidx = getelementptr inbounds double* %G, i64 %j.0 ; <double*> [#uses=1]
|
|
|
|
%tmp3 = load double* %arrayidx ; <double> [#uses=1]
|
|
|
|
%sub = sub i64 %j.0, 1 ; <i64> [#uses=1]
|
|
|
|
%arrayidx6 = getelementptr inbounds double* %G, i64 %sub ; <double*> [#uses=1]
|
|
|
|
%tmp7 = load double* %arrayidx6 ; <double> [#uses=1]
|
|
|
|
%add = fadd double %tmp3, %tmp7 ; <double> [#uses=1]
|
|
|
|
%arrayidx10 = getelementptr inbounds double* %G, i64 %j.0 ; <double*> [#uses=1]
|
|
|
|
store double %add, double* %arrayidx10
|
|
|
|
%inc = add nsw i64 %j.0, 1 ; <i64> [#uses=1]
|
|
|
|
br label %for.cond
|
|
|
|
|
|
|
|
for.end: ; preds = %for.cond
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
When loop rotation happens, it is *very* common for the duplicated condbr
to be foldable into an uncond branch. When this happens, we can make a
much simpler CFG for the loop, which is important for nested loop cases
where we want the outer loop to be aggressively optimized.
Handle this case more aggressively. For example, previously on
phi-duplicate.ll we would get this:
define void @test(i32 %N, double* %G) nounwind ssp {
entry:
%cmp1 = icmp slt i64 1, 1000
br i1 %cmp1, label %bb.nph, label %for.end
bb.nph: ; preds = %entry
br label %for.body
for.body: ; preds = %bb.nph, %for.cond
%j.02 = phi i64 [ 1, %bb.nph ], [ %inc, %for.cond ]
%arrayidx = getelementptr inbounds double* %G, i64 %j.02
%tmp3 = load double* %arrayidx
%sub = sub i64 %j.02, 1
%arrayidx6 = getelementptr inbounds double* %G, i64 %sub
%tmp7 = load double* %arrayidx6
%add = fadd double %tmp3, %tmp7
%arrayidx10 = getelementptr inbounds double* %G, i64 %j.02
store double %add, double* %arrayidx10
%inc = add nsw i64 %j.02, 1
br label %for.cond
for.cond: ; preds = %for.body
%cmp = icmp slt i64 %inc, 1000
br i1 %cmp, label %for.body, label %for.cond.for.end_crit_edge
for.cond.for.end_crit_edge: ; preds = %for.cond
br label %for.end
for.end: ; preds = %for.cond.for.end_crit_edge, %entry
ret void
}
Now we get the much nicer:
define void @test(i32 %N, double* %G) nounwind ssp {
entry:
br label %for.body
for.body: ; preds = %entry, %for.body
%j.01 = phi i64 [ 1, %entry ], [ %inc, %for.body ]
%arrayidx = getelementptr inbounds double* %G, i64 %j.01
%tmp3 = load double* %arrayidx
%sub = sub i64 %j.01, 1
%arrayidx6 = getelementptr inbounds double* %G, i64 %sub
%tmp7 = load double* %arrayidx6
%add = fadd double %tmp3, %tmp7
%arrayidx10 = getelementptr inbounds double* %G, i64 %j.01
store double %add, double* %arrayidx10
%inc = add nsw i64 %j.01, 1
%cmp = icmp slt i64 %inc, 1000
br i1 %cmp, label %for.body, label %for.end
for.end: ; preds = %for.body
ret void
}
With all of these recent changes, we are now able to compile:
void foo(char *X) {
for (int i = 0; i != 100; ++i)
for (int j = 0; j != 100; ++j)
X[j+i*100] = 0;
}
into a single memset of 10000 bytes. This series of changes
should also be helpful for other nested loop scenarios as well.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@123079 91177308-0d34-0410-b5e6-96231b3b80d8
2011-01-08 19:59:06 +00:00
|
|
|
; Should only end up with one phi.
|
2010-08-17 17:39:21 +00:00
|
|
|
; CHECK: define void @test
|
|
|
|
; CHECK-NEXT: entry:
|
|
|
|
; CHECK-NEXT: br label %for.body
|
|
|
|
; CHECK: for.body:
|
2011-01-08 08:24:46 +00:00
|
|
|
; CHECK-NEXT: %j.01 = phi i64
|
When loop rotation happens, it is *very* common for the duplicated condbr
to be foldable into an uncond branch. When this happens, we can make a
much simpler CFG for the loop, which is important for nested loop cases
where we want the outer loop to be aggressively optimized.
Handle this case more aggressively. For example, previously on
phi-duplicate.ll we would get this:
define void @test(i32 %N, double* %G) nounwind ssp {
entry:
%cmp1 = icmp slt i64 1, 1000
br i1 %cmp1, label %bb.nph, label %for.end
bb.nph: ; preds = %entry
br label %for.body
for.body: ; preds = %bb.nph, %for.cond
%j.02 = phi i64 [ 1, %bb.nph ], [ %inc, %for.cond ]
%arrayidx = getelementptr inbounds double* %G, i64 %j.02
%tmp3 = load double* %arrayidx
%sub = sub i64 %j.02, 1
%arrayidx6 = getelementptr inbounds double* %G, i64 %sub
%tmp7 = load double* %arrayidx6
%add = fadd double %tmp3, %tmp7
%arrayidx10 = getelementptr inbounds double* %G, i64 %j.02
store double %add, double* %arrayidx10
%inc = add nsw i64 %j.02, 1
br label %for.cond
for.cond: ; preds = %for.body
%cmp = icmp slt i64 %inc, 1000
br i1 %cmp, label %for.body, label %for.cond.for.end_crit_edge
for.cond.for.end_crit_edge: ; preds = %for.cond
br label %for.end
for.end: ; preds = %for.cond.for.end_crit_edge, %entry
ret void
}
Now we get the much nicer:
define void @test(i32 %N, double* %G) nounwind ssp {
entry:
br label %for.body
for.body: ; preds = %entry, %for.body
%j.01 = phi i64 [ 1, %entry ], [ %inc, %for.body ]
%arrayidx = getelementptr inbounds double* %G, i64 %j.01
%tmp3 = load double* %arrayidx
%sub = sub i64 %j.01, 1
%arrayidx6 = getelementptr inbounds double* %G, i64 %sub
%tmp7 = load double* %arrayidx6
%add = fadd double %tmp3, %tmp7
%arrayidx10 = getelementptr inbounds double* %G, i64 %j.01
store double %add, double* %arrayidx10
%inc = add nsw i64 %j.01, 1
%cmp = icmp slt i64 %inc, 1000
br i1 %cmp, label %for.body, label %for.end
for.end: ; preds = %for.body
ret void
}
With all of these recent changes, we are now able to compile:
void foo(char *X) {
for (int i = 0; i != 100; ++i)
for (int j = 0; j != 100; ++j)
X[j+i*100] = 0;
}
into a single memset of 10000 bytes. This series of changes
should also be helpful for other nested loop scenarios as well.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@123079 91177308-0d34-0410-b5e6-96231b3b80d8
2011-01-08 19:59:06 +00:00
|
|
|
; CHECK-NOT: br
|
|
|
|
; CHECK: br i1 %cmp, label %for.body, label %for.end
|
|
|
|
; CHECK: for.end:
|
|
|
|
; CHECK-NEXT: ret void
|