llvm-6502/test/Transforms/BBVectorize/loop1.ll

target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"
; RUN: opt < %s -bb-vectorize -bb-vectorize-req-chain-depth=3 -bb-vectorize-ignore-target-info -instcombine -gvn -S | FileCheck %s
; RUN: opt < %s -basicaa -loop-unroll -unroll-threshold=45 -unroll-allow-partial -bb-vectorize -bb-vectorize-req-chain-depth=3 -bb-vectorize-ignore-target-info -instcombine -gvn -S | FileCheck %s -check-prefix=CHECK-UNRL
; The second check covers the use of alias analysis (with loop unrolling).

define void @test1(double* noalias %out, double* noalias %in1, double* noalias %in2) nounwind uwtable {
entry:
  br label %for.body
; CHECK-LABEL: @test1(
; CHECK-UNRL-LABEL: @test1(

for.body:                                         ; preds = %for.body, %entry
  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
  %arrayidx = getelementptr inbounds double* %in1, i64 %indvars.iv
  %0 = load double* %arrayidx, align 8
  %arrayidx2 = getelementptr inbounds double* %in2, i64 %indvars.iv
  %1 = load double* %arrayidx2, align 8
  %mul = fmul double %0, %0
  %mul3 = fmul double %0, %1
  %add = fadd double %mul, %mul3
  %add4 = fadd double %1, %1
  %add5 = fadd double %add4, %0
  %mul6 = fmul double %0, %add5
  %add7 = fadd double %add, %mul6
  %mul8 = fmul double %1, %1
  %add9 = fadd double %0, %0
  %add10 = fadd double %add9, %0
  %mul11 = fmul double %mul8, %add10
  %add12 = fadd double %add7, %mul11
  %arrayidx14 = getelementptr inbounds double* %out, i64 %indvars.iv
  store double %add12, double* %arrayidx14, align 8
  %indvars.iv.next = add i64 %indvars.iv, 1
  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
  %exitcond = icmp eq i32 %lftr.wideiv, 10
  br i1 %exitcond, label %for.end, label %for.body
; CHECK: %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
; CHECK: %arrayidx = getelementptr inbounds double* %in1, i64 %indvars.iv
; CHECK: %0 = load double* %arrayidx, align 8
; CHECK: %arrayidx2 = getelementptr inbounds double* %in2, i64 %indvars.iv
; CHECK: %1 = load double* %arrayidx2, align 8
; CHECK: %mul = fmul double %0, %0
; CHECK: %mul3 = fmul double %0, %1
; CHECK: %add = fadd double %mul, %mul3
; CHECK: %mul8 = fmul double %1, %1
; CHECK: %add4.v.i1.1 = insertelement <2 x double> undef, double %1, i32 0
; CHECK: %add4.v.i1.2 = insertelement <2 x double> %add4.v.i1.1, double %0, i32 1
; CHECK: %add4 = fadd <2 x double> %add4.v.i1.2, %add4.v.i1.2
; CHECK: %add5.v.i1.1 = insertelement <2 x double> undef, double %0, i32 0
; CHECK: %add5.v.i1.2 = insertelement <2 x double> %add5.v.i1.1, double %0, i32 1
; CHECK: %add5 = fadd <2 x double> %add4, %add5.v.i1.2
; CHECK: %mul6.v.i0.2 = insertelement <2 x double> %add5.v.i1.1, double %mul8, i32 1
; CHECK: %mul6 = fmul <2 x double> %mul6.v.i0.2, %add5
; CHECK: %mul6.v.r1 = extractelement <2 x double> %mul6, i32 0
; CHECK: %mul6.v.r2 = extractelement <2 x double> %mul6, i32 1
; CHECK: %add7 = fadd double %add, %mul6.v.r1
; CHECK: %add12 = fadd double %add7, %mul6.v.r2
; CHECK: %arrayidx14 = getelementptr inbounds double* %out, i64 %indvars.iv
; CHECK: store double %add12, double* %arrayidx14, align 8
; CHECK: %indvars.iv.next = add i64 %indvars.iv, 1
; CHECK: %lftr.wideiv = trunc i64 %indvars.iv.next to i32
; CHECK: %exitcond = icmp eq i32 %lftr.wideiv, 10
; CHECK: br i1 %exitcond, label %for.end, label %for.body
; CHECK-UNRL: %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next.1, %for.body ]
; CHECK-UNRL: %arrayidx = getelementptr inbounds double* %in1, i64 %indvars.iv
; CHECK-UNRL: %0 = bitcast double* %arrayidx to <2 x double>*
; CHECK-UNRL: %arrayidx2 = getelementptr inbounds double* %in2, i64 %indvars.iv
; CHECK-UNRL: %1 = bitcast double* %arrayidx2 to <2 x double>*
; CHECK-UNRL: %arrayidx14 = getelementptr inbounds double* %out, i64 %indvars.iv
; CHECK-UNRL: %2 = load <2 x double>* %0, align 8
; CHECK-UNRL: %3 = load <2 x double>* %1, align 8
; CHECK-UNRL: %mul = fmul <2 x double> %2, %2
; CHECK-UNRL: %mul3 = fmul <2 x double> %2, %3
; CHECK-UNRL: %add = fadd <2 x double> %mul, %mul3
; CHECK-UNRL: %add4 = fadd <2 x double> %3, %3
; CHECK-UNRL: %add5 = fadd <2 x double> %add4, %2
; CHECK-UNRL: %mul6 = fmul <2 x double> %2, %add5
; CHECK-UNRL: %add7 = fadd <2 x double> %add, %mul6
; CHECK-UNRL: %mul8 = fmul <2 x double> %3, %3
; CHECK-UNRL: %add9 = fadd <2 x double> %2, %2
; CHECK-UNRL: %add10 = fadd <2 x double> %add9, %2
; CHECK-UNRL: %mul11 = fmul <2 x double> %mul8, %add10
; CHECK-UNRL: %add12 = fadd <2 x double> %add7, %mul11
; CHECK-UNRL: %4 = bitcast double* %arrayidx14 to <2 x double>*
; CHECK-UNRL: store <2 x double> %add12, <2 x double>* %4, align 8
; CHECK-UNRL: %indvars.iv.next.1 = add nsw i64 %indvars.iv, 2
; CHECK-UNRL: %lftr.wideiv.1 = trunc i64 %indvars.iv.next.1 to i32
; CHECK-UNRL: %exitcond.1 = icmp eq i32 %lftr.wideiv.1, 10
; CHECK-UNRL: br i1 %exitcond.1, label %for.end, label %for.body

for.end:                                          ; preds = %for.body
  ret void
}
Add a basic-block autovectorization pass. This is the initial checkin of the basic-block autovectorization pass along with some supporting vectorization infrastructure. Special thanks to everyone who helped review this code over the last several months (especially Tobias Grosser). git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@149468 91177308-0d34-0410-b5e6-96231b3b80d8 2012-02-01 03:51:43 +00:00			`target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"`
			`target triple = "x86_64-unknown-linux-gnu"`
Switch BBVectorize to directly depend on having a TTI analysis. This could be simplified further, but Hal has a specific feature for ignoring TTI, and so I preserved that. Also, I needed to use it because a number of tests fail when switching from a null TTI to the NoTTI nonce implementation. That seems suspicious to me and so may be something that you need to look into Hal. I worked it by preserving the old behavior for these tests with the flag that ignores all target info. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@171722 91177308-0d34-0410-b5e6-96231b3b80d8 2013-01-07 10:22:36 +00:00			`; RUN: opt < %s -bb-vectorize -bb-vectorize-req-chain-depth=3 -bb-vectorize-ignore-target-info -instcombine -gvn -S \| FileCheck %s`
			`; RUN: opt < %s -basicaa -loop-unroll -unroll-threshold=45 -unroll-allow-partial -bb-vectorize -bb-vectorize-req-chain-depth=3 -bb-vectorize-ignore-target-info -instcombine -gvn -S \| FileCheck %s -check-prefix=CHECK-UNRL`
Add a basic-block autovectorization pass. This is the initial checkin of the basic-block autovectorization pass along with some supporting vectorization infrastructure. Special thanks to everyone who helped review this code over the last several months (especially Tobias Grosser). git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@149468 91177308-0d34-0410-b5e6-96231b3b80d8 2012-02-01 03:51:43 +00:00			`; The second check covers the use of alias analysis (with loop unrolling).`

			`define void @test1(double* noalias %out, double* noalias %in1, double* noalias %in2) nounwind uwtable {`
			`entry:`
			`br label %for.body`
Update Transforms tests to use CHECK-LABEL for easier debugging. No functionality change. This update was done with the following bash script: find test/Transforms -name ".ll" \| \ while read NAME; do echo "$NAME" if ! grep -q "^; RUN: llc" $NAME; then TEMP=`mktemp -t temp` cp $NAME $TEMP sed -n "s/^define [^@]@\([A-Za-z0-9_]\)(.$/\1/p" < $NAME \| \ while read FUNC; do sed -i '' "s/;\(.\)\([A-Za-z0-9_]\):\( \)@$FUNC\([( ]\)\$/;\1\2-LABEL:\3@$FUNC(/g" $TEMP done mv $TEMP $NAME fi done git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@186268 91177308-0d34-0410-b5e6-96231b3b80d8 2013-07-14 01:42:54 +00:00			`; CHECK-LABEL: @test1(`
			`; CHECK-UNRL-LABEL: @test1(`
Add a basic-block autovectorization pass. This is the initial checkin of the basic-block autovectorization pass along with some supporting vectorization infrastructure. Special thanks to everyone who helped review this code over the last several months (especially Tobias Grosser). git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@149468 91177308-0d34-0410-b5e6-96231b3b80d8 2012-02-01 03:51:43 +00:00
			`for.body: ; preds = %for.body, %entry`
			`%indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]`
			`%arrayidx = getelementptr inbounds double* %in1, i64 %indvars.iv`
			`%0 = load double* %arrayidx, align 8`
			`%arrayidx2 = getelementptr inbounds double* %in2, i64 %indvars.iv`
			`%1 = load double* %arrayidx2, align 8`
			`%mul = fmul double %0, %0`
			`%mul3 = fmul double %0, %1`
			`%add = fadd double %mul, %mul3`
			`%add4 = fadd double %1, %1`
			`%add5 = fadd double %add4, %0`
			`%mul6 = fmul double %0, %add5`
			`%add7 = fadd double %add, %mul6`
			`%mul8 = fmul double %1, %1`
			`%add9 = fadd double %0, %0`
			`%add10 = fadd double %add9, %0`
			`%mul11 = fmul double %mul8, %add10`
			`%add12 = fadd double %add7, %mul11`
			`%arrayidx14 = getelementptr inbounds double* %out, i64 %indvars.iv`
			`store double %add12, double* %arrayidx14, align 8`
			`%indvars.iv.next = add i64 %indvars.iv, 1`
			`%lftr.wideiv = trunc i64 %indvars.iv.next to i32`
			`%exitcond = icmp eq i32 %lftr.wideiv, 10`
			`br i1 %exitcond, label %for.end, label %for.body`
			`; CHECK: %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]`
			`; CHECK: %arrayidx = getelementptr inbounds double* %in1, i64 %indvars.iv`
			`; CHECK: %0 = load double* %arrayidx, align 8`
			`; CHECK: %arrayidx2 = getelementptr inbounds double* %in2, i64 %indvars.iv`
			`; CHECK: %1 = load double* %arrayidx2, align 8`
			`; CHECK: %mul = fmul double %0, %0`
			`; CHECK: %mul3 = fmul double %0, %1`
			`; CHECK: %add = fadd double %mul, %mul3`
			`; CHECK: %mul8 = fmul double %1, %1`
BBVectorize: Choose pair ordering to minimize shuffles BBVectorize would, except for loads and stores, always fuse instructions so that the first instruction (in the current source order) would always represent the low part of the input vectors and the second instruction would always represent the high part. This lead to too many shuffles being produced because sometimes the opposite order produces fewer of them. With this change, BBVectorize tracks the kind of pair connections that form the DAG of candidate pairs, and uses that information to reorder the pairs to avoid excess shuffles. Using this information, a future commit will be able to add VTTI-based shuffle costs to the pair selection procedure. Importantly, the number of remaining shuffles can now be estimated during pair selection. There are some trivial instruction reorderings in the test cases, and one simple additional test where we certainly want to do a reordering to avoid an unnecessary shuffle. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@167122 91177308-0d34-0410-b5e6-96231b3b80d8 2012-10-31 15:17:07 +00:00			`; CHECK: %add4.v.i1.1 = insertelement <2 x double> undef, double %1, i32 0`
Add a basic-block autovectorization pass. This is the initial checkin of the basic-block autovectorization pass along with some supporting vectorization infrastructure. Special thanks to everyone who helped review this code over the last several months (especially Tobias Grosser). git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@149468 91177308-0d34-0410-b5e6-96231b3b80d8 2012-02-01 03:51:43 +00:00			`; CHECK: %add4.v.i1.2 = insertelement <2 x double> %add4.v.i1.1, double %0, i32 1`
			`; CHECK: %add4 = fadd <2 x double> %add4.v.i1.2, %add4.v.i1.2`
			`; CHECK: %add5.v.i1.1 = insertelement <2 x double> undef, double %0, i32 0`
			`; CHECK: %add5.v.i1.2 = insertelement <2 x double> %add5.v.i1.1, double %0, i32 1`
			`; CHECK: %add5 = fadd <2 x double> %add4, %add5.v.i1.2`
			`; CHECK: %mul6.v.i0.2 = insertelement <2 x double> %add5.v.i1.1, double %mul8, i32 1`
			`; CHECK: %mul6 = fmul <2 x double> %mul6.v.i0.2, %add5`
			`; CHECK: %mul6.v.r1 = extractelement <2 x double> %mul6, i32 0`
			`; CHECK: %mul6.v.r2 = extractelement <2 x double> %mul6, i32 1`
			`; CHECK: %add7 = fadd double %add, %mul6.v.r1`
			`; CHECK: %add12 = fadd double %add7, %mul6.v.r2`
			`; CHECK: %arrayidx14 = getelementptr inbounds double* %out, i64 %indvars.iv`
			`; CHECK: store double %add12, double* %arrayidx14, align 8`
			`; CHECK: %indvars.iv.next = add i64 %indvars.iv, 1`
			`; CHECK: %lftr.wideiv = trunc i64 %indvars.iv.next to i32`
			`; CHECK: %exitcond = icmp eq i32 %lftr.wideiv, 10`
			`; CHECK: br i1 %exitcond, label %for.end, label %for.body`
			`; CHECK-UNRL: %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next.1, %for.body ]`
			`; CHECK-UNRL: %arrayidx = getelementptr inbounds double* %in1, i64 %indvars.iv`
			`; CHECK-UNRL: %0 = bitcast double* %arrayidx to <2 x double>*`
			`; CHECK-UNRL: %arrayidx2 = getelementptr inbounds double* %in2, i64 %indvars.iv`
			`; CHECK-UNRL: %1 = bitcast double* %arrayidx2 to <2 x double>*`
			`; CHECK-UNRL: %arrayidx14 = getelementptr inbounds double* %out, i64 %indvars.iv`
			`; CHECK-UNRL: %2 = load <2 x double>* %0, align 8`
			`; CHECK-UNRL: %3 = load <2 x double>* %1, align 8`
			`; CHECK-UNRL: %mul = fmul <2 x double> %2, %2`
			`; CHECK-UNRL: %mul3 = fmul <2 x double> %2, %3`
			`; CHECK-UNRL: %add = fadd <2 x double> %mul, %mul3`
			`; CHECK-UNRL: %add4 = fadd <2 x double> %3, %3`
			`; CHECK-UNRL: %add5 = fadd <2 x double> %add4, %2`
			`; CHECK-UNRL: %mul6 = fmul <2 x double> %2, %add5`
			`; CHECK-UNRL: %add7 = fadd <2 x double> %add, %mul6`
			`; CHECK-UNRL: %mul8 = fmul <2 x double> %3, %3`
			`; CHECK-UNRL: %add9 = fadd <2 x double> %2, %2`
			`; CHECK-UNRL: %add10 = fadd <2 x double> %add9, %2`
			`; CHECK-UNRL: %mul11 = fmul <2 x double> %mul8, %add10`
			`; CHECK-UNRL: %add12 = fadd <2 x double> %add7, %mul11`
			`; CHECK-UNRL: %4 = bitcast double* %arrayidx14 to <2 x double>*`
			`; CHECK-UNRL: store <2 x double> %add12, <2 x double>* %4, align 8`
This patch teaches IndVarSimplify to add nuw and nsw to certain kinds of operations that provably don't overflow. For example, we can prove %civ.inc below does not sign-overflow. With this change, IndVarSimplify changes %civ.inc to an add nsw. define i32 @foo(i32* %array, i32* %length_ptr, i32 %init) { entry: %length = load i32* %length_ptr, !range !0 %len.sub.1 = sub i32 %length, 1 %upper = icmp slt i32 %init, %len.sub.1 br i1 %upper, label %loop, label %exit loop: %civ = phi i32 [ %init, %entry ], [ %civ.inc, %latch ] %civ.inc = add i32 %civ, 1 %cmp = icmp slt i32 %civ.inc, %length br i1 %cmp, label %latch, label %break latch: store i32 0, i32* %array %check = icmp slt i32 %civ.inc, %len.sub.1 br i1 %check, label %loop, label %break break: ret i32 %civ.inc exit: ret i32 42 } Differential Revision: http://reviews.llvm.org/D6748 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@225282 91177308-0d34-0410-b5e6-96231b3b80d8 2015-01-06 19:02:56 +00:00			`; CHECK-UNRL: %indvars.iv.next.1 = add nsw i64 %indvars.iv, 2`
Add a basic-block autovectorization pass. This is the initial checkin of the basic-block autovectorization pass along with some supporting vectorization infrastructure. Special thanks to everyone who helped review this code over the last several months (especially Tobias Grosser). git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@149468 91177308-0d34-0410-b5e6-96231b3b80d8 2012-02-01 03:51:43 +00:00			`; CHECK-UNRL: %lftr.wideiv.1 = trunc i64 %indvars.iv.next.1 to i32`
			`; CHECK-UNRL: %exitcond.1 = icmp eq i32 %lftr.wideiv.1, 10`
			`; CHECK-UNRL: br i1 %exitcond.1, label %for.end, label %for.body`

			`for.end: ; preds = %for.body`
			`ret void`
			`}`