From 55306bdea5d2d53be39f3ac59fadf5220ee6b5d0 Mon Sep 17 00:00:00 2001
From: Nadav Rotem <nrotem@apple.com>
Date: Fri, 21 Dec 2012 00:07:35 +0000
Subject: [PATCH] Fix a bug in the code that checks if we can vectorize loops
 while using dynamic memory bound checks.  Before the fix we were able to
 vectorize this loop from the Livermore Loops benchmark:

for ( k=1 ; k<n ; k++ )
  x[k] = x[k-1] + y[k];



git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@170811 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/LoopVectorize.cpp    |  43 ++++---
 .../LoopVectorize/same-base-access.ll         | 110 ++++++++++++++++++
 test/Transforms/LoopVectorize/scalar-store.ll |  48 --------
 3 files changed, 134 insertions(+), 67 deletions(-)
 create mode 100644 test/Transforms/LoopVectorize/same-base-access.ll
 delete mode 100644 test/Transforms/LoopVectorize/scalar-store.ll
diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index 827c13ffc9f..4a90d78c246 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1593,8 +1593,7 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
 
   ValueVector::iterator I, IE;
   for (I = Stores.begin(), IE = Stores.end(); I != IE; ++I) {
-    StoreInst *ST = dyn_cast<StoreInst>(*I);
-    assert(ST && "Bad StoreInst");
+    StoreInst *ST = cast<StoreInst>(*I);
     Value* Ptr = ST->getPointerOperand();
 
     if (isUniform(Ptr)) {
@@ -1609,8 +1608,7 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
   }
 
   for (I = Loads.begin(), IE = Loads.end(); I != IE; ++I) {
-    LoadInst *LD = dyn_cast<LoadInst>(*I);
-    assert(LD && "Bad LoadInst");
+    LoadInst *LD = cast<LoadInst>(*I);
     Value* Ptr = LD->getPointerOperand();
     // If we did *not* see this pointer before, insert it to the
     // read list. If we *did* see it before, then it is already in
@@ -1633,13 +1631,13 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
 
   // Find pointers with computable bounds. We are going to use this information
   // to place a runtime bound check.
-  bool RT = true;
+  bool CanDoRT = true;
   for (I = ReadWrites.begin(), IE = ReadWrites.end(); I != IE; ++I)
     if (hasComputableBounds(*I)) {
       PtrRtCheck.insert(SE, TheLoop, *I);
       DEBUG(dbgs() << "LV: Found a runtime check ptr:" << **I <<"\n");
     } else {
-      RT = false;
+      CanDoRT = false;
       break;
     }
   for (I = Reads.begin(), IE = Reads.end(); I != IE; ++I)
@@ -1647,23 +1645,23 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
       PtrRtCheck.insert(SE, TheLoop, *I);
       DEBUG(dbgs() << "LV: Found a runtime check ptr:" << **I <<"\n");
     } else {
-      RT = false;
+      CanDoRT = false;
       break;
     }
 
   // Check that we did not collect too many pointers or found a
   // unsizeable pointer.
-  if (!RT || PtrRtCheck.Pointers.size() > RuntimeMemoryCheckThreshold) {
+  if (!CanDoRT || PtrRtCheck.Pointers.size() > RuntimeMemoryCheckThreshold) {
     PtrRtCheck.reset();
-    RT = false;
+    CanDoRT = false;
   }
 
-  PtrRtCheck.Need = RT;
-
-  if (RT) {
+  if (CanDoRT) {
     DEBUG(dbgs() << "LV: We can perform a memory runtime check if needed.\n");
   }
 
+  bool NeedRTCheck = false;
+
   // Now that the pointers are in two lists (Reads and ReadWrites), we
   // can check that there are no conflicts between each of the writes and
   // between the writes to the reads.
@@ -1678,12 +1676,12 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
          it != e; ++it) {
       if (!isIdentifiedObject(*it)) {
         DEBUG(dbgs() << "LV: Found an unidentified write ptr:"<< **it <<"\n");
-        return RT;
+        NeedRTCheck = true;
       }
       if (!WriteObjects.insert(*it)) {
         DEBUG(dbgs() << "LV: Found a possible write-write reorder:"
               << **it <<"\n");
-        return RT;
+        return false;
       }
     }
     TempObjects.clear();
@@ -1696,20 +1694,27 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
          it != e; ++it) {
       if (!isIdentifiedObject(*it)) {
         DEBUG(dbgs() << "LV: Found an unidentified read ptr:"<< **it <<"\n");
-        return RT;
+        NeedRTCheck = true;
       }
       if (WriteObjects.count(*it)) {
         DEBUG(dbgs() << "LV: Found a possible read/write reorder:"
               << **it <<"\n");
-        return RT;
+        return false;
       }
     }
     TempObjects.clear();
   }
 
-  // It is safe to vectorize and we don't need any runtime checks.
-  DEBUG(dbgs() << "LV: We don't need a runtime memory check.\n");
-  PtrRtCheck.reset();
+  PtrRtCheck.Need = NeedRTCheck;
+  if (NeedRTCheck && !CanDoRT) {
+    DEBUG(dbgs() << "LV: We can't vectorize because we can't find " <<
+          "the array bounds.\n");
+    PtrRtCheck.reset();
+    return false;
+  }
+
+  DEBUG(dbgs() << "LV: We "<< (NeedRTCheck ? "" : "don't") <<
+        " need a runtime memory check.\n");
   return true;
 }
 
diff --git a/test/Transforms/LoopVectorize/same-base-access.ll b/test/Transforms/LoopVectorize/same-base-access.ll
new file mode 100644
index 00000000000..f9ef32e4922
--- /dev/null
+++ b/test/Transforms/LoopVectorize/same-base-access.ll
@@ -0,0 +1,110 @@
+; RUN: opt < %s  -loop-vectorize -force-vector-width=4 -dce -instcombine -licm -S -enable-if-conversion | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.9.0"
+
+; This is kernel11 from "LivermoreLoops". We can't vectorize it because we
+; access both x[k] and x[k-1].
+;
+; void kernel11(double *x, double *y, int n) {
+;   for ( int k=1 ; k<n ; k++ )
+;     x[k] = x[k-1] + y[k];
+; }
+
+; CHECK: @kernel11
+; CHECK-NOT: <4 x double>
+; CHECK: ret
+define i32 @kernel11(double* %x, double* %y, i32 %n) nounwind uwtable ssp {
+  %1 = alloca double*, align 8
+  %2 = alloca double*, align 8
+  %3 = alloca i32, align 4
+  %k = alloca i32, align 4
+  store double* %x, double** %1, align 8
+  store double* %y, double** %2, align 8
+  store i32 %n, i32* %3, align 4
+  store i32 1, i32* %k, align 4
+  br label %4
+
+; <label>:4                                       ; preds = %25, %0
+  %5 = load i32* %k, align 4
+  %6 = load i32* %3, align 4
+  %7 = icmp slt i32 %5, %6
+  br i1 %7, label %8, label %28
+
+; <label>:8                                       ; preds = %4
+  %9 = load i32* %k, align 4
+  %10 = sub nsw i32 %9, 1
+  %11 = sext i32 %10 to i64
+  %12 = load double** %1, align 8
+  %13 = getelementptr inbounds double* %12, i64 %11
+  %14 = load double* %13, align 8
+  %15 = load i32* %k, align 4
+  %16 = sext i32 %15 to i64
+  %17 = load double** %2, align 8
+  %18 = getelementptr inbounds double* %17, i64 %16
+  %19 = load double* %18, align 8
+  %20 = fadd double %14, %19
+  %21 = load i32* %k, align 4
+  %22 = sext i32 %21 to i64
+  %23 = load double** %1, align 8
+  %24 = getelementptr inbounds double* %23, i64 %22
+  store double %20, double* %24, align 8
+  br label %25
+
+; <label>:25                                      ; preds = %8
+  %26 = load i32* %k, align 4
+  %27 = add nsw i32 %26, 1
+  store i32 %27, i32* %k, align 4
+  br label %4
+
+; <label>:28                                      ; preds = %4
+  ret i32 0
+}
+
+
+
+; We don't vectorize this function because A[i*7] is scalarized, and the
+; different scalars can in theory wrap around and overwrite other scalar
+; elements. At the moment we only allow read/write access to arrays
+; that are consecutive.
+; 
+; void foo(int *a) {
+;   for (int i=0; i<256; ++i) {
+;     int x = a[i*7];
+;     if (x>3)
+;       x = x*x+x*4;
+;     a[i*7] = x+3;
+;   }
+; }
+
+; CHECK: @func2
+; CHECK-NOT: <4 x i32>
+; CHECK: ret
+define i32 @func2(i32* nocapture %a) nounwind uwtable ssp {
+  br label %1
+
+; <label>:1                                       ; preds = %7, %0
+  %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %7 ]
+  %2 = mul nsw i64 %indvars.iv, 7
+  %3 = getelementptr inbounds i32* %a, i64 %2
+  %4 = load i32* %3, align 4
+  %5 = icmp sgt i32 %4, 3
+  br i1 %5, label %6, label %7
+
+; <label>:6                                       ; preds = %1
+  %tmp = add i32 %4, 4
+  %tmp1 = mul i32 %tmp, %4
+  br label %7
+
+; <label>:7                                       ; preds = %6, %1
+  %x.0 = phi i32 [ %tmp1, %6 ], [ %4, %1 ]
+  %8 = add nsw i32 %x.0, 3
+  store i32 %8, i32* %3, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 256
+  br i1 %exitcond, label %9, label %1
+
+; <label>:9                                       ; preds = %7
+  ret i32 0
+}
diff --git a/test/Transforms/LoopVectorize/scalar-store.ll b/test/Transforms/LoopVectorize/scalar-store.ll
deleted file mode 100644
index 5d207f94454..00000000000
--- a/test/Transforms/LoopVectorize/scalar-store.ll
+++ /dev/null
@@ -1,48 +0,0 @@
-; RUN: opt < %s  -loop-vectorize -force-vector-width=4 -dce -instcombine -licm -S -enable-if-conversion | FileCheck %s
-
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
-target triple = "x86_64-apple-macosx10.9.0"
-
-; When we scalarize a store, make sure that the addresses are extracted
-; from a vector. We had a bug where the addresses were the old scalar
-; addresses.
-
-; CHECK: @foo
-; CHECK: select
-; CHECK: extractelement
-; CHECK-NEXT: store
-; CHECK: extractelement
-; CHECK-NEXT: store
-; CHECK: extractelement
-; CHECK-NEXT: store
-; CHECK: extractelement
-; CHECK-NEXT: store
-; CHECK: ret
-define i32 @foo(i32* nocapture %a) nounwind uwtable ssp {
-  br label %1
-
-; <label>:1                                       ; preds = %7, %0
-  %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %7 ]
-  %2 = mul nsw i64 %indvars.iv, 7
-  %3 = getelementptr inbounds i32* %a, i64 %2
-  %4 = load i32* %3, align 4
-  %5 = icmp sgt i32 %4, 3
-  br i1 %5, label %6, label %7
-
-; <label>:6                                       ; preds = %1
-  %tmp = add i32 %4, 4
-  %tmp1 = mul i32 %tmp, %4
-  br label %7
-
-; <label>:7                                       ; preds = %6, %1
-  %x.0 = phi i32 [ %tmp1, %6 ], [ %4, %1 ]
-  %8 = add nsw i32 %x.0, 3
-  store i32 %8, i32* %3, align 4
-  %indvars.iv.next = add i64 %indvars.iv, 1
-  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
-  %exitcond = icmp eq i32 %lftr.wideiv, 256
-  br i1 %exitcond, label %9, label %1
-
-; <label>:9                                       ; preds = %7
-  ret i32 0
-}