LoopVectorizer: Add a check that the backedge taken count + 1 does not overflow

The loop vectorizer instantiates be-taken-count + 1 as the loop iteration count. If this expression overflows the generated code was invalid. In case of overflow the code now jumps to the scalar loop. Fixes PR17288. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@209854 91177308-0d34-0410-b5e6-96231b3b80d8
2024-11-01 00:11:00 +00:00 · 2014-05-29 22:10:01 +00:00 · 2014-05-29 22:10:01 +00:00 · 06413cd0f0
commit 06413cd0f0
parent ade072c1a9
3 changed files with 123 additions and 34 deletions
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@ -1909,20 +1909,23 @@ void InnerLoopVectorizer::createEmptyLoop() {
   the vectorized instructions while the old loop will continue to run the
   scalar remainder.

-       [ ] <-- vector loop bypass (may consist of multiple blocks).
-     /  |
-    /   v
-   |   [ ]     <-- vector pre header.
-   |    |
-   |    v
-   |   [  ] \
-   |   [  ]_|   <-- vector loop.
-   |    |
-    \   v
-      >[ ]   <--- middle-block.
-     /  |
-    /   v
-   |   [ ]     <--- new preheader.
+       [ ] <-- Back-edge taken count overflow check.
+    /   |
+   /    v
+  |    [ ] <-- vector loop bypass (may consist of multiple blocks).
+  |  /  |
+  | /   v
+  ||   [ ]     <-- vector pre header.
+  ||    |
+  ||    v
+  ||   [  ] \
+  ||   [  ]_|   <-- vector loop.
+  ||    |
+  | \   v
+  |   >[ ]   <--- middle-block.
+  |  /  |
+  | /   v
+  -|- >[ ]     <--- new preheader.
   |    |
   |    v
   |   [ ] \
@ -1936,6 +1939,7 @@ void InnerLoopVectorizer::createEmptyLoop() {
  BasicBlock *OldBasicBlock = OrigLoop->getHeader();
  BasicBlock *BypassBlock = OrigLoop->getLoopPreheader();
  BasicBlock *ExitBlock = OrigLoop->getExitBlock();
+  assert(BypassBlock && "Invalid loop structure");
  assert(ExitBlock && "Must have an exit block");

  // Some loops have a single integer induction variable, while other loops
@ -1958,15 +1962,31 @@ void InnerLoopVectorizer::createEmptyLoop() {
      IdxTy->getPrimitiveSizeInBits())
    ExitCount = SE->getTruncateOrNoop(ExitCount, IdxTy);

-  ExitCount = SE->getNoopOrZeroExtend(ExitCount, IdxTy);
+  const SCEV *BackedgeTakeCount = SE->getNoopOrZeroExtend(ExitCount, IdxTy);
  // Get the total trip count from the count by adding 1.
-  ExitCount = SE->getAddExpr(ExitCount,
-                             SE->getConstant(ExitCount->getType(), 1));
+  ExitCount = SE->getAddExpr(BackedgeTakeCount,
+                             SE->getConstant(BackedgeTakeCount->getType(), 1));

  // Expand the trip count and place the new instructions in the preheader.
  // Notice that the pre-header does not change, only the loop body.
  SCEVExpander Exp(*SE, "induction");

+  // We need to test whether the backedge-taken count is uint##_max. Adding one
+  // to it will cause overflow and an incorrect loop trip count in the vector
+  // body. In case of overflow we want to directly jump to the scalar remainder
+  // loop.
+  Value *BackedgeCount =
+      Exp.expandCodeFor(BackedgeTakeCount, BackedgeTakeCount->getType(),
+                        BypassBlock->getTerminator());
+  if (BackedgeCount->getType()->isPointerTy())
+    BackedgeCount = CastInst::CreatePointerCast(BackedgeCount, IdxTy,
+                                                "backedge.ptrcnt.to.int",
+                                                BypassBlock->getTerminator());
+  Instruction *CheckBCOverflow =
+      CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, BackedgeCount,
+                      Constant::getAllOnesValue(BackedgeCount->getType()),
+                      "backedge.overflow", BypassBlock->getTerminator());
+
  // Count holds the overall loop count (N).
  Value *Count = Exp.expandCodeFor(ExitCount, ExitCount->getType(),
                                   BypassBlock->getTerminator());
@ -1980,7 +2000,6 @@ void InnerLoopVectorizer::createEmptyLoop() {
                       IdxTy):
    ConstantInt::get(IdxTy, 0);

-  assert(BypassBlock && "Invalid loop structure");
  LoopBypassBlocks.push_back(BypassBlock);

  // Split the single block loop into the two loop structure described above.
@ -2054,24 +2073,39 @@ void InnerLoopVectorizer::createEmptyLoop() {

  BasicBlock *LastBypassBlock = BypassBlock;

+  // Generate code to check that the loops trip count that we computed by adding
+  // one to the backedge-taken count will not overflow.
+  {
+    auto PastOverflowCheck = std::next(BasicBlock::iterator(CheckBCOverflow));
+    BasicBlock *CheckBlock =
+        LastBypassBlock->splitBasicBlock(PastOverflowCheck, "overflow.checked");
+    if (ParentLoop)
+      ParentLoop->addBasicBlockToLoop(CheckBlock, LI->getBase());
+    LoopBypassBlocks.push_back(CheckBlock);
+    Instruction *OldTerm = LastBypassBlock->getTerminator();
+    BranchInst::Create(ScalarPH, CheckBlock, CheckBCOverflow, OldTerm);
+    OldTerm->eraseFromParent();
+    LastBypassBlock = CheckBlock;
+  }
+
  // Generate the code to check that the strides we assumed to be one are really
  // one. We want the new basic block to start at the first instruction in a
  // sequence of instructions that form a check.
  Instruction *StrideCheck;
  Instruction *FirstCheckInst;
  std::tie(FirstCheckInst, StrideCheck) =
-      addStrideCheck(BypassBlock->getTerminator());
+      addStrideCheck(LastBypassBlock->getTerminator());
  if (StrideCheck) {
    // Create a new block containing the stride check.
    BasicBlock *CheckBlock =
-        BypassBlock->splitBasicBlock(FirstCheckInst, "vector.stridecheck");
+        LastBypassBlock->splitBasicBlock(FirstCheckInst, "vector.stridecheck");
    if (ParentLoop)
      ParentLoop->addBasicBlockToLoop(CheckBlock, LI->getBase());
    LoopBypassBlocks.push_back(CheckBlock);

    // Replace the branch into the memory check block with a conditional branch
    // for the "few elements case".
-    Instruction *OldTerm = BypassBlock->getTerminator();
+    Instruction *OldTerm = LastBypassBlock->getTerminator();
    BranchInst::Create(MiddleBlock, CheckBlock, Cmp, OldTerm);
    OldTerm->eraseFromParent();

@ -2134,6 +2168,19 @@ void InnerLoopVectorizer::createEmptyLoop() {
      PHINode::Create(OrigPhi->getType(), 2, "trunc.resume.val",
                      MiddleBlock->getTerminator()) : nullptr;

+    // Create phi nodes to merge from the  backedge-taken check block.
+    PHINode *BCResumeVal = PHINode::Create(ResumeValTy, 3, "bc.resume.val",
+                                           ScalarPH->getTerminator());
+    BCResumeVal->addIncoming(ResumeVal, MiddleBlock);
+
+    PHINode *BCTruncResumeVal = nullptr;
+    if (OrigPhi == OldInduction) {
+      BCTruncResumeVal =
+          PHINode::Create(OrigPhi->getType(), 2, "bc.trunc.resume.val",
+                          ScalarPH->getTerminator());
+      BCTruncResumeVal->addIncoming(TruncResumeVal, MiddleBlock);
+    }
+
    Value *EndValue = nullptr;
    switch (II.IK) {
    case LoopVectorizationLegality::IK_NoInduction:
@ -2150,10 +2197,12 @@ void InnerLoopVectorizer::createEmptyLoop() {
          BypassBuilder.CreateTrunc(IdxEndRoundDown, OrigPhi->getType());
        // The new PHI merges the original incoming value, in case of a bypass,
        // or the value at the end of the vectorized loop.
-        for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
+        for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I)
          TruncResumeVal->addIncoming(II.StartValue, LoopBypassBlocks[I]);
        TruncResumeVal->addIncoming(EndValue, VecBody);

+        BCTruncResumeVal->addIncoming(II.StartValue, LoopBypassBlocks[0]);
+
        // We know what the end value is.
        EndValue = IdxEndRoundDown;
        // We also know which PHI node holds it.
@ -2199,7 +2248,7 @@ void InnerLoopVectorizer::createEmptyLoop() {

    // The new PHI merges the original incoming value, in case of a bypass,
    // or the value at the end of the vectorized loop.
-    for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I) {
+    for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I) {
      if (OrigPhi == OldInduction)
        ResumeVal->addIncoming(StartIdx, LoopBypassBlocks[I]);
      else
@ -2209,11 +2258,16 @@ void InnerLoopVectorizer::createEmptyLoop() {

    // Fix the scalar body counter (PHI node).
    unsigned BlockIdx = OrigPhi->getBasicBlockIndex(ScalarPH);
-    // The old inductions phi node in the scalar body needs the truncated value.
-    if (OrigPhi == OldInduction)
-      OrigPhi->setIncomingValue(BlockIdx, TruncResumeVal);
-    else
-      OrigPhi->setIncomingValue(BlockIdx, ResumeVal);
+
+    // The old induction's phi node in the scalar body needs the truncated
+    // value.
+    if (OrigPhi == OldInduction) {
+      BCResumeVal->addIncoming(StartIdx, LoopBypassBlocks[0]);
+      OrigPhi->setIncomingValue(BlockIdx, BCTruncResumeVal);
+    } else {
+      BCResumeVal->addIncoming(II.StartValue, LoopBypassBlocks[0]);
+      OrigPhi->setIncomingValue(BlockIdx, BCResumeVal);
+    }
  }

  // If we are generating a new induction variable then we also need to
@ -2224,7 +2278,7 @@ void InnerLoopVectorizer::createEmptyLoop() {
    assert(!ResumeIndex && "Unexpected resume value found");
    ResumeIndex = PHINode::Create(IdxTy, 2, "new.indc.resume.val",
                                  MiddleBlock->getTerminator());
-    for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
+    for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I)
      ResumeIndex->addIncoming(StartIdx, LoopBypassBlocks[I]);
    ResumeIndex->addIncoming(IdxEndRoundDown, VecBody);
  }
@ -2494,7 +2548,7 @@ void InnerLoopVectorizer::vectorizeLoop() {
    // To do so, we need to generate the 'identity' vector and override
    // one of the elements with the incoming scalar reduction. We need
    // to do it in the vector-loop preheader.
-    Builder.SetInsertPoint(LoopBypassBlocks.front()->getTerminator());
+    Builder.SetInsertPoint(LoopBypassBlocks[1]->getTerminator());

    // This is the vector-clone of the value that leaves the loop.
    VectorParts &VectorExit = getVectorValue(RdxDesc.LoopExitInstr);
@ -2568,7 +2622,7 @@ void InnerLoopVectorizer::vectorizeLoop() {
      VectorParts &RdxExitVal = getVectorValue(RdxDesc.LoopExitInstr);
      PHINode *NewPhi = Builder.CreatePHI(VecTy, 2, "rdx.vec.exit.phi");
      Value *StartVal = (part == 0) ? VectorStart : Identity;
-      for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
+      for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I)
        NewPhi->addIncoming(StartVal, LoopBypassBlocks[I]);
      NewPhi->addIncoming(RdxExitVal[part],
                          LoopVectorBody.back());
@ -2626,6 +2680,13 @@ void InnerLoopVectorizer::vectorizeLoop() {
                                                    Builder.getInt32(0));
    }

+    // Create a phi node that merges control-flow from the backedge-taken check
+    // block and the middle block.
+    PHINode *BCBlockPhi = PHINode::Create(RdxPhi->getType(), 2, "bc.merge.rdx",
+                                          LoopScalarPreHeader->getTerminator());
+    BCBlockPhi->addIncoming(RdxDesc.StartValue, LoopBypassBlocks[0]);
+    BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock);
+
    // Now, we need to fix the users of the reduction variable
    // inside and outside of the scalar remainder loop.
    // We know that the loop is in LCSSA form. We need to update the
@ -2655,7 +2716,7 @@ void InnerLoopVectorizer::vectorizeLoop() {
    assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
    // Pick the other block.
    int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
-    (RdxPhi)->setIncomingValue(SelfEdgeBlockIdx, ReducedPartRdx);
+    (RdxPhi)->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
    (RdxPhi)->setIncomingValue(IncomingEdgeBlockIdx, RdxDesc.LoopExitInstr);
  }// end of for each redux variable.

@ -3112,8 +3173,8 @@ void InnerLoopVectorizer::updateAnalysis() {
    }
  }

-  DT->addNewBlock(LoopMiddleBlock, LoopBypassBlocks.front());
-  DT->addNewBlock(LoopScalarPreHeader, LoopMiddleBlock);
+  DT->addNewBlock(LoopMiddleBlock, LoopBypassBlocks[1]);
+  DT->addNewBlock(LoopScalarPreHeader, LoopBypassBlocks[0]);
  DT->changeImmediateDominator(LoopScalarBody, LoopScalarPreHeader);
  DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock);

--- a/test/Transforms/LoopVectorize/induction.ll
+++ b/test/Transforms/LoopVectorize/induction.ll
@ -108,3 +108,30 @@ define i32 @i16_loop() nounwind readnone ssp uwtable {
 ; <label>:5                                       ; preds = %1
  ret i32 %2
 }
+
+; This loop has a backedge taken count of i32_max. We need to check for this
+; condition and branch directly to the scalar loop.
+
+; CHECK-LABEL: max_i32_backedgetaken
+; CHECK:  %backedge.overflow = icmp eq i32 -1, -1
+; CHECK:  br i1 %backedge.overflow, label %scalar.ph, label %overflow.checked
+
+; CHECK: scalar.ph:
+; CHECK:  %bc.resume.val = phi i32 [ %resume.val, %middle.block ], [ 0, %0 ]
+; CHECK:  %bc.merge.rdx = phi i32 [ 1, %0 ], [ %5, %middle.block ]
+
+define i32 @max_i32_backedgetaken() nounwind readnone ssp uwtable {
+
+  br label %1
+
+; <label>:1                                       ; preds = %1, %0
+  %a.0 = phi i32 [ 1, %0 ], [ %2, %1 ]
+  %b.0 = phi i32 [ 0, %0 ], [ %3, %1 ]
+  %2 = and i32 %a.0, 4
+  %3 = add i32 %b.0, -1
+  %4 = icmp eq i32 %3, 0
+  br i1 %4, label %5, label %1
+
+; <label>:5                                       ; preds = %1
+  ret i32 %2
+}
--- a/test/Transforms/LoopVectorize/runtime-check-readonly.ll
+++ b/test/Transforms/LoopVectorize/runtime-check-readonly.ll
@ -5,6 +5,7 @@ target triple = "x86_64-apple-macosx10.8.0"

 ;CHECK-LABEL: @add_ints(
 ;CHECK: br
+;CHECK: br
 ;CHECK: getelementptr
 ;CHECK-NEXT: getelementptr
 ;CHECK-DAG: icmp uge