LoopVectorizer: Emit memory checks into their own basic block.

This separates the check for "too few elements to run the vector loop" from the
"memory overlap" check, giving a lot nicer code and allowing to skip the memory
checks when we're not going to execute the vector code anyways. We still leave
the decision of whether to emit the memory checks as branches or setccs, but it
seems to be doing a good job. If ugly code pops up we may want to emit them as
separate blocks too. Small speedup on MultiSource/Benchmarks/MallocBench/espresso.

Most of this is legwork to allow multiple bypass blocks while updating PHIs,
dominators and loop info.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@172902 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
Benjamin Kramer 2013-01-19 13:57:58 +00:00
parent ba95865441
commit 1af132dcf3
2 changed files with 57 additions and 33 deletions

View File

@ -163,8 +163,8 @@ private:
/// Add code that checks at runtime if the accessed arrays overlap. /// Add code that checks at runtime if the accessed arrays overlap.
/// Returns the comparator value or NULL if no check is needed. /// Returns the comparator value or NULL if no check is needed.
Value *addRuntimeCheck(LoopVectorizationLegality *Legal, Instruction *addRuntimeCheck(LoopVectorizationLegality *Legal,
Instruction *Loc); Instruction *Loc);
/// Create an empty loop, based on the loop ranges of the old loop. /// Create an empty loop, based on the loop ranges of the old loop.
void createEmptyLoop(LoopVectorizationLegality *Legal); void createEmptyLoop(LoopVectorizationLegality *Legal);
/// Copy and widen the instructions from the old loop. /// Copy and widen the instructions from the old loop.
@ -283,8 +283,8 @@ private:
BasicBlock *LoopVectorBody; BasicBlock *LoopVectorBody;
///The scalar loop body. ///The scalar loop body.
BasicBlock *LoopScalarBody; BasicBlock *LoopScalarBody;
///The first bypass block. /// A list of all bypass blocks. The first block is the entry of the loop.
BasicBlock *LoopBypassBlock; SmallVector<BasicBlock *, 4> LoopBypassBlocks;
/// The new Induction variable which was added to the new block. /// The new Induction variable which was added to the new block.
PHINode *Induction; PHINode *Induction;
@ -868,7 +868,7 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr) {
} }
} }
Value* Instruction *
InnerLoopVectorizer::addRuntimeCheck(LoopVectorizationLegality *Legal, InnerLoopVectorizer::addRuntimeCheck(LoopVectorizationLegality *Legal,
Instruction *Loc) { Instruction *Loc) {
LoopVectorizationLegality::RuntimePointerCheck *PtrRtCheck = LoopVectorizationLegality::RuntimePointerCheck *PtrRtCheck =
@ -877,7 +877,7 @@ InnerLoopVectorizer::addRuntimeCheck(LoopVectorizationLegality *Legal,
if (!PtrRtCheck->Need) if (!PtrRtCheck->Need)
return NULL; return NULL;
Value *MemoryRuntimeCheck = 0; Instruction *MemoryRuntimeCheck = 0;
unsigned NumPointers = PtrRtCheck->Pointers.size(); unsigned NumPointers = PtrRtCheck->Pointers.size();
SmallVector<Value* , 2> Starts; SmallVector<Value* , 2> Starts;
SmallVector<Value* , 2> Ends; SmallVector<Value* , 2> Ends;
@ -918,8 +918,9 @@ InnerLoopVectorizer::addRuntimeCheck(LoopVectorizationLegality *Legal,
Start0, End1, "bound0", Loc); Start0, End1, "bound0", Loc);
Value *Cmp1 = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_ULE, Value *Cmp1 = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_ULE,
Start1, End0, "bound1", Loc); Start1, End0, "bound1", Loc);
Value *IsConflict = BinaryOperator::Create(Instruction::And, Cmp0, Cmp1, Instruction *IsConflict = BinaryOperator::Create(Instruction::And, Cmp0,
"found.conflict", Loc); Cmp1, "found.conflict",
Loc);
if (MemoryRuntimeCheck) if (MemoryRuntimeCheck)
MemoryRuntimeCheck = BinaryOperator::Create(Instruction::Or, MemoryRuntimeCheck = BinaryOperator::Create(Instruction::Or,
MemoryRuntimeCheck, MemoryRuntimeCheck,
@ -941,7 +942,7 @@ InnerLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
the vectorized instructions while the old loop will continue to run the the vectorized instructions while the old loop will continue to run the
scalar remainder. scalar remainder.
[ ] <-- vector loop bypass. [ ] <-- vector loop bypass (may consist of multiple blocks).
/ | / |
/ v / v
| [ ] <-- vector pre header. | [ ] <-- vector pre header.
@ -1002,10 +1003,7 @@ InnerLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
ConstantInt::get(IdxTy, 0); ConstantInt::get(IdxTy, 0);
assert(BypassBlock && "Invalid loop structure"); assert(BypassBlock && "Invalid loop structure");
LoopBypassBlocks.push_back(BypassBlock);
// Generate the code that checks in runtime if arrays overlap.
Value *MemoryRuntimeCheck = addRuntimeCheck(Legal,
BypassBlock->getTerminator());
// Split the single block loop into the two loop structure described above. // Split the single block loop into the two loop structure described above.
BasicBlock *VectorPH = BasicBlock *VectorPH =
@ -1062,10 +1060,24 @@ InnerLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
StartIdx, StartIdx,
"cmp.zero", Loc); "cmp.zero", Loc);
// If we are using memory runtime checks, include them in. // Generate the code that checks in runtime if arrays overlap. We put the
if (MemoryRuntimeCheck) // checks into a separate block to make the more common case of few elements
Cmp = BinaryOperator::Create(Instruction::Or, Cmp, MemoryRuntimeCheck, // faster.
"CntOrMem", Loc); if (Instruction *MemoryRuntimeCheck = addRuntimeCheck(Legal, Loc)) {
// Create a new block containing the memory check.
BasicBlock *CheckBlock = BypassBlock->splitBasicBlock(MemoryRuntimeCheck,
"vector.memcheck");
LoopBypassBlocks.push_back(CheckBlock);
// Replace the branch into the memory check block with a conditional branch
// for the "few elements case".
Instruction *OldTerm = BypassBlock->getTerminator();
BranchInst::Create(MiddleBlock, CheckBlock, Cmp, OldTerm);
OldTerm->eraseFromParent();
Cmp = MemoryRuntimeCheck;
assert(Loc == CheckBlock->getTerminator());
}
BranchInst::Create(MiddleBlock, VectorPH, Cmp, Loc); BranchInst::Create(MiddleBlock, VectorPH, Cmp, Loc);
// Remove the old terminator. // Remove the old terminator.
@ -1109,30 +1121,33 @@ InnerLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
Value *CRD = CountRoundDown; Value *CRD = CountRoundDown;
if (CRDSize > IISize) if (CRDSize > IISize)
CRD = CastInst::Create(Instruction::Trunc, CountRoundDown, CRD = CastInst::Create(Instruction::Trunc, CountRoundDown,
II.StartValue->getType(), II.StartValue->getType(), "tr.crd",
"tr.crd", BypassBlock->getTerminator()); LoopBypassBlocks.back()->getTerminator());
else if (CRDSize < IISize) else if (CRDSize < IISize)
CRD = CastInst::Create(Instruction::SExt, CountRoundDown, CRD = CastInst::Create(Instruction::SExt, CountRoundDown,
II.StartValue->getType(), II.StartValue->getType(),
"sext.crd", BypassBlock->getTerminator()); "sext.crd",
LoopBypassBlocks.back()->getTerminator());
// Handle reverse integer induction counter: // Handle reverse integer induction counter:
EndValue = BinaryOperator::CreateSub(II.StartValue, CRD, "rev.ind.end", EndValue =
BypassBlock->getTerminator()); BinaryOperator::CreateSub(II.StartValue, CRD, "rev.ind.end",
LoopBypassBlocks.back()->getTerminator());
break; break;
} }
case LoopVectorizationLegality::IK_PtrInduction: { case LoopVectorizationLegality::IK_PtrInduction: {
// For pointer induction variables, calculate the offset using // For pointer induction variables, calculate the offset using
// the end index. // the end index.
EndValue = GetElementPtrInst::Create(II.StartValue, CountRoundDown, EndValue =
"ptr.ind.end", GetElementPtrInst::Create(II.StartValue, CountRoundDown, "ptr.ind.end",
BypassBlock->getTerminator()); LoopBypassBlocks.back()->getTerminator());
break; break;
} }
}// end of case }// end of case
// The new PHI merges the original incoming value, in case of a bypass, // The new PHI merges the original incoming value, in case of a bypass,
// or the value at the end of the vectorized loop. // or the value at the end of the vectorized loop.
ResumeVal->addIncoming(II.StartValue, BypassBlock); for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
ResumeVal->addIncoming(II.StartValue, LoopBypassBlocks[I]);
ResumeVal->addIncoming(EndValue, VecBody); ResumeVal->addIncoming(EndValue, VecBody);
// Fix the scalar body counter (PHI node). // Fix the scalar body counter (PHI node).
@ -1148,7 +1163,8 @@ InnerLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
assert(!ResumeIndex && "Unexpected resume value found"); assert(!ResumeIndex && "Unexpected resume value found");
ResumeIndex = PHINode::Create(IdxTy, 2, "new.indc.resume.val", ResumeIndex = PHINode::Create(IdxTy, 2, "new.indc.resume.val",
MiddleBlock->getTerminator()); MiddleBlock->getTerminator());
ResumeIndex->addIncoming(StartIdx, BypassBlock); for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
ResumeIndex->addIncoming(StartIdx, LoopBypassBlocks[I]);
ResumeIndex->addIncoming(IdxEndRoundDown, VecBody); ResumeIndex->addIncoming(IdxEndRoundDown, VecBody);
} }
@ -1188,6 +1204,8 @@ InnerLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
// Insert the new loop into the loop nest and register the new basic blocks. // Insert the new loop into the loop nest and register the new basic blocks.
if (ParentLoop) { if (ParentLoop) {
ParentLoop->addChildLoop(Lp); ParentLoop->addChildLoop(Lp);
for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I)
ParentLoop->addBasicBlockToLoop(LoopBypassBlocks[I], LI->getBase());
ParentLoop->addBasicBlockToLoop(ScalarPH, LI->getBase()); ParentLoop->addBasicBlockToLoop(ScalarPH, LI->getBase());
ParentLoop->addBasicBlockToLoop(VectorPH, LI->getBase()); ParentLoop->addBasicBlockToLoop(VectorPH, LI->getBase());
ParentLoop->addBasicBlockToLoop(MiddleBlock, LI->getBase()); ParentLoop->addBasicBlockToLoop(MiddleBlock, LI->getBase());
@ -1204,7 +1222,6 @@ InnerLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
LoopExitBlock = ExitBlock; LoopExitBlock = ExitBlock;
LoopVectorBody = VecBody; LoopVectorBody = VecBody;
LoopScalarBody = OldBasicBlock; LoopScalarBody = OldBasicBlock;
LoopBypassBlock = BypassBlock;
} }
/// This function returns the identity element (or neutral element) for /// This function returns the identity element (or neutral element) for
@ -1344,7 +1361,7 @@ InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
// To do so, we need to generate the 'identity' vector and overide // To do so, we need to generate the 'identity' vector and overide
// one of the elements with the incoming scalar reduction. We need // one of the elements with the incoming scalar reduction. We need
// to do it in the vector-loop preheader. // to do it in the vector-loop preheader.
Builder.SetInsertPoint(LoopBypassBlock->getTerminator()); Builder.SetInsertPoint(LoopBypassBlocks.back()->getTerminator());
// This is the vector-clone of the value that leaves the loop. // This is the vector-clone of the value that leaves the loop.
VectorParts &VectorExit = getVectorValue(RdxDesc.LoopExitInstr); VectorParts &VectorExit = getVectorValue(RdxDesc.LoopExitInstr);
@ -1392,7 +1409,8 @@ InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
VectorParts &RdxExitVal = getVectorValue(RdxDesc.LoopExitInstr); VectorParts &RdxExitVal = getVectorValue(RdxDesc.LoopExitInstr);
PHINode *NewPhi = Builder.CreatePHI(VecTy, 2, "rdx.vec.exit.phi"); PHINode *NewPhi = Builder.CreatePHI(VecTy, 2, "rdx.vec.exit.phi");
Value *StartVal = (part == 0) ? VectorStart : Identity; Value *StartVal = (part == 0) ? VectorStart : Identity;
NewPhi->addIncoming(StartVal, LoopBypassBlock); for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
NewPhi->addIncoming(StartVal, LoopBypassBlocks[I]);
NewPhi->addIncoming(RdxExitVal[part], LoopVectorBody); NewPhi->addIncoming(RdxExitVal[part], LoopVectorBody);
RdxParts.push_back(NewPhi); RdxParts.push_back(NewPhi);
} }
@ -1925,12 +1943,14 @@ void InnerLoopVectorizer::updateAnalysis() {
SE->forgetLoop(OrigLoop); SE->forgetLoop(OrigLoop);
// Update the dominator tree information. // Update the dominator tree information.
assert(DT->properlyDominates(LoopBypassBlock, LoopExitBlock) && assert(DT->properlyDominates(LoopBypassBlocks.front(), LoopExitBlock) &&
"Entry does not dominate exit."); "Entry does not dominate exit.");
DT->addNewBlock(LoopVectorPreHeader, LoopBypassBlock); for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I)
DT->addNewBlock(LoopBypassBlocks[I], LoopBypassBlocks[I-1]);
DT->addNewBlock(LoopVectorPreHeader, LoopBypassBlocks.back());
DT->addNewBlock(LoopVectorBody, LoopVectorPreHeader); DT->addNewBlock(LoopVectorBody, LoopVectorPreHeader);
DT->addNewBlock(LoopMiddleBlock, LoopBypassBlock); DT->addNewBlock(LoopMiddleBlock, LoopBypassBlocks.front());
DT->addNewBlock(LoopScalarPreHeader, LoopMiddleBlock); DT->addNewBlock(LoopScalarPreHeader, LoopMiddleBlock);
DT->changeImmediateDominator(LoopScalarBody, LoopScalarPreHeader); DT->changeImmediateDominator(LoopScalarBody, LoopScalarPreHeader);
DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock); DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock);

View File

@ -9,6 +9,10 @@ target triple = "x86_64-apple-macosx10.9.0"
; a[i] = b[i] * 3; ; a[i] = b[i] * 3;
; } ; }
;CHECK: for.body.preheader:
;CHECK: br i1 %cmp.zero, label %middle.block, label %vector.memcheck
;CHECK: vector.memcheck:
;CHECK: br i1 %found.conflict, label %middle.block, label %vector.ph
;CHECK: load <4 x float> ;CHECK: load <4 x float>
define i32 @foo(float* nocapture %a, float* nocapture %b, i32 %n) nounwind uwtable ssp { define i32 @foo(float* nocapture %a, float* nocapture %b, i32 %n) nounwind uwtable ssp {
entry: entry: