mirror of
https://github.com/c64scene-ar/llvm-6502.git
synced 2025-02-06 06:33:24 +00:00
LoopVectorize: Support conditional stores by scalarizing
The vectorizer takes a loop like this and widens all instructions except for the store. The stores are scalarized/unrolled and hidden behind an "if" block. for (i = 0; i < 128; ++i) { if (a[i] < 10) a[i] += val; } for (i = 0; i < 128; i+=2) { v = a[i:i+1]; v0 = (extract v, 0) + 10; v1 = (extract v, 1) + 10; if (v0 < 10) a[i] = v0; if (v1 < 10) a[i] = v1; } The vectorizer relies on subsequent optimizations to sink instructions into the conditional block where they are anticipated. The flag "vectorize-num-stores-pred" controls whether and how many stores to handle this way. Vectorization of conditional stores is disabled per default for now. This patch also adds a change to the heuristic when the flag "enable-loadstore-runtime-unroll" is enabled (off by default). It unrolls small loops until load/store ports are saturated. This heuristic uses TTI's getMaxUnrollFactor as a measure for load/store ports. I also added a second flag -enable-cond-stores-vec. It will enable vectorization of conditional stores. But there is no cost model for vectorization of conditional stores in place yet so this will not do good at the moment. rdar://15892953 Results for x86-64 -O3 -mavx +/- -mllvm -enable-loadstore-runtime-unroll -vectorize-num-stores-pred=1 (before the BFI change): Performance Regressions: Benchmarks/Ptrdist/yacr2/yacr2 7.35% (maze3() is identical but 10% slower) Applications/siod/siod 2.18% Performance improvements: mesa -4.42% libquantum -4.15% With a patch that slightly changes the register heuristics (by subtracting the induction variable on both sides of the register pressure equation, as the induction variable is probably not really unrolled): Performance Regressions: Benchmarks/Ptrdist/yacr2/yacr2 7.73% Applications/siod/siod 1.97% Performance Improvements: libquantum -13.05% (we now also unroll quantum_toffoli) mesa -4.27% git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@200270 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
parent
1c2827cd6a
commit
a47aa4b4ef
@ -172,6 +172,20 @@ static cl::opt<unsigned> SmallLoopCost(
|
||||
"small-loop-cost", cl::init(20), cl::Hidden,
|
||||
cl::desc("The cost of a loop that is considered 'small' by the unroller."));
|
||||
|
||||
// Runtime unroll loops for load/store throughput.
|
||||
static cl::opt<bool> EnableLoadStoreRuntimeUnroll(
|
||||
"enable-loadstore-runtime-unroll", cl::init(false), cl::Hidden,
|
||||
cl::desc("Enable runtime unrolling until load/store ports are saturated"));
|
||||
|
||||
/// The number of stores in a loop that are allowed to need predication.
|
||||
static cl::opt<unsigned> NumberOfStoresToPredicate(
|
||||
"vectorize-num-stores-pred", cl::init(0), cl::Hidden,
|
||||
cl::desc("Max number of stores to be predicated behind an if."));
|
||||
|
||||
static cl::opt<bool> EnableCondStoresVectorization(
|
||||
"enable-cond-stores-vec", cl::init(false), cl::Hidden,
|
||||
cl::desc("Enable if predication of stores during vectorization."));
|
||||
|
||||
namespace {
|
||||
|
||||
// Forward declarations.
|
||||
@ -275,8 +289,11 @@ protected:
|
||||
void updateAnalysis();
|
||||
|
||||
/// This instruction is un-vectorizable. Implement it as a sequence
|
||||
/// of scalars.
|
||||
virtual void scalarizeInstruction(Instruction *Instr);
|
||||
/// of scalars. If \p IfPredicateStore is true we need to 'hide' each
|
||||
/// scalarized instruction behind an if block predicated on the control
|
||||
/// dependence of the instruction.
|
||||
virtual void scalarizeInstruction(Instruction *Instr,
|
||||
bool IfPredicateStore=false);
|
||||
|
||||
/// Vectorize Load and Store instructions,
|
||||
virtual void vectorizeMemoryInstruction(Instruction *Instr);
|
||||
@ -379,7 +396,7 @@ protected:
|
||||
///The ExitBlock of the scalar loop.
|
||||
BasicBlock *LoopExitBlock;
|
||||
///The vector loop body.
|
||||
BasicBlock *LoopVectorBody;
|
||||
SmallVector<BasicBlock *, 4> LoopVectorBody;
|
||||
///The scalar loop body.
|
||||
BasicBlock *LoopScalarBody;
|
||||
/// A list of all bypass blocks. The first block is the entry of the loop.
|
||||
@ -406,7 +423,7 @@ public:
|
||||
InnerLoopVectorizer(OrigLoop, SE, LI, DT, DL, TLI, 1, UnrollFactor) { }
|
||||
|
||||
private:
|
||||
virtual void scalarizeInstruction(Instruction *Instr);
|
||||
virtual void scalarizeInstruction(Instruction *Instr, bool IfPredicateStore = false);
|
||||
virtual void vectorizeMemoryInstruction(Instruction *Instr);
|
||||
virtual Value *getBroadcastInstrs(Value *V);
|
||||
virtual Value *getConsecutiveVector(Value* Val, int StartIdx, bool Negate);
|
||||
@ -456,10 +473,14 @@ static void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) {
|
||||
/// induction variable and the different reduction variables.
|
||||
class LoopVectorizationLegality {
|
||||
public:
|
||||
unsigned NumLoads;
|
||||
unsigned NumStores;
|
||||
unsigned NumPredStores;
|
||||
|
||||
LoopVectorizationLegality(Loop *L, ScalarEvolution *SE, DataLayout *DL,
|
||||
DominatorTree *DT, TargetLibraryInfo *TLI)
|
||||
: TheLoop(L), SE(SE), DL(DL), DT(DT), TLI(TLI),
|
||||
Induction(0), WidestIndTy(0), HasFunNoNaNAttr(false),
|
||||
: NumLoads(0), NumStores(0), NumPredStores(0), TheLoop(L), SE(SE), DL(DL),
|
||||
DT(DT), TLI(TLI), Induction(0), WidestIndTy(0), HasFunNoNaNAttr(false),
|
||||
MaxSafeDepDistBytes(-1U) {}
|
||||
|
||||
/// This enum represents the kinds of reductions that we support.
|
||||
@ -1206,7 +1227,9 @@ void LoopVectorizationLegality::RuntimePointerCheck::insert(
|
||||
Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
|
||||
// We need to place the broadcast of invariant variables outside the loop.
|
||||
Instruction *Instr = dyn_cast<Instruction>(V);
|
||||
bool NewInstr = (Instr && Instr->getParent() == LoopVectorBody);
|
||||
bool NewInstr =
|
||||
(Instr && std::find(LoopVectorBody.begin(), LoopVectorBody.end(),
|
||||
Instr->getParent()) != LoopVectorBody.end());
|
||||
bool Invariant = OrigLoop->isLoopInvariant(V) && !NewInstr;
|
||||
|
||||
// Place the code for broadcasting invariant variables in the new preheader.
|
||||
@ -1411,6 +1434,9 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) {
|
||||
unsigned ScalarAllocatedSize = DL->getTypeAllocSize(ScalarDataTy);
|
||||
unsigned VectorElementSize = DL->getTypeStoreSize(DataTy)/VF;
|
||||
|
||||
if (SI && Legal->blockNeedsPredication(SI->getParent()))
|
||||
return scalarizeInstruction(Instr, true);
|
||||
|
||||
if (ScalarAllocatedSize != VectorElementSize)
|
||||
return scalarizeInstruction(Instr);
|
||||
|
||||
@ -1529,7 +1555,7 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) {
|
||||
}
|
||||
}
|
||||
|
||||
void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr) {
|
||||
void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, bool IfPredicateStore) {
|
||||
assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
|
||||
// Holds vector parameters or scalars, in case of uniform vals.
|
||||
SmallVector<VectorParts, 4> Params;
|
||||
@ -1574,10 +1600,37 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr) {
|
||||
// Create a new entry in the WidenMap and initialize it to Undef or Null.
|
||||
VectorParts &VecResults = WidenMap.splat(Instr, UndefVec);
|
||||
|
||||
Instruction *InsertPt = Builder.GetInsertPoint();
|
||||
BasicBlock *IfBlock = Builder.GetInsertBlock();
|
||||
BasicBlock *CondBlock = 0;
|
||||
|
||||
VectorParts Cond;
|
||||
Loop *VectorLp = 0;
|
||||
if (IfPredicateStore) {
|
||||
assert(Instr->getParent()->getSinglePredecessor() &&
|
||||
"Only support single predecessor blocks");
|
||||
Cond = createEdgeMask(Instr->getParent()->getSinglePredecessor(),
|
||||
Instr->getParent());
|
||||
VectorLp = LI->getLoopFor(IfBlock);
|
||||
assert(VectorLp && "Must have a loop for this block");
|
||||
}
|
||||
|
||||
// For each vector unroll 'part':
|
||||
for (unsigned Part = 0; Part < UF; ++Part) {
|
||||
// For each scalar that we create:
|
||||
for (unsigned Width = 0; Width < VF; ++Width) {
|
||||
|
||||
// Start if-block.
|
||||
Value *Cmp = 0;
|
||||
if (IfPredicateStore) {
|
||||
Cmp = Builder.CreateExtractElement(Cond[Part], Builder.getInt32(Width));
|
||||
Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Cmp, ConstantInt::get(Cmp->getType(), 1));
|
||||
CondBlock = IfBlock->splitBasicBlock(InsertPt, "cond.store");
|
||||
VectorLp->addBasicBlockToLoop(CondBlock, LI->getBase());
|
||||
// Update Builder with newly created basic block.
|
||||
Builder.SetInsertPoint(InsertPt);
|
||||
}
|
||||
|
||||
Instruction *Cloned = Instr->clone();
|
||||
if (!IsVoidRetTy)
|
||||
Cloned->setName(Instr->getName() + ".cloned");
|
||||
@ -1598,6 +1651,16 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr) {
|
||||
if (!IsVoidRetTy)
|
||||
VecResults[Part] = Builder.CreateInsertElement(VecResults[Part], Cloned,
|
||||
Builder.getInt32(Width));
|
||||
// End if-block.
|
||||
if (IfPredicateStore) {
|
||||
BasicBlock *NewIfBlock = CondBlock->splitBasicBlock(InsertPt, "else");
|
||||
VectorLp->addBasicBlockToLoop(NewIfBlock, LI->getBase());
|
||||
Builder.SetInsertPoint(InsertPt);
|
||||
Instruction *OldBr = IfBlock->getTerminator();
|
||||
BranchInst::Create(CondBlock, NewIfBlock, Cmp, OldBr);
|
||||
OldBr->eraseFromParent();
|
||||
IfBlock = NewIfBlock;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -2101,7 +2164,7 @@ void InnerLoopVectorizer::createEmptyLoop() {
|
||||
LoopScalarPreHeader = ScalarPH;
|
||||
LoopMiddleBlock = MiddleBlock;
|
||||
LoopExitBlock = ExitBlock;
|
||||
LoopVectorBody = VecBody;
|
||||
LoopVectorBody.push_back(VecBody);
|
||||
LoopScalarBody = OldBasicBlock;
|
||||
|
||||
LoopVectorizeHints Hints(Lp, true);
|
||||
@ -2369,25 +2432,42 @@ struct CSEDenseMapInfo {
|
||||
};
|
||||
}
|
||||
|
||||
/// \brief Check whether this block is a predicated block.
|
||||
/// Due to if predication of stores we might create a sequence of "if(pred) a[i]
|
||||
/// = ...; " blocks. We start with one vectorized basic block. For every
|
||||
/// conditional block we split this vectorized block. Therefore, every second
|
||||
/// block will be a predicated one.
|
||||
static bool isPredicatedBlock(unsigned BlockNum) {
|
||||
return BlockNum % 2;
|
||||
}
|
||||
|
||||
///\brief Perform cse of induction variable instructions.
|
||||
static void cse(BasicBlock *BB) {
|
||||
static void cse(SmallVector<BasicBlock *, 4> &BBs) {
|
||||
// Perform simple cse.
|
||||
SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap;
|
||||
for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
|
||||
Instruction *In = I++;
|
||||
for (unsigned i = 0, e = BBs.size(); i != e; ++i) {
|
||||
BasicBlock *BB = BBs[i];
|
||||
for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
|
||||
Instruction *In = I++;
|
||||
|
||||
if (!CSEDenseMapInfo::canHandle(In))
|
||||
continue;
|
||||
if (!CSEDenseMapInfo::canHandle(In))
|
||||
continue;
|
||||
|
||||
// Check if we can replace this instruction with any of the
|
||||
// visited instructions.
|
||||
if (Instruction *V = CSEMap.lookup(In)) {
|
||||
In->replaceAllUsesWith(V);
|
||||
In->eraseFromParent();
|
||||
continue;
|
||||
// Check if we can replace this instruction with any of the
|
||||
// visited instructions.
|
||||
if (Instruction *V = CSEMap.lookup(In)) {
|
||||
In->replaceAllUsesWith(V);
|
||||
In->eraseFromParent();
|
||||
continue;
|
||||
}
|
||||
// Ignore instructions in conditional blocks. We create "if (pred) a[i] =
|
||||
// ...;" blocks for predicated stores. Every second block is a predicated
|
||||
// block.
|
||||
if (isPredicatedBlock(i))
|
||||
continue;
|
||||
|
||||
CSEMap[In] = In;
|
||||
}
|
||||
|
||||
CSEMap[In] = In;
|
||||
}
|
||||
}
|
||||
|
||||
@ -2503,7 +2583,8 @@ void InnerLoopVectorizer::vectorizeLoop() {
|
||||
// first unroll part.
|
||||
Value *StartVal = (part == 0) ? VectorStart : Identity;
|
||||
cast<PHINode>(VecRdxPhi[part])->addIncoming(StartVal, VecPreheader);
|
||||
cast<PHINode>(VecRdxPhi[part])->addIncoming(Val[part], LoopVectorBody);
|
||||
cast<PHINode>(VecRdxPhi[part])->addIncoming(Val[part],
|
||||
LoopVectorBody.back());
|
||||
}
|
||||
|
||||
// Before each round, move the insertion point right between
|
||||
@ -2522,7 +2603,8 @@ void InnerLoopVectorizer::vectorizeLoop() {
|
||||
Value *StartVal = (part == 0) ? VectorStart : Identity;
|
||||
for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
|
||||
NewPhi->addIncoming(StartVal, LoopBypassBlocks[I]);
|
||||
NewPhi->addIncoming(RdxExitVal[part], LoopVectorBody);
|
||||
NewPhi->addIncoming(RdxExitVal[part],
|
||||
LoopVectorBody.back());
|
||||
RdxParts.push_back(NewPhi);
|
||||
}
|
||||
|
||||
@ -2695,7 +2777,7 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN,
|
||||
Type *VecTy = (VF == 1) ? PN->getType() :
|
||||
VectorType::get(PN->getType(), VF);
|
||||
Entry[part] = PHINode::Create(VecTy, 2, "vec.phi",
|
||||
LoopVectorBody-> getFirstInsertionPt());
|
||||
LoopVectorBody.back()-> getFirstInsertionPt());
|
||||
}
|
||||
PV->push_back(P);
|
||||
return;
|
||||
@ -3044,7 +3126,19 @@ void InnerLoopVectorizer::updateAnalysis() {
|
||||
for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I)
|
||||
DT->addNewBlock(LoopBypassBlocks[I], LoopBypassBlocks[I-1]);
|
||||
DT->addNewBlock(LoopVectorPreHeader, LoopBypassBlocks.back());
|
||||
DT->addNewBlock(LoopVectorBody, LoopVectorPreHeader);
|
||||
|
||||
// Due to if predication of stores we might create a sequence of "if(pred)
|
||||
// a[i] = ...; " blocks.
|
||||
for (unsigned i = 0, e = LoopVectorBody.size(); i != e; ++i) {
|
||||
if (i == 0)
|
||||
DT->addNewBlock(LoopVectorBody[0], LoopVectorPreHeader);
|
||||
else if (isPredicatedBlock(i)) {
|
||||
DT->addNewBlock(LoopVectorBody[i], LoopVectorBody[i-1]);
|
||||
} else {
|
||||
DT->addNewBlock(LoopVectorBody[i], LoopVectorBody[i-2]);
|
||||
}
|
||||
}
|
||||
|
||||
DT->addNewBlock(LoopMiddleBlock, LoopBypassBlocks.front());
|
||||
DT->addNewBlock(LoopScalarPreHeader, LoopMiddleBlock);
|
||||
DT->changeImmediateDominator(LoopScalarBody, LoopScalarPreHeader);
|
||||
@ -4292,6 +4386,7 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
|
||||
DEBUG(dbgs() << "LV: Found a non-simple load.\n");
|
||||
return false;
|
||||
}
|
||||
NumLoads++;
|
||||
Loads.push_back(Ld);
|
||||
DepChecker.addAccess(Ld);
|
||||
continue;
|
||||
@ -4305,6 +4400,7 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
|
||||
DEBUG(dbgs() << "LV: Found a non-simple store.\n");
|
||||
return false;
|
||||
}
|
||||
NumStores++;
|
||||
Stores.push_back(St);
|
||||
DepChecker.addAccess(St);
|
||||
}
|
||||
@ -4816,7 +4912,16 @@ bool LoopVectorizationLegality::blockCanBePredicated(BasicBlock *BB,
|
||||
}
|
||||
|
||||
// We don't predicate stores at the moment.
|
||||
if (it->mayWriteToMemory() || it->mayThrow())
|
||||
if (it->mayWriteToMemory()) {
|
||||
StoreInst *SI = dyn_cast<StoreInst>(it);
|
||||
// We only support predication of stores in basic blocks with one
|
||||
// predecessor.
|
||||
if (!SI || ++NumPredStores > NumberOfStoresToPredicate ||
|
||||
!SafePtrs.count(SI->getPointerOperand()) ||
|
||||
!SI->getParent()->getSinglePredecessor())
|
||||
return false;
|
||||
}
|
||||
if (it->mayThrow())
|
||||
return false;
|
||||
|
||||
// Check that we don't have a constant expression that can trap as operand.
|
||||
@ -4851,6 +4956,11 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize,
|
||||
return Factor;
|
||||
}
|
||||
|
||||
if (!EnableCondStoresVectorization && Legal->NumPredStores) {
|
||||
DEBUG(dbgs() << "LV: No vectorization. There are conditional stores.\n");
|
||||
return Factor;
|
||||
}
|
||||
|
||||
// Find the trip count.
|
||||
unsigned TC = SE->getSmallConstantTripCount(TheLoop, TheLoop->getLoopLatch());
|
||||
DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
|
||||
@ -5066,6 +5176,17 @@ LoopVectorizationCostModel::selectUnrollFactor(bool OptForSize,
|
||||
return UF;
|
||||
}
|
||||
|
||||
if (EnableLoadStoreRuntimeUnroll &&
|
||||
!Legal->getRuntimePointerCheck()->Need &&
|
||||
LoopCost < SmallLoopCost) {
|
||||
// Unroll until store/load ports (estimated by max unroll factor) are
|
||||
// saturated.
|
||||
unsigned UnrollStores = UF / (Legal->NumStores ? Legal->NumStores : 1);
|
||||
unsigned UnrollLoads = UF / (Legal->NumLoads ? Legal->NumLoads : 1);
|
||||
UF = std::max(std::min(UnrollStores, UnrollLoads), 1u);
|
||||
return UF;
|
||||
}
|
||||
|
||||
// We want to unroll tiny loops in order to reduce the loop overhead.
|
||||
// We assume that the cost overhead is 1 and we use the cost model
|
||||
// to estimate the cost of the loop and unroll until the cost of the
|
||||
@ -5515,7 +5636,8 @@ bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) {
|
||||
}
|
||||
|
||||
|
||||
void InnerLoopUnroller::scalarizeInstruction(Instruction *Instr) {
|
||||
void InnerLoopUnroller::scalarizeInstruction(Instruction *Instr,
|
||||
bool IfPredicateStore) {
|
||||
assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
|
||||
// Holds vector parameters or scalars, in case of uniform vals.
|
||||
SmallVector<VectorParts, 4> Params;
|
||||
@ -5560,10 +5682,39 @@ void InnerLoopUnroller::scalarizeInstruction(Instruction *Instr) {
|
||||
// Create a new entry in the WidenMap and initialize it to Undef or Null.
|
||||
VectorParts &VecResults = WidenMap.splat(Instr, UndefVec);
|
||||
|
||||
Instruction *InsertPt = Builder.GetInsertPoint();
|
||||
BasicBlock *IfBlock = Builder.GetInsertBlock();
|
||||
BasicBlock *CondBlock = 0;
|
||||
|
||||
VectorParts Cond;
|
||||
Loop *VectorLp = 0;
|
||||
if (IfPredicateStore) {
|
||||
assert(Instr->getParent()->getSinglePredecessor() &&
|
||||
"Only support single predecessor blocks");
|
||||
Cond = createEdgeMask(Instr->getParent()->getSinglePredecessor(),
|
||||
Instr->getParent());
|
||||
VectorLp = LI->getLoopFor(IfBlock);
|
||||
assert(VectorLp && "Must have a loop for this block");
|
||||
}
|
||||
|
||||
// For each vector unroll 'part':
|
||||
for (unsigned Part = 0; Part < UF; ++Part) {
|
||||
// For each scalar that we create:
|
||||
|
||||
// Start an "if (pred) a[i] = ..." block.
|
||||
Value *Cmp = 0;
|
||||
if (IfPredicateStore) {
|
||||
if (Cond[Part]->getType()->isVectorTy())
|
||||
Cond[Part] =
|
||||
Builder.CreateExtractElement(Cond[Part], Builder.getInt32(0));
|
||||
Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Cond[Part],
|
||||
ConstantInt::get(Cond[Part]->getType(), 1));
|
||||
CondBlock = IfBlock->splitBasicBlock(InsertPt, "cond.store");
|
||||
VectorLp->addBasicBlockToLoop(CondBlock, LI->getBase());
|
||||
// Update Builder with newly created basic block.
|
||||
Builder.SetInsertPoint(InsertPt);
|
||||
}
|
||||
|
||||
Instruction *Cloned = Instr->clone();
|
||||
if (!IsVoidRetTy)
|
||||
Cloned->setName(Instr->getName() + ".cloned");
|
||||
@ -5580,11 +5731,25 @@ void InnerLoopUnroller::scalarizeInstruction(Instruction *Instr) {
|
||||
// so that future users will be able to use it.
|
||||
if (!IsVoidRetTy)
|
||||
VecResults[Part] = Cloned;
|
||||
|
||||
// End if-block.
|
||||
if (IfPredicateStore) {
|
||||
BasicBlock *NewIfBlock = CondBlock->splitBasicBlock(InsertPt, "else");
|
||||
VectorLp->addBasicBlockToLoop(NewIfBlock, LI->getBase());
|
||||
Builder.SetInsertPoint(InsertPt);
|
||||
Instruction *OldBr = IfBlock->getTerminator();
|
||||
BranchInst::Create(CondBlock, NewIfBlock, Cmp, OldBr);
|
||||
OldBr->eraseFromParent();
|
||||
IfBlock = NewIfBlock;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void InnerLoopUnroller::vectorizeMemoryInstruction(Instruction *Instr) {
|
||||
return scalarizeInstruction(Instr);
|
||||
StoreInst *SI = dyn_cast<StoreInst>(Instr);
|
||||
bool IfPredicateStore = (SI && Legal->blockNeedsPredication(SI->getParent()));
|
||||
|
||||
return scalarizeInstruction(Instr, IfPredicateStore);
|
||||
}
|
||||
|
||||
Value *InnerLoopUnroller::reverseVector(Value *Vec) {
|
||||
|
86
test/Transforms/LoopVectorize/if-pred-stores.ll
Normal file
86
test/Transforms/LoopVectorize/if-pred-stores.ll
Normal file
@ -0,0 +1,86 @@
|
||||
; RUN: opt -S -vectorize-num-stores-pred=1 -force-vector-width=1 -force-vector-unroll=2 -loop-vectorize < %s | FileCheck %s --check-prefix=UNROLL
|
||||
; RUN: opt -S -vectorize-num-stores-pred=1 -force-vector-width=2 -force-vector-unroll=1 -loop-vectorize -enable-cond-stores-vec < %s | FileCheck %s --check-prefix=VEC
|
||||
target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
|
||||
target triple = "x86_64-apple-macosx10.9.0"
|
||||
|
||||
; Test predication of stores.
|
||||
define i32 @test(i32* nocapture %f) #0 {
|
||||
entry:
|
||||
br label %for.body
|
||||
|
||||
; VEC-LABEL: test
|
||||
; VEC: %[[v8:.+]] = icmp sgt <2 x i32> %{{.*}}, <i32 100, i32 100>
|
||||
; VEC: %[[v9:.+]] = add nsw <2 x i32> %{{.*}}, <i32 20, i32 20>
|
||||
; VEC: %[[v10:.+]] = and <2 x i1> %[[v8]], <i1 true, i1 true>
|
||||
; VEC: %[[v11:.+]] = extractelement <2 x i1> %[[v10]], i32 0
|
||||
; VEC: %[[v12:.+]] = icmp eq i1 %[[v11]], true
|
||||
; VEC: br i1 %[[v12]], label %[[cond:.+]], label %[[else:.+]]
|
||||
;
|
||||
; VEC: [[cond]]:
|
||||
; VEC: %[[v13:.+]] = extractelement <2 x i32> %[[v9]], i32 0
|
||||
; VEC: %[[v14:.+]] = extractelement <2 x i32*> %{{.*}}, i32 0
|
||||
; VEC: store i32 %[[v13]], i32* %[[v14]], align 4
|
||||
; VEC: br label %[[else:.+]]
|
||||
;
|
||||
; VEC: [[else]]:
|
||||
; VEC: %[[v15:.+]] = extractelement <2 x i1> %[[v10]], i32 1
|
||||
; VEC: %[[v16:.+]] = icmp eq i1 %[[v15]], true
|
||||
; VEC: br i1 %[[v16]], label %[[cond2:.+]], label %[[else2:.+]]
|
||||
;
|
||||
; VEC: [[cond2]]:
|
||||
; VEC: %[[v17:.+]] = extractelement <2 x i32> %[[v9]], i32 1
|
||||
; VEC: %[[v18:.+]] = extractelement <2 x i32*> %{{.+}} i32 1
|
||||
; VEC: store i32 %[[v17]], i32* %[[v18]], align 4
|
||||
; VEC: br label %[[else2:.+]]
|
||||
;
|
||||
; VEC: [[else2]]:
|
||||
|
||||
; UNROLL-LABEL: test
|
||||
; UNROLL: vector.body:
|
||||
; UNROLL: %[[IND:[a-zA-Z0-9]+]] = add i64 %{{.*}}, 0
|
||||
; UNROLL: %[[IND1:[a-zA-Z0-9]+]] = add i64 %{{.*}}, 1
|
||||
; UNROLL: %[[v0:[a-zA-Z0-9]+]] = getelementptr inbounds i32* %f, i64 %[[IND]]
|
||||
; UNROLL: %[[v1:[a-zA-Z0-9]+]] = getelementptr inbounds i32* %f, i64 %[[IND1]]
|
||||
; UNROLL: %[[v2:[a-zA-Z0-9]+]] = load i32* %[[v0]], align 4
|
||||
; UNROLL: %[[v3:[a-zA-Z0-9]+]] = load i32* %[[v1]], align 4
|
||||
; UNROLL: %[[v4:[a-zA-Z0-9]+]] = icmp sgt i32 %[[v2]], 100
|
||||
; UNROLL: %[[v5:[a-zA-Z0-9]+]] = icmp sgt i32 %[[v3]], 100
|
||||
; UNROLL: %[[v6:[a-zA-Z0-9]+]] = add nsw i32 %[[v2]], 20
|
||||
; UNROLL: %[[v7:[a-zA-Z0-9]+]] = add nsw i32 %[[v3]], 20
|
||||
; UNROLL: %[[v8:[a-zA-Z0-9]+]] = icmp eq i1 %[[v4]], true
|
||||
; UNROLL: br i1 %[[v8]], label %[[cond:[a-zA-Z0-9.]+]], label %[[else:[a-zA-Z0-9.]+]]
|
||||
;
|
||||
; UNROLL: [[cond]]:
|
||||
; UNROLL: store i32 %[[v6]], i32* %[[v0]], align 4
|
||||
; UNROLL: br label %[[else]]
|
||||
;
|
||||
; UNROLL: [[else]]:
|
||||
; UNROLL: %[[v9:[a-zA-Z0-9]+]] = icmp eq i1 %[[v5]], true
|
||||
; UNROLL: br i1 %[[v9]], label %[[cond2:[a-zA-Z0-9.]+]], label %[[else2:[a-zA-Z0-9.]+]]
|
||||
;
|
||||
; UNROLL: [[cond2]]:
|
||||
; UNROLL: store i32 %[[v7]], i32* %[[v1]], align 4
|
||||
; UNROLL: br label %[[else2]]
|
||||
;
|
||||
; UNROLL: [[else2]]:
|
||||
|
||||
for.body:
|
||||
%indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.inc ]
|
||||
%arrayidx = getelementptr inbounds i32* %f, i64 %indvars.iv
|
||||
%0 = load i32* %arrayidx, align 4
|
||||
%cmp1 = icmp sgt i32 %0, 100
|
||||
br i1 %cmp1, label %if.then, label %for.inc
|
||||
|
||||
if.then:
|
||||
%add = add nsw i32 %0, 20
|
||||
store i32 %add, i32* %arrayidx, align 4
|
||||
br label %for.inc
|
||||
|
||||
for.inc:
|
||||
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
|
||||
%exitcond = icmp eq i64 %indvars.iv.next, 128
|
||||
br i1 %exitcond, label %for.end, label %for.body
|
||||
|
||||
for.end:
|
||||
ret i32 0
|
||||
}
|
Loading…
x
Reference in New Issue
Block a user