Add support for reduction variables that do not start at zero.

This is important for nested-loop reductions such as :

In the innermost loop, the induction variable does not start with zero:

for (i = 0 .. n)
 for (j = 0 .. m)
  sum += ...



git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@166387 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
Nadav Rotem 2012-10-21 05:52:51 +00:00
parent f01cad69c1
commit c847872629
2 changed files with 101 additions and 55 deletions

View File

@ -179,20 +179,36 @@ public:
TheLoop(Lp), SE(Se), DL(Dl), Induction(0) { }
/// This represents the kinds of reductions that we support.
/// We use the enum values to hold the 'identity' value for
/// each operand. This value does not change the result if applied.
enum ReductionKind {
IntegerAdd, /// Sum of numbers.
IntegerMult, /// Product of numbers.
NoReduction /// Not a reduction.
NoReduction = -1, /// Not a reduction.
IntegerAdd = 0, /// Sum of numbers.
IntegerMult = 1 /// Product of numbers.
};
// Holds a pairing of reduction instruction and the reduction kind.
typedef std::pair<Instruction*, ReductionKind> ReductionPair;
/// This POD struct holds information about reduction variables.
struct ReductionDescriptor {
// Default C'tor
ReductionDescriptor():
StartValue(0), LoopExitInstr(0), Kind(NoReduction) {}
/// ReductionList contains the reduction variables
/// as well as a single EXIT (from the block) value and the kind of
/// reduction variable..
/// Notice that the EXIT instruction can also be the PHI itself.
typedef DenseMap<PHINode*, ReductionPair> ReductionList;
// C'tor.
ReductionDescriptor(Value *Start, Instruction *Exit, ReductionKind K):
StartValue(Start), LoopExitInstr(Exit), Kind(K) {}
// The starting value of the reduction.
// It does not have to be zero!
Value *StartValue;
// The instruction who's value is used outside the loop.
Instruction *LoopExitInstr;
// The kind of the reduction.
ReductionKind Kind;
};
/// ReductionList contains the reduction descriptors for all
/// of the reductions that were found in the loop.
typedef DenseMap<PHINode*, ReductionDescriptor> ReductionList;
/// Returns the maximum vectorization factor that we *can* use to vectorize
/// this loop. This does not mean that it is profitable to vectorize this
@ -229,9 +245,6 @@ private:
/// Returns True, if 'Phi' is the kind of reduction variable for type
/// 'Kind'. If this is a reduction variable, it adds it to ReductionList.
bool AddReductionVar(PHINode *Phi, ReductionKind Kind);
/// Checks if a constant matches the reduction kind.
/// Sums starts with zero. Products start at one.
bool isReductionConstant(Value *V, ReductionKind Kind);
/// Returns true if the instruction I can be a reduction variable of type
/// 'Kind'.
bool isReductionInstr(Instruction *I, ReductionKind Kind);
@ -628,6 +641,8 @@ void
SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
typedef SmallVector<PHINode*, 4> PhiVector;
BasicBlock &BB = *Orig->getHeader();
Constant *Zero = ConstantInt::get(
IntegerType::getInt32Ty(BB.getContext()), 0);
// In order to support reduction variables we need to be able to vectorize
// Phi nodes. Phi nodes have cycles, so we need to vectorize them in two
@ -803,29 +818,42 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
PHINode *VecRdxPhi = dyn_cast<PHINode>(WidenMap[RdxPhi]);
assert(RdxPhi && "Unable to recover vectorized PHI");
// Find the reduction variable.
// Find the reduction variable descriptor.
assert(Legal->getReductionVars()->count(RdxPhi) &&
"Unable to find the reduction variable");
LoopVectorizationLegality::ReductionPair ReductionVar =
LoopVectorizationLegality::ReductionDescriptor RdxDesc =
(*Legal->getReductionVars())[RdxPhi];
// We need to generate a reduction vector from the incoming scalar.
// To do so, we need to generate the 'identity' vector and overide
// one of the elements with the incoming scalar reduction. We need
// to do it in the vector-loop preheader.
Builder.SetInsertPoint(LoopBypassBlock->getTerminator());
// This is the vector-clone of the value that leaves the loop.
Value *VectorExit = getVectorValue(ReductionVar.first);
Value *VectorExit = getVectorValue(RdxDesc.LoopExitInstr);
Type *VecTy = VectorExit->getType();
// This is the kind of reduction.
LoopVectorizationLegality::ReductionKind RdxKind = ReductionVar.second;
// Find the reduction identity variable.
// Zero for addition. One for Multiplication.
unsigned IdentitySclr =
(RdxKind == LoopVectorizationLegality::IntegerAdd ? 0 : 1);
Constant *Identity = getUniformVector(IdentitySclr, VecTy->getScalarType());
// Find the reduction identity variable. The value of the enum is the
// identity. Zero for addition. One for Multiplication.
unsigned IdentitySclr = RdxDesc.Kind;
Constant *Identity = getUniformVector(IdentitySclr,
VecTy->getScalarType());
// This vector is the Identity vector where the first element is the
// incoming scalar reduction.
Value *VectorStart = Builder.CreateInsertElement(Identity,
RdxDesc.StartValue, Zero);
// Fix the vector-loop phi.
// We created the induction variable so we know that the
// preheader is the first entry.
BasicBlock *VecPreheader = Induction->getIncomingBlock(0);
VecRdxPhi->addIncoming(Identity, VecPreheader);
// Reductions do not have to start at zero. They can start with
// any loop invariant values.
VecRdxPhi->addIncoming(VectorStart, VecPreheader);
unsigned SelfEdgeIdx = (RdxPhi)->getBasicBlockIndex(LoopScalarBody);
Value *Val = getVectorValue(RdxPhi->getIncomingValue(SelfEdgeIdx));
VecRdxPhi->addIncoming(Val, LoopVectorBody);
@ -837,10 +865,10 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
Builder.SetInsertPoint(LoopMiddleBlock->getFirstInsertionPt());
// This PHINode contains the vectorized reduction variable, or
// the identity vector, if we bypass the vector loop.
// the initial value vector, if we bypass the vector loop.
PHINode *NewPhi = Builder.CreatePHI(VecTy, 2, "rdx.vec.exit.phi");
NewPhi->addIncoming(Identity, LoopBypassBlock);
NewPhi->addIncoming(getVectorValue(ReductionVar.first), LoopVectorBody);
NewPhi->addIncoming(VectorStart, LoopBypassBlock);
NewPhi->addIncoming(getVectorValue(RdxDesc.LoopExitInstr), LoopVectorBody);
// Extract the first scalar.
Value *Scalar0 =
@ -849,7 +877,7 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
for (unsigned i=1; i < VF; ++i) {
Value *Scalar1 =
Builder.CreateExtractElement(NewPhi, Builder.getInt32(i));
if (RdxKind == LoopVectorizationLegality::IntegerAdd) {
if (RdxDesc.Kind == LoopVectorizationLegality::IntegerAdd) {
Scalar0 = Builder.CreateAdd(Scalar0, Scalar1);
} else {
Scalar0 = Builder.CreateMul(Scalar0, Scalar1);
@ -865,11 +893,13 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
PHINode *LCSSAPhi = dyn_cast<PHINode>(LEI);
if (!LCSSAPhi) continue;
// All PHINodes need to have a single entry edge, or two if we already fixed them.
// All PHINodes need to have a single entry edge, or two if
// we already fixed them.
assert(LCSSAPhi->getNumIncomingValues() < 3 && "Invalid LCSSA PHI");
// We found our reduction value exit-PHI. Update it with the incoming bypass edge.
if (LCSSAPhi->getIncomingValue(0) == ReductionVar.first) {
// We found our reduction value exit-PHI. Update it with the
// incoming bypass edge.
if (LCSSAPhi->getIncomingValue(0) == RdxDesc.LoopExitInstr) {
// Add an edge coming from the bypass.
LCSSAPhi->addIncoming(Scalar0, LoopMiddleBlock);
break;
@ -881,7 +911,7 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
int IncomingEdgeBlockIdx = (RdxPhi)->getBasicBlockIndex(LoopScalarBody);
int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); // The other block.
(RdxPhi)->setIncomingValue(SelfEdgeBlockIdx, Scalar0);
(RdxPhi)->setIncomingValue(IncomingEdgeBlockIdx, ReductionVar.first);
(RdxPhi)->setIncomingValue(IncomingEdgeBlockIdx, RdxDesc.LoopExitInstr);
}// end of for each redux variable.
}
@ -1167,10 +1197,6 @@ bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi,
int InEdgeBlockIdx = (SelfEdgeIdx ? 0 : 1); // The other entry.
Value *RdxStart = Phi->getIncomingValue(InEdgeBlockIdx);
// We must have a constant that starts the reduction.
if (!isReductionConstant(RdxStart, Kind))
return false;
// ExitInstruction is the single value which is used outside the loop.
// We only allow for a single reduction value to be used outside the loop.
// This includes users of the reduction, variables (which form a cycle
@ -1228,23 +1254,13 @@ bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi,
if (FoundStartPHI && ExitInstruction) {
// This instruction is allowed to have out-of-loop users.
AllowedExit.insert(ExitInstruction);
// Mark this as a reduction var.
Reductions[Phi] = std::make_pair(ExitInstruction, Kind);
return true;
}
}
}
bool
LoopVectorizationLegality::isReductionConstant(Value *V, ReductionKind Kind) {
ConstantInt *CI = dyn_cast<ConstantInt>(V);
if (!CI)
return false;
if (Kind == IntegerMult && CI->isOne())
// Save the description of this reduction variable.
ReductionDescriptor RD(RdxStart, ExitInstruction, Kind);
Reductions[Phi] = RD;
return true;
if (Kind == IntegerAdd && CI->isZero())
return true;
return false;
}
}
}
bool

View File

@ -93,16 +93,16 @@ define i32 @reduction_mix(i32 %n, i32* noalias nocapture %A, i32* noalias nocapt
ret i32 %sum.0.lcssa
}
;CHECK: @reduction_bad
;CHECK-NOT: <4 x i32>
;CHECK: @reduction_mul
;CHECK: mul <4 x i32>
;CHECK: ret i32
define i32 @reduction_bad(i32 %n, i32* noalias nocapture %A, i32* noalias nocapture %B) nounwind uwtable readonly noinline ssp {
define i32 @reduction_mul(i32 %n, i32* noalias nocapture %A, i32* noalias nocapture %B) nounwind uwtable readonly noinline ssp {
%1 = icmp sgt i32 %n, 0
br i1 %1, label %.lr.ph, label %._crit_edge
.lr.ph: ; preds = %0, %.lr.ph
%indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ]
%sum.02 = phi i32 [ %9, %.lr.ph ], [ 0, %0 ]
%sum.02 = phi i32 [ %9, %.lr.ph ], [ 19, %0 ]
%2 = getelementptr inbounds i32* %A, i64 %indvars.iv
%3 = load i32* %2, align 4
%4 = getelementptr inbounds i32* %B, i64 %indvars.iv
@ -120,3 +120,33 @@ define i32 @reduction_bad(i32 %n, i32* noalias nocapture %A, i32* noalias nocapt
%sum.0.lcssa = phi i32 [ 0, %0 ], [ %9, %.lr.ph ]
ret i32 %sum.0.lcssa
}
;CHECK: @start_at_non_zero
;CHECK: phi <4 x i32>
;CHECK: <i32 120, i32 0, i32 0, i32 0>
;CHECK: ret i32
define i32 @start_at_non_zero(i32* nocapture %in, i32* nocapture %coeff, i32* nocapture %out, i32 %n) nounwind uwtable readonly ssp {
entry:
%cmp7 = icmp sgt i32 %n, 0
br i1 %cmp7, label %for.body, label %for.end
for.body: ; preds = %entry, %for.body
%indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
%sum.09 = phi i32 [ %add, %for.body ], [ 120, %entry ]
%arrayidx = getelementptr inbounds i32* %in, i64 %indvars.iv
%0 = load i32* %arrayidx, align 4
%arrayidx2 = getelementptr inbounds i32* %coeff, i64 %indvars.iv
%1 = load i32* %arrayidx2, align 4
%mul = mul nsw i32 %1, %0
%add = add nsw i32 %mul, %sum.09
%indvars.iv.next = add i64 %indvars.iv, 1
%lftr.wideiv = trunc i64 %indvars.iv.next to i32
%exitcond = icmp eq i32 %lftr.wideiv, %n
br i1 %exitcond, label %for.end, label %for.body
for.end: ; preds = %for.body, %entry
%sum.0.lcssa = phi i32 [ 120, %entry ], [ %add, %for.body ]
ret i32 %sum.0.lcssa
}