LoopVectorizer: Recognize min/max reductions

A min/max operation is represented by a select(cmp(lt/le/gt/ge, X, Y), X, Y) sequence in LLVM. If we see such a sequence we can treat it just as any other commutative binary instruction and reduce it. This appears to help bzip2 by about 1.5% on an imac12,2. radar://12960601 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@179773 91177308-0d34-0410-b5e6-96231b3b80d8
2025-07-15 19:24:33 +00:00 · 2013-04-18 17:22:34 +00:00
parent bff177676c
commit a3fb330d05
2 changed files with 608 additions and 34 deletions
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -343,6 +343,7 @@ public:
    RK_IntegerOr,   ///< Bitwise or logical OR of numbers.
    RK_IntegerAnd,  ///< Bitwise or logical AND of numbers.
    RK_IntegerXor,  ///< Bitwise or logical XOR of numbers.
+    RK_IntegerMinMax, //< Min/max implemented in terms of select(cmp()).
    RK_FloatAdd,    ///< Sum of floats.
    RK_FloatMult    ///< Product of floats.
  };
@@ -361,8 +362,9 @@ public:
    ReductionDescriptor() : StartValue(0), LoopExitInstr(0),
      Kind(RK_NoReduction) {}

-    ReductionDescriptor(Value *Start, Instruction *Exit, ReductionKind K)
-        : StartValue(Start), LoopExitInstr(Exit), Kind(K) {}
+    ReductionDescriptor(Value *Start, Instruction *Exit, ReductionKind K,
+                        CmpInst::Predicate P)
+        : StartValue(Start), LoopExitInstr(Exit), Kind(K), MinMaxPred(P) {}

    // The starting value of the reduction.
    // It does not have to be zero!
@@ -371,6 +373,25 @@ public:
    Instruction *LoopExitInstr;
    // The kind of the reduction.
    ReductionKind Kind;
+    // If this a min/max reduction the kind of reduction.
+    CmpInst::Predicate MinMaxPred;
+  };
+
+  /// This POD struct holds information about a potential reduction operation.
+  struct ReductionInstDesc {
+    ReductionInstDesc(bool IsRedux, Instruction *I) :
+      IsReduction(IsRedux), PatternLastInst(I), Predicate(ICmpInst::ICMP_EQ) {}
+
+    ReductionInstDesc(Instruction *I, CmpInst::Predicate P) :
+      IsReduction(true), PatternLastInst(I), Predicate(P) {}
+
+    // Is this instruction a reduction candidate.
+    bool IsReduction;
+    // The last instruction in a min/max pattern (select of the select(icmp())
+    // pattern), or the current reduction instruction otherwise.
+    Instruction *PatternLastInst;
+    // If this is a min/max pattern the comparison predicate.
+    CmpInst::Predicate Predicate;
  };

  // This POD struct holds information about the memory runtime legality
@@ -487,9 +508,13 @@ private:
  /// Returns True, if 'Phi' is the kind of reduction variable for type
  /// 'Kind'. If this is a reduction variable, it adds it to ReductionList.
  bool AddReductionVar(PHINode *Phi, ReductionKind Kind);
-  /// Returns true if the instruction I can be a reduction variable of type
-  /// 'Kind'.
-  bool isReductionInstr(Instruction *I, ReductionKind Kind);
+  /// Returns a struct describing if the instruction 'I' can be a reduction
+  /// variable of type 'Kind'. If the reduction is a min/max pattern of
+  /// select(icmp()) this function advances the instruction pointer 'I' from the
+  /// compare instruction to the select instruction and stores this pointer in
+  /// 'PatternLastInst' member of the returned struct.
+  ReductionInstDesc isReductionInstr(Instruction *I, ReductionKind Kind,
+                                     ReductionInstDesc Desc);
  /// Returns the induction kind of Phi. This function may return NoInduction
  /// if the PHI is not an induction variable.
  InductionKind isInductionVariable(PHINode *Phi);
@@ -1437,7 +1462,8 @@ InnerLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
 /// This function returns the identity element (or neutral element) for
 /// the operation K.
 static Constant*
-getReductionIdentity(LoopVectorizationLegality::ReductionKind K, Type *Tp) {
+getReductionIdentity(LoopVectorizationLegality::ReductionKind K, Type *Tp,
+                     CmpInst::Predicate Pred) {
  switch (K) {
  case LoopVectorizationLegality:: RK_IntegerXor:
  case LoopVectorizationLegality:: RK_IntegerAdd:
@@ -1456,6 +1482,28 @@ getReductionIdentity(LoopVectorizationLegality::ReductionKind K, Type *Tp) {
  case LoopVectorizationLegality:: RK_FloatAdd:
    // Adding zero to a number does not change it.
    return ConstantFP::get(Tp, 0.0L);
+  case LoopVectorizationLegality:: RK_IntegerMinMax:
+    switch(Pred) {
+    default: llvm_unreachable("Unknown min/max predicate");
+    case CmpInst::ICMP_ULT:
+    case CmpInst::ICMP_ULE:
+      return ConstantInt::getAllOnesValue(Tp);
+    case CmpInst::ICMP_UGT:
+    case CmpInst::ICMP_UGE:
+      return ConstantInt::get(Tp, 0);
+    case CmpInst::ICMP_SLT:
+    case CmpInst::ICMP_SLE: {
+      unsigned BitWidth = Tp->getPrimitiveSizeInBits();
+      return ConstantInt::get(Tp->getContext(),
+                              APInt::getSignedMaxValue(BitWidth));
+    }
+    case CmpInst::ICMP_SGT:
+    case CmpInst::ICMP_SGE: {
+      unsigned BitWidth = Tp->getPrimitiveSizeInBits();
+      return ConstantInt::get(Tp->getContext(),
+                              APInt::getSignedMinValue(BitWidth));
+    }
+    }
  default:
    llvm_unreachable("Unknown reduction kind");
  }
@@ -1566,7 +1614,7 @@ getIntrinsicIDForCall(CallInst *CI, const TargetLibraryInfo *TLI) {
 }

 /// This function translates the reduction kind to an LLVM binary operator.
-static Instruction::BinaryOps
+static unsigned
 getReductionBinOp(LoopVectorizationLegality::ReductionKind Kind) {
  switch (Kind) {
    case LoopVectorizationLegality::RK_IntegerAdd:
@@ -1583,11 +1631,20 @@ getReductionBinOp(LoopVectorizationLegality::ReductionKind Kind) {
      return Instruction::FMul;
    case LoopVectorizationLegality::RK_FloatAdd:
      return Instruction::FAdd;
+    case LoopVectorizationLegality::RK_IntegerMinMax:
+      return Instruction::ICmp;
    default:
      llvm_unreachable("Unknown reduction operation");
  }
 }

+Value *createMinMaxOp(IRBuilder<> &Builder, ICmpInst::Predicate P, Value *Left,
+                      Value *Right) {
+  Value *Cmp = Builder.CreateICmp(P, Left, Right, "rdx.minmax.cmp");
+  Value *Select = Builder.CreateSelect(Cmp, Left, Right, "rdx.minmax.select");
+  return Select;
+}
+
 void
 InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
  //===------------------------------------------------===//
@@ -1651,7 +1708,8 @@ InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {

    // Find the reduction identity variable. Zero for addition, or, xor,
    // one for multiplication, -1 for And.
-    Constant *Iden = getReductionIdentity(RdxDesc.Kind, VecTy->getScalarType());
+    Constant *Iden = getReductionIdentity(RdxDesc.Kind, VecTy->getScalarType(),
+                                          RdxDesc.MinMaxPred);
    Constant *Identity = ConstantVector::getSplat(VF, Iden);

    // This vector is the Identity vector where the first element is the
@@ -1699,10 +1757,15 @@ InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {

    // Reduce all of the unrolled parts into a single vector.
    Value *ReducedPartRdx = RdxParts[0];
+    unsigned Op = getReductionBinOp(RdxDesc.Kind);
    for (unsigned part = 1; part < UF; ++part) {
-      Instruction::BinaryOps Op = getReductionBinOp(RdxDesc.Kind);
-      ReducedPartRdx = Builder.CreateBinOp(Op, RdxParts[part], ReducedPartRdx,
-                                           "bin.rdx");
+      if (Op != Instruction::ICmp)
+        ReducedPartRdx = Builder.CreateBinOp((Instruction::BinaryOps)Op,
+                                             RdxParts[part], ReducedPartRdx,
+                                             "bin.rdx");
+      else
+        ReducedPartRdx = createMinMaxOp(Builder, RdxDesc.MinMaxPred,
+                                        ReducedPartRdx, RdxParts[part]);
    }

    // VF is a power of 2 so we can emit the reduction using log2(VF) shuffles
@@ -1727,8 +1790,11 @@ InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
                                    ConstantVector::get(ShuffleMask),
                                    "rdx.shuf");

-      Instruction::BinaryOps Op = getReductionBinOp(RdxDesc.Kind);
-      TmpVec = Builder.CreateBinOp(Op, TmpVec, Shuf, "bin.rdx");
+      if (Op != Instruction::ICmp)
+        TmpVec = Builder.CreateBinOp((Instruction::BinaryOps)Op, TmpVec, Shuf,
+                                     "bin.rdx");
+      else
+        TmpVec = createMinMaxOp(Builder, RdxDesc.MinMaxPred, TmpVec, Shuf);
    }

    // The result is in the first element of the vector.
@@ -2315,6 +2381,10 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
          DEBUG(dbgs() << "LV: Found a XOR reduction PHI."<< *Phi <<"\n");
          continue;
        }
+        if (AddReductionVar(Phi, RK_IntegerMinMax)) {
+          DEBUG(dbgs() << "LV: Found a MINMAX reduction PHI."<< *Phi <<"\n");
+          continue;
+        }
        if (AddReductionVar(Phi, RK_FloatMult)) {
          DEBUG(dbgs() << "LV: Found an FMult reduction PHI."<< *Phi <<"\n");
          continue;
@@ -2734,6 +2804,14 @@ bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi,
  // out-of-block user. The cycle must end with the original PHI.
  Instruction *Iter = Phi;

+  // To recognize min/max patterns formed by a icmp select sequence, we store
+  // the number of instruction we saw from the recognized min/max pattern,
+  // such that we don't stop when we see the phi has two uses (one by the select
+  // and one by the icmp) and to make sure we only see exactly the two
+  // instructions.
+  unsigned NumICmpSelectPatternInst = 0;
+  ReductionInstDesc ReduxDesc(false, 0);
+
  // Avoid cycles in the chain.
  SmallPtrSet<Instruction *, 8> VisitedInsts;
  while (VisitedInsts.insert(Iter)) {
@@ -2778,23 +2856,35 @@ bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi,
          Iter->hasNUsesOrMore(2))
        continue;

-      // We can't have multiple inside users.
-      if (FoundInBlockUser)
+      // We can't have multiple inside users except for a combination of
+      // icmp/select both using the phi.
+      if (FoundInBlockUser && !NumICmpSelectPatternInst)
        return false;
      FoundInBlockUser = true;

      // Any reduction instr must be of one of the allowed kinds.
-      if (!isReductionInstr(U, Kind))
+      ReduxDesc = isReductionInstr(U, Kind, ReduxDesc);
+      if (!ReduxDesc.IsReduction)
        return false;

+      if (Kind == RK_IntegerMinMax && (isa<ICmpInst>(U) ||
+                                       isa<SelectInst>(U)))
+          ++NumICmpSelectPatternInst;
+
      // Reductions of instructions such as Div, and Sub is only
      // possible if the LHS is the reduction variable.
-      if (!U->isCommutative() && !isa<PHINode>(U) && U->getOperand(0) != Iter)
+      if (!U->isCommutative() && !isa<PHINode>(U) && !isa<SelectInst>(U) &&
+          !isa<ICmpInst>(U) && U->getOperand(0) != Iter)
        return false;

-      Iter = U;
+      Iter = ReduxDesc.PatternLastInst;
    }

+    // This means we have seen one but not the other instruction of the
+    // pattern or more than just a select and cmp.
+    if (Kind == RK_IntegerMinMax && NumICmpSelectPatternInst != 2)
+      return false;
+
    // We found a reduction var if we have reached the original
    // phi node and we only have a single instruction with out-of-loop
    // users.
@@ -2803,7 +2893,8 @@ bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi,
      AllowedExit.insert(ExitInstruction);

      // Save the description of this reduction variable.
-      ReductionDescriptor RD(RdxStart, ExitInstruction, Kind);
+      ReductionDescriptor RD(RdxStart, ExitInstruction, Kind,
+                             ReduxDesc.Predicate);
      Reductions[Phi] = RD;
      // We've ended the cycle. This is a reduction variable if we have an
      // outside user and it has a binary op.
@@ -2814,36 +2905,120 @@ bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi,
  return false;
 }

-bool
+static CmpInst::Predicate getPredicateSense(CmpInst::Predicate P,
+                                            bool ShouldRevert) {
+  if (!ShouldRevert) return P;
+
+  switch(P) {
+  default:
+    llvm_unreachable("Unknown predicate sense");
+  case CmpInst::ICMP_UGT:
+  case CmpInst::ICMP_UGE:
+    return CmpInst::ICMP_ULT;
+  case CmpInst::ICMP_SGT:
+  case CmpInst::ICMP_SGE:
+    return CmpInst::ICMP_SLT;
+  case CmpInst::ICMP_ULT:
+  case CmpInst::ICMP_ULE:
+    return CmpInst::ICMP_UGT;
+  case CmpInst::ICMP_SLT:
+  case CmpInst::ICMP_SLE:
+    return CmpInst::ICMP_SGT;
+  }
+}
+
+/// Returns true if the instruction is a Select(ICmp(X, Y), X, Y) instruction
+/// pattern corresponding to a min(X, Y) or max(X, Y).
+static LoopVectorizationLegality::ReductionInstDesc
+isMinMaxSelectCmpPattern(Instruction *I) {
+
+  assert((isa<ICmpInst>(I) || isa<SelectInst>(I)) &&
+         "Expect a select instruction");
+  ICmpInst *Cmp = 0;
+  SelectInst *Select = 0;
+
+  // Look for a select(icmp(),...) pattern. Only handle integer reductions for
+  // now.
+  if ((Select = dyn_cast<SelectInst>(I))) {
+    if (!(Cmp = dyn_cast<ICmpInst>(I->getOperand(0))))
+      return LoopVectorizationLegality::ReductionInstDesc(false, I);
+    // Only handle the single user case
+    if (!Cmp->hasOneUse())
+      return LoopVectorizationLegality::ReductionInstDesc(false, I);
+  } else if ((Cmp = dyn_cast<ICmpInst>(I))) {
+    // Only handle the single user case.
+    if (!Cmp->hasOneUse())
+      return LoopVectorizationLegality::ReductionInstDesc(false, I);
+    // Look for the select.
+    if (!(Select = dyn_cast<SelectInst>(*I->use_begin())))
+      return LoopVectorizationLegality::ReductionInstDesc(false, I);
+    // Compare must be the first operand of the select.
+    if (Select->getOperand(0) != Cmp)
+      return LoopVectorizationLegality::ReductionInstDesc(false, I);
+  }
+
+  CmpInst::Predicate Pred = Cmp->getPredicate();
+
+  // Only (u/s)lt/gt/ge/le are min or max patterns.
+  if (Pred == CmpInst::ICMP_EQ ||
+      Pred == CmpInst::ICMP_NE)
+    return LoopVectorizationLegality::ReductionInstDesc(false, I);
+
+  Value *SelectOp1 = Select->getOperand(1);
+  Value *SelectOp2 = Select->getOperand(2);
+
+  Value *CmpLeft = Cmp->getOperand(0);
+  Value *CmpRight = Cmp->getOperand(1);
+
+  // Can have reversed sense.
+  // select(slt(X, Y), Y, X) == select(sge(X, Y), X, Y).
+  bool IsInverted = (SelectOp2 == CmpLeft && SelectOp1 == CmpRight);
+  bool IsMinMaxPattern = (SelectOp1 == CmpLeft && SelectOp2 == CmpRight) ||
+    IsInverted;
+
+  // Advance the instruction pointer from the icmp to the select instruction.
+  if (IsMinMaxPattern) {
+    CmpInst::Predicate P = getPredicateSense(Pred, IsInverted);
+    return LoopVectorizationLegality::ReductionInstDesc(Select, P);
+  }
+
+  return LoopVectorizationLegality::ReductionInstDesc(false, I);
+}
+
+LoopVectorizationLegality::ReductionInstDesc
 LoopVectorizationLegality::isReductionInstr(Instruction *I,
-                                            ReductionKind Kind) {
+                                            ReductionKind Kind,
+                                            ReductionInstDesc Desc) {
  bool FP = I->getType()->isFloatingPointTy();
  bool FastMath = (FP && I->isCommutative() && I->isAssociative());
-
  switch (I->getOpcode()) {
  default:
-    return false;
+    return ReductionInstDesc(false, I);
  case Instruction::PHI:
      if (FP && (Kind != RK_FloatMult && Kind != RK_FloatAdd))
-        return false;
-    // possibly.
-    return true;
+        return ReductionInstDesc(false, I);
+    return ReductionInstDesc(I, Desc.Predicate);
  case Instruction::Sub:
  case Instruction::Add:
-    return Kind == RK_IntegerAdd;
+    return ReductionInstDesc(Kind == RK_IntegerAdd, I);
  case Instruction::Mul:
-    return Kind == RK_IntegerMult;
+    return ReductionInstDesc(Kind == RK_IntegerMult, I);
  case Instruction::And:
-    return Kind == RK_IntegerAnd;
+    return ReductionInstDesc(Kind == RK_IntegerAnd, I);
  case Instruction::Or:
-    return Kind == RK_IntegerOr;
+    return ReductionInstDesc(Kind == RK_IntegerOr, I);
  case Instruction::Xor:
-    return Kind == RK_IntegerXor;
+    return ReductionInstDesc(Kind == RK_IntegerXor, I);
  case Instruction::FMul:
-    return Kind == RK_FloatMult && FastMath;
+    return ReductionInstDesc(Kind == RK_FloatMult && FastMath, I);
  case Instruction::FAdd:
-    return Kind == RK_FloatAdd && FastMath;
-   }
+    return ReductionInstDesc(Kind == RK_FloatAdd && FastMath, I);
+  case Instruction::ICmp:
+  case Instruction::Select:
+    if (Kind != RK_IntegerMinMax)
+      return ReductionInstDesc(false, I);
+    return isMinMaxSelectCmpPattern(I);
+  }
 }

 LoopVectorizationLegality::InductionKind
--- a/test/Transforms/LoopVectorize/minmax_reduction.ll
+++ b/test/Transforms/LoopVectorize/minmax_reduction.ll
@@ -0,0 +1,399 @@
+; RUN: opt -S -loop-vectorize -dce -instcombine -force-vector-width=2 -force-vector-unroll=1 < %s | FileCheck %s
+@A = common global [1024 x i32] zeroinitializer, align 16
+
+; Signed tests.
+
+; Turn this into a max reduction.
+; CHECK: @max_red
+; CHECK: icmp sgt <2 x i32>
+; CHECK: select <2 x i1>
+; CHECK: middle.block
+; CHECK: icmp sgt <2 x i32>
+; CHECK: select <2 x i1>
+
+define i32 @max_red(i32 %max) {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %max.red.08 = phi i32 [ %max, %entry ], [ %max.red.0, %for.body ]
+  %arrayidx = getelementptr inbounds [1024 x i32]* @A, i64 0, i64 %indvars.iv
+  %0 = load i32* %arrayidx, align 4
+  %cmp3 = icmp sgt i32 %0, %max.red.08
+  %max.red.0 = select i1 %cmp3, i32 %0, i32 %max.red.08
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret i32 %max.red.0
+}
+
+; Turn this into a max reduction. The select has its inputs reversed therefore
+; this is a max reduction.
+; CHECK: @max_red_inverse_select
+; CHECK: icmp slt <2 x i32>
+; CHECK: select <2 x i1>
+; CHECK: middle.block
+; CHECK: icmp sgt <2 x i32>
+; CHECK: select <2 x i1>
+
+define i32 @max_red_inverse_select(i32 %max) {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %max.red.08 = phi i32 [ %max, %entry ], [ %max.red.0, %for.body ]
+  %arrayidx = getelementptr inbounds [1024 x i32]* @A, i64 0, i64 %indvars.iv
+  %0 = load i32* %arrayidx, align 4
+  %cmp3 = icmp slt i32 %max.red.08, %0
+  %max.red.0 = select i1 %cmp3, i32 %0, i32 %max.red.08
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret i32 %max.red.0
+}
+
+; Turn this into a min reduction.
+; CHECK: @min_red
+; CHECK: icmp slt <2 x i32>
+; CHECK: select <2 x i1>
+; CHECK: middle.block
+; CHECK: icmp slt <2 x i32>
+; CHECK: select <2 x i1>
+
+define i32 @min_red(i32 %max) {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %max.red.08 = phi i32 [ %max, %entry ], [ %max.red.0, %for.body ]
+  %arrayidx = getelementptr inbounds [1024 x i32]* @A, i64 0, i64 %indvars.iv
+  %0 = load i32* %arrayidx, align 4
+  %cmp3 = icmp slt i32 %0, %max.red.08
+  %max.red.0 = select i1 %cmp3, i32 %0, i32 %max.red.08
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret i32 %max.red.0
+}
+
+; Turn this into a min reduction. The select has its inputs reversed therefore
+; this is a min reduction.
+; CHECK: @min_red_inverse_select
+; CHECK: icmp sgt <2 x i32>
+; CHECK: select <2 x i1>
+; CHECK: middle.block
+; CHECK: icmp slt <2 x i32>
+; CHECK: select <2 x i1>
+
+define i32 @min_red_inverse_select(i32 %max) {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %max.red.08 = phi i32 [ %max, %entry ], [ %max.red.0, %for.body ]
+  %arrayidx = getelementptr inbounds [1024 x i32]* @A, i64 0, i64 %indvars.iv
+  %0 = load i32* %arrayidx, align 4
+  %cmp3 = icmp sgt i32 %max.red.08, %0
+  %max.red.0 = select i1 %cmp3, i32 %0, i32 %max.red.08
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret i32 %max.red.0
+}
+
+; Unsigned tests.
+
+; Turn this into a max reduction.
+; CHECK: @umax_red
+; CHECK: icmp ugt <2 x i32>
+; CHECK: select <2 x i1>
+; CHECK: middle.block
+; CHECK: icmp ugt <2 x i32>
+; CHECK: select <2 x i1>
+
+define i32 @umax_red(i32 %max) {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %max.red.08 = phi i32 [ %max, %entry ], [ %max.red.0, %for.body ]
+  %arrayidx = getelementptr inbounds [1024 x i32]* @A, i64 0, i64 %indvars.iv
+  %0 = load i32* %arrayidx, align 4
+  %cmp3 = icmp ugt i32 %0, %max.red.08
+  %max.red.0 = select i1 %cmp3, i32 %0, i32 %max.red.08
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret i32 %max.red.0
+}
+
+; Turn this into a max reduction. The select has its inputs reversed therefore
+; this is a max reduction.
+; CHECK: @umax_red_inverse_select
+; CHECK: icmp ult <2 x i32>
+; CHECK: select <2 x i1>
+; CHECK: middle.block
+; CHECK: icmp ugt <2 x i32>
+; CHECK: select <2 x i1>
+
+define i32 @umax_red_inverse_select(i32 %max) {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %max.red.08 = phi i32 [ %max, %entry ], [ %max.red.0, %for.body ]
+  %arrayidx = getelementptr inbounds [1024 x i32]* @A, i64 0, i64 %indvars.iv
+  %0 = load i32* %arrayidx, align 4
+  %cmp3 = icmp ult i32 %max.red.08, %0
+  %max.red.0 = select i1 %cmp3, i32 %0, i32 %max.red.08
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret i32 %max.red.0
+}
+
+; Turn this into a min reduction.
+; CHECK: @umin_red
+; CHECK: icmp ult <2 x i32>
+; CHECK: select <2 x i1>
+; CHECK: middle.block
+; CHECK: icmp ult <2 x i32>
+; CHECK: select <2 x i1>
+
+define i32 @umin_red(i32 %max) {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %max.red.08 = phi i32 [ %max, %entry ], [ %max.red.0, %for.body ]
+  %arrayidx = getelementptr inbounds [1024 x i32]* @A, i64 0, i64 %indvars.iv
+  %0 = load i32* %arrayidx, align 4
+  %cmp3 = icmp ult i32 %0, %max.red.08
+  %max.red.0 = select i1 %cmp3, i32 %0, i32 %max.red.08
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret i32 %max.red.0
+}
+
+; Turn this into a min reduction. The select has its inputs reversed therefore
+; this is a min reduction.
+; CHECK: @umin_red_inverse_select
+; CHECK: icmp ugt <2 x i32>
+; CHECK: select <2 x i1>
+; CHECK: middle.block
+; CHECK: icmp ult <2 x i32>
+; CHECK: select <2 x i1>
+
+define i32 @umin_red_inverse_select(i32 %max) {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %max.red.08 = phi i32 [ %max, %entry ], [ %max.red.0, %for.body ]
+  %arrayidx = getelementptr inbounds [1024 x i32]* @A, i64 0, i64 %indvars.iv
+  %0 = load i32* %arrayidx, align 4
+  %cmp3 = icmp ugt i32 %max.red.08, %0
+  %max.red.0 = select i1 %cmp3, i32 %0, i32 %max.red.08
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret i32 %max.red.0
+}
+
+; SGE -> SLT
+; Turn this into a min reduction (select inputs are reversed).
+; CHECK: @sge_min_red
+; CHECK: icmp sge <2 x i32>
+; CHECK: select <2 x i1>
+; CHECK: middle.block
+; CHECK: icmp slt <2 x i32>
+; CHECK: select <2 x i1>
+
+define i32 @sge_min_red(i32 %max) {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %max.red.08 = phi i32 [ %max, %entry ], [ %max.red.0, %for.body ]
+  %arrayidx = getelementptr inbounds [1024 x i32]* @A, i64 0, i64 %indvars.iv
+  %0 = load i32* %arrayidx, align 4
+  %cmp3 = icmp sge i32 %0, %max.red.08
+  %max.red.0 = select i1 %cmp3, i32 %max.red.08, i32 %0
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret i32 %max.red.0
+}
+
+; SLE -> SGT
+; Turn this into a max reduction (select inputs are reversed).
+; CHECK: @sle_min_red
+; CHECK: icmp sle <2 x i32>
+; CHECK: select <2 x i1>
+; CHECK: middle.block
+; CHECK: icmp sgt <2 x i32>
+; CHECK: select <2 x i1>
+
+define i32 @sle_min_red(i32 %max) {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %max.red.08 = phi i32 [ %max, %entry ], [ %max.red.0, %for.body ]
+  %arrayidx = getelementptr inbounds [1024 x i32]* @A, i64 0, i64 %indvars.iv
+  %0 = load i32* %arrayidx, align 4
+  %cmp3 = icmp sle i32 %0, %max.red.08
+  %max.red.0 = select i1 %cmp3, i32 %max.red.08, i32 %0
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret i32 %max.red.0
+}
+
+; UGE -> ULT
+; Turn this into a min reduction (select inputs are reversed).
+; CHECK: @uge_min_red
+; CHECK: icmp uge <2 x i32>
+; CHECK: select <2 x i1>
+; CHECK: middle.block
+; CHECK: icmp ult <2 x i32>
+; CHECK: select <2 x i1>
+
+define i32 @uge_min_red(i32 %max) {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %max.red.08 = phi i32 [ %max, %entry ], [ %max.red.0, %for.body ]
+  %arrayidx = getelementptr inbounds [1024 x i32]* @A, i64 0, i64 %indvars.iv
+  %0 = load i32* %arrayidx, align 4
+  %cmp3 = icmp uge i32 %0, %max.red.08
+  %max.red.0 = select i1 %cmp3, i32 %max.red.08, i32 %0
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret i32 %max.red.0
+}
+
+; ULE -> UGT
+; Turn this into a max reduction (select inputs are reversed).
+; CHECK: @ule_min_red
+; CHECK: icmp ule <2 x i32>
+; CHECK: select <2 x i1>
+; CHECK: middle.block
+; CHECK: icmp ugt <2 x i32>
+; CHECK: select <2 x i1>
+
+define i32 @ule_min_red(i32 %max) {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %max.red.08 = phi i32 [ %max, %entry ], [ %max.red.0, %for.body ]
+  %arrayidx = getelementptr inbounds [1024 x i32]* @A, i64 0, i64 %indvars.iv
+  %0 = load i32* %arrayidx, align 4
+  %cmp3 = icmp ule i32 %0, %max.red.08
+  %max.red.0 = select i1 %cmp3, i32 %max.red.08, i32 %0
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret i32 %max.red.0
+}
+
+; No reduction.
+; CHECK: @no_red_1
+; CHECK-NOT: icmp <2 x i32>
+define i32 @no_red_1(i32 %max) {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %max.red.08 = phi i32 [ %max, %entry ], [ %max.red.0, %for.body ]
+  %arrayidx = getelementptr inbounds [1024 x i32]* @A, i64 0, i64 %indvars.iv
+  %arrayidx1 = getelementptr inbounds [1024 x i32]* @A, i64 1, i64 %indvars.iv
+  %0 = load i32* %arrayidx, align 4
+  %1 = load i32* %arrayidx1, align 4
+  %cmp3 = icmp sgt i32 %0, %1
+  %max.red.0 = select i1 %cmp3, i32 %0, i32 %max.red.08
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret i32 %max.red.0
+}
+
+; CHECK: @no_red_2
+; CHECK-NOT: icmp <2 x i32>
+define i32 @no_red_2(i32 %max) {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %max.red.08 = phi i32 [ %max, %entry ], [ %max.red.0, %for.body ]
+  %arrayidx = getelementptr inbounds [1024 x i32]* @A, i64 0, i64 %indvars.iv
+  %arrayidx1 = getelementptr inbounds [1024 x i32]* @A, i64 1, i64 %indvars.iv
+  %0 = load i32* %arrayidx, align 4
+  %1 = load i32* %arrayidx1, align 4
+  %cmp3 = icmp sgt i32 %0, %max.red.08
+  %max.red.0 = select i1 %cmp3, i32 %0, i32 %1
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret i32 %max.red.0
+}