diff --git a/include/llvm/Transforms/IPO/PassManagerBuilder.h b/include/llvm/Transforms/IPO/PassManagerBuilder.h index 27887749e96..42b6b27d0ce 100644 --- a/include/llvm/Transforms/IPO/PassManagerBuilder.h +++ b/include/llvm/Transforms/IPO/PassManagerBuilder.h @@ -111,7 +111,6 @@ public: bool BBVectorize; bool SLPVectorize; bool LoopVectorize; - bool LateVectorize; bool RerollLoops; private: diff --git a/include/llvm/Transforms/Vectorize.h b/include/llvm/Transforms/Vectorize.h index 823c5fba745..e93b39a8140 100644 --- a/include/llvm/Transforms/Vectorize.h +++ b/include/llvm/Transforms/Vectorize.h @@ -114,7 +114,8 @@ createBBVectorizePass(const VectorizeConfig &C = VectorizeConfig()); // // LoopVectorize - Create a loop vectorization pass. // -Pass *createLoopVectorizePass(bool NoUnrolling = false); +Pass *createLoopVectorizePass(bool NoUnrolling = false, + bool AlwaysVectorize = true); //===----------------------------------------------------------------------===// // diff --git a/lib/Transforms/IPO/PassManagerBuilder.cpp b/lib/Transforms/IPO/PassManagerBuilder.cpp index 24c5018d542..cd46c799829 100644 --- a/lib/Transforms/IPO/PassManagerBuilder.cpp +++ b/lib/Transforms/IPO/PassManagerBuilder.cpp @@ -32,11 +32,6 @@ static cl::opt RunLoopVectorization("vectorize-loops", cl::Hidden, cl::desc("Run the Loop vectorization passes")); -static cl::opt -LateVectorization("late-vectorize", cl::init(true), cl::Hidden, - cl::desc("Run the vectorization pasess late in the pass " - "pipeline (after the inliner)")); - static cl::opt RunSLPVectorization("vectorize-slp", cl::Hidden, cl::desc("Run the SLP vectorization passes")); @@ -68,7 +63,6 @@ PassManagerBuilder::PassManagerBuilder() { BBVectorize = RunBBVectorization; SLPVectorize = RunSLPVectorization; LoopVectorize = RunLoopVectorization; - LateVectorize = LateVectorization; RerollLoops = RunLoopRerolling; } @@ -200,9 +194,6 @@ void PassManagerBuilder::populateModulePassManager(PassManagerBase &MPM) { MPM.add(createLoopIdiomPass()); // Recognize idioms like memset. MPM.add(createLoopDeletionPass()); // Delete dead loops - if (!LateVectorize && LoopVectorize) - MPM.add(createLoopVectorizePass(DisableUnrollLoops)); - if (!DisableUnrollLoops) MPM.add(createLoopUnrollPass()); // Unroll small loops addExtensionsToPM(EP_LoopOptimizerEnd, MPM); @@ -243,21 +234,18 @@ void PassManagerBuilder::populateModulePassManager(PassManagerBase &MPM) { MPM.add(createCFGSimplificationPass()); // Merge & remove BBs MPM.add(createInstructionCombiningPass()); // Clean up after everything. - // As an experimental mode, run any vectorization passes in a separate - // pipeline from the CGSCC pass manager that runs iteratively with the - // inliner. - if (LateVectorize && LoopVectorize) { - // FIXME: This is a HACK! The inliner pass above implicitly creates a CGSCC - // pass manager that we are specifically trying to avoid. To prevent this - // we must insert a no-op module pass to reset the pass manager. - MPM.add(createBarrierNoopPass()); - - // Add the various vectorization passes and relevant cleanup passes for - // them since we are no longer in the middle of the main scalar pipeline. - MPM.add(createLoopVectorizePass(DisableUnrollLoops)); - MPM.add(createInstructionCombiningPass()); - MPM.add(createCFGSimplificationPass()); - } + // FIXME: This is a HACK! The inliner pass above implicitly creates a CGSCC + // pass manager that we are specifically trying to avoid. To prevent this + // we must insert a no-op module pass to reset the pass manager. + MPM.add(createBarrierNoopPass()); + MPM.add(createLoopVectorizePass(DisableUnrollLoops, LoopVectorize)); + // FIXME: Because of #pragma vectorize enable, the passes below are always + // inserted in the pipeline, even when the vectorizer doesn't run (ex. when + // on -O1 and no #pragma is found). Would be good to have these two passes + // as function calls, so that we can only pass them when the vectorizer + // changed the code. + MPM.add(createInstructionCombiningPass()); + MPM.add(createCFGSimplificationPass()); if (!DisableUnitAtATime) { // FIXME: We shouldn't bother with this anymore. diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index 9c3d29f6597..45ddeaf9336 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -763,10 +763,13 @@ struct LoopVectorizeHints { unsigned Width; /// Vectorization unroll factor. unsigned Unroll; + /// Vectorization forced (-1 not selected, 0 force disabled, 1 force enabled) + int Force; LoopVectorizeHints(const Loop *L, bool DisableUnrolling) : Width(VectorizationFactor) , Unroll(DisableUnrolling ? 1 : VectorizationUnroll) + , Force(-1) , LoopID(L->getLoopID()) { getHints(L); // The command line options override any loop metadata except for when @@ -877,6 +880,11 @@ private: Unroll = Val; else DEBUG(dbgs() << "LV: ignoring invalid unroll hint metadata\n"); + } else if (Hint == "enable") { + if (C->getBitWidth() == 1) + Force = Val; + else + DEBUG(dbgs() << "LV: ignoring invalid enable hint metadata\n"); } else { DEBUG(dbgs() << "LV: ignoring unknown hint " << Hint << '\n'); } @@ -888,8 +896,10 @@ struct LoopVectorize : public LoopPass { /// Pass identification, replacement for typeid static char ID; - explicit LoopVectorize(bool NoUnrolling = false) - : LoopPass(ID), DisableUnrolling(NoUnrolling) { + explicit LoopVectorize(bool NoUnrolling = false, bool AlwaysVectorize = true) + : LoopPass(ID), + DisableUnrolling(NoUnrolling), + AlwaysVectorize(AlwaysVectorize) { initializeLoopVectorizePass(*PassRegistry::getPassRegistry()); } @@ -900,6 +910,7 @@ struct LoopVectorize : public LoopPass { DominatorTree *DT; TargetLibraryInfo *TLI; bool DisableUnrolling; + bool AlwaysVectorize; virtual bool runOnLoop(Loop *L, LPPassManager &LPM) { // We only vectorize innermost loops. @@ -919,7 +930,7 @@ struct LoopVectorize : public LoopPass { return false; if (DL == NULL) { - DEBUG(dbgs() << "LV: Not vectorizing because of missing data layout\n"); + DEBUG(dbgs() << "LV: Not vectorizing: Missing data layout\n"); return false; } @@ -928,15 +939,25 @@ struct LoopVectorize : public LoopPass { LoopVectorizeHints Hints(L, DisableUnrolling); + if (Hints.Force == 0) { + DEBUG(dbgs() << "LV: Not vectorizing: #pragma vectorize disable.\n"); + return false; + } + + if (!AlwaysVectorize && Hints.Force != 1) { + DEBUG(dbgs() << "LV: Not vectorizing: No #pragma vectorize enable.\n"); + return false; + } + if (Hints.Width == 1 && Hints.Unroll == 1) { - DEBUG(dbgs() << "LV: Not vectorizing.\n"); + DEBUG(dbgs() << "LV: Not vectorizing: Disabled/already vectorized.\n"); return false; } // Check if it is legal to vectorize the loop. LoopVectorizationLegality LVL(L, SE, DL, DT, TLI); if (!LVL.canVectorize()) { - DEBUG(dbgs() << "LV: Not vectorizing.\n"); + DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n"); return false; } @@ -949,7 +970,8 @@ struct LoopVectorize : public LoopPass { Attribute::AttrKind SzAttr = Attribute::OptimizeForSize; Attribute::AttrKind FlAttr = Attribute::NoImplicitFloat; unsigned FnIndex = AttributeSet::FunctionIndex; - bool OptForSize = F->getAttributes().hasAttribute(FnIndex, SzAttr); + bool OptForSize = Hints.Force != 1 && + F->getAttributes().hasAttribute(FnIndex, SzAttr); bool NoFloat = F->getAttributes().hasAttribute(FnIndex, FlAttr); if (NoFloat) { @@ -973,6 +995,7 @@ struct LoopVectorize : public LoopPass { DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n"); if (UF == 1) return false; + DEBUG(dbgs() << "LV: Trying to at least unroll the loops.\n"); // We decided not to vectorize, but we may want to unroll. InnerLoopUnroller Unroller(L, SE, LI, DT, DL, TLI, UF); Unroller.vectorize(&LVL); @@ -5016,8 +5039,8 @@ INITIALIZE_PASS_DEPENDENCY(LoopSimplify) INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false) namespace llvm { - Pass *createLoopVectorizePass(bool NoUnrolling) { - return new LoopVectorize(NoUnrolling); + Pass *createLoopVectorizePass(bool NoUnrolling, bool AlwaysVectorize) { + return new LoopVectorize(NoUnrolling, AlwaysVectorize); } } diff --git a/test/Transforms/LoopVectorize/metadata-enable.ll b/test/Transforms/LoopVectorize/metadata-enable.ll new file mode 100644 index 00000000000..fff3c0e808f --- /dev/null +++ b/test/Transforms/LoopVectorize/metadata-enable.ll @@ -0,0 +1,175 @@ +; RUN: opt < %s -O1 -S | FileCheck %s --check-prefix=O1 +; RUN: opt < %s -O2 -S | FileCheck %s --check-prefix=O2 +; RUN: opt < %s -O3 -S | FileCheck %s --check-prefix=O3 +; RUN: opt < %s -Os -S | FileCheck %s --check-prefix=Os +; RUN: opt < %s -Oz -S | FileCheck %s --check-prefix=Oz +; RUN: opt < %s -O1 -vectorize-loops -S | FileCheck %s --check-prefix=O1VEC +; RUN: opt < %s -Oz -vectorize-loops -S | FileCheck %s --check-prefix=OzVEC +; RUN: opt < %s -O1 -loop-vectorize -S | FileCheck %s --check-prefix=O1VEC2 +; RUN: opt < %s -Oz -loop-vectorize -S | FileCheck %s --check-prefix=OzVEC2 +; RUN: opt < %s -O3 -disable-loop-vectorization -S | FileCheck %s --check-prefix=O3DIS + +; This file tests the llvm.vectorizer.pragma forcing vectorization even when +; optimization levels are too low, or when vectorization is disabled. + +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; O1-LABEL: @enabled( +; O1: store <4 x i32> +; O1: ret i32 +; O2-LABEL: @enabled( +; O2: store <4 x i32> +; O2: ret i32 +; O3-LABEL: @enabled( +; O3: store <4 x i32> +; O3: ret i32 +; Pragma always wins! +; O3DIS-LABEL: @enabled( +; O3DIS: store <4 x i32> +; O3DIS: ret i32 +; Os-LABEL: @enabled( +; Os: store <4 x i32> +; Os: ret i32 +; Oz-LABEL: @enabled( +; Oz: store <4 x i32> +; Oz: ret i32 +; O1VEC-LABEL: @enabled( +; O1VEC: store <4 x i32> +; O1VEC: ret i32 +; OzVEC-LABEL: @enabled( +; OzVEC: store <4 x i32> +; OzVEC: ret i32 +; O1VEC2-LABEL: @enabled( +; O1VEC2: store <4 x i32> +; O1VEC2: ret i32 +; OzVEC2-LABEL: @enabled( +; OzVEC2: store <4 x i32> +; OzVEC2: ret i32 + +define i32 @enabled(i32* noalias nocapture %a, i32* noalias nocapture readonly %b, i32 %N) { +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds i32* %b, i64 %indvars.iv + %0 = load i32* %arrayidx, align 4 + %add = add nsw i32 %0, %N + %arrayidx2 = getelementptr inbounds i32* %a, i64 %indvars.iv + store i32 %add, i32* %arrayidx2, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 32 + br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !0 + +for.end: ; preds = %for.body + %1 = load i32* %a, align 4 + ret i32 %1 +} + +; O1-LABEL: @nopragma( +; O1-NOT: store <4 x i32> +; O1: ret i32 +; O2-LABEL: @nopragma( +; O2: store <4 x i32> +; O2: ret i32 +; O3-LABEL: @nopragma( +; O3: store <4 x i32> +; O3: ret i32 +; O3DIS-LABEL: @nopragma( +; O3DIS-NOT: store <4 x i32> +; O3DIS: ret i32 +; Os-LABEL: @nopragma( +; Os: store <4 x i32> +; Os: ret i32 +; Oz-LABEL: @nopragma( +; Oz-NOT: store <4 x i32> +; Oz: ret i32 +; O1VEC-LABEL: @nopragma( +; O1VEC: store <4 x i32> +; O1VEC: ret i32 +; OzVEC-LABEL: @nopragma( +; OzVEC: store <4 x i32> +; OzVEC: ret i32 +; O1VEC2-LABEL: @nopragma( +; O1VEC2: store <4 x i32> +; O1VEC2: ret i32 +; OzVEC2-LABEL: @nopragma( +; OzVEC2: store <4 x i32> +; OzVEC2: ret i32 + +define i32 @nopragma(i32* noalias nocapture %a, i32* noalias nocapture readonly %b, i32 %N) { +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds i32* %b, i64 %indvars.iv + %0 = load i32* %arrayidx, align 4 + %add = add nsw i32 %0, %N + %arrayidx2 = getelementptr inbounds i32* %a, i64 %indvars.iv + store i32 %add, i32* %arrayidx2, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 32 + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body + %1 = load i32* %a, align 4 + ret i32 %1 +} + +; O1-LABEL: @disabled( +; O1-NOT: store <4 x i32> +; O1: ret i32 +; O2-LABEL: @disabled( +; O2-NOT: store <4 x i32> +; O2: ret i32 +; O3-LABEL: @disabled( +; O3-NOT: store <4 x i32> +; O3: ret i32 +; O3DIS-LABEL: @disabled( +; O3DIS-NOT: store <4 x i32> +; O3DIS: ret i32 +; Os-LABEL: @disabled( +; Os-NOT: store <4 x i32> +; Os: ret i32 +; Oz-LABEL: @disabled( +; Oz-NOT: store <4 x i32> +; Oz: ret i32 +; O1VEC-LABEL: @disabled( +; O1VEC-NOT: store <4 x i32> +; O1VEC: ret i32 +; OzVEC-LABEL: @disabled( +; OzVEC-NOT: store <4 x i32> +; OzVEC: ret i32 +; O1VEC2-LABEL: @disabled( +; O1VEC2-NOT: store <4 x i32> +; O1VEC2: ret i32 +; OzVEC2-LABEL: @disabled( +; OzVEC2-NOT: store <4 x i32> +; OzVEC2: ret i32 + +define i32 @disabled(i32* noalias nocapture %a, i32* noalias nocapture readonly %b, i32 %N) { +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds i32* %b, i64 %indvars.iv + %0 = load i32* %arrayidx, align 4 + %add = add nsw i32 %0, %N + %arrayidx2 = getelementptr inbounds i32* %a, i64 %indvars.iv + store i32 %add, i32* %arrayidx2, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 32 + br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !2 + +for.end: ; preds = %for.body + %1 = load i32* %a, align 4 + ret i32 %1 +} + +!0 = metadata !{metadata !0, metadata !1} +!1 = metadata !{metadata !"llvm.vectorizer.enable", i1 1} +!2 = metadata !{metadata !2, metadata !3} +!3 = metadata !{metadata !"llvm.vectorizer.enable", i1 0} \ No newline at end of file diff --git a/tools/opt/opt.cpp b/tools/opt/opt.cpp index dba16f72dab..5e27c09cdd6 100644 --- a/tools/opt/opt.cpp +++ b/tools/opt/opt.cpp @@ -471,8 +471,14 @@ static void AddOptimizationPasses(PassManagerBase &MPM,FunctionPassManager &FPM, Builder.DisableUnrollLoops = (DisableLoopUnrolling.getNumOccurrences() > 0) ? DisableLoopUnrolling : OptLevel == 0; - Builder.LoopVectorize = - DisableLoopVectorization ? false : OptLevel > 1 && SizeLevel < 2; + // This is final, unless there is a #pragma vectorize enable + if (DisableLoopVectorization) + Builder.LoopVectorize = false; + // If option wasn't forced via cmd line (-vectorize-loops, -loop-vectorize) + else if (!Builder.LoopVectorize) + Builder.LoopVectorize = OptLevel > 1 && SizeLevel < 2; + + // When #pragma vectorize is on for SLP, do the same as above Builder.SLPVectorize = DisableSLPVectorization ? false : OptLevel > 1 && SizeLevel < 2;