mirror of
https://github.com/c64scene-ar/llvm-6502.git
synced 2024-12-28 04:33:05 +00:00
[OPENMP][LV][D3423] Respect Hints.Force meta-data for loops in LoopVectorizer
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@207512 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
parent
eda7f44b27
commit
c5e41aed09
@ -819,7 +819,8 @@ public:
|
||||
/// then this vectorization factor will be selected if vectorization is
|
||||
/// possible.
|
||||
VectorizationFactor selectVectorizationFactor(bool OptForSize,
|
||||
unsigned UserVF);
|
||||
unsigned UserVF,
|
||||
bool ForceVectorization);
|
||||
|
||||
/// \return The size (in bits) of the widest type in the code that
|
||||
/// needs to be vectorized. We ignore values that remain scalar such as
|
||||
@ -891,13 +892,17 @@ struct LoopVectorizeHints {
|
||||
unsigned Width;
|
||||
/// Vectorization unroll factor.
|
||||
unsigned Unroll;
|
||||
/// Vectorization forced (-1 not selected, 0 force disabled, 1 force enabled)
|
||||
int Force;
|
||||
/// Vectorization forced
|
||||
enum ForceKind {
|
||||
FK_Undefined = -1, ///< Not selected.
|
||||
FK_Disabled = 0, ///< Forcing disabled.
|
||||
FK_Enabled = 1, ///< Forcing enabled.
|
||||
} Force;
|
||||
|
||||
LoopVectorizeHints(const Loop *L, bool DisableUnrolling)
|
||||
: Width(VectorizationFactor)
|
||||
, Unroll(DisableUnrolling ? 1 : VectorizationUnroll)
|
||||
, Force(-1)
|
||||
, Force(FK_Undefined)
|
||||
, LoopID(L->getLoopID()) {
|
||||
getHints(L);
|
||||
// The command line options override any loop metadata except for when
|
||||
@ -1010,7 +1015,8 @@ private:
|
||||
DEBUG(dbgs() << "LV: ignoring invalid unroll hint metadata\n");
|
||||
} else if (Hint == "enable") {
|
||||
if (C->getBitWidth() == 1)
|
||||
Force = Val;
|
||||
Force = Val == 1 ? LoopVectorizeHints::FK_Enabled
|
||||
: LoopVectorizeHints::FK_Disabled;
|
||||
else
|
||||
DEBUG(dbgs() << "LV: ignoring invalid enable hint metadata\n");
|
||||
} else {
|
||||
@ -1106,18 +1112,20 @@ struct LoopVectorize : public FunctionPass {
|
||||
LoopVectorizeHints Hints(L, DisableUnrolling);
|
||||
|
||||
DEBUG(dbgs() << "LV: Loop hints:"
|
||||
<< " force=" << (Hints.Force == 0
|
||||
? "disabled"
|
||||
: (Hints.Force == 1 ? "enabled" : "?"))
|
||||
<< " width=" << Hints.Width << " unroll=" << Hints.Unroll
|
||||
<< "\n");
|
||||
<< " force="
|
||||
<< (Hints.Force == LoopVectorizeHints::FK_Disabled
|
||||
? "disabled"
|
||||
: (Hints.Force == LoopVectorizeHints::FK_Enabled
|
||||
? "enabled"
|
||||
: "?")) << " width=" << Hints.Width
|
||||
<< " unroll=" << Hints.Unroll << "\n");
|
||||
|
||||
if (Hints.Force == 0) {
|
||||
if (Hints.Force == LoopVectorizeHints::FK_Disabled) {
|
||||
DEBUG(dbgs() << "LV: Not vectorizing: #pragma vectorize disable.\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!AlwaysVectorize && Hints.Force != 1) {
|
||||
if (!AlwaysVectorize && Hints.Force != LoopVectorizeHints::FK_Enabled) {
|
||||
DEBUG(dbgs() << "LV: Not vectorizing: No #pragma vectorize enable.\n");
|
||||
return false;
|
||||
}
|
||||
@ -1127,6 +1135,21 @@ struct LoopVectorize : public FunctionPass {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Check the loop for a trip count threshold:
|
||||
// do not vectorize loops with a tiny trip count.
|
||||
BasicBlock *Latch = L->getLoopLatch();
|
||||
const unsigned TC = SE->getSmallConstantTripCount(L, Latch);
|
||||
if (TC > 0u && TC < TinyTripCountVectorThreshold) {
|
||||
DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
|
||||
<< "This loop is not worth vectorizing.");
|
||||
if (Hints.Force == LoopVectorizeHints::FK_Enabled)
|
||||
DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
|
||||
else {
|
||||
DEBUG(dbgs() << "\n");
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// Check if it is legal to vectorize the loop.
|
||||
LoopVectorizationLegality LVL(L, SE, DL, DT, TLI);
|
||||
if (!LVL.canVectorize()) {
|
||||
@ -1140,8 +1163,8 @@ struct LoopVectorize : public FunctionPass {
|
||||
// Check the function attributes to find out if this function should be
|
||||
// optimized for size.
|
||||
Function *F = L->getHeader()->getParent();
|
||||
bool OptForSize =
|
||||
Hints.Force != 1 && F->hasFnAttribute(Attribute::OptimizeForSize);
|
||||
bool OptForSize = Hints.Force != LoopVectorizeHints::FK_Enabled &&
|
||||
F->hasFnAttribute(Attribute::OptimizeForSize);
|
||||
|
||||
// Compute the weighted frequency of this loop being executed and see if it
|
||||
// is less than 20% of the function entry baseline frequency. Note that we
|
||||
@ -1150,7 +1173,8 @@ struct LoopVectorize : public FunctionPass {
|
||||
// exactly what block frequency models.
|
||||
if (LoopVectorizeWithBlockFrequency) {
|
||||
BlockFrequency LoopEntryFreq = BFI->getBlockFreq(L->getLoopPreheader());
|
||||
if (Hints.Force != 1 && LoopEntryFreq < ColdEntryFreq)
|
||||
if (Hints.Force != LoopVectorizeHints::FK_Enabled &&
|
||||
LoopEntryFreq < ColdEntryFreq)
|
||||
OptForSize = true;
|
||||
}
|
||||
|
||||
@ -1166,7 +1190,10 @@ struct LoopVectorize : public FunctionPass {
|
||||
|
||||
// Select the optimal vectorization factor.
|
||||
const LoopVectorizationCostModel::VectorizationFactor VF =
|
||||
CM.selectVectorizationFactor(OptForSize, Hints.Width);
|
||||
CM.selectVectorizationFactor(OptForSize, Hints.Width,
|
||||
Hints.Force ==
|
||||
LoopVectorizeHints::FK_Enabled);
|
||||
|
||||
// Select the unroll factor.
|
||||
const unsigned UF = CM.selectUnrollFactor(OptForSize, Hints.Unroll, VF.Width,
|
||||
VF.Cost);
|
||||
@ -3300,15 +3327,6 @@ bool LoopVectorizationLegality::canVectorize() {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Do not loop-vectorize loops with a tiny trip count.
|
||||
BasicBlock *Latch = TheLoop->getLoopLatch();
|
||||
unsigned TC = SE->getSmallConstantTripCount(TheLoop, Latch);
|
||||
if (TC > 0u && TC < TinyTripCountVectorThreshold) {
|
||||
DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " <<
|
||||
"This loop is not worth vectorizing.\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
// Check if we can vectorize the instructions and CFG in this loop.
|
||||
if (!canVectorizeInstrs()) {
|
||||
DEBUG(dbgs() << "LV: Can't vectorize the instructions or CFG\n");
|
||||
@ -5007,7 +5025,8 @@ bool LoopVectorizationLegality::blockCanBePredicated(BasicBlock *BB,
|
||||
|
||||
LoopVectorizationCostModel::VectorizationFactor
|
||||
LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize,
|
||||
unsigned UserVF) {
|
||||
unsigned UserVF,
|
||||
bool ForceVectorization) {
|
||||
// Width 1 means no vectorize
|
||||
VectorizationFactor Factor = { 1U, 0U };
|
||||
if (OptForSize && Legal->getRuntimePointerCheck()->Need) {
|
||||
@ -5077,8 +5096,16 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize,
|
||||
}
|
||||
|
||||
float Cost = expectedCost(1);
|
||||
const float ScalarCost = Cost;
|
||||
unsigned Width = 1;
|
||||
DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)Cost << ".\n");
|
||||
|
||||
// Ignore scalar width, because the user explicitly wants vectorization.
|
||||
if (ForceVectorization && VF > 1) {
|
||||
Width = 2;
|
||||
Cost = expectedCost(Width) / (float)Width;
|
||||
}
|
||||
|
||||
for (unsigned i=2; i <= VF; i*=2) {
|
||||
// Notice that the vector loop needs to be executed less times, so
|
||||
// we need to divide the cost of the vector loops by the width of
|
||||
@ -5092,6 +5119,9 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize,
|
||||
}
|
||||
}
|
||||
|
||||
DEBUG(if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs()
|
||||
<< "LV: Vectorization seems to be not beneficial, "
|
||||
<< "but was forced by a user.\n");
|
||||
DEBUG(dbgs() << "LV: Selecting VF: "<< Width << ".\n");
|
||||
Factor.Width = Width;
|
||||
Factor.Cost = Width * Cost;
|
||||
|
93
test/Transforms/LoopVectorize/X86/vect.omp.force.ll
Normal file
93
test/Transforms/LoopVectorize/X86/vect.omp.force.ll
Normal file
@ -0,0 +1,93 @@
|
||||
; RUN: opt < %s -loop-vectorize -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx -debug-only=loop-vectorize -stats -S 2>&1 | FileCheck %s
|
||||
; REQUIRES: asserts
|
||||
|
||||
; CHECK: LV: Loop hints: force=enabled
|
||||
; CHECK: LV: Loop hints: force=?
|
||||
; No more loops in the module
|
||||
; CHECK-NOT: LV: Loop hints: force=
|
||||
; CHECK: 2 loop-vectorize - Number of loops analyzed for vectorization
|
||||
; CHECK: 1 loop-vectorize - Number of loops vectorized
|
||||
|
||||
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
|
||||
target triple = "x86_64-apple-macosx10.8.0"
|
||||
|
||||
;
|
||||
; The source code for the test:
|
||||
;
|
||||
; #include <math.h>
|
||||
; void foo(float* restrict A, float * restrict B, int size)
|
||||
; {
|
||||
; for (int i = 0; i < size; ++i) A[i] = sinf(B[i]);
|
||||
; }
|
||||
;
|
||||
|
||||
;
|
||||
; This loop will be vectorized, although the scalar cost is lower than any of vector costs, but vectorization is explicitly forced in metadata.
|
||||
;
|
||||
|
||||
define void @vectorized(float* noalias nocapture %A, float* noalias nocapture %B, i32 %size) {
|
||||
entry:
|
||||
%cmp6 = icmp sgt i32 %size, 0
|
||||
br i1 %cmp6, label %for.body.preheader, label %for.end
|
||||
|
||||
for.body.preheader:
|
||||
br label %for.body
|
||||
|
||||
for.body:
|
||||
%indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
|
||||
%arrayidx = getelementptr inbounds float* %B, i64 %indvars.iv
|
||||
%0 = load float* %arrayidx, align 4, !llvm.mem.parallel_loop_access !1
|
||||
%call = tail call float @llvm.sin.f32(float %0)
|
||||
%arrayidx2 = getelementptr inbounds float* %A, i64 %indvars.iv
|
||||
store float %call, float* %arrayidx2, align 4, !llvm.mem.parallel_loop_access !1
|
||||
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
|
||||
%lftr.wideiv = trunc i64 %indvars.iv.next to i32
|
||||
%exitcond = icmp eq i32 %lftr.wideiv, %size
|
||||
br i1 %exitcond, label %for.end.loopexit, label %for.body, !llvm.loop !1
|
||||
|
||||
for.end.loopexit:
|
||||
br label %for.end
|
||||
|
||||
for.end:
|
||||
ret void
|
||||
}
|
||||
|
||||
!1 = metadata !{metadata !1, metadata !2}
|
||||
!2 = metadata !{metadata !"llvm.vectorizer.enable", i1 true}
|
||||
|
||||
;
|
||||
; This method will not be vectorized, as scalar cost is lower than any of vector costs.
|
||||
;
|
||||
|
||||
define void @not_vectorized(float* noalias nocapture %A, float* noalias nocapture %B, i32 %size) {
|
||||
entry:
|
||||
%cmp6 = icmp sgt i32 %size, 0
|
||||
br i1 %cmp6, label %for.body.preheader, label %for.end
|
||||
|
||||
for.body.preheader:
|
||||
br label %for.body
|
||||
|
||||
for.body:
|
||||
%indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
|
||||
%arrayidx = getelementptr inbounds float* %B, i64 %indvars.iv
|
||||
%0 = load float* %arrayidx, align 4, !llvm.mem.parallel_loop_access !3
|
||||
%call = tail call float @llvm.sin.f32(float %0)
|
||||
%arrayidx2 = getelementptr inbounds float* %A, i64 %indvars.iv
|
||||
store float %call, float* %arrayidx2, align 4, !llvm.mem.parallel_loop_access !3
|
||||
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
|
||||
%lftr.wideiv = trunc i64 %indvars.iv.next to i32
|
||||
%exitcond = icmp eq i32 %lftr.wideiv, %size
|
||||
br i1 %exitcond, label %for.end.loopexit, label %for.body, !llvm.loop !3
|
||||
|
||||
for.end.loopexit:
|
||||
br label %for.end
|
||||
|
||||
for.end:
|
||||
ret void
|
||||
}
|
||||
|
||||
declare float @llvm.sin.f32(float) nounwind readnone
|
||||
|
||||
; Dummy metadata
|
||||
!3 = metadata !{metadata !3}
|
||||
|
73
test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll
Normal file
73
test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll
Normal file
@ -0,0 +1,73 @@
|
||||
; RUN: opt < %s -loop-vectorize -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx -debug-only=loop-vectorize -stats -S -vectorizer-min-trip-count=21 2>&1 | FileCheck %s
|
||||
; REQUIRES: asserts
|
||||
|
||||
; CHECK: LV: Loop hints: force=enabled
|
||||
; CHECK: LV: Loop hints: force=?
|
||||
; No more loops in the module
|
||||
; CHECK-NOT: LV: Loop hints: force=
|
||||
; CHECK: 2 loop-vectorize - Number of loops analyzed for vectorization
|
||||
; CHECK: 1 loop-vectorize - Number of loops vectorized
|
||||
|
||||
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
|
||||
target triple = "x86_64-apple-macosx10.8.0"
|
||||
|
||||
;
|
||||
; The source code for the test:
|
||||
;
|
||||
; void foo(float* restrict A, float* restrict B)
|
||||
; {
|
||||
; for (int i = 0; i < 20; ++i) A[i] += B[i];
|
||||
; }
|
||||
;
|
||||
|
||||
;
|
||||
; This loop will be vectorized, although the trip count is below the threshold, but vectorization is explicitly forced in metadata.
|
||||
;
|
||||
define void @vectorized(float* noalias nocapture %A, float* noalias nocapture readonly %B) {
|
||||
entry:
|
||||
br label %for.body
|
||||
|
||||
for.body:
|
||||
%indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
|
||||
%arrayidx = getelementptr inbounds float* %B, i64 %indvars.iv
|
||||
%0 = load float* %arrayidx, align 4, !llvm.mem.parallel_loop_access !1
|
||||
%arrayidx2 = getelementptr inbounds float* %A, i64 %indvars.iv
|
||||
%1 = load float* %arrayidx2, align 4, !llvm.mem.parallel_loop_access !1
|
||||
%add = fadd fast float %0, %1
|
||||
store float %add, float* %arrayidx2, align 4, !llvm.mem.parallel_loop_access !1
|
||||
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
|
||||
%exitcond = icmp eq i64 %indvars.iv.next, 20
|
||||
br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !1
|
||||
|
||||
for.end:
|
||||
ret void
|
||||
}
|
||||
|
||||
!1 = metadata !{metadata !1, metadata !2}
|
||||
!2 = metadata !{metadata !"llvm.vectorizer.enable", i1 true}
|
||||
|
||||
;
|
||||
; This loop will not be vectorized as the trip count is below the threshold.
|
||||
;
|
||||
define void @not_vectorized(float* noalias nocapture %A, float* noalias nocapture readonly %B) {
|
||||
entry:
|
||||
br label %for.body
|
||||
|
||||
for.body:
|
||||
%indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
|
||||
%arrayidx = getelementptr inbounds float* %B, i64 %indvars.iv
|
||||
%0 = load float* %arrayidx, align 4, !llvm.mem.parallel_loop_access !3
|
||||
%arrayidx2 = getelementptr inbounds float* %A, i64 %indvars.iv
|
||||
%1 = load float* %arrayidx2, align 4, !llvm.mem.parallel_loop_access !3
|
||||
%add = fadd fast float %0, %1
|
||||
store float %add, float* %arrayidx2, align 4, !llvm.mem.parallel_loop_access !3
|
||||
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
|
||||
%exitcond = icmp eq i64 %indvars.iv.next, 20
|
||||
br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !3
|
||||
|
||||
for.end:
|
||||
ret void
|
||||
}
|
||||
|
||||
!3 = metadata !{metadata !3}
|
||||
|
Loading…
Reference in New Issue
Block a user