mirror of
https://github.com/c64scene-ar/llvm-6502.git
synced 2024-12-13 20:32:21 +00:00
[SLPVectorizer] Try different vectorization factors for store chains
...and set max vector register size based on target This patch is based on discussion on the llvmdev mailing list: http://lists.cs.uiuc.edu/pipermail/llvmdev/2015-July/087405.html and also solves: https://llvm.org/bugs/show_bug.cgi?id=17170 Several FIXME/TODO items are noted in comments as potential improvements. Differential Revision: http://reviews.llvm.org/D10950 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@241760 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
parent
6e999b0a30
commit
c1c43c15cc
@ -69,8 +69,13 @@ static cl::opt<bool> ShouldStartVectorizeHorAtStore(
|
|||||||
cl::desc(
|
cl::desc(
|
||||||
"Attempt to vectorize horizontal reductions feeding into a store"));
|
"Attempt to vectorize horizontal reductions feeding into a store"));
|
||||||
|
|
||||||
|
static cl::opt<int>
|
||||||
|
MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden,
|
||||||
|
cl::desc("Attempt to vectorize for this register size in bits"));
|
||||||
|
|
||||||
namespace {
|
namespace {
|
||||||
|
|
||||||
|
// FIXME: Set this via cl::opt to allow overriding.
|
||||||
static const unsigned MinVecRegSize = 128;
|
static const unsigned MinVecRegSize = 128;
|
||||||
|
|
||||||
static const unsigned RecursionMaxDepth = 12;
|
static const unsigned RecursionMaxDepth = 12;
|
||||||
@ -3088,6 +3093,17 @@ struct SLPVectorizer : public FunctionPass {
|
|||||||
if (!TTI->getNumberOfRegisters(true))
|
if (!TTI->getNumberOfRegisters(true))
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
|
// Use the vector register size specified by the target unless overridden
|
||||||
|
// by a command-line option.
|
||||||
|
// TODO: It would be better to limit the vectorization factor based on
|
||||||
|
// data type rather than just register size. For example, x86 AVX has
|
||||||
|
// 256-bit registers, but it does not support integer operations
|
||||||
|
// at that width (that requires AVX2).
|
||||||
|
if (MaxVectorRegSizeOption.getNumOccurrences())
|
||||||
|
MaxVecRegSize = MaxVectorRegSizeOption;
|
||||||
|
else
|
||||||
|
MaxVecRegSize = TTI->getRegisterBitWidth(true);
|
||||||
|
|
||||||
// Don't vectorize when the attribute NoImplicitFloat is used.
|
// Don't vectorize when the attribute NoImplicitFloat is used.
|
||||||
if (F.hasFnAttribute(Attribute::NoImplicitFloat))
|
if (F.hasFnAttribute(Attribute::NoImplicitFloat))
|
||||||
return false;
|
return false;
|
||||||
@ -3165,12 +3181,13 @@ private:
|
|||||||
bool vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R);
|
bool vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R);
|
||||||
|
|
||||||
bool vectorizeStoreChain(ArrayRef<Value *> Chain, int CostThreshold,
|
bool vectorizeStoreChain(ArrayRef<Value *> Chain, int CostThreshold,
|
||||||
BoUpSLP &R);
|
BoUpSLP &R, unsigned VecRegSize);
|
||||||
|
|
||||||
bool vectorizeStores(ArrayRef<StoreInst *> Stores, int costThreshold,
|
bool vectorizeStores(ArrayRef<StoreInst *> Stores, int costThreshold,
|
||||||
BoUpSLP &R);
|
BoUpSLP &R);
|
||||||
private:
|
private:
|
||||||
StoreListMap StoreRefs;
|
StoreListMap StoreRefs;
|
||||||
|
unsigned MaxVecRegSize; // This is set by TTI or overridden by cl::opt.
|
||||||
};
|
};
|
||||||
|
|
||||||
/// \brief Check that the Values in the slice in VL array are still existent in
|
/// \brief Check that the Values in the slice in VL array are still existent in
|
||||||
@ -3185,14 +3202,15 @@ static bool hasValueBeenRAUWed(ArrayRef<Value *> VL, ArrayRef<WeakVH> VH,
|
|||||||
}
|
}
|
||||||
|
|
||||||
bool SLPVectorizer::vectorizeStoreChain(ArrayRef<Value *> Chain,
|
bool SLPVectorizer::vectorizeStoreChain(ArrayRef<Value *> Chain,
|
||||||
int CostThreshold, BoUpSLP &R) {
|
int CostThreshold, BoUpSLP &R,
|
||||||
|
unsigned VecRegSize) {
|
||||||
unsigned ChainLen = Chain.size();
|
unsigned ChainLen = Chain.size();
|
||||||
DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << ChainLen
|
DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << ChainLen
|
||||||
<< "\n");
|
<< "\n");
|
||||||
Type *StoreTy = cast<StoreInst>(Chain[0])->getValueOperand()->getType();
|
Type *StoreTy = cast<StoreInst>(Chain[0])->getValueOperand()->getType();
|
||||||
auto &DL = cast<StoreInst>(Chain[0])->getModule()->getDataLayout();
|
auto &DL = cast<StoreInst>(Chain[0])->getModule()->getDataLayout();
|
||||||
unsigned Sz = DL.getTypeSizeInBits(StoreTy);
|
unsigned Sz = DL.getTypeSizeInBits(StoreTy);
|
||||||
unsigned VF = MinVecRegSize / Sz;
|
unsigned VF = VecRegSize / Sz;
|
||||||
|
|
||||||
if (!isPowerOf2_32(Sz) || VF < 2)
|
if (!isPowerOf2_32(Sz) || VF < 2)
|
||||||
return false;
|
return false;
|
||||||
@ -3276,10 +3294,15 @@ bool SLPVectorizer::vectorizeStores(ArrayRef<StoreInst *> Stores,
|
|||||||
I = ConsecutiveChain[I];
|
I = ConsecutiveChain[I];
|
||||||
}
|
}
|
||||||
|
|
||||||
if (vectorizeStoreChain(Operands, costThreshold, R)) {
|
// FIXME: Is division-by-2 the correct step? Should we assert that the
|
||||||
// Mark the vectorized stores so that we don't vectorize them again.
|
// register size is a power-of-2?
|
||||||
VectorizedStores.insert(Operands.begin(), Operands.end());
|
for (unsigned Size = MaxVecRegSize; Size >= MinVecRegSize; Size /= 2) {
|
||||||
Changed = true;
|
if (vectorizeStoreChain(Operands, costThreshold, R, Size)) {
|
||||||
|
// Mark the vectorized stores so that we don't vectorize them again.
|
||||||
|
VectorizedStores.insert(Operands.begin(), Operands.end());
|
||||||
|
Changed = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -3340,6 +3363,8 @@ bool SLPVectorizer::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
|
|||||||
|
|
||||||
Type *Ty0 = I0->getType();
|
Type *Ty0 = I0->getType();
|
||||||
unsigned Sz = DL.getTypeSizeInBits(Ty0);
|
unsigned Sz = DL.getTypeSizeInBits(Ty0);
|
||||||
|
// FIXME: Register size should be a parameter to this function, so we can
|
||||||
|
// try different vectorization factors.
|
||||||
unsigned VF = MinVecRegSize / Sz;
|
unsigned VF = MinVecRegSize / Sz;
|
||||||
|
|
||||||
for (Value *V : VL) {
|
for (Value *V : VL) {
|
||||||
@ -3569,6 +3594,8 @@ public:
|
|||||||
const DataLayout &DL = B->getModule()->getDataLayout();
|
const DataLayout &DL = B->getModule()->getDataLayout();
|
||||||
ReductionOpcode = B->getOpcode();
|
ReductionOpcode = B->getOpcode();
|
||||||
ReducedValueOpcode = 0;
|
ReducedValueOpcode = 0;
|
||||||
|
// FIXME: Register size should be a parameter to this function, so we can
|
||||||
|
// try different vectorization factors.
|
||||||
ReduxWidth = MinVecRegSize / DL.getTypeSizeInBits(Ty);
|
ReduxWidth = MinVecRegSize / DL.getTypeSizeInBits(Ty);
|
||||||
ReductionRoot = B;
|
ReductionRoot = B;
|
||||||
ReductionPHI = Phi;
|
ReductionPHI = Phi;
|
||||||
@ -3995,6 +4022,9 @@ bool SLPVectorizer::vectorizeStoreChains(BoUpSLP &R) {
|
|||||||
<< it->second.size() << ".\n");
|
<< it->second.size() << ".\n");
|
||||||
|
|
||||||
// Process the stores in chunks of 16.
|
// Process the stores in chunks of 16.
|
||||||
|
// TODO: The limit of 16 inhibits greater vectorization factors.
|
||||||
|
// For example, AVX2 supports v32i8. Increasing this limit, however,
|
||||||
|
// may cause a significant compile-time increase.
|
||||||
for (unsigned CI = 0, CE = it->second.size(); CI < CE; CI+=16) {
|
for (unsigned CI = 0, CE = it->second.size(); CI < CE; CI+=16) {
|
||||||
unsigned Len = std::min<unsigned>(CE - CI, 16);
|
unsigned Len = std::min<unsigned>(CE - CI, 16);
|
||||||
Changed |= vectorizeStores(makeArrayRef(&it->second[CI], Len),
|
Changed |= vectorizeStores(makeArrayRef(&it->second[CI], Len),
|
||||||
|
@ -1,4 +1,9 @@
|
|||||||
; RUN: opt -S -march=r600 -mcpu=cayman -basicaa -slp-vectorizer -dce < %s | FileCheck %s
|
; RUN: opt -S -march=r600 -mcpu=cayman -basicaa -slp-vectorizer -dce < %s | FileCheck %s
|
||||||
|
; XFAIL: *
|
||||||
|
;
|
||||||
|
; FIXME: If this test expects to be vectorized, the TTI must indicate that the target
|
||||||
|
; has vector registers of the expected width.
|
||||||
|
; Currently, it says there are 8 vector registers that are 32-bits wide.
|
||||||
|
|
||||||
target datalayout = "e-p:32:32:32-p3:16:16:16-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024-v2048:2048:2048-n32:64"
|
target datalayout = "e-p:32:32:32-p3:16:16:16-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024-v2048:2048:2048-n32:64"
|
||||||
|
|
||||||
|
@ -12,11 +12,8 @@ target triple = "i386-apple-macosx10.8.0"
|
|||||||
|
|
||||||
;CHECK-LABEL: @test(
|
;CHECK-LABEL: @test(
|
||||||
;CHECK: load <2 x double>
|
;CHECK: load <2 x double>
|
||||||
;CHECK: fadd <2 x double>
|
;CHECK: fadd <4 x double>
|
||||||
;CHECK: store <2 x double>
|
;CHECK: store <4 x double>
|
||||||
;CHECK: insertelement <2 x double>
|
|
||||||
;CHECK: fadd <2 x double>
|
|
||||||
;CHECK: store <2 x double>
|
|
||||||
;CHECK: ret i32
|
;CHECK: ret i32
|
||||||
|
|
||||||
define i32 @test(double* nocapture %G) {
|
define i32 @test(double* nocapture %G) {
|
||||||
@ -48,11 +45,12 @@ entry:
|
|||||||
; A[2] = A[2] * 7.6 * n + 3.0;
|
; A[2] = A[2] * 7.6 * n + 3.0;
|
||||||
; A[3] = A[3] * 7.4 * n + 4.0;
|
; A[3] = A[3] * 7.4 * n + 4.0;
|
||||||
;}
|
;}
|
||||||
;CHECK-LABEL: @foo(
|
; CHECK-LABEL: @foo(
|
||||||
;CHECK: insertelement <2 x double>
|
; CHECK: load <4 x double>
|
||||||
;CHECK: insertelement <2 x double>
|
; CHECK: fmul <4 x double>
|
||||||
;CHECK-NOT: insertelement <2 x double>
|
; CHECK: fmul <4 x double>
|
||||||
;CHECK: ret
|
; CHECK: fadd <4 x double>
|
||||||
|
; CHECK: store <4 x double>
|
||||||
define i32 @foo(double* nocapture %A, i32 %n) {
|
define i32 @foo(double* nocapture %A, i32 %n) {
|
||||||
entry:
|
entry:
|
||||||
%0 = load double, double* %A, align 8
|
%0 = load double, double* %A, align 8
|
||||||
@ -140,11 +138,12 @@ define i32 @test2(double* nocapture %G, i32 %k) {
|
|||||||
; A[2] = A[2] * 7.9 * n + 6.0;
|
; A[2] = A[2] * 7.9 * n + 6.0;
|
||||||
; A[3] = A[3] * 7.9 * n + 6.0;
|
; A[3] = A[3] * 7.9 * n + 6.0;
|
||||||
;}
|
;}
|
||||||
;CHECK-LABEL: @foo4(
|
; CHECK-LABEL: @foo4(
|
||||||
;CHECK: insertelement <2 x double>
|
; CHECK: load <4 x double>
|
||||||
;CHECK: insertelement <2 x double>
|
; CHECK: fmul <4 x double>
|
||||||
;CHECK-NOT: insertelement <2 x double>
|
; CHECK: fmul <4 x double>
|
||||||
;CHECK: ret
|
; CHECK: fadd <4 x double>
|
||||||
|
; CHECK: store <4 x double>
|
||||||
define i32 @foo4(double* nocapture %A, i32 %n) {
|
define i32 @foo4(double* nocapture %A, i32 %n) {
|
||||||
entry:
|
entry:
|
||||||
%0 = load double, double* %A, align 8
|
%0 = load double, double* %A, align 8
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
; RUN: opt < %s -basicaa -slp-vectorizer -S |FileCheck %s
|
; RUN: opt < %s -basicaa -slp-vectorizer -S |FileCheck %s
|
||||||
target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
|
target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
|
||||||
|
target triple = "x86_64-unknown-unknown"
|
||||||
|
|
||||||
; Test if SLP can handle GEP expressions.
|
; Test if SLP can handle GEP expressions.
|
||||||
; The test perform the following action:
|
; The test perform the following action:
|
||||||
|
@ -4,12 +4,9 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
|
|||||||
target triple = "x86_64-apple-macosx10.8.0"
|
target triple = "x86_64-apple-macosx10.8.0"
|
||||||
|
|
||||||
;CHECK-LABEL: @foo(
|
;CHECK-LABEL: @foo(
|
||||||
;CHECK: load <4 x i32>
|
;CHECK: load <8 x i32>
|
||||||
;CHECK: add nsw <4 x i32>
|
;CHECK: add nsw <8 x i32>
|
||||||
;CHECK: store <4 x i32>
|
;CHECK: store <8 x i32>
|
||||||
;CHECK: load <4 x i32>
|
|
||||||
;CHECK: add nsw <4 x i32>
|
|
||||||
;CHECK: store <4 x i32>
|
|
||||||
;CHECK: ret
|
;CHECK: ret
|
||||||
define i32 @foo(i32* nocapture %A, i32 %n) {
|
define i32 @foo(i32* nocapture %A, i32 %n) {
|
||||||
entry:
|
entry:
|
||||||
|
@ -1,11 +1,24 @@
|
|||||||
; RUN: opt < %s -basicaa -slp-vectorizer -S -mcpu=corei7-avx | FileCheck %s
|
; RUN: opt < %s -basicaa -slp-vectorizer -S -mcpu=corei7-avx | FileCheck %s
|
||||||
|
; RUN: opt < %s -basicaa -slp-vectorizer -slp-max-reg-size=128 -S -mcpu=corei7-avx | FileCheck %s --check-prefix=V128
|
||||||
|
|
||||||
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
|
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
|
||||||
target triple = "x86_64-unknown-linux-gnu"
|
target triple = "x86_64-unknown-linux-gnu"
|
||||||
|
|
||||||
; CHECK: load <2 x double>, <2 x double>*
|
; CHECK-LABEL: @foo(
|
||||||
; CHECK: fadd <2 x double>
|
; CHECK: load <4 x double>
|
||||||
; CHECK: store <2 x double>
|
; CHECK: fadd <4 x double>
|
||||||
|
; CHECK: fadd <4 x double>
|
||||||
|
; CHECK: store <4 x double>
|
||||||
|
|
||||||
|
; V128-LABEL: @foo(
|
||||||
|
; V128: load <2 x double>
|
||||||
|
; V128: fadd <2 x double>
|
||||||
|
; V128: fadd <2 x double>
|
||||||
|
; V128: store <2 x double>
|
||||||
|
; V128: load <2 x double>
|
||||||
|
; V128: fadd <2 x double>
|
||||||
|
; V128: fadd <2 x double>
|
||||||
|
; V128: store <2 x double>
|
||||||
|
|
||||||
define void @foo(double* %x) {
|
define void @foo(double* %x) {
|
||||||
%1 = load double, double* %x, align 8
|
%1 = load double, double* %x, align 8
|
||||||
|
Loading…
Reference in New Issue
Block a user