From c1c43c15ccd850278568a24cb80b40c06c4e1e41 Mon Sep 17 00:00:00 2001 From: Sanjay Patel Date: Wed, 8 Jul 2015 23:40:55 +0000 Subject: [PATCH] [SLPVectorizer] Try different vectorization factors for store chains ...and set max vector register size based on target This patch is based on discussion on the llvmdev mailing list: http://lists.cs.uiuc.edu/pipermail/llvmdev/2015-July/087405.html and also solves: https://llvm.org/bugs/show_bug.cgi?id=17170 Several FIXME/TODO items are noted in comments as potential improvements. Differential Revision: http://reviews.llvm.org/D10950 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@241760 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Vectorize/SLPVectorizer.cpp | 44 ++++++++++++++++--- .../SLPVectorizer/AMDGPU/simplebb.ll | 5 +++ test/Transforms/SLPVectorizer/X86/cse.ll | 29 ++++++------ test/Transforms/SLPVectorizer/X86/gep.ll | 1 + .../SLPVectorizer/X86/loopinvariant.ll | 9 ++-- test/Transforms/SLPVectorizer/X86/pr19657.ll | 19 ++++++-- 6 files changed, 76 insertions(+), 31 deletions(-) diff --git a/lib/Transforms/Vectorize/SLPVectorizer.cpp b/lib/Transforms/Vectorize/SLPVectorizer.cpp index da35b35248f..7bac407e77e 100644 --- a/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -69,8 +69,13 @@ static cl::opt ShouldStartVectorizeHorAtStore( cl::desc( "Attempt to vectorize horizontal reductions feeding into a store")); +static cl::opt +MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden, + cl::desc("Attempt to vectorize for this register size in bits")); + namespace { +// FIXME: Set this via cl::opt to allow overriding. static const unsigned MinVecRegSize = 128; static const unsigned RecursionMaxDepth = 12; @@ -3088,6 +3093,17 @@ struct SLPVectorizer : public FunctionPass { if (!TTI->getNumberOfRegisters(true)) return false; + // Use the vector register size specified by the target unless overridden + // by a command-line option. + // TODO: It would be better to limit the vectorization factor based on + // data type rather than just register size. For example, x86 AVX has + // 256-bit registers, but it does not support integer operations + // at that width (that requires AVX2). + if (MaxVectorRegSizeOption.getNumOccurrences()) + MaxVecRegSize = MaxVectorRegSizeOption; + else + MaxVecRegSize = TTI->getRegisterBitWidth(true); + // Don't vectorize when the attribute NoImplicitFloat is used. if (F.hasFnAttribute(Attribute::NoImplicitFloat)) return false; @@ -3165,12 +3181,13 @@ private: bool vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R); bool vectorizeStoreChain(ArrayRef Chain, int CostThreshold, - BoUpSLP &R); + BoUpSLP &R, unsigned VecRegSize); bool vectorizeStores(ArrayRef Stores, int costThreshold, BoUpSLP &R); private: StoreListMap StoreRefs; + unsigned MaxVecRegSize; // This is set by TTI or overridden by cl::opt. }; /// \brief Check that the Values in the slice in VL array are still existent in @@ -3185,14 +3202,15 @@ static bool hasValueBeenRAUWed(ArrayRef VL, ArrayRef VH, } bool SLPVectorizer::vectorizeStoreChain(ArrayRef Chain, - int CostThreshold, BoUpSLP &R) { + int CostThreshold, BoUpSLP &R, + unsigned VecRegSize) { unsigned ChainLen = Chain.size(); DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << ChainLen << "\n"); Type *StoreTy = cast(Chain[0])->getValueOperand()->getType(); auto &DL = cast(Chain[0])->getModule()->getDataLayout(); unsigned Sz = DL.getTypeSizeInBits(StoreTy); - unsigned VF = MinVecRegSize / Sz; + unsigned VF = VecRegSize / Sz; if (!isPowerOf2_32(Sz) || VF < 2) return false; @@ -3276,10 +3294,15 @@ bool SLPVectorizer::vectorizeStores(ArrayRef Stores, I = ConsecutiveChain[I]; } - if (vectorizeStoreChain(Operands, costThreshold, R)) { - // Mark the vectorized stores so that we don't vectorize them again. - VectorizedStores.insert(Operands.begin(), Operands.end()); - Changed = true; + // FIXME: Is division-by-2 the correct step? Should we assert that the + // register size is a power-of-2? + for (unsigned Size = MaxVecRegSize; Size >= MinVecRegSize; Size /= 2) { + if (vectorizeStoreChain(Operands, costThreshold, R, Size)) { + // Mark the vectorized stores so that we don't vectorize them again. + VectorizedStores.insert(Operands.begin(), Operands.end()); + Changed = true; + break; + } } } @@ -3340,6 +3363,8 @@ bool SLPVectorizer::tryToVectorizeList(ArrayRef VL, BoUpSLP &R, Type *Ty0 = I0->getType(); unsigned Sz = DL.getTypeSizeInBits(Ty0); + // FIXME: Register size should be a parameter to this function, so we can + // try different vectorization factors. unsigned VF = MinVecRegSize / Sz; for (Value *V : VL) { @@ -3569,6 +3594,8 @@ public: const DataLayout &DL = B->getModule()->getDataLayout(); ReductionOpcode = B->getOpcode(); ReducedValueOpcode = 0; + // FIXME: Register size should be a parameter to this function, so we can + // try different vectorization factors. ReduxWidth = MinVecRegSize / DL.getTypeSizeInBits(Ty); ReductionRoot = B; ReductionPHI = Phi; @@ -3995,6 +4022,9 @@ bool SLPVectorizer::vectorizeStoreChains(BoUpSLP &R) { << it->second.size() << ".\n"); // Process the stores in chunks of 16. + // TODO: The limit of 16 inhibits greater vectorization factors. + // For example, AVX2 supports v32i8. Increasing this limit, however, + // may cause a significant compile-time increase. for (unsigned CI = 0, CE = it->second.size(); CI < CE; CI+=16) { unsigned Len = std::min(CE - CI, 16); Changed |= vectorizeStores(makeArrayRef(&it->second[CI], Len), diff --git a/test/Transforms/SLPVectorizer/AMDGPU/simplebb.ll b/test/Transforms/SLPVectorizer/AMDGPU/simplebb.ll index 9ed86f88147..35763953911 100644 --- a/test/Transforms/SLPVectorizer/AMDGPU/simplebb.ll +++ b/test/Transforms/SLPVectorizer/AMDGPU/simplebb.ll @@ -1,4 +1,9 @@ ; RUN: opt -S -march=r600 -mcpu=cayman -basicaa -slp-vectorizer -dce < %s | FileCheck %s +; XFAIL: * +; +; FIXME: If this test expects to be vectorized, the TTI must indicate that the target +; has vector registers of the expected width. +; Currently, it says there are 8 vector registers that are 32-bits wide. target datalayout = "e-p:32:32:32-p3:16:16:16-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024-v2048:2048:2048-n32:64" diff --git a/test/Transforms/SLPVectorizer/X86/cse.ll b/test/Transforms/SLPVectorizer/X86/cse.ll index 00e1ecd4ad3..8d25b3661dc 100644 --- a/test/Transforms/SLPVectorizer/X86/cse.ll +++ b/test/Transforms/SLPVectorizer/X86/cse.ll @@ -12,11 +12,8 @@ target triple = "i386-apple-macosx10.8.0" ;CHECK-LABEL: @test( ;CHECK: load <2 x double> -;CHECK: fadd <2 x double> -;CHECK: store <2 x double> -;CHECK: insertelement <2 x double> -;CHECK: fadd <2 x double> -;CHECK: store <2 x double> +;CHECK: fadd <4 x double> +;CHECK: store <4 x double> ;CHECK: ret i32 define i32 @test(double* nocapture %G) { @@ -48,11 +45,12 @@ entry: ; A[2] = A[2] * 7.6 * n + 3.0; ; A[3] = A[3] * 7.4 * n + 4.0; ;} -;CHECK-LABEL: @foo( -;CHECK: insertelement <2 x double> -;CHECK: insertelement <2 x double> -;CHECK-NOT: insertelement <2 x double> -;CHECK: ret +; CHECK-LABEL: @foo( +; CHECK: load <4 x double> +; CHECK: fmul <4 x double> +; CHECK: fmul <4 x double> +; CHECK: fadd <4 x double> +; CHECK: store <4 x double> define i32 @foo(double* nocapture %A, i32 %n) { entry: %0 = load double, double* %A, align 8 @@ -140,11 +138,12 @@ define i32 @test2(double* nocapture %G, i32 %k) { ; A[2] = A[2] * 7.9 * n + 6.0; ; A[3] = A[3] * 7.9 * n + 6.0; ;} -;CHECK-LABEL: @foo4( -;CHECK: insertelement <2 x double> -;CHECK: insertelement <2 x double> -;CHECK-NOT: insertelement <2 x double> -;CHECK: ret +; CHECK-LABEL: @foo4( +; CHECK: load <4 x double> +; CHECK: fmul <4 x double> +; CHECK: fmul <4 x double> +; CHECK: fadd <4 x double> +; CHECK: store <4 x double> define i32 @foo4(double* nocapture %A, i32 %n) { entry: %0 = load double, double* %A, align 8 diff --git a/test/Transforms/SLPVectorizer/X86/gep.ll b/test/Transforms/SLPVectorizer/X86/gep.ll index 3f952d7b242..d10f2b6015d 100644 --- a/test/Transforms/SLPVectorizer/X86/gep.ll +++ b/test/Transforms/SLPVectorizer/X86/gep.ll @@ -1,5 +1,6 @@ ; RUN: opt < %s -basicaa -slp-vectorizer -S |FileCheck %s target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-unknown" ; Test if SLP can handle GEP expressions. ; The test perform the following action: diff --git a/test/Transforms/SLPVectorizer/X86/loopinvariant.ll b/test/Transforms/SLPVectorizer/X86/loopinvariant.ll index bbc315c24e1..dace4b35b87 100644 --- a/test/Transforms/SLPVectorizer/X86/loopinvariant.ll +++ b/test/Transforms/SLPVectorizer/X86/loopinvariant.ll @@ -4,12 +4,9 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3 target triple = "x86_64-apple-macosx10.8.0" ;CHECK-LABEL: @foo( -;CHECK: load <4 x i32> -;CHECK: add nsw <4 x i32> -;CHECK: store <4 x i32> -;CHECK: load <4 x i32> -;CHECK: add nsw <4 x i32> -;CHECK: store <4 x i32> +;CHECK: load <8 x i32> +;CHECK: add nsw <8 x i32> +;CHECK: store <8 x i32> ;CHECK: ret define i32 @foo(i32* nocapture %A, i32 %n) { entry: diff --git a/test/Transforms/SLPVectorizer/X86/pr19657.ll b/test/Transforms/SLPVectorizer/X86/pr19657.ll index 0e404b2ad7b..32f8da4c7ee 100644 --- a/test/Transforms/SLPVectorizer/X86/pr19657.ll +++ b/test/Transforms/SLPVectorizer/X86/pr19657.ll @@ -1,11 +1,24 @@ ; RUN: opt < %s -basicaa -slp-vectorizer -S -mcpu=corei7-avx | FileCheck %s +; RUN: opt < %s -basicaa -slp-vectorizer -slp-max-reg-size=128 -S -mcpu=corei7-avx | FileCheck %s --check-prefix=V128 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" -; CHECK: load <2 x double>, <2 x double>* -; CHECK: fadd <2 x double> -; CHECK: store <2 x double> +; CHECK-LABEL: @foo( +; CHECK: load <4 x double> +; CHECK: fadd <4 x double> +; CHECK: fadd <4 x double> +; CHECK: store <4 x double> + +; V128-LABEL: @foo( +; V128: load <2 x double> +; V128: fadd <2 x double> +; V128: fadd <2 x double> +; V128: store <2 x double> +; V128: load <2 x double> +; V128: fadd <2 x double> +; V128: fadd <2 x double> +; V128: store <2 x double> define void @foo(double* %x) { %1 = load double, double* %x, align 8