SLP Vectorizer: Implement multi-block slp-vectorization.

Rewrote the SLP-vectorization as a whole-function vectorization pass. It is now able to vectorize chains across multiple basic blocks.
It still does not vectorize PHIs, but this should be easy to do now that we scan the entire function.
I removed the support for extracting values from trees.
We are now able to vectorize more programs, but there are some serious regressions in many workloads (such as flops-6 and mandel-2).



git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@184647 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
Nadav Rotem 2013-06-22 21:34:10 +00:00
parent 71f28bf6fb
commit 53a0552b06
7 changed files with 1220 additions and 1360 deletions

View File

@ -3,7 +3,6 @@ add_llvm_library(LLVMVectorize
Vectorize.cpp
LoopVectorize.cpp
SLPVectorizer.cpp
VecUtils.cpp
)
add_dependencies(LLVMVectorize intrinsics_gen)

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -1,194 +0,0 @@
//===- VecUtils.h - Vectorization Utilities -------------------------------===//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
//
// This family of classes and functions manipulate vectors and chains of
// vectors.
//
//===----------------------------------------------------------------------===//
#ifndef LLVM_TRANSFORMS_VECTORIZE_VECUTILS_H
#define LLVM_TRANSFORMS_VECTORIZE_VECUTILS_H
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/SetVector.h"
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/Analysis/AliasAnalysis.h"
#include "llvm/IR/IRBuilder.h"
#include <vector>
namespace llvm {
class BasicBlock;
class Instruction;
class Type;
class VectorType;
class StoreInst;
class Value;
class ScalarEvolution;
class DataLayout;
class TargetTransformInfo;
class AliasAnalysis;
class Loop;
/// Bottom Up SLP vectorization utility class.
struct BoUpSLP {
typedef SmallVector<Value *, 8> ValueList;
typedef SmallVector<Instruction *, 16> InstrList;
typedef SmallPtrSet<Value *, 16> ValueSet;
typedef SmallVector<StoreInst *, 8> StoreList;
static const int max_cost = 1 << 20;
// \brief C'tor.
BoUpSLP(BasicBlock *Bb, ScalarEvolution *Se, DataLayout *Dl,
TargetTransformInfo *Tti, AliasAnalysis *Aa, Loop *Lp);
/// \brief Take the pointer operand from the Load/Store instruction.
/// \returns NULL if this is not a valid Load/Store instruction.
static Value *getPointerOperand(Value *I);
/// \brief Take the address space operand from the Load/Store instruction.
/// \returns -1 if this is not a valid Load/Store instruction.
static unsigned getAddressSpaceOperand(Value *I);
/// \returns true if the memory operations A and B are consecutive.
bool isConsecutiveAccess(Value *A, Value *B);
/// \brief Vectorize the tree that starts with the elements in \p VL.
/// \returns the vectorized value.
Value *vectorizeTree(ArrayRef<Value *> VL, int VF);
/// \returns the vectorization cost of the subtree that starts at \p VL.
/// A negative number means that this is profitable.
int getTreeCost(ArrayRef<Value *> VL);
/// \returns the scalarization cost for this list of values. Assuming that
/// this subtree gets vectorized, we may need to extract the values from the
/// roots. This method calculates the cost of extracting the values.
int getScalarizationCost(ArrayRef<Value *> VL);
/// \brief Attempts to order and vectorize a sequence of stores. This
/// function does a quadratic scan of the given stores.
/// \returns true if the basic block was modified.
bool vectorizeStores(ArrayRef<StoreInst *> Stores, int costThreshold);
/// \brief Vectorize a group of scalars into a vector tree.
/// \returns the vectorized value.
Value *vectorizeArith(ArrayRef<Value *> Operands);
/// \returns the list of new instructions that were added in order to collect
/// scalars into vectors. This list can be used to further optimize the gather
/// sequences.
InstrList &getGatherSeqInstructions() { return GatherInstructions; }
private:
/// \brief This method contains the recursive part of getTreeCost.
int getTreeCost_rec(ArrayRef<Value *> VL, unsigned Depth);
/// \brief This recursive method looks for vectorization hazards such as
/// values that are used by multiple users and checks that values are used
/// by only one vector lane. It updates the variables LaneMap, MultiUserVals.
void getTreeUses_rec(ArrayRef<Value *> VL, unsigned Depth);
/// \brief This method contains the recursive part of vectorizeTree.
Value *vectorizeTree_rec(ArrayRef<Value *> VL, int VF);
/// \brief Number all of the instructions in the block.
void numberInstructions();
/// \brief Vectorize a sorted sequence of stores.
bool vectorizeStoreChain(ArrayRef<Value *> Chain, int CostThreshold);
/// \returns the scalarization cost for this type. Scalarization in this
/// context means the creation of vectors from a group of scalars.
int getScalarizationCost(Type *Ty);
/// \returns the AA location that is being access by the instruction.
AliasAnalysis::Location getLocation(Instruction *I);
/// \brief Checks if it is possible to sink an instruction from
/// \p Src to \p Dst.
/// \returns the pointer to the barrier instruction if we can't sink.
Value *isUnsafeToSink(Instruction *Src, Instruction *Dst);
/// \returns the index of the last instrucion in the BB from \p VL.
/// Only consider the first \p VF elements.
int getLastIndex(ArrayRef<Value *> VL, unsigned VF);
/// \returns the index of the first User of \p VL.
/// Only consider the first \p VF elements.
int getFirstUserIndex(ArrayRef<Value *> VL, unsigned VF);
/// \returns the instruction \p I or \p J that appears last in the BB .
int getLastIndex(Instruction *I, Instruction *J);
/// \returns the insertion point for \p Index.
Instruction *getInsertionPoint(unsigned Index);
/// \returns a vector from a collection of scalars in \p VL.
Value *Scalarize(ArrayRef<Value *> VL, VectorType *Ty);
private:
/// Maps instructions to numbers and back.
SmallDenseMap<Value *, int> InstrIdx;
/// Maps integers to Instructions.
std::vector<Instruction *> InstrVec;
// -- containers that are used during getTreeCost -- //
/// Contains values that must be scalarized because they are used
/// by multiple lanes, or by users outside the tree.
/// NOTICE: The vectorization methods also use this set.
ValueSet MustScalarize;
/// Contains values that have users outside of the vectorized graph.
/// We need to generate extract instructions for these values.
/// NOTICE: The vectorization methods also use this set.
SetVector<Value *> MustExtract;
/// Contains a list of values that are used outside the current tree. This
/// set must be reset between runs.
SetVector<Value *> MultiUserVals;
/// Maps values in the tree to the vector lanes that uses them. This map must
/// be reset between runs of getCost.
std::map<Value *, int> LaneMap;
/// A list of instructions to ignore while sinking
/// memory instructions. This map must be reset between runs of getCost.
ValueSet MemBarrierIgnoreList;
// -- Containers that are used during vectorizeTree -- //
/// Maps between the first scalar to the vector. This map must be reset
/// between runs.
DenseMap<Value *, Value *> VectorizedValues;
// -- Containers that are used after vectorization by the caller -- //
/// A list of instructions that are used when gathering scalars into vectors.
/// In many cases these instructions can be hoisted outside of the BB.
/// Iterating over this list is faster than calling LICM.
/// Notice: We insert NULL ptrs to separate between the different gather
/// sequences.
InstrList GatherInstructions;
/// Instruction builder to construct the vectorized tree.
IRBuilder<> Builder;
// Analysis and block reference.
BasicBlock *BB;
ScalarEvolution *SE;
DataLayout *DL;
TargetTransformInfo *TTI;
AliasAnalysis *AA;
Loop *L;
};
} // end of namespace
#endif // LLVM_TRANSFORMS_VECTORIZE_VECUTILS_H

View File

@ -50,9 +50,9 @@ entry:
; }
; CHECK: @extr_user
; CHECK: load i32*
; CHECK: store <4 x i32>
; CHECK-NEXT: extractelement <4 x i32>
; CHECK: ret
; CHECK-NEXT: ret
define i32 @extr_user(i32* noalias nocapture %B, i32* noalias nocapture %A, i32 %n, i32 %m) {
entry:
%0 = load i32* %A, align 4
@ -79,9 +79,9 @@ entry:
; In this example we have an external user that is not the first element in the vector.
; CHECK: @extr_user1
; CHECK: load i32*
; CHECK: store <4 x i32>
; CHECK-NEXT: extractelement <4 x i32>
; CHECK: ret
; CHECK-NEXT: ret
define i32 @extr_user1(i32* noalias nocapture %B, i32* noalias nocapture %A, i32 %n, i32 %m) {
entry:
%0 = load i32* %A, align 4

View File

@ -0,0 +1,55 @@
; RUN: opt < %s -basicaa -slp-vectorizer -dce -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
target triple = "x86_64-apple-macosx10.7.0"
; int bar(double *A, int d) {
; double A0 = A[0];
; double A1 = A[1];
; float F0 = A0;
; float F1 = A1;
; if (d) foo(); <----- This splits the blocks
; F0+=4.0;
; F1+=5.0;
; A[8] = 9.0 + F0;
; A[9] = 5.0 + F1;
; }
;CHECK: @bar
;CHECK: load <2 x double>
;CHECK: fptrunc <2 x double>
;CHECK: call i32
;CHECK: fadd <2 x float>
;CHECK: fpext <2 x float>
;CHECK: store <2 x double>
;CHECK: ret
define i32 @bar(double* nocapture %A, i32 %d) {
%1 = load double* %A, align 8
%2 = getelementptr inbounds double* %A, i64 1
%3 = load double* %2, align 8
%4 = fptrunc double %1 to float
%5 = fptrunc double %3 to float
%6 = icmp eq i32 %d, 0
br i1 %6, label %9, label %7
; <label>:7 ; preds = %0
%8 = tail call i32 (...)* @foo()
br label %9
; <label>:9 ; preds = %0, %7
%10 = fadd float %4, 4.000000e+00
%11 = fadd float %5, 5.000000e+00
%12 = fpext float %10 to double
%13 = fadd double %12, 9.000000e+00
%14 = getelementptr inbounds double* %A, i64 8
store double %13, double* %14, align 8
%15 = fpext float %11 to double
%16 = fadd double %15, 5.000000e+00
%17 = getelementptr inbounds double* %A, i64 9
store double %16, double* %17, align 8
ret i32 undef
}
declare i32 @foo(...)

View File

@ -12,8 +12,8 @@ target triple = "x86_64-apple-macosx10.7.0"
;}
;CHECK: @foo
;CHECK: load <4 x i32>
;CHECK: insertelement <4 x i32>
;CHECK: load <4 x i32>
;CHECK: add <4 x i32>
;CHECK: store <4 x i32>
;CHECK: ret