mirror of
https://github.com/c64scene-ar/llvm-6502.git
synced 2025-01-27 14:34:58 +00:00
SLPVectorizer: add initial support for reduction variable vectorization.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@179470 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
parent
ef596e1a80
commit
f7eaf29cf7
@ -26,6 +26,7 @@
|
||||
#include "llvm/Analysis/Verifier.h"
|
||||
#include "llvm/IR/DataLayout.h"
|
||||
#include "llvm/IR/Instructions.h"
|
||||
#include "llvm/IR/IntrinsicInst.h"
|
||||
#include "llvm/IR/Module.h"
|
||||
#include "llvm/IR/Type.h"
|
||||
#include "llvm/IR/Value.h"
|
||||
@ -38,7 +39,7 @@
|
||||
using namespace llvm;
|
||||
|
||||
static cl::opt<int>
|
||||
SLPCostThreshold("slp-threshold", cl::init(1), cl::Hidden,
|
||||
SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden,
|
||||
cl::desc("Only vectorize trees if the gain is above this "
|
||||
"number. (gain = -cost of vectorization)"));
|
||||
namespace {
|
||||
@ -63,7 +64,7 @@ struct SLPVectorizer : public BasicBlockPass {
|
||||
/// object. We sort the stores to their base objects to reduce the cost of the
|
||||
/// quadratic search on the stores. TODO: We can further reduce this cost
|
||||
/// if we flush the chain creation every time we run into a memory barrier.
|
||||
bool CollectStores(BasicBlock *BB, BoUpSLP &R) {
|
||||
bool collectStores(BasicBlock *BB, BoUpSLP &R) {
|
||||
for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
|
||||
StoreInst *SI = dyn_cast<StoreInst>(it);
|
||||
if (!SI)
|
||||
@ -84,7 +85,79 @@ struct SLPVectorizer : public BasicBlockPass {
|
||||
return true;
|
||||
}
|
||||
|
||||
bool RollStoreChains(BoUpSLP &R) {
|
||||
bool tryToVectorizePair(BinaryOperator *A, BinaryOperator *B, BoUpSLP &R) {
|
||||
if (!A || !B) return false;
|
||||
BoUpSLP::ValueList VL;
|
||||
VL.push_back(A);
|
||||
VL.push_back(B);
|
||||
int Cost = R.getTreeCost(VL);
|
||||
DEBUG(dbgs()<<"SLP: Cost of pair:" << Cost << ".\n");
|
||||
if (Cost >= -SLPCostThreshold) return false;
|
||||
DEBUG(dbgs()<<"SLP: Vectorizing pair.\n");
|
||||
R.vectorizeArith(VL);
|
||||
return true;
|
||||
}
|
||||
|
||||
bool tryToVectorizeCandidate(BinaryOperator *V, BoUpSLP &R) {
|
||||
if (!V) return false;
|
||||
BinaryOperator *A = dyn_cast<BinaryOperator>(V->getOperand(0));
|
||||
BinaryOperator *B = dyn_cast<BinaryOperator>(V->getOperand(1));
|
||||
// Try to vectorize V.
|
||||
if (tryToVectorizePair(A, B, R)) return true;
|
||||
|
||||
// Try to skip B.
|
||||
if (B && B->hasOneUse()) {
|
||||
BinaryOperator *B0 = dyn_cast<BinaryOperator>(B->getOperand(0));
|
||||
BinaryOperator *B1 = dyn_cast<BinaryOperator>(B->getOperand(1));
|
||||
if (tryToVectorizePair(A, B0, R)) {
|
||||
B->moveBefore(V);
|
||||
return true;
|
||||
}
|
||||
if (tryToVectorizePair(A, B1, R)) {
|
||||
B->moveBefore(V);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
// Try to slip A.
|
||||
if (A && A->hasOneUse()) {
|
||||
BinaryOperator *A0 = dyn_cast<BinaryOperator>(A->getOperand(0));
|
||||
BinaryOperator *A1 = dyn_cast<BinaryOperator>(A->getOperand(1));
|
||||
if (tryToVectorizePair(A0, B, R)) {
|
||||
A->moveBefore(V);
|
||||
return true;
|
||||
}
|
||||
if (tryToVectorizePair(A1, B, R)) {
|
||||
A->moveBefore(V);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
bool vectorizeReductions(BasicBlock *BB, BoUpSLP &R) {
|
||||
bool Changed = false;
|
||||
for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
|
||||
if (isa<DbgInfoIntrinsic>(it)) continue;
|
||||
PHINode *P = dyn_cast<PHINode>(it);
|
||||
if (!P) return Changed;
|
||||
// Check that the PHI is a reduction PHI.
|
||||
if (P->getNumIncomingValues() != 2) return Changed;
|
||||
Value *Rdx = (P->getIncomingBlock(0) == BB ? P->getIncomingValue(0) :
|
||||
(P->getIncomingBlock(1) == BB ? P->getIncomingValue(1) : 0));
|
||||
// Check if this is a Binary Operator.
|
||||
BinaryOperator *BI = dyn_cast_or_null<BinaryOperator>(Rdx);
|
||||
if (!BI) continue;
|
||||
|
||||
Value *Inst = BI->getOperand(0);
|
||||
if (Inst == P) Inst = BI->getOperand(1);
|
||||
Changed |= tryToVectorizeCandidate(dyn_cast<BinaryOperator>(Inst), R);
|
||||
}
|
||||
|
||||
return Changed;
|
||||
}
|
||||
|
||||
bool rollStoreChains(BoUpSLP &R) {
|
||||
bool Changed = false;
|
||||
// Attempt to sort and vectorize each of the store-groups.
|
||||
for (StoreListMap::iterator it = StoreRefs.begin(), e = StoreRefs.end();
|
||||
@ -116,13 +189,15 @@ struct SLPVectorizer : public BasicBlockPass {
|
||||
// he store instructions.
|
||||
BoUpSLP R(&BB, SE, DL, TTI, AA);
|
||||
|
||||
if (!CollectStores(&BB, R))
|
||||
return false;
|
||||
bool Changed = vectorizeReductions(&BB, R);
|
||||
|
||||
bool Changed = RollStoreChains(R);
|
||||
if (Changed) {
|
||||
if (!collectStores(&BB, R))
|
||||
return Changed;
|
||||
|
||||
if (rollStoreChains(R)) {
|
||||
DEBUG(dbgs()<<"SLP: vectorized in \""<<BB.getParent()->getName()<<"\"\n");
|
||||
DEBUG(verifyFunction(*BB.getParent()));
|
||||
Changed |= true;
|
||||
}
|
||||
|
||||
return Changed;
|
||||
|
@ -208,6 +208,16 @@ Value *BoUpSLP::isUnsafeToSink(Instruction *Src, Instruction *Dst) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
void BoUpSLP::vectorizeArith(ValueList &Operands) {
|
||||
Value *Vec = vectorizeTree(Operands, Operands.size());
|
||||
BasicBlock::iterator Loc = cast<Instruction>(Vec);
|
||||
IRBuilder<> Builder(++Loc);
|
||||
for (unsigned i = 0, e = Operands.size(); i != e; ++i) {
|
||||
Value *S = Builder.CreateExtractElement(Vec, Builder.getInt32(i));
|
||||
Operands[i]->replaceAllUsesWith(S);
|
||||
}
|
||||
}
|
||||
|
||||
int BoUpSLP::getTreeCost(ValueList &VL) {
|
||||
// Get rid of the list of stores that were removed, and from the
|
||||
// lists of instructions with multiple users.
|
||||
|
@ -66,6 +66,9 @@ struct BoUpSLP {
|
||||
/// \returns true if the basic block was modified.
|
||||
bool vectorizeStores(StoreList &Stores, int costThreshold);
|
||||
|
||||
/// \brief Vectorize a group of scalars into a vector tree.
|
||||
void vectorizeArith(ValueList &Operands);
|
||||
|
||||
private:
|
||||
/// \returns This method contains the recursive part of getTreeCost.
|
||||
int getTreeCost_rec(ValueList &VL, unsigned Depth);
|
||||
|
52
test/Transforms/SLPVectorizer/X86/reduction.ll
Normal file
52
test/Transforms/SLPVectorizer/X86/reduction.ll
Normal file
@ -0,0 +1,52 @@
|
||||
; RUN: opt < %s -basicaa -slp-vectorizer -dce -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s
|
||||
|
||||
target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32-S128"
|
||||
target triple = "i386-apple-macosx10.8.0"
|
||||
|
||||
; int foo(double *A, int n, int m) {
|
||||
; double sum = 0, v1 = 2, v0 = 3;
|
||||
; for (int i=0; i < n; ++i)
|
||||
; sum += 7*A[i*2] + 7*A[i*2+1];
|
||||
; return sum;
|
||||
; }
|
||||
|
||||
;CHECK: reduce
|
||||
;CHECK: load <2 x double>
|
||||
;CHECK: fmul <2 x double>
|
||||
;CHECK: ret
|
||||
define i32 @reduce(double* nocapture %A, i32 %n, i32 %m) #0 {
|
||||
entry:
|
||||
%cmp13 = icmp sgt i32 %n, 0
|
||||
br i1 %cmp13, label %for.body, label %for.end
|
||||
|
||||
for.body: ; preds = %entry, %for.body
|
||||
%i.015 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
|
||||
%sum.014 = phi double [ %add6, %for.body ], [ 0.000000e+00, %entry ]
|
||||
%mul = shl nsw i32 %i.015, 1
|
||||
%arrayidx = getelementptr inbounds double* %A, i32 %mul
|
||||
%0 = load double* %arrayidx, align 4, !tbaa !0
|
||||
%mul1 = fmul double %0, 7.000000e+00
|
||||
%add12 = or i32 %mul, 1
|
||||
%arrayidx3 = getelementptr inbounds double* %A, i32 %add12
|
||||
%1 = load double* %arrayidx3, align 4, !tbaa !0
|
||||
%mul4 = fmul double %1, 7.000000e+00
|
||||
%add5 = fadd double %mul1, %mul4
|
||||
%add6 = fadd double %sum.014, %add5
|
||||
%inc = add nsw i32 %i.015, 1
|
||||
%exitcond = icmp eq i32 %inc, %n
|
||||
br i1 %exitcond, label %for.cond.for.end_crit_edge, label %for.body
|
||||
|
||||
for.cond.for.end_crit_edge: ; preds = %for.body
|
||||
%phitmp = fptosi double %add6 to i32
|
||||
br label %for.end
|
||||
|
||||
for.end: ; preds = %for.cond.for.end_crit_edge, %entry
|
||||
%sum.0.lcssa = phi i32 [ %phitmp, %for.cond.for.end_crit_edge ], [ 0, %entry ]
|
||||
ret i32 %sum.0.lcssa
|
||||
}
|
||||
|
||||
attributes #0 = { nounwind readonly ssp "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
|
||||
|
||||
!0 = metadata !{metadata !"double", metadata !1}
|
||||
!1 = metadata !{metadata !"omnipotent char", metadata !2}
|
||||
!2 = metadata !{metadata !"Simple C/C++ TBAA"}
|
Loading…
x
Reference in New Issue
Block a user