diff --git a/lib/Transforms/Vectorize/SLPVectorizer.cpp b/lib/Transforms/Vectorize/SLPVectorizer.cpp index 59e2ff937f7..c0d44f179bc 100644 --- a/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -1179,6 +1179,64 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth) { } return; } + case Instruction::GetElementPtr: { + // We don't combine GEPs with complicated (nested) indexing. + for (unsigned j = 0; j < VL.size(); ++j) { + if (cast(VL[j])->getNumOperands() != 2) { + DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n"); + BS.cancelScheduling(VL); + newTreeEntry(VL, false); + return; + } + } + + // We combine only GEPs with a single use. + for (unsigned j = 0; j < VL.size(); ++j) { + if (cast(VL[j])->getNumUses() > 1) { + DEBUG(dbgs() << "SLP: not-vectorizable GEP (multiple uses).\n"); + BS.cancelScheduling(VL); + newTreeEntry(VL, false); + return; + } + } + + // We can't combine several GEPs into one vector if they operate on + // different types. + Type *Ty0 = cast(VL0)->getOperand(0)->getType(); + for (unsigned j = 0; j < VL.size(); ++j) { + Type *CurTy = cast(VL[j])->getOperand(0)->getType(); + if (Ty0 != CurTy) { + DEBUG(dbgs() << "SLP: not-vectorizable GEP (different types).\n"); + BS.cancelScheduling(VL); + newTreeEntry(VL, false); + return; + } + } + + // We don't combine GEPs with non-constant indexes. + for (unsigned j = 0; j < VL.size(); ++j) { + auto Op = cast(VL[j])->getOperand(1); + if (!isa(Op)) { + DEBUG( + dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n"); + BS.cancelScheduling(VL); + newTreeEntry(VL, false); + return; + } + } + + newTreeEntry(VL, true); + DEBUG(dbgs() << "SLP: added a vector of GEPs.\n"); + for (unsigned i = 0, e = 2; i < e; ++i) { + ValueList Operands; + // Prepare the operand vector. + for (unsigned j = 0; j < VL.size(); ++j) + Operands.push_back(cast(VL[j])->getOperand(i)); + + buildTree_rec(Operands, Depth + 1); + } + return; + } case Instruction::Store: { // Check if the stores are consecutive or of we need to swizzle them. for (unsigned i = 0, e = VL.size() - 1; i < e; ++i) @@ -1416,6 +1474,20 @@ int BoUpSLP::getEntryCost(TreeEntry *E) { } return VecCost - ScalarCost; } + case Instruction::GetElementPtr: { + TargetTransformInfo::OperandValueKind Op1VK = + TargetTransformInfo::OK_AnyValue; + TargetTransformInfo::OperandValueKind Op2VK = + TargetTransformInfo::OK_UniformConstantValue; + + int ScalarCost = + VecTy->getNumElements() * + TTI->getArithmeticInstrCost(Instruction::Add, ScalarTy, Op1VK, Op2VK); + int VecCost = + TTI->getArithmeticInstrCost(Instruction::Add, VecTy, Op1VK, Op2VK); + + return VecCost - ScalarCost; + } case Instruction::Load: { // Cost of wide load - cost of scalar loads. int ScalarLdCost = VecTy->getNumElements() * @@ -1982,6 +2054,35 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { ++NumVectorInstructions; return propagateMetadata(S, E->Scalars); } + case Instruction::GetElementPtr: { + setInsertPointAfterBundle(E->Scalars); + + ValueList Op0VL; + for (int i = 0, e = E->Scalars.size(); i < e; ++i) + Op0VL.push_back(cast(E->Scalars[i])->getOperand(0)); + + Value *Op0 = vectorizeTree(Op0VL); + + std::vector OpVecs; + for (int j = 1, e = cast(VL0)->getNumOperands(); j < e; + ++j) { + ValueList OpVL; + for (int i = 0, e = E->Scalars.size(); i < e; ++i) + OpVL.push_back(cast(E->Scalars[i])->getOperand(j)); + + Value *OpVec = vectorizeTree(OpVL); + OpVecs.push_back(OpVec); + } + + Value *V = Builder.CreateGEP(Op0, OpVecs); + E->VectorizedValue = V; + ++NumVectorInstructions; + + if (Instruction *I = dyn_cast(V)) + return propagateMetadata(I, E->Scalars); + + return V; + } case Instruction::Call: { CallInst *CI = cast(VL0); setInsertPointAfterBundle(E->Scalars); diff --git a/test/Transforms/SLPVectorizer/X86/gep.ll b/test/Transforms/SLPVectorizer/X86/gep.ll new file mode 100644 index 00000000000..9e105ec9848 --- /dev/null +++ b/test/Transforms/SLPVectorizer/X86/gep.ll @@ -0,0 +1,41 @@ +; RUN: opt < %s -basicaa -slp-vectorizer -S |FileCheck %s +target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" + +; Test if SLP can handle GEP expressions. +; The test perform the following action: +; x->first = y->first + 16 +; x->second = y->second + 16 + +; CHECK-LABEL: foo1 +; CHECK: <2 x i32*> +define void @foo1 ({ i32*, i32* }* noalias %x, { i32*, i32* }* noalias %y) { + %1 = getelementptr inbounds { i32*, i32* }* %y, i64 0, i32 0 + %2 = load i32** %1, align 8 + %3 = getelementptr inbounds i32* %2, i64 16 + %4 = getelementptr inbounds { i32*, i32* }* %x, i64 0, i32 0 + store i32* %3, i32** %4, align 8 + %5 = getelementptr inbounds { i32*, i32* }* %y, i64 0, i32 1 + %6 = load i32** %5, align 8 + %7 = getelementptr inbounds i32* %6, i64 16 + %8 = getelementptr inbounds { i32*, i32* }* %x, i64 0, i32 1 + store i32* %7, i32** %8, align 8 + ret void +} + +; Test that we don't vectorize GEP expressions if indexes are not constants. +; We can't produce an efficient code in that case. +; CHECK-LABEL: foo2 +; CHECK-NOT: <2 x i32*> +define void @foo2 ({ i32*, i32* }* noalias %x, { i32*, i32* }* noalias %y, i32 %i) { + %1 = getelementptr inbounds { i32*, i32* }* %y, i64 0, i32 0 + %2 = load i32** %1, align 8 + %3 = getelementptr inbounds i32* %2, i32 %i + %4 = getelementptr inbounds { i32*, i32* }* %x, i64 0, i32 0 + store i32* %3, i32** %4, align 8 + %5 = getelementptr inbounds { i32*, i32* }* %y, i64 0, i32 1 + %6 = load i32** %5, align 8 + %7 = getelementptr inbounds i32* %6, i32 %i + %8 = getelementptr inbounds { i32*, i32* }* %x, i64 0, i32 1 + store i32* %7, i32** %8, align 8 + ret void +}