diff --git a/lib/Transforms/Vectorize/SLPVectorizer.cpp b/lib/Transforms/Vectorize/SLPVectorizer.cpp index 6d1c2c0ffee..d2d4db33f69 100644 --- a/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -1015,8 +1015,17 @@ int BoUpSLP::getEntryCost(TreeEntry *E) { return 0; } case Instruction::ExtractElement: { - if (CanReuseExtract(VL)) - return 0; + if (CanReuseExtract(VL)) { + int DeadCost = 0; + for (unsigned i = 0, e = VL.size(); i < e; ++i) { + ExtractElementInst *E = cast(VL[i]); + if (E->hasOneUse()) + // Take credit for instruction that will become dead. + DeadCost += + TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, i); + } + return -DeadCost; + } return getGatherCost(VecTy); } case Instruction::ZExt: diff --git a/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll b/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll index a6a7f32bfc5..620292bdaf3 100644 --- a/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll +++ b/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll @@ -1,4 +1,5 @@ ; RUN: opt -S -slp-vectorizer -slp-threshold=-10000 < %s | FileCheck %s +; RUN: opt -S -slp-vectorizer -slp-threshold=0 < %s | FileCheck %s -check-prefix=ZEROTHRESH target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-n8:16:32:64-S128" target triple = "x86_64-apple-macosx10.8.0" @@ -218,4 +219,28 @@ define <4 x float> @reschedule_extract(<4 x float> %a, <4 x float> %b) { ret <4 x float> %v3 } +; Check that cost model for vectorization takes credit for +; instructions that are erased. +define <4 x float> @take_credit(<4 x float> %a, <4 x float> %b) { +; ZEROTHRESH-LABEL: @take_credit( +; ZEROTHRESH-CHECK: %1 = fadd <4 x float> %a, %b + %a0 = extractelement <4 x float> %a, i32 0 + %b0 = extractelement <4 x float> %b, i32 0 + %c0 = fadd float %a0, %b0 + %a1 = extractelement <4 x float> %a, i32 1 + %b1 = extractelement <4 x float> %b, i32 1 + %c1 = fadd float %a1, %b1 + %a2 = extractelement <4 x float> %a, i32 2 + %b2 = extractelement <4 x float> %b, i32 2 + %c2 = fadd float %a2, %b2 + %a3 = extractelement <4 x float> %a, i32 3 + %b3 = extractelement <4 x float> %b, i32 3 + %c3 = fadd float %a3, %b3 + %v0 = insertelement <4 x float> undef, float %c0, i32 0 + %v1 = insertelement <4 x float> %v0, float %c1, i32 1 + %v2 = insertelement <4 x float> %v1, float %c2, i32 2 + %v3 = insertelement <4 x float> %v2, float %c3, i32 3 + ret <4 x float> %v3 +} + attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }