diff --git a/lib/Transforms/Vectorize/SLPVectorizer.cpp b/lib/Transforms/Vectorize/SLPVectorizer.cpp index c9b8e7b3c00..b1f097e2d8a 100644 --- a/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -992,63 +992,29 @@ bool BoUpSLP::isConsecutiveAccess(Value *A, Value *B) { if (PtrA == PtrB || PtrA->getType() != PtrB->getType()) return false; - // Calculate a constant offset from the base pointer without using SCEV - // in the supported cases. - // TODO: Add support for the case where one of the pointers is a GEP that - // uses the other pointer. - GetElementPtrInst *GepA = dyn_cast(PtrA); - GetElementPtrInst *GepB = dyn_cast(PtrB); - - unsigned BW = DL->getPointerSizeInBits(ASA); + unsigned PtrBitWidth = DL->getPointerSizeInBits(ASA); Type *Ty = cast(PtrA->getType())->getElementType(); - int64_t Sz = DL->getTypeStoreSize(Ty); + APInt Size(PtrBitWidth, DL->getTypeStoreSize(Ty)); - // Check if PtrA is the base and PtrB is a constant offset. - if (GepB && GepB->getPointerOperand() == PtrA) { - APInt Offset(BW, 0); - if (GepB->accumulateConstantOffset(*DL, Offset)) - return Offset.getSExtValue() == Sz; - return false; - } + APInt OffsetA(PtrBitWidth, 0), OffsetB(PtrBitWidth, 0); + PtrA = PtrA->stripAndAccumulateInBoundsConstantOffsets(*DL, OffsetA); + PtrB = PtrB->stripAndAccumulateInBoundsConstantOffsets(*DL, OffsetB); - // Check if PtrB is the base and PtrA is a constant offset. - if (GepA && GepA->getPointerOperand() == PtrB) { - APInt Offset(BW, 0); - if (GepA->accumulateConstantOffset(*DL, Offset)) - return Offset.getSExtValue() == -Sz; - return false; - } + APInt OffsetDelta = OffsetB - OffsetA; - // If both pointers are GEPs: - if (GepA && GepB) { - // Check that they have the same base pointer and number of indices. - if (GepA->getPointerOperand() != GepB->getPointerOperand() || - GepA->getNumIndices() != GepB->getNumIndices()) - return false; + // Check if they are based on the same pointer. That makes the offsets + // sufficient. + if (PtrA == PtrB) + return OffsetDelta == Size; - // Try to strip the geps. This makes SCEV faster. - // Make sure that all of the indices except for the last are identical. - int LastIdx = GepA->getNumIndices(); - for (int i = 0; i < LastIdx - 1; i++) { - if (GepA->getOperand(i+1) != GepB->getOperand(i+1)) - return false; - } + // Compute the necessary base pointer delta to have the necessary final delta + // equal to the size. + APInt BaseDelta = Size - OffsetDelta; - PtrA = GepA->getOperand(LastIdx); - PtrB = GepB->getOperand(LastIdx); - Sz = 1; - } - - ConstantInt *CA = dyn_cast(PtrA); - ConstantInt *CB = dyn_cast(PtrB); - if (CA && CB) { - return (CA->getSExtValue() + Sz == CB->getSExtValue()); - } - - // Calculate the distance. + // Otherwise compute the distance with SCEV between the base pointers. const SCEV *PtrSCEVA = SE->getSCEV(PtrA); const SCEV *PtrSCEVB = SE->getSCEV(PtrB); - const SCEV *C = SE->getConstant(PtrSCEVA->getType(), Sz); + const SCEV *C = SE->getConstant(BaseDelta); const SCEV *X = SE->getAddExpr(PtrSCEVA, C); return X == PtrSCEVB; } diff --git a/test/Transforms/SLPVectorizer/X86/phi.ll b/test/Transforms/SLPVectorizer/X86/phi.ll index 1c7f9ccf602..f77e945aad9 100644 --- a/test/Transforms/SLPVectorizer/X86/phi.ll +++ b/test/Transforms/SLPVectorizer/X86/phi.ll @@ -1,4 +1,4 @@ -; RUN: opt < %s -basicaa -slp-vectorizer -dce -S -mtriple=i386-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s +; RUN: opt < %s -basicaa -slp-vectorizer -slp-threshold=-100 -dce -S -mtriple=i386-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32-S128" target triple = "i386-apple-macosx10.9.0" @@ -95,3 +95,41 @@ for.end: ; preds = %for.body ret i32 0 } +define void @test(x86_fp80* %i1, x86_fp80* %i2, x86_fp80* %o) { +; CHECK-LABEL: @test( +; +; Test that we correctly recognize the discontiguous memory in arrays where the +; size is less than the alignment, and through various different GEP formations. + +entry: + %i1.0 = load x86_fp80* %i1, align 16 + %i1.gep1 = getelementptr x86_fp80* %i1, i64 1 + %i1.1 = load x86_fp80* %i1.gep1, align 16 +; CHECK: load x86_fp80* +; CHECK: load x86_fp80* +; CHECK: insertelement <2 x x86_fp80> +; CHECK: insertelement <2 x x86_fp80> + br i1 undef, label %then, label %end + +then: + %i2.gep0 = getelementptr inbounds x86_fp80* %i2, i64 0 + %i2.0 = load x86_fp80* %i2.gep0, align 16 + %i2.gep1 = getelementptr inbounds x86_fp80* %i2, i64 1 + %i2.1 = load x86_fp80* %i2.gep1, align 16 +; CHECK: load x86_fp80* +; CHECK: load x86_fp80* +; CHECK: insertelement <2 x x86_fp80> +; CHECK: insertelement <2 x x86_fp80> + br label %end + +end: + %phi0 = phi x86_fp80 [ %i1.0, %entry ], [ %i2.0, %then ] + %phi1 = phi x86_fp80 [ %i1.1, %entry ], [ %i2.1, %then ] +; CHECK: phi <2 x x86_fp80> +; CHECK: extractelement <2 x x86_fp80> +; CHECK: extractelement <2 x x86_fp80> + store x86_fp80 %phi0, x86_fp80* %o, align 16 + %o.gep1 = getelementptr inbounds x86_fp80* %o, i64 1 + store x86_fp80 %phi1, x86_fp80* %o.gep1, align 16 + ret void +}