From b2fd770136b92637c5f084b743eab29f910288d5 Mon Sep 17 00:00:00 2001 From: Cameron Zwarich Date: Wed, 9 Mar 2011 05:43:05 +0000 Subject: [PATCH] Add support to scalar replacement for partial vector accesses of an alloca, e.g. a union of a float, <2 x float>, and <4 x float>. This mostly comes up with the use of vector intrinsics, especially in NEON when programmers know the layout of the register file. This enables codegen to eliminate a lot of the subregister traffic it would otherwise generate. This commit only enables this for a small number of floating-point cases, but a lot more integer cases. I assume this is okay for all ports, but I did not do extensive testing of the quality of code involving i512 vectors and the like. If there is a use case where this generates worse code than before, let me know and we can scale it back. This fixes . git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@127317 91177308-0d34-0410-b5e6-96231b3b80d8 --- .../Scalar/ScalarReplAggregates.cpp | 139 ++++++++++++++++-- test/Transforms/ScalarRepl/vector_promote.ll | 59 ++++++++ 2 files changed, 186 insertions(+), 12 deletions(-) diff --git a/lib/Transforms/Scalar/ScalarReplAggregates.cpp b/lib/Transforms/Scalar/ScalarReplAggregates.cpp index bc6035e1fae..1f64ad2606a 100644 --- a/lib/Transforms/Scalar/ScalarReplAggregates.cpp +++ b/lib/Transforms/Scalar/ScalarReplAggregates.cpp @@ -295,12 +295,16 @@ AllocaInst *ConvertToScalarInfo::TryConvert(AllocaInst *AI) { /// MergeInType - Add the 'In' type to the accumulated vector type (VectorTy) /// so far at the offset specified by Offset (which is specified in bytes). /// -/// There are two cases we handle here: +/// There are three cases we handle here: /// 1) A union of vector types of the same size and potentially its elements. /// Here we turn element accesses into insert/extract element operations. /// This promotes a <4 x float> with a store of float to the third element /// into a <4 x float> that uses insert element. -/// 2) A fully general blob of memory, which we turn into some (potentially +/// 2) A union of vector types with power-of-2 size differences, e.g. a float, +/// <2 x float> and <4 x float>. Here we turn element accesses into insert +/// and extract element operations, and <2 x float> accesses into a cast to +/// <2 x double>, an extract, and a cast back to <2 x float>. +/// 3) A fully general blob of memory, which we turn into some (potentially /// large) integer type with extract and insert operations where the loads /// and stores would mutate the memory. We mark this by setting VectorTy /// to VoidTy. @@ -346,18 +350,68 @@ bool ConvertToScalarInfo::MergeInVectorType(const VectorType *VInTy, // Remember if we saw a vector type. HadAVector = true; - if (VInTy->getBitWidth()/8 == AllocaSize && Offset == 0) { - // If we're storing/loading a vector of the right size, allow it as a - // vector. If this the first vector we see, remember the type so that - // we know the element size. If this is a subsequent access, ignore it - // even if it is a differing type but the same size. Worst case we can - // bitcast the resultant vectors. - if (VectorTy == 0) - VectorTy = VInTy; + // TODO: Support nonzero offsets? + if (Offset != 0) + return false; + + // Only allow vectors that are a power-of-2 away from the size of the alloca. + if (!isPowerOf2_64(AllocaSize / (VInTy->getBitWidth() / 8))) + return false; + + // If this the first vector we see, remember the type so that we know the + // element size. + if (!VectorTy) { + VectorTy = VInTy; return true; } - return false; + unsigned BitWidth = cast(VectorTy)->getBitWidth(); + unsigned InBitWidth = VInTy->getBitWidth(); + + // Vectors of the same size can be converted using a simple bitcast. + if (InBitWidth == BitWidth && AllocaSize == (InBitWidth / 8)) + return true; + + const Type *ElementTy = cast(VectorTy)->getElementType(); + const Type *InElementTy = cast(VectorTy)->getElementType(); + + // Do not allow mixed integer and floating-point accesses from vectors of + // different sizes. + if (ElementTy->isFloatingPointTy() != InElementTy->isFloatingPointTy()) + return false; + + if (ElementTy->isFloatingPointTy()) { + // Only allow floating-point vectors of different sizes if they have the + // same element type. + // TODO: This could be loosened a bit, but would anything benefit? + if (ElementTy != InElementTy) + return false; + + // There are no arbitrary-precision floating-point types, which limits the + // number of legal vector types with larger element types that we can form + // to bitcast and extract a subvector. + // TODO: We could support some more cases with mixed fp128 and double here. + if (!(BitWidth == 64 || BitWidth == 128) || + !(InBitWidth == 64 || InBitWidth == 128)) + return false; + } else { + assert(ElementTy->isIntegerTy() && "Vector elements must be either integer " + "or floating-point."); + unsigned BitWidth = ElementTy->getPrimitiveSizeInBits(); + unsigned InBitWidth = InElementTy->getPrimitiveSizeInBits(); + + // Do not allow integer types smaller than a byte or types whose widths are + // not a multiple of a byte. + if (BitWidth < 8 || InBitWidth < 8 || + BitWidth % 8 != 0 || InBitWidth % 8 != 0) + return false; + } + + // Pick the largest of the two vector types. + if (InBitWidth > BitWidth) + VectorTy = VInTy; + + return true; } /// CanConvertToScalar - V is a pointer. If we can convert the pointee and all @@ -586,6 +640,26 @@ void ConvertToScalarInfo::ConvertUsesToScalar(Value *Ptr, AllocaInst *NewAI, } } +/// getScaledElementType - Gets a scaled element type for a partial vector +/// access of an alloca. The input type must be an integer or float, and +/// the resulting type must be an integer, float or double. +static const Type *getScaledElementType(const Type *OldTy, unsigned Scale) { + assert((OldTy->isIntegerTy() || OldTy->isFloatTy()) && "Partial vector " + "accesses must be scaled from integer or float elements."); + + LLVMContext &Context = OldTy->getContext(); + unsigned Size = OldTy->getPrimitiveSizeInBits() * Scale; + + if (OldTy->isIntegerTy()) + return Type::getIntNTy(Context, Size); + if (Size == 32) + return Type::getFloatTy(Context); + if (Size == 64) + return Type::getDoubleTy(Context); + + llvm_unreachable("Invalid type for a partial vector access of an alloca!"); +} + /// ConvertScalar_ExtractValue - Extract a value of type ToType from an integer /// or vector value FromVal, extracting the bits from the offset specified by /// Offset. This returns the value, which is of type ToType. @@ -606,8 +680,27 @@ ConvertScalar_ExtractValue(Value *FromVal, const Type *ToType, // If the result alloca is a vector type, this is either an element // access or a bitcast to another vector type of the same size. if (const VectorType *VTy = dyn_cast(FromVal->getType())) { - if (ToType->isVectorTy()) + if (ToType->isVectorTy()) { + if (isPowerOf2_64(AllocaSize / TD.getTypeAllocSize(ToType))) { + assert(Offset == 0 && "Can't extract a value of a smaller vector type " + "from a nonzero offset."); + + const Type *ToElementTy = cast(ToType)->getElementType(); + unsigned Scale = AllocaSize / TD.getTypeAllocSize(ToType); + const Type *CastElementTy = getScaledElementType(ToElementTy, Scale); + unsigned NumCastVectorElements = VTy->getNumElements() / Scale; + + LLVMContext &Context = FromVal->getContext(); + const Type *CastTy = VectorType::get(CastElementTy, + NumCastVectorElements); + Value *Cast = Builder.CreateBitCast(FromVal, CastTy, "tmp"); + Value *Extract = Builder.CreateExtractElement(Cast, ConstantInt::get( + Type::getInt32Ty(Context), 0), "tmp"); + return Builder.CreateBitCast(Extract, ToType, "tmp"); + } + return Builder.CreateBitCast(FromVal, ToType, "tmp"); + } // Otherwise it must be an element access. unsigned Elt = 0; @@ -728,6 +821,28 @@ ConvertScalar_InsertValue(Value *SV, Value *Old, if (ValSize == VecSize) return Builder.CreateBitCast(SV, AllocaType, "tmp"); + if (SV->getType()->isVectorTy() && isPowerOf2_64(VecSize / ValSize)) { + assert(Offset == 0 && "Can't insert a value of a smaller vector type at " + "a nonzero offset."); + + const Type *ToElementTy = + cast(SV->getType())->getElementType(); + unsigned Scale = VecSize / ValSize; + const Type *CastElementTy = getScaledElementType(ToElementTy, Scale); + unsigned NumCastVectorElements = VTy->getNumElements() / Scale; + + LLVMContext &Context = SV->getContext(); + const Type *OldCastTy = VectorType::get(CastElementTy, + NumCastVectorElements); + Value *OldCast = Builder.CreateBitCast(Old, OldCastTy, "tmp"); + + Value *SVCast = Builder.CreateBitCast(SV, CastElementTy, "tmp"); + Value *Insert = + Builder.CreateInsertElement(OldCast, SVCast, ConstantInt::get( + Type::getInt32Ty(Context), 0), "tmp"); + return Builder.CreateBitCast(Insert, AllocaType, "tmp"); + } + uint64_t EltSize = TD.getTypeAllocSizeInBits(VTy->getElementType()); // Must be an element insertion. diff --git a/test/Transforms/ScalarRepl/vector_promote.ll b/test/Transforms/ScalarRepl/vector_promote.ll index 37cb49f539d..898cbde3530 100644 --- a/test/Transforms/ScalarRepl/vector_promote.ll +++ b/test/Transforms/ScalarRepl/vector_promote.ll @@ -98,3 +98,62 @@ define i64 @test6(<2 x float> %X) { ; CHECK: ret i64 } +define float @test7(<4 x float> %x) { + %a = alloca <4 x float> + store <4 x float> %x, <4 x float>* %a + %p = bitcast <4 x float>* %a to <2 x float>* + %b = load <2 x float>* %p + %q = getelementptr <4 x float>* %a, i32 0, i32 2 + %c = load float* %q + ret float %c +; CHECK: @test7 +; CHECK-NOT: alloca +; CHECK: bitcast <4 x float> %x to <2 x double> +; CHECK-NEXT: extractelement <2 x double> +; CHECK-NEXT: bitcast double %tmp4 to <2 x float> +; CHECK-NEXT: extractelement <4 x float> +} + +define void @test8(<4 x float> %x, <2 x float> %y) { + %a = alloca <4 x float> + store <4 x float> %x, <4 x float>* %a + %p = bitcast <4 x float>* %a to <2 x float>* + store <2 x float> %y, <2 x float>* %p + ret void +; CHECK: @test8 +; CHECK-NOT: alloca +; CHECK: bitcast <4 x float> %x to <2 x double> +; CHECK-NEXT: bitcast <2 x float> %y to double +; CHECK-NEXT: insertelement <2 x double> +; CHECK-NEXT: bitcast <2 x double> %tmp2 to <4 x float> +} + +define i256 @test9(<4 x i256> %x) { + %a = alloca <4 x i256> + store <4 x i256> %x, <4 x i256>* %a + %p = bitcast <4 x i256>* %a to <2 x i256>* + %b = load <2 x i256>* %p + %q = getelementptr <4 x i256>* %a, i32 0, i32 2 + %c = load i256* %q + ret i256 %c +; CHECK: @test9 +; CHECK-NOT: alloca +; CHECK: bitcast <4 x i256> %x to <2 x i512> +; CHECK-NEXT: extractelement <2 x i512> +; CHECK-NEXT: bitcast i512 %tmp4 to <2 x i256> +; CHECK-NEXT: extractelement <4 x i256> +} + +define void @test10(<4 x i256> %x, <2 x i256> %y) { + %a = alloca <4 x i256> + store <4 x i256> %x, <4 x i256>* %a + %p = bitcast <4 x i256>* %a to <2 x i256>* + store <2 x i256> %y, <2 x i256>* %p + ret void +; CHECK: @test10 +; CHECK-NOT: alloca +; CHECK: bitcast <4 x i256> %x to <2 x i512> +; CHECK-NEXT: bitcast <2 x i256> %y to i512 +; CHECK-NEXT: insertelement <2 x i512> +; CHECK-NEXT: bitcast <2 x i512> %tmp2 to <4 x i256> +}