From 99a54942ae0fb6fdca03e91b2e492e9738fa4436 Mon Sep 17 00:00:00 2001 From: Chandler Carruth Date: Mon, 17 Dec 2012 14:51:24 +0000 Subject: [PATCH] Teach the rewriting of memcpy calls to support subvector copies. This also cleans up a bit of the memcpy call rewriting by sinking some irrelevant code further down and making the call-emitting code a bit more concrete. Previously, memcpy of a subvector would actually miscompile (!!!) the copy into a single vector element copy. I have no idea how this ever worked. =/ This is the memcpy half of PR14478 which we probably weren't noticing previously because it didn't actually assert. The rewrite relies on the newly refactored insert- and extractVector functions to do the heavy lifting, and those are the same as used for loads and stores which makes the test coverage a bit more meaningful here. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@170338 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Scalar/SROA.cpp | 81 ++++++++++++------------ test/Transforms/SROA/vector-promotion.ll | 48 ++++++++++++++ 2 files changed, 89 insertions(+), 40 deletions(-) diff --git a/lib/Transforms/Scalar/SROA.cpp b/lib/Transforms/Scalar/SROA.cpp index 1ac239e13e3..ab61a2fd674 100644 --- a/lib/Transforms/Scalar/SROA.cpp +++ b/lib/Transforms/Scalar/SROA.cpp @@ -2900,37 +2900,22 @@ private: // Record this instruction for deletion. Pass.DeadInsts.insert(&II); - bool IsWholeAlloca = BeginOffset == NewAllocaBeginOffset && - EndOffset == NewAllocaEndOffset; - bool IsVectorElement = VecTy && !IsWholeAlloca; - uint64_t Size = EndOffset - BeginOffset; - IntegerType *SubIntTy - = IntTy ? Type::getIntNTy(IntTy->getContext(), Size*8) : 0; - - Type *OtherPtrTy = IsDest ? II.getRawSource()->getType() - : II.getRawDest()->getType(); - if (!EmitMemCpy) { - if (IsVectorElement) - OtherPtrTy = VecTy->getElementType()->getPointerTo(); - else if (IntTy && !IsWholeAlloca) - OtherPtrTy = SubIntTy->getPointerTo(); - else - OtherPtrTy = NewAI.getType(); - } - - // Compute the other pointer, folding as much as possible to produce - // a single, simple GEP in most cases. - Value *OtherPtr = IsDest ? II.getRawSource() : II.getRawDest(); - OtherPtr = getAdjustedPtr(IRB, TD, OtherPtr, RelOffset, OtherPtrTy, - getName("." + OtherPtr->getName())); - // Strip all inbounds GEPs and pointer casts to try to dig out any root // alloca that should be re-examined after rewriting this instruction. + Value *OtherPtr = IsDest ? II.getRawSource() : II.getRawDest(); if (AllocaInst *AI = dyn_cast(OtherPtr->stripInBoundsOffsets())) Pass.Worklist.insert(AI); if (EmitMemCpy) { + Type *OtherPtrTy = IsDest ? II.getRawSource()->getType() + : II.getRawDest()->getType(); + + // Compute the other pointer, folding as much as possible to produce + // a single, simple GEP in most cases. + OtherPtr = getAdjustedPtr(IRB, TD, OtherPtr, RelOffset, OtherPtrTy, + getName("." + OtherPtr->getName())); + Value *OurPtr = getAdjustedAllocaPtr(IRB, IsDest ? II.getRawDest()->getType() : II.getRawSource()->getType()); @@ -2951,18 +2936,38 @@ private: if (!Align) Align = 1; - Value *SrcPtr = OtherPtr; + bool IsWholeAlloca = BeginOffset == NewAllocaBeginOffset && + EndOffset == NewAllocaEndOffset; + uint64_t Size = EndOffset - BeginOffset; + unsigned BeginIndex = VecTy ? getIndex(BeginOffset) : 0; + unsigned EndIndex = VecTy ? getIndex(EndOffset) : 0; + unsigned NumElements = EndIndex - BeginIndex; + IntegerType *SubIntTy + = IntTy ? Type::getIntNTy(IntTy->getContext(), Size*8) : 0; + + Type *OtherPtrTy = NewAI.getType(); + if (VecTy && !IsWholeAlloca) { + if (NumElements == 1) + OtherPtrTy = VecTy->getElementType(); + else + OtherPtrTy = VectorType::get(VecTy->getElementType(), NumElements); + + OtherPtrTy = OtherPtrTy->getPointerTo(); + } else if (IntTy && !IsWholeAlloca) { + OtherPtrTy = SubIntTy->getPointerTo(); + } + + Value *SrcPtr = getAdjustedPtr(IRB, TD, OtherPtr, RelOffset, OtherPtrTy, + getName("." + OtherPtr->getName())); Value *DstPtr = &NewAI; if (!IsDest) std::swap(SrcPtr, DstPtr); Value *Src; - if (IsVectorElement && !IsDest) { - // We have to extract rather than load. - Src = IRB.CreateExtractElement( - IRB.CreateAlignedLoad(SrcPtr, Align, getName(".copyload")), - IRB.getInt32(getIndex(BeginOffset)), - getName(".copyextract")); + if (VecTy && !IsWholeAlloca && !IsDest) { + Src = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), + getName(".load")); + Src = extractVector(IRB, Src, BeginIndex, EndIndex, getName(".vec")); } else if (IntTy && !IsWholeAlloca && !IsDest) { Src = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), getName(".load")); @@ -2975,7 +2980,11 @@ private: getName(".copyload")); } - if (IntTy && !IsWholeAlloca && IsDest) { + if (VecTy && !IsWholeAlloca && IsDest) { + Value *Old = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), + getName(".oldload")); + Src = insertVector(IRB, Old, Src, BeginIndex, getName(".vec")); + } else if (IntTy && !IsWholeAlloca && IsDest) { Value *Old = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), getName(".oldload")); Old = convertValue(TD, IRB, Old, IntTy); @@ -2985,14 +2994,6 @@ private: Src = convertValue(TD, IRB, Src, NewAllocaTy); } - if (IsVectorElement && IsDest) { - // We have to insert into a loaded copy before storing. - Src = IRB.CreateInsertElement( - IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), getName(".load")), - Src, IRB.getInt32(getIndex(BeginOffset)), - getName(".insert")); - } - StoreInst *Store = cast( IRB.CreateAlignedStore(Src, DstPtr, Align, II.isVolatile())); (void)Store; diff --git a/test/Transforms/SROA/vector-promotion.ll b/test/Transforms/SROA/vector-promotion.ll index 846a4326f76..02f6d040cc9 100644 --- a/test/Transforms/SROA/vector-promotion.ll +++ b/test/Transforms/SROA/vector-promotion.ll @@ -314,6 +314,54 @@ entry: ; CHECK-NEXT: ret <4 x float> %[[insert4]] } +define <4 x float> @test_subvec_memcpy(i8* %x, i8* %y, i8* %z, i8* %f, i8* %out) { +; CHECK: @test_subvec_memcpy +entry: + %a = alloca <4 x float> +; CHECK-NOT: alloca + + %a.gep0 = getelementptr <4 x float>* %a, i32 0, i32 0 + %a.cast0 = bitcast float* %a.gep0 to i8* + call void @llvm.memcpy.p0i8.p0i8.i32(i8* %a.cast0, i8* %x, i32 8, i32 0, i1 false) +; CHECK: %[[xptr:.*]] = bitcast i8* %x to <2 x float>* +; CHECK-NEXT: %[[x:.*]] = load <2 x float>* %[[xptr]] +; CHECK-NEXT: %[[expand_x:.*]] = shufflevector <2 x float> %[[x]], <2 x float> undef, <4 x i32> +; CHECK-NEXT: %[[insert_x:.*]] = shufflevector <4 x float> %[[expand_x]], <4 x float> undef, <4 x i32> + + %a.gep1 = getelementptr <4 x float>* %a, i32 0, i32 1 + %a.cast1 = bitcast float* %a.gep1 to i8* + call void @llvm.memcpy.p0i8.p0i8.i32(i8* %a.cast1, i8* %y, i32 8, i32 0, i1 false) +; CHECK-NEXT: %[[yptr:.*]] = bitcast i8* %y to <2 x float>* +; CHECK-NEXT: %[[y:.*]] = load <2 x float>* %[[yptr]] +; CHECK-NEXT: %[[expand_y:.*]] = shufflevector <2 x float> %[[y]], <2 x float> undef, <4 x i32> +; CHECK-NEXT: %[[insert_y:.*]] = shufflevector <4 x float> %[[expand_y]], <4 x float> %[[insert_x]], <4 x i32> + + %a.gep2 = getelementptr <4 x float>* %a, i32 0, i32 2 + %a.cast2 = bitcast float* %a.gep2 to i8* + call void @llvm.memcpy.p0i8.p0i8.i32(i8* %a.cast2, i8* %z, i32 8, i32 0, i1 false) +; CHECK-NEXT: %[[zptr:.*]] = bitcast i8* %z to <2 x float>* +; CHECK-NEXT: %[[z:.*]] = load <2 x float>* %[[zptr]] +; CHECK-NEXT: %[[expand_z:.*]] = shufflevector <2 x float> %[[z]], <2 x float> undef, <4 x i32> +; CHECK-NEXT: %[[insert_z:.*]] = shufflevector <4 x float> %[[expand_z]], <4 x float> %[[insert_y]], <4 x i32> + + %a.gep3 = getelementptr <4 x float>* %a, i32 0, i32 3 + %a.cast3 = bitcast float* %a.gep3 to i8* + call void @llvm.memcpy.p0i8.p0i8.i32(i8* %a.cast3, i8* %f, i32 4, i32 0, i1 false) +; CHECK-NEXT: %[[fptr:.*]] = bitcast i8* %f to float* +; CHECK-NEXT: %[[f:.*]] = load float* %[[fptr]] +; CHECK-NEXT: %[[insert_f:.*]] = insertelement <4 x float> %[[insert_z]], float %[[f]], i32 3 + + call void @llvm.memcpy.p0i8.p0i8.i32(i8* %out, i8* %a.cast2, i32 8, i32 0, i1 false) +; CHECK-NEXT: %[[outptr:.*]] = bitcast i8* %out to <2 x float>* +; CHECK-NEXT: %[[extract_out:.*]] = shufflevector <4 x float> %[[insert_f]], <4 x float> undef, <2 x i32> +; CHECK-NEXT: store <2 x float> %[[extract_out]], <2 x float>* %[[outptr]] + + %ret = load <4 x float>* %a + + ret <4 x float> %ret +; CHECK-NEXT: ret <4 x float> %[[insert_f]] +} + define i32 @PR14212() { ; CHECK: @PR14212 ; This caused a crash when "splitting" the load of the i32 in order to promote