diff --git a/lib/Transforms/Scalar/SROA.cpp b/lib/Transforms/Scalar/SROA.cpp index d1a0a82b9b0..947513a3657 100644 --- a/lib/Transforms/Scalar/SROA.cpp +++ b/lib/Transforms/Scalar/SROA.cpp @@ -1847,10 +1847,17 @@ static unsigned getAdjustedAlignment(Instruction *I, uint64_t Offset, static bool canConvertValue(const DataLayout &DL, Type *OldTy, Type *NewTy) { if (OldTy == NewTy) return true; - if (IntegerType *OldITy = dyn_cast(OldTy)) - if (IntegerType *NewITy = dyn_cast(NewTy)) - if (NewITy->getBitWidth() >= OldITy->getBitWidth()) - return true; + + // For integer types, we can't handle any bit-width differences. This would + // break both vector conversions with extension and introduce endianness + // issues when in conjunction with loads and stores. + if (isa(OldTy) && isa(NewTy)) { + assert(cast(OldTy)->getBitWidth() != + cast(NewTy)->getBitWidth() && + "We can't have the same bitwidth for different int types"); + return false; + } + if (DL.getTypeSizeInBits(NewTy) != DL.getTypeSizeInBits(OldTy)) return false; if (!NewTy->isSingleValueType() || !OldTy->isSingleValueType()) @@ -1885,10 +1892,8 @@ static Value *convertValue(const DataLayout &DL, IRBuilderTy &IRB, Value *V, if (OldTy == NewTy) return V; - if (IntegerType *OldITy = dyn_cast(OldTy)) - if (IntegerType *NewITy = dyn_cast(NewTy)) - if (NewITy->getBitWidth() > OldITy->getBitWidth()) - return IRB.CreateZExt(V, NewITy); + assert(!(isa(OldTy) && isa(NewTy)) && + "Integer types must be the exact same to convert."); // See if we need inttoptr for this type pair. A cast involving both scalars // and vectors requires and additional bitcast. @@ -2134,6 +2139,9 @@ static bool isIntegerWideningViableForSlice(const Slice &S, if (LoadInst *LI = dyn_cast(U->getUser())) { if (LI->isVolatile()) return false; + // We can't handle loads that extend past the allocated memory. + if (DL.getTypeStoreSize(LI->getType()) > Size) + return false; // Note that we don't count vector loads or stores as whole-alloca // operations which enable integer widening because we would prefer to use // vector widening instead. @@ -2152,6 +2160,9 @@ static bool isIntegerWideningViableForSlice(const Slice &S, Type *ValueTy = SI->getValueOperand()->getType(); if (SI->isVolatile()) return false; + // We can't handle stores that extend past the allocated memory. + if (DL.getTypeStoreSize(ValueTy) > Size) + return false; // Note that we don't count vector loads or stores as whole-alloca // operations which enable integer widening because we would prefer to use // vector widening instead. @@ -2585,6 +2596,7 @@ private: Type *TargetTy = IsSplit ? Type::getIntNTy(LI.getContext(), SliceSize * 8) : LI.getType(); + const bool IsLoadPastEnd = DL.getTypeStoreSize(TargetTy) > SliceSize; bool IsPtrAdjusted = false; Value *V; if (VecTy) { @@ -2592,13 +2604,27 @@ private: } else if (IntTy && LI.getType()->isIntegerTy()) { V = rewriteIntegerLoad(LI); } else if (NewBeginOffset == NewAllocaBeginOffset && - canConvertValue(DL, NewAllocaTy, LI.getType())) { + NewEndOffset == NewAllocaEndOffset && + (canConvertValue(DL, NewAllocaTy, TargetTy) || + (IsLoadPastEnd && NewAllocaTy->isIntegerTy() && + TargetTy->isIntegerTy()))) { LoadInst *NewLI = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), LI.isVolatile(), LI.getName()); if (LI.isVolatile()) NewLI->setAtomic(LI.getOrdering(), LI.getSynchScope()); - V = NewLI; + + // If this is an integer load past the end of the slice (which means the + // bytes outside the slice are undef or this load is dead) just forcibly + // fix the integer size with correct handling of endianness. + if (auto *AITy = dyn_cast(NewAllocaTy)) + if (auto *TITy = dyn_cast(TargetTy)) + if (AITy->getBitWidth() < TITy->getBitWidth()) { + V = IRB.CreateZExt(V, TITy, "load.ext"); + if (DL.isBigEndian()) + V = IRB.CreateShl(V, TITy->getBitWidth() - AITy->getBitWidth(), + "endian_shift"); + } } else { Type *LTy = TargetTy->getPointerTo(); LoadInst *NewLI = IRB.CreateAlignedLoad(getNewAllocaSlicePtr(IRB, LTy), @@ -2718,10 +2744,25 @@ private: if (IntTy && V->getType()->isIntegerTy()) return rewriteIntegerStore(V, SI); + const bool IsStorePastEnd = DL.getTypeStoreSize(V->getType()) > SliceSize; StoreInst *NewSI; if (NewBeginOffset == NewAllocaBeginOffset && NewEndOffset == NewAllocaEndOffset && - canConvertValue(DL, V->getType(), NewAllocaTy)) { + (canConvertValue(DL, V->getType(), NewAllocaTy) || + (IsStorePastEnd && NewAllocaTy->isIntegerTy() && + V->getType()->isIntegerTy()))) { + // If this is an integer store past the end of slice (and thus the bytes + // past that point are irrelevant or this is unreachable), truncate the + // value prior to storing. + if (auto *VITy = dyn_cast(V->getType())) + if (auto *AITy = dyn_cast(NewAllocaTy)) + if (VITy->getBitWidth() > AITy->getBitWidth()) { + if (DL.isBigEndian()) + V = IRB.CreateLShr(V, VITy->getBitWidth() - AITy->getBitWidth(), + "endian_shift"); + V = IRB.CreateTrunc(V, AITy, "load.trunc"); + } + V = convertValue(DL, IRB, V, NewAllocaTy); NewSI = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlignment(), SI.isVolatile()); diff --git a/test/Transforms/SROA/basictest.ll b/test/Transforms/SROA/basictest.ll index 7c8955b28fa..ad2794167a5 100644 --- a/test/Transforms/SROA/basictest.ll +++ b/test/Transforms/SROA/basictest.ll @@ -1195,20 +1195,24 @@ entry: %a = alloca <{ i1 }>, align 8 %b = alloca <{ i1 }>, align 8 ; CHECK: %[[a:.*]] = alloca i8, align 8 +; CHECK-NEXT: %[[b:.*]] = alloca i8, align 8 %b.i1 = bitcast <{ i1 }>* %b to i1* store i1 %x, i1* %b.i1, align 8 %b.i8 = bitcast <{ i1 }>* %b to i8* %foo = load i8, i8* %b.i8, align 1 -; CHECK-NEXT: %[[ext:.*]] = zext i1 %x to i8 -; CHECK-NEXT: store i8 %[[ext]], i8* %[[a]], align 8 -; CHECK-NEXT: {{.*}} = load i8, i8* %[[a]], align 8 +; CHECK-NEXT: %[[b_cast:.*]] = bitcast i8* %[[b]] to i1* +; CHECK-NEXT: store i1 %x, i1* %[[b_cast]], align 8 +; CHECK-NEXT: {{.*}} = load i8, i8* %[[b]], align 8 %a.i8 = bitcast <{ i1 }>* %a to i8* call void @llvm.memcpy.p0i8.p0i8.i32(i8* %a.i8, i8* %b.i8, i32 1, i32 1, i1 false) nounwind %bar = load i8, i8* %a.i8, align 1 %a.i1 = getelementptr inbounds <{ i1 }>, <{ i1 }>* %a, i32 0, i32 0 %baz = load i1, i1* %a.i1, align 1 +; CHECK-NEXT: %[[copy:.*]] = load i8, i8* %[[b]], align 8 +; CHECK-NEXT: store i8 %[[copy]], i8* %[[a]], align 8 +; CHECK-NEXT: {{.*}} = load i8, i8* %[[a]], align 8 ; CHECK-NEXT: %[[a_cast:.*]] = bitcast i8* %[[a]] to i1* ; CHECK-NEXT: {{.*}} = load i1, i1* %[[a_cast]], align 8 diff --git a/test/Transforms/SROA/big-endian.ll b/test/Transforms/SROA/big-endian.ll index b5a04ca8e64..4de7bfcb898 100644 --- a/test/Transforms/SROA/big-endian.ll +++ b/test/Transforms/SROA/big-endian.ll @@ -112,3 +112,126 @@ entry: ; CHECK-NEXT: %[[ret:.*]] = zext i56 %[[insert4]] to i64 ; CHECK-NEXT: ret i64 %[[ret]] } + +define i64 @PR14132(i1 %flag) { +; CHECK-LABEL: @PR14132( +; Here we form a PHI-node by promoting the pointer alloca first, and then in +; order to promote the other two allocas, we speculate the load of the +; now-phi-node-pointer. In doing so we end up loading a 64-bit value from an i8 +; alloca. While this is a bit dubious, we were asserting on trying to +; rewrite it. The trick is that the code using the value may carefully take +; steps to only use the not-undef bits, and so we need to at least loosely +; support this. This test is particularly interesting because how we handle +; a load of an i64 from an i8 alloca is dependent on endianness. +entry: + %a = alloca i64, align 8 + %b = alloca i8, align 8 + %ptr = alloca i64*, align 8 +; CHECK-NOT: alloca + + %ptr.cast = bitcast i64** %ptr to i8** + store i64 0, i64* %a + store i8 1, i8* %b + store i64* %a, i64** %ptr + br i1 %flag, label %if.then, label %if.end + +if.then: + store i8* %b, i8** %ptr.cast + br label %if.end +; CHECK-NOT: store +; CHECK: %[[ext:.*]] = zext i8 1 to i64 +; CHECK: %[[shift:.*]] = shl i64 %[[ext]], 56 + +if.end: + %tmp = load i64*, i64** %ptr + %result = load i64, i64* %tmp +; CHECK-NOT: load +; CHECK: %[[result:.*]] = phi i64 [ %[[shift]], %if.then ], [ 0, %entry ] + + ret i64 %result +; CHECK-NEXT: ret i64 %[[result]] +} + +declare void @f(i64 %x, i32 %y) + +define void @test3() { +; CHECK-LABEL: @test3( +; +; This is a test that specifically exercises the big-endian lowering because it +; ends up splitting a 64-bit integer into two smaller integers and has a number +; of tricky aspects (the i24 type) that make that hard. Historically, SROA +; would miscompile this by either dropping a most significant byte or least +; significant byte due to shrinking the [4,8) slice to an i24, or by failing to +; move the bytes around correctly. +; +; The magical number 34494054408 is used because it has bits set in various +; bytes so that it is clear if those bytes fail to be propagated. +; +; If you're debugging this, rather than using the direct magical numbers, run +; the IR through '-sroa -instcombine'. With '-instcombine' these will be +; constant folded, and if the i64 doesn't round-trip correctly, you've found +; a bug! +; +entry: + %a = alloca { i32, i24 }, align 4 +; CHECK-NOT: alloca + + %tmp0 = bitcast { i32, i24 }* %a to i64* + store i64 34494054408, i64* %tmp0 + %tmp1 = load i64, i64* %tmp0, align 4 + %tmp2 = bitcast { i32, i24 }* %a to i32* + %tmp3 = load i32, i32* %tmp2, align 4 +; CHECK: %[[HI_EXT:.*]] = zext i32 134316040 to i64 +; CHECK: %[[HI_INPUT:.*]] = and i64 undef, -4294967296 +; CHECK: %[[HI_MERGE:.*]] = or i64 %[[HI_INPUT]], %[[HI_EXT]] +; CHECK: %[[LO_EXT:.*]] = zext i32 8 to i64 +; CHECK: %[[LO_SHL:.*]] = shl i64 %[[LO_EXT]], 32 +; CHECK: %[[LO_INPUT:.*]] = and i64 %[[HI_MERGE]], 4294967295 +; CHECK: %[[LO_MERGE:.*]] = or i64 %[[LO_INPUT]], %[[LO_SHL]] + + call void @f(i64 %tmp1, i32 %tmp3) +; CHECK: call void @f(i64 %[[LO_MERGE]], i32 8) + ret void +; CHECK: ret void +} + +define void @test4() { +; CHECK-LABEL: @test4 +; +; Much like @test3, this is specifically testing big-endian management of data. +; Also similarly, it uses constants with particular bits set to help track +; whether values are corrupted, and can be easily evaluated by running through +; -instcombine to see that the i64 round-trips. +; +entry: + %a = alloca { i32, i24 }, align 4 + %a2 = alloca i64, align 4 +; CHECK-NOT: alloca + + store i64 34494054408, i64* %a2 + %tmp0 = bitcast { i32, i24 }* %a to i8* + %tmp1 = bitcast i64* %a2 to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %tmp0, i8* %tmp1, i64 8, i32 4, i1 false) +; CHECK: %[[LO_SHR:.*]] = lshr i64 34494054408, 32 +; CHECK: %[[LO_START:.*]] = trunc i64 %[[LO_SHR]] to i32 +; CHECK: %[[HI_START:.*]] = trunc i64 34494054408 to i32 + + %tmp2 = bitcast { i32, i24 }* %a to i64* + %tmp3 = load i64, i64* %tmp2, align 4 + %tmp4 = bitcast { i32, i24 }* %a to i32* + %tmp5 = load i32, i32* %tmp4, align 4 +; CHECK: %[[HI_EXT:.*]] = zext i32 %[[HI_START]] to i64 +; CHECK: %[[HI_INPUT:.*]] = and i64 undef, -4294967296 +; CHECK: %[[HI_MERGE:.*]] = or i64 %[[HI_INPUT]], %[[HI_EXT]] +; CHECK: %[[LO_EXT:.*]] = zext i32 %[[LO_START]] to i64 +; CHECK: %[[LO_SHL:.*]] = shl i64 %[[LO_EXT]], 32 +; CHECK: %[[LO_INPUT:.*]] = and i64 %[[HI_MERGE]], 4294967295 +; CHECK: %[[LO_MERGE:.*]] = or i64 %[[LO_INPUT]], %[[LO_SHL]] + + call void @f(i64 %tmp3, i32 %tmp5) +; CHECK: call void @f(i64 %[[LO_MERGE]], i32 %[[LO_START]]) + ret void +; CHECK: ret void +} + +declare void @llvm.memcpy.p0i8.p0i8.i64(i8*, i8*, i64, i32, i1) diff --git a/test/Transforms/SROA/phi-and-select.ll b/test/Transforms/SROA/phi-and-select.ll index e97bd66d052..fb76548b1d1 100644 --- a/test/Transforms/SROA/phi-and-select.ll +++ b/test/Transforms/SROA/phi-and-select.ll @@ -438,26 +438,26 @@ define i64 @PR14132(i1 %flag) { ; steps to only use the not-undef bits, and so we need to at least loosely ; support this.. entry: - %a = alloca i64 - %b = alloca i8 - %ptr = alloca i64* + %a = alloca i64, align 8 + %b = alloca i8, align 8 + %ptr = alloca i64*, align 8 ; CHECK-NOT: alloca %ptr.cast = bitcast i64** %ptr to i8** - store i64 0, i64* %a - store i8 1, i8* %b - store i64* %a, i64** %ptr + store i64 0, i64* %a, align 8 + store i8 1, i8* %b, align 8 + store i64* %a, i64** %ptr, align 8 br i1 %flag, label %if.then, label %if.end if.then: - store i8* %b, i8** %ptr.cast + store i8* %b, i8** %ptr.cast, align 8 br label %if.end ; CHECK-NOT: store ; CHECK: %[[ext:.*]] = zext i8 1 to i64 if.end: - %tmp = load i64*, i64** %ptr - %result = load i64, i64* %tmp + %tmp = load i64*, i64** %ptr, align 8 + %result = load i64, i64* %tmp, align 8 ; CHECK-NOT: load ; CHECK: %[[result:.*]] = phi i64 [ %[[ext]], %if.then ], [ 0, %entry ]