mirror of
https://github.com/c64scene-ar/llvm-6502.git
synced 2024-12-19 17:33:29 +00:00
ce7f347da2
a pre-splitting pass over loads and stores. Historically, splitting could cause enough problems that I hamstrung the entire process with a requirement that splittable integer loads and stores must cover the entire alloca. All smaller loads and stores were unsplittable to prevent chaos from ensuing. With the new pre-splitting logic that does load/store pair splitting I introduced in r225061, we can now very nicely handle arbitrarily splittable loads and stores. In order to fully benefit from these smarts, we need to mark all of the integer loads and stores as splittable. However, we don't actually want to rewrite partitions with all integer loads and stores marked as splittable. This will fail to extract scalar integers from aggregates, which is kind of the point of SROA. =] In order to resolve this, what we really want to do is only do pre-splitting on the alloca slices with integer loads and stores fully splittable. This allows us to uncover all non-integer uses of the alloca that would benefit from a split in an integer load or store (and where introducing the split is safe because it is just memory transfer from a load to a store). Once done, we make all the non-whole-alloca integer loads and stores unsplittable just as they have historically been, repartition and rewrite. The result is that when there are integer loads and stores anywhere within an alloca (such as from a memcpy of a sub-object of a larger object), we can split them up if there are non-integer components to the aggregate hiding beneath. I've added the challenging test cases to demonstrate how this is able to promote to scalars even a case where we have even *partially* overlapping loads and stores. This restores the single-store behavior for small arrays of i8s which is really nice. I've restored both the little endian testing and big endian testing for these exactly as they were prior to r225061. It also forced me to be more aggressive in an alignment test to actually defeat SROA. =] Without the added volatiles there, we actually split up the weird i16 loads and produce nice double allocas with better alignment. This also uncovered a number of bugs where we failed to handle splittable load and store slices which didn't have a begininng offset of zero. Those fixes are included, and without them the existing test cases explode in glorious fireworks. =] I've kept support for leaving whole-alloca integer loads and stores as splittable even for the purpose of rewriting, but I think that's likely no longer needed. With the new pre-splitting, we might be able to remove all the splitting support for loads and stores from the rewriter. Not doing that in this patch to try to isolate any performance regressions that causes in an easy to find and revert chunk. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@225074 91177308-0d34-0410-b5e6-96231b3b80d8
175 lines
6.0 KiB
LLVM
175 lines
6.0 KiB
LLVM
; RUN: opt < %s -sroa -S | FileCheck %s
|
|
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n8:16:32:64"
|
|
|
|
declare void @llvm.memcpy.p0i8.p0i8.i32(i8*, i8*, i32, i32, i1)
|
|
|
|
define void @test1({ i8, i8 }* %a, { i8, i8 }* %b) {
|
|
; CHECK-LABEL: @test1(
|
|
; CHECK: %[[gep_a0:.*]] = getelementptr inbounds { i8, i8 }* %a, i64 0, i32 0
|
|
; CHECK: %[[a0:.*]] = load i8* %[[gep_a0]], align 16
|
|
; CHECK: %[[gep_a1:.*]] = getelementptr inbounds { i8, i8 }* %a, i64 0, i32 1
|
|
; CHECK: %[[a1:.*]] = load i8* %[[gep_a1]], align 1
|
|
; CHECK: %[[gep_b0:.*]] = getelementptr inbounds { i8, i8 }* %b, i64 0, i32 0
|
|
; CHECK: store i8 %[[a0]], i8* %[[gep_b0]], align 16
|
|
; CHECK: %[[gep_b1:.*]] = getelementptr inbounds { i8, i8 }* %b, i64 0, i32 1
|
|
; CHECK: store i8 %[[a1]], i8* %[[gep_b1]], align 1
|
|
; CHECK: ret void
|
|
|
|
entry:
|
|
%alloca = alloca { i8, i8 }, align 16
|
|
%gep_a = getelementptr { i8, i8 }* %a, i32 0, i32 0
|
|
%gep_alloca = getelementptr { i8, i8 }* %alloca, i32 0, i32 0
|
|
%gep_b = getelementptr { i8, i8 }* %b, i32 0, i32 0
|
|
|
|
store i8 420, i8* %gep_alloca, align 16
|
|
|
|
call void @llvm.memcpy.p0i8.p0i8.i32(i8* %gep_alloca, i8* %gep_a, i32 2, i32 16, i1 false)
|
|
call void @llvm.memcpy.p0i8.p0i8.i32(i8* %gep_b, i8* %gep_alloca, i32 2, i32 16, i1 false)
|
|
ret void
|
|
}
|
|
|
|
define void @test2() {
|
|
; CHECK-LABEL: @test2(
|
|
; CHECK: alloca i16
|
|
; CHECK: load i8* %{{.*}}
|
|
; CHECK: store i8 42, i8* %{{.*}}
|
|
; CHECK: ret void
|
|
|
|
entry:
|
|
%a = alloca { i8, i8, i8, i8 }, align 2
|
|
%gep1 = getelementptr { i8, i8, i8, i8 }* %a, i32 0, i32 1
|
|
%cast1 = bitcast i8* %gep1 to i16*
|
|
store volatile i16 0, i16* %cast1
|
|
%gep2 = getelementptr { i8, i8, i8, i8 }* %a, i32 0, i32 2
|
|
%result = load i8* %gep2
|
|
store i8 42, i8* %gep2
|
|
ret void
|
|
}
|
|
|
|
define void @PR13920(<2 x i64>* %a, i16* %b) {
|
|
; Test that alignments on memcpy intrinsics get propagated to loads and stores.
|
|
; CHECK-LABEL: @PR13920(
|
|
; CHECK: load <2 x i64>* %a, align 2
|
|
; CHECK: store <2 x i64> {{.*}}, <2 x i64>* {{.*}}, align 2
|
|
; CHECK: ret void
|
|
|
|
entry:
|
|
%aa = alloca <2 x i64>, align 16
|
|
%aptr = bitcast <2 x i64>* %a to i8*
|
|
%aaptr = bitcast <2 x i64>* %aa to i8*
|
|
call void @llvm.memcpy.p0i8.p0i8.i32(i8* %aaptr, i8* %aptr, i32 16, i32 2, i1 false)
|
|
%bptr = bitcast i16* %b to i8*
|
|
call void @llvm.memcpy.p0i8.p0i8.i32(i8* %bptr, i8* %aaptr, i32 16, i32 2, i1 false)
|
|
ret void
|
|
}
|
|
|
|
define void @test3(i8* %x) {
|
|
; Test that when we promote an alloca to a type with lower ABI alignment, we
|
|
; provide the needed explicit alignment that code using the alloca may be
|
|
; expecting. However, also check that any offset within an alloca can in turn
|
|
; reduce the alignment.
|
|
; CHECK-LABEL: @test3(
|
|
; CHECK: alloca [22 x i8], align 8
|
|
; CHECK: alloca [18 x i8], align 2
|
|
; CHECK: ret void
|
|
|
|
entry:
|
|
%a = alloca { i8*, i8*, i8* }
|
|
%b = alloca { i8*, i8*, i8* }
|
|
%a_raw = bitcast { i8*, i8*, i8* }* %a to i8*
|
|
call void @llvm.memcpy.p0i8.p0i8.i32(i8* %a_raw, i8* %x, i32 22, i32 8, i1 false)
|
|
%b_raw = bitcast { i8*, i8*, i8* }* %b to i8*
|
|
%b_gep = getelementptr i8* %b_raw, i32 6
|
|
call void @llvm.memcpy.p0i8.p0i8.i32(i8* %b_gep, i8* %x, i32 18, i32 2, i1 false)
|
|
ret void
|
|
}
|
|
|
|
define void @test5() {
|
|
; Test that we preserve underaligned loads and stores when splitting. The use
|
|
; of volatile in this test case is just to force the loads and stores to not be
|
|
; split or promoted out of existence.
|
|
;
|
|
; CHECK-LABEL: @test5(
|
|
; CHECK: alloca [9 x i8]
|
|
; CHECK: alloca [9 x i8]
|
|
; CHECK: store volatile double 0.0{{.*}}, double* %{{.*}}, align 1
|
|
; CHECK: load volatile i16* %{{.*}}, align 1
|
|
; CHECK: load double* %{{.*}}, align 1
|
|
; CHECK: store volatile double %{{.*}}, double* %{{.*}}, align 1
|
|
; CHECK: load volatile i16* %{{.*}}, align 1
|
|
; CHECK: ret void
|
|
|
|
entry:
|
|
%a = alloca [18 x i8]
|
|
%raw1 = getelementptr inbounds [18 x i8]* %a, i32 0, i32 0
|
|
%ptr1 = bitcast i8* %raw1 to double*
|
|
store volatile double 0.0, double* %ptr1, align 1
|
|
%weird_gep1 = getelementptr inbounds [18 x i8]* %a, i32 0, i32 7
|
|
%weird_cast1 = bitcast i8* %weird_gep1 to i16*
|
|
%weird_load1 = load volatile i16* %weird_cast1, align 1
|
|
|
|
%raw2 = getelementptr inbounds [18 x i8]* %a, i32 0, i32 9
|
|
%ptr2 = bitcast i8* %raw2 to double*
|
|
%d1 = load double* %ptr1, align 1
|
|
store volatile double %d1, double* %ptr2, align 1
|
|
%weird_gep2 = getelementptr inbounds [18 x i8]* %a, i32 0, i32 16
|
|
%weird_cast2 = bitcast i8* %weird_gep2 to i16*
|
|
%weird_load2 = load volatile i16* %weird_cast2, align 1
|
|
|
|
ret void
|
|
}
|
|
|
|
define void @test6() {
|
|
; Test that we promote alignment when the underlying alloca switches to one
|
|
; that innately provides it.
|
|
; CHECK-LABEL: @test6(
|
|
; CHECK: alloca double
|
|
; CHECK: alloca double
|
|
; CHECK-NOT: align
|
|
; CHECK: ret void
|
|
|
|
entry:
|
|
%a = alloca [16 x i8]
|
|
%raw1 = getelementptr inbounds [16 x i8]* %a, i32 0, i32 0
|
|
%ptr1 = bitcast i8* %raw1 to double*
|
|
store volatile double 0.0, double* %ptr1, align 1
|
|
|
|
%raw2 = getelementptr inbounds [16 x i8]* %a, i32 0, i32 8
|
|
%ptr2 = bitcast i8* %raw2 to double*
|
|
%val = load double* %ptr1, align 1
|
|
store volatile double %val, double* %ptr2, align 1
|
|
|
|
ret void
|
|
}
|
|
|
|
define void @test7(i8* %out) {
|
|
; Test that we properly compute the destination alignment when rewriting
|
|
; memcpys as direct loads or stores.
|
|
; CHECK-LABEL: @test7(
|
|
; CHECK-NOT: alloca
|
|
|
|
entry:
|
|
%a = alloca [16 x i8]
|
|
%raw1 = getelementptr inbounds [16 x i8]* %a, i32 0, i32 0
|
|
%ptr1 = bitcast i8* %raw1 to double*
|
|
%raw2 = getelementptr inbounds [16 x i8]* %a, i32 0, i32 8
|
|
%ptr2 = bitcast i8* %raw2 to double*
|
|
|
|
call void @llvm.memcpy.p0i8.p0i8.i32(i8* %raw1, i8* %out, i32 16, i32 0, i1 false)
|
|
; CHECK: %[[val2:.*]] = load double* %{{.*}}, align 1
|
|
; CHECK: %[[val1:.*]] = load double* %{{.*}}, align 1
|
|
|
|
%val1 = load double* %ptr2, align 1
|
|
%val2 = load double* %ptr1, align 1
|
|
|
|
store double %val1, double* %ptr1, align 1
|
|
store double %val2, double* %ptr2, align 1
|
|
|
|
call void @llvm.memcpy.p0i8.p0i8.i32(i8* %out, i8* %raw1, i32 16, i32 0, i1 false)
|
|
; CHECK: store double %[[val1]], double* %{{.*}}, align 1
|
|
; CHECK: store double %[[val2]], double* %{{.*}}, align 1
|
|
|
|
ret void
|
|
; CHECK: ret void
|
|
}
|