1
0
mirror of https://github.com/c64scene-ar/llvm-6502.git synced 2025-04-07 16:42:07 +00:00

ARM: Enable MachineScheduler and disable PostRAScheduler for swift.

Reapply r242500 now that the swift schedmodel includes LDRLIT.

This is mostly done to disable the PostRAScheduler which optimizes for
instruction latencies which isn't a good fit for out-of-order
architectures. This also allows to leave out the itinerary table in
swift in favor of the SchedModel ones.

This change leads to performance improvements/regressions by as much as
10% in some benchmarks, in fact we loose 0.4% performance over the
llvm-testsuite for reasons that appear to be unknown or out of the
compilers control. rdar://20803802 documents the investigation of
these effects.

While it is probably a good idea to perform the same switch for the
other ARM out-of-order CPUs, I limited this change to swift as I cannot
perform the benchmark verification on the other CPUs.

Differential Revision: http://reviews.llvm.org/D10513

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@242588 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
Matthias Braun 2015-07-17 23:18:30 +00:00
parent 0dec0e1ea5
commit 6f6ca40ef0
10 changed files with 48 additions and 1069 deletions

@ -206,6 +206,9 @@ struct MCSchedModel {
/// scheduling class (itinerary class or SchedRW list).
bool isComplete() const { return CompleteModel; }
/// Return true if machine supports out of order execution.
bool isOutOfOrder() const { return MicroOpBufferSize > 1; }
unsigned getNumProcResourceKinds() const {
return NumProcResourceKinds;
}

File diff suppressed because it is too large Load Diff

@ -319,8 +319,19 @@ bool ARMSubtarget::hasSinCos() const {
return getTargetTriple().isiOS() && !getTargetTriple().isOSVersionLT(7, 0);
}
bool ARMSubtarget::enableMachineScheduler() const {
// Enable the MachineScheduler before register allocation for out-of-order
// architectures where we do not use the PostRA scheduler anymore (for now
// restricted to swift).
return getSchedModel().isOutOfOrder() && isSwift();
}
// This overrides the PostRAScheduler bit in the SchedModel for any CPU.
bool ARMSubtarget::enablePostRAScheduler() const {
// No need for PostRA scheduling on out of order CPUs (for now restricted to
// swift).
if (getSchedModel().isOutOfOrder() && isSwift())
return false;
return (!isThumb() || hasThumb2());
}

@ -433,6 +433,9 @@ public:
/// compiler runtime or math libraries.
bool hasSinCos() const;
/// Returns true if machine scheduler should be enabled.
bool enableMachineScheduler() const override;
/// True for some subtargets at > -O0.
bool enablePostRAScheduler() const override;

@ -11,25 +11,25 @@
; r0 = r0 / r2
; r1 = r1 / r3
;
; NOOPT: vmov [[B:d[0-9]+]], r2, r3
; NOOPT-NEXT: vmov [[A:d[0-9]+]], r0, r1
; NOOPT: vmov [[A:d[0-9]+]], r0, r1
; NOOPT-NEXT: vmov [[B:d[0-9]+]], r2, r3
; Move the low part of B into a register.
; Unfortunately, we cannot express that the 's' register is the low
; part of B, i.e., sIdx == BIdx x 2. E.g., B = d1, B_low = s2.
; NOOPT-NEXT: vmov [[B_LOW:r[0-9]+]], s{{[0-9]+}}
; NOOPT-NEXT: vmov [[A_LOW:r[0-9]+]], s{{[0-9]+}}
; NOOPT-NEXT: udiv [[RES_LOW:r[0-9]+]], [[A_LOW]], [[B_LOW]]
; NOOPT-NEXT: vmov [[B_HIGH:r[0-9]+]], s{{[0-9]+}}
; NOOPT-NEXT: vmov [[A_LOW:r[0-9]+]], s{{[0-9]+}}
; NOOPT-NEXT: vmov [[A_HIGH:r[0-9]+]], s{{[0-9]+}}
; NOOPT-NEXT: udiv [[RES_HIGH:r[0-9]+]], [[A_HIGH]], [[B_HIGH]]
; NOOPT-NEXT: udiv [[RES_LOW:r[0-9]+]], [[A_LOW]], [[B_LOW]]
; NOOPT-NEXT: vmov.32 [[RES:d[0-9]+]][0], [[RES_LOW]]
; NOOPT-NEXT: udiv [[RES_HIGH:r[0-9]+]], [[A_HIGH]], [[B_HIGH]]
; NOOPT-NEXT: vmov.32 [[RES]][1], [[RES_HIGH]]
; NOOPT-NEXT: vmov r0, r1, [[RES]]
; NOOPT-NEXT: bx lr
;
; OPT-NOT: vmov
; OPT: udiv r0, r0, r2
; OPT-NEXT: udiv r1, r1, r3
; OPT: udiv r1, r1, r3
; OPT-NEXT: udiv r0, r0, r2
; OPT-NEXT: bx lr
define <2 x i32> @simpleVectorDiv(<2 x i32> %A, <2 x i32> %B) nounwind {
entry:

@ -1,5 +1,5 @@
; RUN: llc < %s -mtriple=thumbv7-apple-darwin -mcpu=cortex-a9 | FileCheck %s
; RUN: llc < %s -mtriple=thumbv7-apple-darwin -mcpu=swift | FileCheck %s
; RUN: llc < %s -mtriple=thumbv7-apple-darwin -mcpu=cortex-a9 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-CORTEX
; RUN: llc < %s -mtriple=thumbv7-apple-darwin -mcpu=swift | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-SWIFT
; Avoid some 's' 16-bit instruction which partially update CPSR (and add false
; dependency) when it isn't dependent on last CPSR defining instruction.
; rdar://8928208
@ -7,8 +7,10 @@
define i32 @t1(i32 %a, i32 %b, i32 %c, i32 %d) nounwind readnone {
entry:
; CHECK-LABEL: t1:
; CHECK: muls [[REG:(r[0-9]+)]], r3, r2
; CHECK-NEXT: mul [[REG2:(r[0-9]+)]], r1, r0
; CHECK-CORTEX: muls [[REG:(r[0-9]+)]], r3, r2
; CHECK-CORTEX-NEXT: mul [[REG2:(r[0-9]+)]], r1, r0
; CHECK-SWIFT: muls [[REG2:(r[0-9]+)]], r1, r0
; CHECK-SWIFT-NEXT: mul [[REG:(r[0-9]+)]], r2, r3
; CHECK-NEXT: muls r0, [[REG]], [[REG2]]
%0 = mul nsw i32 %a, %b
%1 = mul nsw i32 %c, %d
@ -21,8 +23,7 @@ define i32 @t1(i32 %a, i32 %b, i32 %c, i32 %d) nounwind readnone {
define void @t2(i32* nocapture %ptr1, i32* %ptr2, i32 %c) nounwind {
entry:
; CHECK-LABEL: t2:
%tobool7 = icmp eq i32* %ptr2, null
br i1 %tobool7, label %while.end, label %while.body
br label %while.body
while.body:
; CHECK: while.body
@ -55,8 +56,7 @@ while.end:
define void @t3(i32* nocapture %ptr1, i32* %ptr2, i32 %c) nounwind minsize {
entry:
; CHECK-LABEL: t3:
%tobool7 = icmp eq i32* %ptr2, null
br i1 %tobool7, label %while.end, label %while.body
br label %while.body
while.body:
; CHECK: while.body

@ -15,14 +15,14 @@ define i32 @test_return(i32* %p, i32 %oldval, i32 %newval) {
; CHECK: bne [[LOOP]]
; CHECK-NOT: cmp {{r[0-9]+}}, {{r[0-9]+}}
; CHECK: movs r0, #1
; CHECK: dmb ish
; CHECK: movs r0, #1
; CHECK: bx lr
; CHECK: [[FAILED]]:
; CHECK-NOT: cmp {{r[0-9]+}}, {{r[0-9]+}}
; CHECK: movs r0, #0
; CHECK: dmb ish
; CHECK: movs r0, #0
; CHECK: bx lr
%pair = cmpxchg i32* %p, i32 %oldval, i32 %newval seq_cst seq_cst
@ -34,8 +34,8 @@ define i32 @test_return(i32* %p, i32 %oldval, i32 %newval) {
define i1 @test_return_bool(i8* %value, i8 %oldValue, i8 %newValue) {
; CHECK-LABEL: test_return_bool:
; CHECK: uxtb [[OLDBYTE:r[0-9]+]], r1
; CHECK: dmb ishst
; CHECK: uxtb [[OLDBYTE:r[0-9]+]], r1
; CHECK: [[LOOP:LBB[0-9]+_[0-9]+]]:
; CHECK: ldrexb [[LOADED:r[0-9]+]], [r0]

@ -20,8 +20,8 @@ entry:
for.body: ; preds = %entry, %for.body.3
; CHECK: %for.body
; CHECK: ldrb {{r[0-9]|lr}}, [{{r[0-9]|lr}}, {{r[0-9]|lr}}]!
; CHECK: ldrb {{r[0-9]|lr}}, [{{r[0-9]|lr}}, {{r[0-9]|lr}}]!
; CHECK: ldrb {{r[0-9]+|lr}}, [{{r[0-9]+|lr}}, {{r[0-9]+|lr}}]!
; CHECK: ldrb {{r[0-9]+|lr}}, [{{r[0-9]+|lr}}, {{r[0-9]+|lr}}]!
%i.09 = phi i32 [ %add5.3, %for.body.3 ], [ 0, %entry ]
%arrayidx = getelementptr inbounds i8, i8* %a, i32 %i.09
%0 = load i8, i8* %arrayidx, align 1
@ -42,8 +42,8 @@ for.end: ; preds = %for.body, %for.body
for.body.1: ; preds = %for.body
; CHECK: %for.body.1
; CHECK: ldrb {{r[0-9]|lr}}, [{{r[0-9]|lr}}, {{r[0-9]|lr}}]!
; CHECK: ldrb {{r[0-9]|lr}}, [{{r[0-9]|lr}}, {{r[0-9]|lr}}]!
; CHECK: ldrb {{r[0-9]+|lr}}, [{{r[0-9]+|lr}}, {{r[0-9]+|lr}}]!
; CHECK: ldrb {{r[0-9]+|lr}}, [{{r[0-9]+|lr}}, {{r[0-9]+|lr}}]!
%arrayidx.1 = getelementptr inbounds i8, i8* %a, i32 %add5
%2 = load i8, i8* %arrayidx.1, align 1
%conv6.1 = zext i8 %2 to i32
@ -60,8 +60,8 @@ for.body.1: ; preds = %for.body
for.body.2: ; preds = %for.body.1
; CHECK: %for.body.2
; CHECK: ldrb {{r[0-9]|lr}}, [{{r[0-9]|lr}}, {{r[0-9]|lr}}]!
; CHECK: ldrb {{r[0-9]|lr}}, [{{r[0-9]|lr}}, {{r[0-9]|lr}}]!
; CHECK: ldrb {{r[0-9]+|lr}}, [{{r[0-9]+|lr}}, {{r[0-9]+|lr}}]!
; CHECK: ldrb {{r[0-9]+|lr}}, [{{r[0-9]+|lr}}, {{r[0-9]+|lr}}]!
%arrayidx.2 = getelementptr inbounds i8, i8* %a, i32 %add5.1
%4 = load i8, i8* %arrayidx.2, align 1
%conv6.2 = zext i8 %4 to i32
@ -78,8 +78,8 @@ for.body.2: ; preds = %for.body.1
for.body.3: ; preds = %for.body.2
; CHECK: %for.body.3
; CHECK: ldrb {{r[0-9]|lr}}, [{{r[0-9]|lr}}, {{r[0-9]|lr}}]!
; CHECK: ldrb {{r[0-9]|lr}}, [{{r[0-9]|lr}}, {{r[0-9]|lr}}]!
; CHECK: ldrb {{r[0-9]+|lr}}, [{{r[0-9]+|lr}}, {{r[0-9]+|lr}}]!
; CHECK: ldrb {{r[0-9]+|lr}}, [{{r[0-9]+|lr}}, {{r[0-9]+|lr}}]!
%arrayidx.3 = getelementptr inbounds i8, i8* %a, i32 %add5.2
%6 = load i8, i8* %arrayidx.3, align 1
%conv6.3 = zext i8 %6 to i32

@ -238,12 +238,12 @@ define <4 x i32> @zextload_v8i8tov8i32(<4 x i8>** %ptr) {
define <4 x i32> @zextload_v8i8tov8i32_fake_update(<4 x i8>** %ptr) {
;CHECK-LABEL: zextload_v8i8tov8i32_fake_update:
;CHECK: ldr.w r[[PTRREG:[0-9]+]], [r0]
;CHECK: ldr r[[PTRREG:[0-9]+]], [r0]
;CHECK: vld1.32 {{{d[0-9]+}}[0]}, [r[[PTRREG]]:32]
;CHECK: add.w r[[INCREG:[0-9]+]], r[[PTRREG]], #16
;CHECK: str.w r[[INCREG]], [r0]
;CHECK: vmovl.u8 {{q[0-9]+}}, {{d[0-9]+}}
;CHECK: vmovl.u16 {{q[0-9]+}}, {{d[0-9]+}}
;CHECK: str r[[INCREG]], [r0]
%A = load <4 x i8>*, <4 x i8>** %ptr
%lA = load <4 x i8>, <4 x i8>* %A, align 4
%inc = getelementptr <4 x i8>, <4 x i8>* %A, i38 4

@ -228,9 +228,9 @@ define void @truncstore_v4i32tov4i8(<4 x i8>** %ptr, <4 x i32> %val) {
;CHECK: ldr.w r9, [sp]
;CHECK: vmov {{d[0-9]+}}, r3, r9
;CHECK: vmov {{d[0-9]+}}, r1, r2
;CHECK: ldr r[[PTRREG:[0-9]+]], [r0]
;CHECK: vmovn.i32 [[VECLO:d[0-9]+]], {{q[0-9]+}}
;CHECK: vuzp.8 [[VECLO]], {{d[0-9]+}}
;CHECK: ldr r[[PTRREG:[0-9]+]], [r0]
;CHECK: vst1.32 {[[VECLO]][0]}, [r[[PTRREG]]:32]
%A = load <4 x i8>*, <4 x i8>** %ptr
%trunc = trunc <4 x i32> %val to <4 x i8>
@ -243,10 +243,10 @@ define void @truncstore_v4i32tov4i8_fake_update(<4 x i8>** %ptr, <4 x i32> %val)
;CHECK: ldr.w r9, [sp]
;CHECK: vmov {{d[0-9]+}}, r3, r9
;CHECK: vmov {{d[0-9]+}}, r1, r2
;CHECK: movs [[IMM16:r[0-9]+]], #16
;CHECK: ldr r[[PTRREG:[0-9]+]], [r0]
;CHECK: vmovn.i32 [[VECLO:d[0-9]+]], {{q[0-9]+}}
;CHECK: vuzp.8 [[VECLO]], {{d[0-9]+}}
;CHECK: ldr r[[PTRREG:[0-9]+]], [r0]
;CHECK: movs [[IMM16:r[0-9]+]], #16
;CHECK: vst1.32 {[[VECLO]][0]}, [r[[PTRREG]]:32], [[IMM16]]
;CHECK: str r[[PTRREG]], [r0]
%A = load <4 x i8>*, <4 x i8>** %ptr