mirror of
https://github.com/c64scene-ar/llvm-6502.git
synced 2025-07-02 19:24:25 +00:00
LSR: rewrite inner loops only.
Rewriting the entire loop nest now requires -enable-lsr-nested. See PR11035 for some performance data. A few unit tests specifically test nested LSR, and are now under a flag. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@140762 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
@ -78,6 +78,9 @@
|
|||||||
using namespace llvm;
|
using namespace llvm;
|
||||||
|
|
||||||
namespace llvm {
|
namespace llvm {
|
||||||
|
cl::opt<bool> EnableNested(
|
||||||
|
"enable-lsr-nested", cl::Hidden, cl::desc("Enable LSR on nested loops"));
|
||||||
|
|
||||||
cl::opt<bool> EnableRetry(
|
cl::opt<bool> EnableRetry(
|
||||||
"enable-lsr-retry", cl::Hidden, cl::desc("Enable LSR retry"));
|
"enable-lsr-retry", cl::Hidden, cl::desc("Enable LSR retry"));
|
||||||
}
|
}
|
||||||
@ -723,11 +726,14 @@ void Cost::RateRegister(const SCEV *Reg,
|
|||||||
if (AR->getLoop() == L)
|
if (AR->getLoop() == L)
|
||||||
AddRecCost += 1; /// TODO: This should be a function of the stride.
|
AddRecCost += 1; /// TODO: This should be a function of the stride.
|
||||||
|
|
||||||
// If this is an addrec for a loop that's already been visited by LSR,
|
// If this is an addrec for another loop, don't second-guess its addrec phi
|
||||||
// don't second-guess its addrec phi nodes. LSR isn't currently smart
|
// nodes. LSR isn't currently smart enough to reason about more than one
|
||||||
// enough to reason about more than one loop at a time. Consider these
|
// loop at a time. LSR has either already run on inner loops, will not run
|
||||||
// registers free and leave them alone.
|
// on other loops, and cannot be expected to change sibling loops. If the
|
||||||
else if (L->contains(AR->getLoop()) ||
|
// AddRec exists, consider it's register free and leave it alone. Otherwise,
|
||||||
|
// do not consider this formula at all.
|
||||||
|
// FIXME: why do we need to generate such fomulae?
|
||||||
|
else if (!EnableNested || L->contains(AR->getLoop()) ||
|
||||||
(!AR->getLoop()->contains(L) &&
|
(!AR->getLoop()->contains(L) &&
|
||||||
DT.dominates(L->getHeader(), AR->getLoop()->getHeader()))) {
|
DT.dominates(L->getHeader(), AR->getLoop()->getHeader()))) {
|
||||||
for (BasicBlock::iterator I = AR->getLoop()->getHeader()->begin();
|
for (BasicBlock::iterator I = AR->getLoop()->getHeader()->begin();
|
||||||
@ -738,6 +744,10 @@ void Cost::RateRegister(const SCEV *Reg,
|
|||||||
SE.getSCEV(PN) == AR)
|
SE.getSCEV(PN) == AR)
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
if (!EnableNested) {
|
||||||
|
Loose();
|
||||||
|
return;
|
||||||
|
}
|
||||||
// If this isn't one of the addrecs that the loop already has, it
|
// If this isn't one of the addrecs that the loop already has, it
|
||||||
// would require a costly new phi and add. TODO: This isn't
|
// would require a costly new phi and add. TODO: This isn't
|
||||||
// precisely modeled right now.
|
// precisely modeled right now.
|
||||||
@ -3801,6 +3811,12 @@ LSRInstance::LSRInstance(const TargetLowering *tli, Loop *l, Pass *P)
|
|||||||
// If loop preparation eliminates all interesting IV users, bail.
|
// If loop preparation eliminates all interesting IV users, bail.
|
||||||
if (IU.empty()) return;
|
if (IU.empty()) return;
|
||||||
|
|
||||||
|
// Skip nested loops until we can model them better with forulae.
|
||||||
|
if (!EnableNested && !L->empty()) {
|
||||||
|
DEBUG(dbgs() << "LSR skipping outer loop " << *L << "\n");
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
// Start collecting data and preparing for the solver.
|
// Start collecting data and preparing for the solver.
|
||||||
CollectInterestingTypesAndFactors();
|
CollectInterestingTypesAndFactors();
|
||||||
CollectFixupsAndInitialFormulae();
|
CollectFixupsAndInitialFormulae();
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
; RUN: llc -mtriple=thumbv7-apple-darwin10 -mcpu=cortex-a8 < %s | FileCheck %s
|
; RUN: llc -mtriple=thumbv7-apple-darwin10 -mcpu=cortex-a8 -enable-lsr-nested < %s | FileCheck %s
|
||||||
|
|
||||||
; LSR should recognize that this is an unrolled loop which can use
|
; LSR should recognize that this is an unrolled loop which can use
|
||||||
; constant offset addressing, so that each of the following stores
|
; constant offset addressing, so that each of the following stores
|
||||||
@ -8,6 +8,9 @@
|
|||||||
; CHECK: vstr.32 s{{.*}}, [{{(r[0-9]+)|(lr)}}, #64]
|
; CHECK: vstr.32 s{{.*}}, [{{(r[0-9]+)|(lr)}}, #64]
|
||||||
; CHECK: vstr.32 s{{.*}}, [{{(r[0-9]+)|(lr)}}, #96]
|
; CHECK: vstr.32 s{{.*}}, [{{(r[0-9]+)|(lr)}}, #96]
|
||||||
|
|
||||||
|
; We can also save a register in the outer loop, but that requires
|
||||||
|
; performing LSR on the outer loop.
|
||||||
|
|
||||||
target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:32-f32:32:32-f64:32:32-v64:64:64-v128:128:128-a0:0:32-n32"
|
target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:32-f32:32:32-f64:32:32-v64:64:64-v128:128:128-a0:0:32-n32"
|
||||||
|
|
||||||
%0 = type { %1*, %3*, %6*, i8*, i32, i32, %8*, i32, i32, i32, i32, i32, i32, i32, double, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i8**, i32, i32, i32, i32, i32, [64 x i32]*, [4 x %9*], [4 x %10*], [4 x %10*], i32, %11*, i32, i32, [16 x i8], [16 x i8], [16 x i8], i32, i32, i8, i8, i8, i16, i16, i32, i8, i32, %12*, i32, i32, i32, i32, i8*, i32, [4 x %11*], i32, i32, i32, [10 x i32], i32, i32, i32, i32, i32, %13*, %14*, %15*, %16*, %17*, %18*, %19*, %20*, %21*, %22*, %23* }
|
%0 = type { %1*, %3*, %6*, i8*, i32, i32, %8*, i32, i32, i32, i32, i32, i32, i32, double, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i8**, i32, i32, i32, i32, i32, [64 x i32]*, [4 x %9*], [4 x %10*], [4 x %10*], i32, %11*, i32, i32, [16 x i8], [16 x i8], [16 x i8], i32, i32, i8, i8, i8, i16, i16, i32, i8, i32, %12*, i32, i32, i32, i32, i8*, i32, [4 x %11*], i32, i32, i32, [10 x i32], i32, i32, i32, i32, i32, %13*, %14*, %15*, %16*, %17*, %18*, %19*, %20*, %21*, %22*, %23* }
|
||||||
|
@ -1,4 +1,7 @@
|
|||||||
; RUN: llc < %s -march=x86 | FileCheck %s
|
; RUN: llc < %s -march=x86 -enable-lsr-nested | FileCheck %s
|
||||||
|
;
|
||||||
|
; Nested LSR is required to optimize this case.
|
||||||
|
; We do not expect to see this form of IR without -enable-iv-rewrite.
|
||||||
|
|
||||||
define void @borf(i8* nocapture %in, i8* nocapture %out) nounwind {
|
define void @borf(i8* nocapture %in, i8* nocapture %out) nounwind {
|
||||||
; CHECK: borf:
|
; CHECK: borf:
|
||||||
|
@ -1,4 +1,7 @@
|
|||||||
; RUN: llc < %s -march=x86-64 | FileCheck %s
|
; RUN: llc < %s -march=x86-64 -enable-lsr-nested | FileCheck %s
|
||||||
|
;
|
||||||
|
; Nested LSR is required to optimize this case.
|
||||||
|
; We do not expect to see this form of IR without -enable-iv-rewrite.
|
||||||
|
|
||||||
define void @borf(i8* nocapture %in, i8* nocapture %out) nounwind {
|
define void @borf(i8* nocapture %in, i8* nocapture %out) nounwind {
|
||||||
; CHECK: borf:
|
; CHECK: borf:
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
; RUN: llc < %s -march=x86-64 -o %t
|
; RUN: llc < %s -march=x86-64 -enable-lsr-nested -o %t
|
||||||
; RUN: not grep inc %t
|
; RUN: not grep inc %t
|
||||||
; RUN: grep dec %t | count 2
|
; RUN: grep dec %t | count 2
|
||||||
; RUN: grep addq %t | count 12
|
; RUN: grep addq %t | count 12
|
||||||
@ -11,6 +11,10 @@
|
|||||||
; to insert new induction variables. Previously it would create a
|
; to insert new induction variables. Previously it would create a
|
||||||
; flood of new induction variables.
|
; flood of new induction variables.
|
||||||
; Also, the loop reversal should kick in once.
|
; Also, the loop reversal should kick in once.
|
||||||
|
;
|
||||||
|
; In this example, performing LSR on the entire loop nest,
|
||||||
|
; as opposed to only the inner loop can further reduce induction variables,
|
||||||
|
; and their related instructions and registers.
|
||||||
|
|
||||||
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
|
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
|
||||||
target triple = "x86_64-unknown-linux-gnu"
|
target triple = "x86_64-unknown-linux-gnu"
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
; RUN: llc < %s -march=x86 | grep cmp | grep 240
|
; RUN: llc < %s -march=x86 -enable-lsr-nested | grep cmp | grep 240
|
||||||
; RUN: llc < %s -march=x86 | grep inc | count 1
|
; RUN: llc < %s -march=x86 -enable-lsr-nested | grep inc | count 1
|
||||||
|
|
||||||
define i32 @foo(i32 %A, i32 %B, i32 %C, i32 %D) nounwind {
|
define i32 @foo(i32 %A, i32 %B, i32 %C, i32 %D) nounwind {
|
||||||
entry:
|
entry:
|
||||||
|
@ -1,7 +1,9 @@
|
|||||||
; RUN: llc < %s -march=x86 -stats -regalloc=linearscan |& grep {Number of loads added} | grep 2
|
; RUN: llc < %s -march=x86 -stats -regalloc=linearscan -enable-lsr-nested |& grep {Number of loads added} | grep 2
|
||||||
; RUN: llc < %s -march=x86 -stats -regalloc=linearscan |& grep {Number of spill slots allocated} | grep 1
|
; RUN: llc < %s -march=x86 -stats -regalloc=linearscan -enable-lsr-nested |& grep {Number of spill slots allocated} | grep 1
|
||||||
; RUN: llc < %s -march=x86 -stats -regalloc=linearscan |& grep {Number of machine instrs printed} | grep 34
|
; RUN: llc < %s -march=x86 -stats -regalloc=linearscan -enable-lsr-nested |& grep {Number of machine instrs printed} | grep 34
|
||||||
; PR3495
|
; PR3495
|
||||||
|
;
|
||||||
|
; Note: this should not spill at all with either good LSR or good regalloc.
|
||||||
|
|
||||||
target triple = "i386-pc-linux-gnu"
|
target triple = "i386-pc-linux-gnu"
|
||||||
@x = external global [8 x i32], align 32 ; <[8 x i32]*> [#uses=1]
|
@x = external global [8 x i32], align 32 ; <[8 x i32]*> [#uses=1]
|
||||||
|
Reference in New Issue
Block a user