diff --git a/CHANGELOG.md b/CHANGELOG.md index 7fc53879..28c59c9f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,15 +2,17 @@ ## Current version -* **Breaking change!** Renamed `inline` to `macro` +* **Breaking change!** Renamed `inline` to `macro`. -* Added support for parameters for macros written in Millfork +* Added support for parameters for macros written in Millfork. * Enabled calling macros with index expression parameters. -* Added optimizer hints: `inline`, `noinline`, `register` +* Added optimizer hints: `inline`, `noinline`, `register`. + +* Added command line flags `--size`, `--fast`, `--blast-processing`. -* Added `*'=` and `<<<<` operators +* Added `*'=` and `<<<<` operators. * Added return dispatch statements. diff --git a/doc/api/command-line.md b/doc/api/command-line.md index 7d51dac4..317c92e9 100644 --- a/doc/api/command-line.md +++ b/doc/api/command-line.md @@ -64,6 +64,12 @@ This may cause problems if the parameter table is stored next to a hardware regi * `--inline` – Inline functions automatically (experimental). See the [documentation about inlining](../abi/inlining.md). Computationally easy, can give decent gains. +* `--size` – Optimize for size, sacrificing some speed (experimental). + +* `--fast` – Optimize for speed, even if it increases the size a bit (experimental). + +* `--blast-processing` – Optimize for speed, even if it increases the size a lot (experimental). + * `--detailed-flow` – Use detailed flow analysis (experimental). Very computationally expensive and not that great. * `--dangerous-optimizations` – Use dangerous optimizations (experimental). Dangerous optimizations are more likely to result in broken code. diff --git a/src/main/scala/millfork/CompilationOptions.scala b/src/main/scala/millfork/CompilationOptions.scala index 564da98d..ce80ef56 100644 --- a/src/main/scala/millfork/CompilationOptions.scala +++ b/src/main/scala/millfork/CompilationOptions.scala @@ -75,7 +75,7 @@ object CompilationFlag extends Enumeration { // compilation options: EmitIllegals, EmitCmosOpcodes, DecimalMode, ReadOnlyArrays, PreventJmpIndirectBug, // optimization options: - DetailedFlowAnalysis, DangerousOptimizations, InlineFunctions, OptimizeForSize, OptimizeForSpeed, + DetailedFlowAnalysis, DangerousOptimizations, InlineFunctions, OptimizeForSize, OptimizeForSpeed, OptimizeForSonicSpeed, // memory allocation options VariableOverlap, CompactReturnDispatchParams, // runtime check options diff --git a/src/main/scala/millfork/Main.scala b/src/main/scala/millfork/Main.scala index 79cabc44..778a04bd 100644 --- a/src/main/scala/millfork/Main.scala +++ b/src/main/scala/millfork/Main.scala @@ -226,14 +226,22 @@ object Main { flag("--inline").action { c => c.changeFlag(CompilationFlag.InlineFunctions, true) }.description("Inline functions automatically.") - flag("-Of", "--fast").action { c => - c.changeFlag(CompilationFlag.OptimizeForSize, false) - c.changeFlag(CompilationFlag.OptimizeForSpeed, true) - }.description("Optimize for speed (experimental).") flag("-Os", "--size").action { c => c.changeFlag(CompilationFlag.OptimizeForSize, true) c.changeFlag(CompilationFlag.OptimizeForSpeed, false) - }.description("Optimize for size (experimental).") + c.changeFlag(CompilationFlag.OptimizeForSonicSpeed, false) + }.description("Optimize for size at cost of lower speed (experimental).") + flag("-Of", "--fast").action { c => + c.changeFlag(CompilationFlag.OptimizeForSize, false) + c.changeFlag(CompilationFlag.OptimizeForSpeed, true) + c.changeFlag(CompilationFlag.OptimizeForSonicSpeed, false) + }.description("Optimize for speed at cost of bigger size (experimental).") + flag("-Ob", "--blast-processing").action { c => + c.changeFlag(CompilationFlag.OptimizeForSize, false) + c.changeFlag(CompilationFlag.OptimizeForSpeed, true) + c.changeFlag(CompilationFlag.OptimizeForSonicSpeed, true) + c.changeFlag(CompilationFlag.InlineFunctions, true) + }.description("Optimize for speed at cost of much bigger size (experimental). Implies --inline.") flag("--detailed-flow").action { c => c.changeFlag(CompilationFlag.DetailedFlowAnalysis, true) }.description("Use detailed flow analysis (experimental).") diff --git a/src/main/scala/millfork/OptimizationPresets.scala b/src/main/scala/millfork/OptimizationPresets.scala index f07795ab..4e3a8636 100644 --- a/src/main/scala/millfork/OptimizationPresets.scala +++ b/src/main/scala/millfork/OptimizationPresets.scala @@ -39,6 +39,7 @@ object OptimizationPresets { LaterOptimizations.PointlessLoadAfterStore, AlwaysGoodOptimizations.PointlessOperationAfterLoad, AlwaysGoodOptimizations.IdempotentDuplicateRemoval, + LoopUnrolling.LoopUnrolling, AlwaysGoodOptimizations.ConstantIndexPropagation, AlwaysGoodOptimizations.PointlessLoadBeforeReturn, AlwaysGoodOptimizations.PoinlessFlagChange, @@ -141,6 +142,7 @@ object OptimizationPresets { AlwaysGoodOptimizations.IncrementingIndexRegistersAfterTransfer, AlwaysGoodOptimizations.IndexComparisonOptimization, AlwaysGoodOptimizations.IndexSequenceOptimization, + LoopUnrolling.LoopUnrolling, AlwaysGoodOptimizations.MathOperationOnTwoIdenticalMemoryOperands, AlwaysGoodOptimizations.ModificationOfJustWrittenValue, AlwaysGoodOptimizations.NonetAddition, diff --git a/src/main/scala/millfork/assembly/opt/LoopUnrolling.scala b/src/main/scala/millfork/assembly/opt/LoopUnrolling.scala new file mode 100644 index 00000000..a579de64 --- /dev/null +++ b/src/main/scala/millfork/assembly/opt/LoopUnrolling.scala @@ -0,0 +1,147 @@ +package millfork.assembly.opt + +import java.util.concurrent.atomic.AtomicInteger + +import millfork.{CompilationFlag, CompilationOptions} +import millfork.assembly.AssemblyLine +import millfork.assembly.OpcodeClasses._ +import millfork.assembly.Opcode._ +import millfork.assembly.AddrMode._ +import millfork.env.{Constant, Label, MemoryAddressConstant} + +/** + * @author Karol Stasiak + */ +object LoopUnrolling { + + object Unrolling extends Enumeration { + val X, Y, Var = Value + } + + val counter = new AtomicInteger(40000) + + def getNextLabel(prefix: String) = f".$prefix%s__${counter.getAndIncrement()}%05d" + + private val Initialization = 634 + private val Start = 453 + private val End = 312 + private val Skip = 1596 + private val Back = 5473 + private val Body = 6354 + private val Step = 63546 + private val BodyWithStep = 6355 + + + def isFeasible(ctx: AssemblyMatchingContext, branchingSize: Int, index: Unrolling.Value): Boolean = { + if (!ctx.isExternallyLinearBlock(Body)) return false + val bodyCode = ctx.get[List[AssemblyLine]](Body) + val start = ctx.get[Int](Start) + val end = ctx.getOrDefault[Int](End, 0) + if (start == end) return true + val increasing = isIncreasing(ctx) + if (increasing != (start < end)) return false // overflow not supported + val count = Math.abs(start - end) + if (count > 32) return false + if (count > 8 && !ctx.compilationOptions.flag(CompilationFlag.OptimizeForSonicSpeed)) return false + if (count > 3 && !ctx.compilationOptions.flag(CompilationFlag.OptimizeForSpeed)) return false + val onlyUsedForArrayIndexing = index match { + case Unrolling.Var => false + case Unrolling.X => bodyCode.forall(line => !ConcernsX(line) || line.addrMode == AbsoluteX) + case Unrolling.Y => bodyCode.forall(line => !ConcernsY(line) || line.addrMode == AbsoluteY) + } + val stepSize = index match { + case Unrolling.Var => 3 + case _ => 1 + } + val cmpExists = ctx.getOrDefault[Int](End, -1) >= 0 + val bodySize = bodyCode.map(_.sizeInBytes).sum + val sizeBefore = branchingSize + bodySize + stepSize + (if (cmpExists) 2 else 0) + val sizeAfter = count * (bodySize + (if (onlyUsedForArrayIndexing) 0 else stepSize)) + if (sizeAfter <= sizeBefore) return true + if (!ctx.compilationOptions.flag(CompilationFlag.OptimizeForSpeed)) return false + if (ctx.compilationOptions.flag(CompilationFlag.OptimizeForSonicSpeed)) { + (sizeAfter - sizeBefore < 128) && (sizeAfter < sizeBefore * 32) + } else { + (sizeAfter - sizeBefore < 64) && (sizeAfter < sizeBefore * 8) + } + + } + + private def isIncreasing(ctx: AssemblyMatchingContext) = { + val opcode = ctx.get[List[AssemblyLine]](Step).head.opcode + opcode == INX || opcode == INY || opcode == INC || opcode == ISC + } + + private def fixLabels(code: List[AssemblyLine]) = { + val localLabels = code.flatMap { + case AssemblyLine(LABEL, _, MemoryAddressConstant(Label(l)), _) => Some(l) + case _ => None + }.toSet + val labelPrefix = getNextLabel("ur") + code.map { + case s@AssemblyLine(_, _, MemoryAddressConstant(Label(l)), _) if localLabels(l) => + s.copy(parameter = MemoryAddressConstant(Label(labelPrefix + l))) + case s => s + } + } + + val LoopUnrolling = new RuleBasedAssemblyOptimization("Loop unrolling", + needsFlowInfo = FlowInfoRequirement.NoRequirement, + (Elidable & HasOpcode(LDX) & MatchNumericImmediate(Start)).capture(Initialization) ~ + (Elidable & HasOpcode(BEQ) & MatchParameter(Skip)) ~ + (Elidable & HasOpcode(LABEL) & MatchParameter(Back)) ~ + ((Elidable & Not(HasOpcodeIn(Set(RTS, JSR, RTI))) & Not(ChangesX)).*.capture(Body) ~ + (Elidable & HasOpcodeIn(Set(DEX, INX))).capture(Step) + ).capture(BodyWithStep) ~ + (Elidable & HasOpcode(CPX) & MatchNumericImmediate(End)).? ~ + (Elidable & HasOpcode(BNE) & MatchParameter(Back)) ~ + (Elidable & HasOpcode(LABEL) & MatchParameter(Skip)) ~ + Where(ctx => isFeasible(ctx, 4, Unrolling.X)) ~~> { (code, ctx) => + val start = ctx.get[Int](Start) + val end = ctx.getOrDefault[Int](End, 0) + val increasing = isIncreasing(ctx) + ctx.get[List[AssemblyLine]](Initialization) ++ (0 until Math.abs(start - end)).flatMap(_ => fixLabels(ctx.get[List[AssemblyLine]](BodyWithStep))) + }, + (Elidable & HasOpcode(LDX) & MatchNumericImmediate(Start)).capture(Initialization) ~ + (Elidable & HasOpcode(LABEL) & MatchParameter(Back)) ~ + ((Elidable & Not(HasOpcodeIn(Set(RTS, JSR, RTI))) & Not(ChangesX)).*.capture(Body) ~ + (Elidable & HasOpcodeIn(Set(DEX, INX))).capture(Step) + ).capture(BodyWithStep) ~ + (Elidable & HasOpcode(CPX) & MatchNumericImmediate(End)).? ~ + (Elidable & HasOpcode(BNE) & MatchParameter(Back)) ~ + Where(ctx => isFeasible(ctx, 2, Unrolling.X)) ~~> { (code, ctx) => + val start = ctx.get[Int](Start) + val end = ctx.getOrDefault[Int](End, 0) + val increasing = isIncreasing(ctx) + ctx.get[List[AssemblyLine]](Initialization) ++ (0 until Math.abs(start - end)).flatMap(_ => fixLabels(ctx.get[List[AssemblyLine]](BodyWithStep))) + }, + (Elidable & HasOpcode(LDY) & MatchNumericImmediate(Start)).capture(Initialization) ~ + (Elidable & HasOpcode(BEQ) & MatchParameter(Skip)) ~ + (Elidable & HasOpcode(LABEL) & MatchParameter(Back)) ~ + ((Elidable & Not(HasOpcodeIn(Set(RTS, JSR, RTI))) & Not(ChangesY)).*.capture(Body) ~ + (Elidable & HasOpcodeIn(Set(DEY, INY))).capture(Step) + ).capture(BodyWithStep) ~ + (Elidable & HasOpcode(CPY) & MatchNumericImmediate(End)).? ~ + (Elidable & HasOpcode(BNE) & MatchParameter(Back)) ~ + (Elidable & HasOpcode(LABEL) & MatchParameter(Skip)) ~ + Where(ctx => isFeasible(ctx, 4, Unrolling.Y)) ~~> { (code, ctx) => + val start = ctx.get[Int](Start) + val end = ctx.getOrDefault[Int](End, 0) + val increasing = isIncreasing(ctx) + ctx.get[List[AssemblyLine]](Initialization) ++ (0 until Math.abs(start - end)).flatMap(_ => fixLabels(ctx.get[List[AssemblyLine]](BodyWithStep))) + }, + (Elidable & HasOpcode(LDY) & MatchNumericImmediate(Start)).capture(Initialization) ~ + (Elidable & HasOpcode(LABEL) & MatchParameter(Back)) ~ + ((Elidable & Not(HasOpcodeIn(Set(RTS, JSR, RTI))) & Not(ChangesY)).*.capture(Body) ~ + (Elidable & HasOpcodeIn(Set(DEY, INY))).capture(Step) + ).capture(BodyWithStep) ~ + (Elidable & HasOpcode(CPY) & MatchNumericImmediate(End)).? ~ + (Elidable & HasOpcode(BNE) & MatchParameter(Back)) ~ + Where(ctx => isFeasible(ctx, 2, Unrolling.Y)) ~~> { (code, ctx) => + val start = ctx.get[Int](Start) + val end = ctx.getOrDefault[Int](End, 0) + val increasing = isIncreasing(ctx) + ctx.get[List[AssemblyLine]](Initialization) ++ (0 until Math.abs(start - end)).flatMap(_ => fixLabels(ctx.get[List[AssemblyLine]](BodyWithStep))) + }, + ) +} diff --git a/src/main/scala/millfork/assembly/opt/RuleBasedAssemblyOptimization.scala b/src/main/scala/millfork/assembly/opt/RuleBasedAssemblyOptimization.scala index 07ae5ad4..aece6fad 100644 --- a/src/main/scala/millfork/assembly/opt/RuleBasedAssemblyOptimization.scala +++ b/src/main/scala/millfork/assembly/opt/RuleBasedAssemblyOptimization.scala @@ -41,7 +41,7 @@ class RuleBasedAssemblyOptimization(val name: String, val needsFlowInfo: FlowInf case Nil => Nil case head :: tail => for ((rule, index) <- rules.zipWithIndex) { - val ctx = new AssemblyMatchingContext + val ctx = new AssemblyMatchingContext(options) rule.pattern.matchTo(ctx, code) match { case Some(rest: List[(FlowInfo, AssemblyLine)]) => val matchedChunkToOptimize: List[AssemblyLine] = code.take(code.length - rest.length).map(_._2) @@ -69,7 +69,7 @@ class RuleBasedAssemblyOptimization(val name: String, val needsFlowInfo: FlowInf } } -class AssemblyMatchingContext { +class AssemblyMatchingContext(val compilationOptions: CompilationOptions) { private val map = mutable.Map[Int, Any]() override def toString: String = map.mkString(", ") @@ -101,7 +101,8 @@ class AssemblyMatchingContext { } } - def get[T: Manifest](i: Int): T = { + private def getImpl[T: Manifest](i: Int): AnyRef = { + if (!map.contains(i)) return null val t = map(i) val clazz = implicitly[Manifest[T]].runtimeClass match { case java.lang.Integer.TYPE => classOf[java.lang.Integer] @@ -110,7 +111,7 @@ class AssemblyMatchingContext { case x => x } if (clazz.isInstance(t)) { - t.asInstanceOf[T] + t.asInstanceOf[AnyRef] } else { if (i eq null) { ErrorReporting.fatal(s"Value at index $i is null") @@ -120,6 +121,23 @@ class AssemblyMatchingContext { } } + def get[T: Manifest](i: Int): T = { + val v = getImpl[T](i) + if (v eq null) { + ErrorReporting.fatal(s"Value at index $i is null") + } + v.asInstanceOf[T] + } + + def getOrDefault[T: Manifest](i: Int, defau: T): T = { + val v = getImpl[T](i) + if (v eq null) { + defau + } else { + v.asInstanceOf[T] + } + } + def isExternallyLinearBlock(i: Int): Boolean = { val labels = mutable.Set[String]() val jumps = mutable.Set[String]() @@ -749,6 +767,18 @@ case class MatchImmediate(i: Int) extends AssemblyLinePattern { override def toString: String = s"(?<$i>#)" } +case class MatchNumericImmediate(i: Int) extends AssemblyLinePattern { + override def matchLineTo(ctx: AssemblyMatchingContext, flowInfo: FlowInfo, line: AssemblyLine): Boolean = + if (line.addrMode == AddrMode.Immediate) { + line.parameter.quickSimplify match { + case NumericConstant(value, _) => ctx.addObject(i, value.toInt & 0xff) + case _ => false + } + } else false + + override def toString: String = s"(?<$i>#)" +} + case class DoesntChangeIndexingInAddrMode(i: Int) extends AssemblyLinePattern { override def matchLineTo(ctx: AssemblyMatchingContext, flowInfo: FlowInfo, line: AssemblyLine): Boolean = diff --git a/src/main/scala/millfork/output/Assembler.scala b/src/main/scala/millfork/output/Assembler.scala index d466c671..abb5aef4 100644 --- a/src/main/scala/millfork/output/Assembler.scala +++ b/src/main/scala/millfork/output/Assembler.scala @@ -147,9 +147,13 @@ class Assembler(private val program: Program, private val rootEnv: Environment) val potentiallyInlineable: Map[String, Int] = InliningCalculator.getPotentiallyInlineableFunctions( program, - options.flags(CompilationFlag.InlineFunctions), - if (options.flags(CompilationFlag.OptimizeForSpeed)) 1.3 else 1.0, - if (options.flags(CompilationFlag.OptimizeForSpeed)) 8.0 else 1.2) + options.flags(CompilationFlag.InlineFunctions) || options.flags(CompilationFlag.OptimizeForSonicSpeed), + if (options.flags(CompilationFlag.OptimizeForSonicSpeed)) 4.0 + else if (options.flags(CompilationFlag.OptimizeForSpeed)) 1.3 + else 1.0, + if (options.flags(CompilationFlag.OptimizeForSonicSpeed)) 12.0 + else if (options.flags(CompilationFlag.OptimizeForSpeed)) 8.0 + else 1.2) var inlinedFunctions = Map[String, List[AssemblyLine]]() val compiledFunctions = mutable.Map[String, List[AssemblyLine]]()