diff --git a/codeGeneration/src/prog8/compiler/target/cpu6502/codegen/AsmGen.kt b/codeGeneration/src/prog8/compiler/target/cpu6502/codegen/AsmGen.kt index d3e3c71cd..61323798f 100644 --- a/codeGeneration/src/prog8/compiler/target/cpu6502/codegen/AsmGen.kt +++ b/codeGeneration/src/prog8/compiler/target/cpu6502/codegen/AsmGen.kt @@ -87,7 +87,7 @@ class AsmGen(private val program: Program, assemblyLines.addAll(outputFile.readLines()) var optimizationsDone = 1 while (optimizationsDone > 0) { - optimizationsDone = optimizeAssembly(assemblyLines) + optimizationsDone = optimizeAssembly(assemblyLines, options.compTarget.machine) } outputFile.printWriter().use { for (line in assemblyLines) { it.println(line) } diff --git a/codeGeneration/src/prog8/compiler/target/cpu6502/codegen/AsmOptimizer.kt b/codeGeneration/src/prog8/compiler/target/cpu6502/codegen/AsmOptimizer.kt index cff924a73..74c1658f3 100644 --- a/codeGeneration/src/prog8/compiler/target/cpu6502/codegen/AsmOptimizer.kt +++ b/codeGeneration/src/prog8/compiler/target/cpu6502/codegen/AsmOptimizer.kt @@ -1,10 +1,12 @@ package prog8.compiler.target.cpu6502.codegen +import prog8.compilerinterface.IMachineDefinition + // note: see https://wiki.nesdev.org/w/index.php/6502_assembly_optimisations -fun optimizeAssembly(lines: MutableList): Int { +fun optimizeAssembly(lines: MutableList, machine: IMachineDefinition): Int { var numberOfOptimizations = 0 @@ -31,7 +33,7 @@ fun optimizeAssembly(lines: MutableList): Int { numberOfOptimizations++ } - mods = optimizeStoreLoadSame(linesByFour) + mods = optimizeStoreLoadSame(linesByFour, machine) if(mods.isNotEmpty()) { apply(mods, lines) linesByFour = getLinesBy(lines, 4) @@ -46,7 +48,7 @@ fun optimizeAssembly(lines: MutableList): Int { } var linesByFourteen = getLinesBy(lines, 14) - mods = optimizeSameAssignments(linesByFourteen) + mods = optimizeSameAssignments(linesByFourteen, machine) if(mods.isNotEmpty()) { apply(mods, lines) linesByFourteen = getLinesBy(lines, 14) @@ -111,9 +113,9 @@ private fun optimizeUselessStackByteWrites(linesByFour: List>>): List { +private fun optimizeSameAssignments(linesByFourteen: List>>, machine: IMachineDefinition): List { - // Optimize sequential assignments of the isSameAs value to various targets (bytes, words, floats) + // Optimize sequential assignments of the same value to various targets (bytes, words, floats) // the float one is the one that requires 2*7=14 lines of code to check... // The better place to do this is in the Compiler instead and never create these types of assembly, but hey @@ -135,9 +137,13 @@ private fun optimizeSameAssignments(linesByFourteen: List remove second lda/ldy pair (fifth and sixth lines) - mods.add(Modification(lines[4].index, true, null)) - mods.add(Modification(lines[5].index, true, null)) + // lda/ldy sta/sty twice the same word --> remove second lda/ldy pair (fifth and sixth lines) + val address1 = getAddressArg(first) + val address2 = getAddressArg(second) + if(address1==null || address2==null || (!machine.isIOAddress(address1) && !machine.isIOAddress(address2))) { + mods.add(Modification(lines[4].index, true, null)) + mods.add(Modification(lines[5].index, true, null)) + } } } @@ -145,8 +151,10 @@ private fun optimizeSameAssignments(linesByFourteen: List remove second lda (third line) - mods.add(Modification(lines[2].index, true, null)) + // lda value / sta ? / lda same-value / sta ? -> remove second lda (third line) + val address = getAddressArg(first) + if(address==null || !machine.isIOAddress(address)) + mods.add(Modification(lines[2].index, true, null)) } } @@ -227,10 +235,13 @@ private fun optimizeSameAssignments(linesByFourteen: List>>): List { +private fun optimizeStoreLoadSame(linesByFour: List>>, machine: IMachineDefinition): List { // sta X + lda X, sty X + ldy X, stx X + ldx X -> the second instruction can OFTEN be eliminated - // TODO this is not true if X is not a regular RAM memory address (but instead mapped I/O or ROM) but how does this code know? - // should this optimization be removed???? or teach it about the InRegularRAM ? val mods = mutableListOf() for (lines in linesByFour) { val first = lines[1].value.trimStart() @@ -305,7 +338,8 @@ private fun optimizeStoreLoadSame(linesByFour: List>>) } else { // no branch instruction follows, we can remove the load instruction - true + val address = getAddressArg(lines[2].value) + address==null || !machine.isIOAddress(address) } if(attemptRemove) { @@ -319,6 +353,17 @@ private fun optimizeStoreLoadSame(linesByFour: List>>) return mods } +private fun getAddressArg(line: String): UInt? { + val loadArg = line.trimStart().substring(3).trim() + return when { + loadArg.startsWith('$') -> loadArg.substring(1).toUIntOrNull(16) + loadArg.startsWith('%') -> loadArg.substring(1).toUIntOrNull(2) + loadArg.startsWith('#') -> null + loadArg.startsWith('(') -> null + else -> loadArg.substring(1).toUIntOrNull() + } +} + private fun optimizeIncDec(linesByFour: List>>): List { // sometimes, iny+dey / inx+dex / dey+iny / dex+inx sequences are generated, these can be eliminated. val mods = mutableListOf() @@ -327,8 +372,12 @@ private fun optimizeIncDec(linesByFour: List>>): List< val second = lines[1].value if ((" iny" in first || "\tiny" in first) && (" dey" in second || "\tdey" in second) || (" inx" in first || "\tinx" in first) && (" dex" in second || "\tdex" in second) + || (" ina" in first || "\tina" in first) && (" dea" in second || "\tdea" in second) + || (" inc a" in first || "\tinc a" in first) && (" dec a" in second || "\tdec a" in second) || (" dey" in first || "\tdey" in first) && (" iny" in second || "\tiny" in second) - || (" dex" in first || "\tdex" in first) && (" inx" in second || "\tinx" in second)) { + || (" dex" in first || "\tdex" in first) && (" inx" in second || "\tinx" in second) + || (" dea" in first || "\tdea" in first) && (" ina" in second || "\tina" in second) + || (" dec a" in first || "\tdec a" in first) && (" inc a" in second || "\tinc a" in second)) { mods.add(Modification(lines[0].index, true, null)) mods.add(Modification(lines[1].index, true, null)) } diff --git a/docs/source/todo.rst b/docs/source/todo.rst index 35016d78d..de2e45cbb 100644 --- a/docs/source/todo.rst +++ b/docs/source/todo.rst @@ -3,9 +3,7 @@ TODO For next compiler release (7.4) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -optimize: there is an optimization in AsmOptimizer that can only be done correctly - if it knows about regular ram vs io space ram distinction. - +... Blocked by an official Commander-x16 v39 release diff --git a/examples/test.p8 b/examples/test.p8 index ee39eeeca..7a81d626d 100644 --- a/examples/test.p8 +++ b/examples/test.p8 @@ -4,48 +4,48 @@ main { sub start() { - ubyte xx = 1 - ubyte yy = 2 - byte b1 - byte b2=10 + %asm {{ - ; result should be: 29 42 40 87 75 35 + lda $d020 + ldy $d021 + sta $d020 + sty $d021 + lda $d020 + ldy $d021 + sta $d020 + sty $d021 - xx=6 - yy=8 - yy = (xx+5)+(yy+10) - txt.print_ub(yy) ; 29 - txt.nl() - xx=6 - yy=8 - yy = (xx*3)+(yy*3) - txt.print_ub(yy) ; 42 - txt.nl() + lda $d020 + sta $d020 + lda $d020 + sta $d020 + lda $d020 + sta $d020 + lda $d020 + sta $d020 + sta $d020 + sta $d020 + sta $d020 + sta $d020 + sta $d020 + sta $d020 - b1=13 - b2=5 - b2 = (b1*5)-(b2*5) - txt.print_b(b2) ; 40 - txt.nl() + lda $c020 + sta $c020 + lda $c020 + sta $c020 + lda $c020 + sta $c020 + lda $c020 + sta $c020 + sta $c020 + sta $c020 + sta $c020 + sta $c020 + sta $c020 - b1=100 - b2=8 - b2 = (b1+5)-(b2+10) - txt.print_b(b2) ; 87 - txt.nl() - - b1=50 - b2=40 - b2 = (b1-5)+(b2-10) - txt.print_b(b2) ; 75 - txt.nl() - - b1=50 - b2=20 - b2 = (b1-5)-(b2-10) - txt.print_b(b2) ; 35 - txt.nl() + }} repeat { }