From 78c7ee247ae6aff19fdaf745262861a28eed66f0 Mon Sep 17 00:00:00 2001 From: Irmen de Jong Date: Tue, 16 Jul 2024 00:25:29 +0200 Subject: [PATCH] generate 65c02 TSB/TRB instructions in certain cases --- .../src/prog8/codegen/cpu6502/AsmOptimizer.kt | 30 +++++++++ .../assignment/AugmentableAssignmentAsmGen.kt | 18 +++++ docs/source/todo.rst | 1 - examples/test.p8 | 66 ++++++------------- 4 files changed, 67 insertions(+), 48 deletions(-) diff --git a/codeGenCpu6502/src/prog8/codegen/cpu6502/AsmOptimizer.kt b/codeGenCpu6502/src/prog8/codegen/cpu6502/AsmOptimizer.kt index 7aea2813c..dfb34647f 100644 --- a/codeGenCpu6502/src/prog8/codegen/cpu6502/AsmOptimizer.kt +++ b/codeGenCpu6502/src/prog8/codegen/cpu6502/AsmOptimizer.kt @@ -50,6 +50,13 @@ internal fun optimizeAssembly(lines: MutableList, machine: IMachineDefin numberOfOptimizations++ } + mods = optimizeTSBtoRegularOr(linesByFour) + if(mods.isNotEmpty()) { + apply(mods, lines) + linesByFour = getLinesBy(lines, 4) + numberOfOptimizations++ + } + var linesByFourteen = getLinesBy(lines, 14) mods = optimizeSameAssignments(linesByFourteen, machine, symbolTable) if(mods.isNotEmpty()) { @@ -684,6 +691,29 @@ private fun optimizeUselessPushPopStack(linesByFour: Sequence>>): List { + // Asm peephole: lda var2 / tsb var1 / lda var1 Replace this with this to save 1 cycle: lda var1 / ora var2 / sta var1 + val mods = mutableListOf() + + for(lines in linesByFour) { + val first = lines[0].value.trimStart() + val second = lines[1].value.trimStart() + val third = lines[2].value.trimStart() + if(first.startsWith("lda") && second.startsWith("tsb") && third.startsWith("lda")) { + val operand1 = first.substring(3) + val operand2 = second.substring(3) + val operand3 = third.substring(3) + if(operand1!=operand2 && operand2==operand3) { + mods.add(Modification(lines[0].index, false, " lda $operand2 ; op2")) + mods.add(Modification(lines[1].index, false, " ora $operand1 ; op1")) + mods.add(Modification(lines[2].index, false, " sta $operand2 ; op2")) + } + } + } + return mods +} + private fun optimizeUnneededTempvarInAdd(linesByFour: Sequence>>): List { // sequence: sta P8ZP_SCRATCH_XX / lda something / clc / adc P8ZP_SCRATCH_XX // this can be performed without the scratch variable: clc / adc something diff --git a/codeGenCpu6502/src/prog8/codegen/cpu6502/assignment/AugmentableAssignmentAsmGen.kt b/codeGenCpu6502/src/prog8/codegen/cpu6502/assignment/AugmentableAssignmentAsmGen.kt index 1d05e5ff4..d3185475b 100644 --- a/codeGenCpu6502/src/prog8/codegen/cpu6502/assignment/AugmentableAssignmentAsmGen.kt +++ b/codeGenCpu6502/src/prog8/codegen/cpu6502/assignment/AugmentableAssignmentAsmGen.kt @@ -1086,6 +1086,15 @@ $shortcutLabel:""") } } + if(asmgen.isTargetCpu(CpuType.CPU65c02)) { + if(operator=="&" && value is PtPrefix && value.operator=="~") { + // M &= ~A --> use special TRB 65c02 instruction for that + asmgen.assignExpressionToRegister(value.value, RegisterOrPair.A, dt in SignedDatatypes) + asmgen.out(" trb $name") + return + } + } + // normal evaluation asmgen.assignExpressionToRegister(value, RegisterOrPair.A, dt in SignedDatatypes) inplacemodificationRegisterAwithVariableWithSwappedOperands(operator, name, dt in SignedDatatypes) @@ -1094,6 +1103,15 @@ $shortcutLabel:""") private fun inplacemodificationByteVariableWithVariable(name: String, dt: DataType, operator: String, otherName: String) { // note: no logical and/or shortcut here, not worth it due to simple right operand + + if(asmgen.isTargetCpu(CpuType.CPU65c02)) { + if(operator=="|") { + // M |= A --> use special TSB 65c02 instruction for that + asmgen.out(" lda $otherName | tsb $name") + return + } + } + asmgen.out(" lda $name") inplacemodificationRegisterAwithVariable(operator, otherName, dt in SignedDatatypes) asmgen.out(" sta $name") diff --git a/docs/source/todo.rst b/docs/source/todo.rst index 84c1baa73..ea1e06108 100644 --- a/docs/source/todo.rst +++ b/docs/source/todo.rst @@ -91,7 +91,6 @@ Libraries: Optimizations: -- For 65c02 targets: use trb and tsb instructions if possible (f.ex. generating ``lda cmask trb nvub`` for ``nvub &= ~cmask`` and ``lda cmask and fillm tsb nvub`` for ``nvub |= cmask & fillm`` - VariableAllocator: can we think of a smarter strategy for allocating variables into zeropage, rather than first-come-first-served? for instance, vars used inside loops first, then loopvars, then uwords used as pointers, then the rest - various optimizers skip stuff if compTarget.name==VMTarget.NAME. Once 6502-codegen is done from IR code, diff --git a/examples/test.p8 b/examples/test.p8 index 1eed0a327..449a2c83b 100644 --- a/examples/test.p8 +++ b/examples/test.p8 @@ -1,56 +1,28 @@ - %import textio -%import anyall - %option no_sysinit +%zeropage basicsafe main { - byte[256] barray - word[128] warray - uword large_barray=memory("bytes", 1000, 0) - uword large_warray=memory("words", 1000, 0) - - sub check() { - txt.print_bool(anyall.all(barray, 256)) - txt.spc() - txt.print_bool(anyall.any(barray, 256)) - txt.nl() - txt.print_bool(anyall.allw(warray, 128)) - txt.spc() - txt.print_bool(anyall.anyw(warray, 128)) - txt.nl() - txt.print_bool(anyall.all(large_barray, 1000)) - txt.spc() - txt.print_bool(anyall.any(large_barray, 1000)) - txt.nl() - txt.print_bool(anyall.allw(large_warray, 500)) - txt.spc() - txt.print_bool(anyall.anyw(large_warray, 500)) - txt.nl() - txt.nl() - } - sub start() { - sys.memset(large_barray, 1000, 0) - sys.memset(large_warray, 1000, 0) + ubyte @shared b1 = %10101010 + ubyte @shared b2 = %00001111 - check() - barray[250] = 99 - warray[100] = $0100 - large_barray[900] = 99 - large_warray[900] = 99 - check() - sys.memset(barray, 255, 1) - sys.memset(warray, 254, 1) - sys.memset(large_barray, 999, 1) - sys.memset(large_warray, 998, 1) - check() - barray[255]=1 - warray[127]=1 - @(large_barray+999)=1 - @(large_warray+999)=1 - check() - repeat {} + b1 &= ~b2 + txt.print_ubbin(b1, true) + txt.nl() + b1 |= b2 + txt.print_ubbin(b1, true) + txt.nl() + + b1 = %11001100 + b2 = %11110000 + + b1 &= ~b2 + txt.print_ubbin(b1, true) + txt.nl() + b1 |= b2 + txt.print_ubbin(b1, true) + txt.nl() ; smallringbuffer.init() ;