From 78c7ee247ae6aff19fdaf745262861a28eed66f0 Mon Sep 17 00:00:00 2001
From: Irmen de Jong <irmen@razorvine.net>
Date: Tue, 16 Jul 2024 00:25:29 +0200
Subject: [PATCH] generate 65c02 TSB/TRB instructions in certain cases

---
 .../src/prog8/codegen/cpu6502/AsmOptimizer.kt | 30 +++++++++
 .../assignment/AugmentableAssignmentAsmGen.kt | 18 +++++
 docs/source/todo.rst                          |  1 -
 examples/test.p8                              | 66 ++++++-------------
 4 files changed, 67 insertions(+), 48 deletions(-)
diff --git a/codeGenCpu6502/src/prog8/codegen/cpu6502/AsmOptimizer.kt b/codeGenCpu6502/src/prog8/codegen/cpu6502/AsmOptimizer.kt
index 7aea2813c..dfb34647f 100644
--- a/codeGenCpu6502/src/prog8/codegen/cpu6502/AsmOptimizer.kt
+++ b/codeGenCpu6502/src/prog8/codegen/cpu6502/AsmOptimizer.kt
@@ -50,6 +50,13 @@ internal fun optimizeAssembly(lines: MutableList<String>, machine: IMachineDefin
         numberOfOptimizations++
     }
 
+    mods = optimizeTSBtoRegularOr(linesByFour)
+    if(mods.isNotEmpty()) {
+        apply(mods, lines)
+        linesByFour = getLinesBy(lines, 4)
+        numberOfOptimizations++
+    }
+
     var linesByFourteen = getLinesBy(lines, 14)
     mods = optimizeSameAssignments(linesByFourteen, machine, symbolTable)
     if(mods.isNotEmpty()) {
@@ -684,6 +691,29 @@ private fun optimizeUselessPushPopStack(linesByFour: Sequence<List<IndexedValue<
     return mods
 }
 
+
+private fun optimizeTSBtoRegularOr(linesByFour: Sequence<List<IndexedValue<String>>>): List<Modification> {
+    // Asm peephole:   lda var2 / tsb var1 / lda var1  Replace this with this to save 1 cycle:   lda var1 / ora var2 / sta var1
+    val mods = mutableListOf<Modification>()
+
+    for(lines in linesByFour) {
+        val first = lines[0].value.trimStart()
+        val second = lines[1].value.trimStart()
+        val third = lines[2].value.trimStart()
+        if(first.startsWith("lda") && second.startsWith("tsb") && third.startsWith("lda")) {
+            val operand1 = first.substring(3)
+            val operand2 = second.substring(3)
+            val operand3 = third.substring(3)
+            if(operand1!=operand2 && operand2==operand3) {
+                mods.add(Modification(lines[0].index, false, "  lda  $operand2  ; op2"))
+                mods.add(Modification(lines[1].index, false, "  ora  $operand1  ; op1"))
+                mods.add(Modification(lines[2].index, false, "  sta  $operand2  ; op2"))
+            }
+        }
+    }
+    return mods
+}
+
 private fun optimizeUnneededTempvarInAdd(linesByFour: Sequence<List<IndexedValue<String>>>): List<Modification> {
     // sequence:  sta  P8ZP_SCRATCH_XX  / lda  something / clc / adc  P8ZP_SCRATCH_XX
     // this can be performed without the scratch variable:  clc  /  adc  something
diff --git a/codeGenCpu6502/src/prog8/codegen/cpu6502/assignment/AugmentableAssignmentAsmGen.kt b/codeGenCpu6502/src/prog8/codegen/cpu6502/assignment/AugmentableAssignmentAsmGen.kt
index 1d05e5ff4..d3185475b 100644
--- a/codeGenCpu6502/src/prog8/codegen/cpu6502/assignment/AugmentableAssignmentAsmGen.kt
+++ b/codeGenCpu6502/src/prog8/codegen/cpu6502/assignment/AugmentableAssignmentAsmGen.kt
@@ -1086,6 +1086,15 @@ $shortcutLabel:""")
             }
         }
 
+        if(asmgen.isTargetCpu(CpuType.CPU65c02)) {
+            if(operator=="&" && value is PtPrefix && value.operator=="~") {
+                // M &= ~A  -->  use special TRB 65c02 instruction for that
+                asmgen.assignExpressionToRegister(value.value, RegisterOrPair.A, dt in SignedDatatypes)
+                asmgen.out("  trb  $name")
+                return
+            }
+        }
+
         // normal evaluation
         asmgen.assignExpressionToRegister(value, RegisterOrPair.A, dt in SignedDatatypes)
         inplacemodificationRegisterAwithVariableWithSwappedOperands(operator, name, dt in SignedDatatypes)
@@ -1094,6 +1103,15 @@ $shortcutLabel:""")
 
     private fun inplacemodificationByteVariableWithVariable(name: String, dt: DataType, operator: String, otherName: String) {
         // note: no logical and/or shortcut here, not worth it due to simple right operand
+
+        if(asmgen.isTargetCpu(CpuType.CPU65c02)) {
+            if(operator=="|") {
+                // M |= A  -->  use special TSB 65c02 instruction for that
+                asmgen.out("  lda  $otherName |  tsb  $name")
+                return
+            }
+        }
+
         asmgen.out("  lda  $name")
         inplacemodificationRegisterAwithVariable(operator, otherName, dt in SignedDatatypes)
         asmgen.out("  sta  $name")
diff --git a/docs/source/todo.rst b/docs/source/todo.rst
index 84c1baa73..ea1e06108 100644
--- a/docs/source/todo.rst
+++ b/docs/source/todo.rst
@@ -91,7 +91,6 @@ Libraries:
 
 Optimizations:
 
-- For 65c02 targets: use trb and tsb instructions if possible (f.ex. generating  ``lda cmask   trb nvub`` for ``nvub &= ~cmask``  and ``lda cmask  and fillm   tsb nvub`` for  ``nvub |= cmask & fillm``
 - VariableAllocator: can we think of a smarter strategy for allocating variables into zeropage, rather than first-come-first-served?
   for instance, vars used inside loops first, then loopvars, then uwords used as pointers, then the rest
 - various optimizers skip stuff if compTarget.name==VMTarget.NAME.  Once 6502-codegen is done from IR code,
diff --git a/examples/test.p8 b/examples/test.p8
index 1eed0a327..449a2c83b 100644
--- a/examples/test.p8
+++ b/examples/test.p8
@@ -1,56 +1,28 @@
-
 %import textio
-%import anyall
-
 %option no_sysinit
+%zeropage basicsafe
 
 main {
-    byte[256] barray
-    word[128] warray
-    uword large_barray=memory("bytes", 1000, 0)
-    uword large_warray=memory("words", 1000, 0)
-
-    sub check() {
-        txt.print_bool(anyall.all(barray, 256))
-        txt.spc()
-        txt.print_bool(anyall.any(barray, 256))
-        txt.nl()
-        txt.print_bool(anyall.allw(warray, 128))
-        txt.spc()
-        txt.print_bool(anyall.anyw(warray, 128))
-        txt.nl()
-        txt.print_bool(anyall.all(large_barray, 1000))
-        txt.spc()
-        txt.print_bool(anyall.any(large_barray, 1000))
-        txt.nl()
-        txt.print_bool(anyall.allw(large_warray, 500))
-        txt.spc()
-        txt.print_bool(anyall.anyw(large_warray, 500))
-        txt.nl()
-        txt.nl()
-    }
-
     sub start() {
-        sys.memset(large_barray, 1000, 0)
-        sys.memset(large_warray, 1000, 0)
+        ubyte @shared b1 = %10101010
+        ubyte @shared b2 = %00001111
 
-        check()
-        barray[250] = 99
-        warray[100] = $0100
-        large_barray[900] = 99
-        large_warray[900] = 99
-        check()
-        sys.memset(barray, 255, 1)
-        sys.memset(warray, 254, 1)
-        sys.memset(large_barray, 999, 1)
-        sys.memset(large_warray, 998, 1)
-        check()
-        barray[255]=1
-        warray[127]=1
-        @(large_barray+999)=1
-        @(large_warray+999)=1
-        check()
-        repeat {}
+        b1 &= ~b2
+        txt.print_ubbin(b1, true)
+        txt.nl()
+        b1 |= b2
+        txt.print_ubbin(b1, true)
+        txt.nl()
+
+        b1 = %11001100
+        b2 = %11110000
+
+        b1 &= ~b2
+        txt.print_ubbin(b1, true)
+        txt.nl()
+        b1 |= b2
+        txt.print_ubbin(b1, true)
+        txt.nl()
 
 ;        smallringbuffer.init()
 ;