diff --git a/codeGenCpu6502/src/prog8/codegen/cpu6502/AsmOptimizer.kt b/codeGenCpu6502/src/prog8/codegen/cpu6502/AsmOptimizer.kt
index f499f8dea..487ae797a 100644
--- a/codeGenCpu6502/src/prog8/codegen/cpu6502/AsmOptimizer.kt
+++ b/codeGenCpu6502/src/prog8/codegen/cpu6502/AsmOptimizer.kt
@@ -384,6 +384,7 @@ private fun optimizeStoreLoadSame(
     for (lines in linesByFour) {
         val first = lines[1].value.trimStart()
         val second = lines[2].value.trimStart()
+        val third = lines[3].value.trimStart()
 
         // sta X + lda X,  sty X + ldy X,   stx X + ldx X  -> the second instruction can OFTEN be eliminated
         if ((first.startsWith("sta ") && second.startsWith("lda ")) ||
@@ -393,7 +394,6 @@ private fun optimizeStoreLoadSame(
                 (first.startsWith("ldy ") && second.startsWith("ldy ")) ||
                 (first.startsWith("ldx ") && second.startsWith("ldx "))
         ) {
-            val third = lines[3].value.trimStart()
             val attemptRemove =
                 if(third.isBranch()) {
                     // a branch instruction follows, we can only remove the load instruction if
@@ -446,6 +446,23 @@ private fun optimizeStoreLoadSame(
             if (firstLoc == secondLoc)
                 mods.add(Modification(lines[2].index, true, null))
         }
+
+        // phy + ldy + pla -> tya + ldy
+        // phx + ldx + pla -> txa + ldx
+        // pha + lda + pla -> nop
+        if(first=="phy" && second.startsWith("ldy ") && third=="pla") {
+            mods.add(Modification(lines[3].index, true, null))
+            mods.add(Modification(lines[1].index, false, "  tya"))
+        }
+        else if(first=="phx" && second.startsWith("ldx ") && third=="pla") {
+            mods.add(Modification(lines[3].index, true, null))
+            mods.add(Modification(lines[1].index, false, "  txa"))
+        }
+        else if(first=="pha" && second.startsWith("lda ") && third=="pla") {
+            mods.add(Modification(lines[1].index, true, null))
+            mods.add(Modification(lines[2].index, true, null))
+            mods.add(Modification(lines[3].index, true, null))
+        }
     }
     return mods
 }
diff --git a/codeGenCpu6502/src/prog8/codegen/cpu6502/assignment/AssignmentAsmGen.kt b/codeGenCpu6502/src/prog8/codegen/cpu6502/assignment/AssignmentAsmGen.kt
index 012aec2dc..d8fdae92b 100644
--- a/codeGenCpu6502/src/prog8/codegen/cpu6502/assignment/AssignmentAsmGen.kt
+++ b/codeGenCpu6502/src/prog8/codegen/cpu6502/assignment/AssignmentAsmGen.kt
@@ -3182,11 +3182,20 @@ internal class AssignmentAsmGen(private val program: PtProgram,
                     }
                     else {
                         if (regs !in Cx16VirtualRegisters) {
-                            when (regs) {
-                                RegisterOrPair.AX -> asmgen.out("  pha |  txa |  pha")
-                                RegisterOrPair.AY -> asmgen.out("  pha |  tya |  pha")
-                                RegisterOrPair.XY -> asmgen.out("  txa |  pha |  tya |  pha")
-                                else -> throw AssemblyError("expected reg pair")
+                            if (asmgen.isTargetCpu(CpuType.CPU65c02)) {
+                                when (regs) {
+                                    RegisterOrPair.AX -> asmgen.out("  pha |  phx")
+                                    RegisterOrPair.AY -> asmgen.out("  pha |  phy")
+                                    RegisterOrPair.XY -> asmgen.out("  phx |  phy")
+                                    else -> throw AssemblyError("expected reg pair")
+                                }
+                            } else {
+                                when (regs) {
+                                    RegisterOrPair.AX -> asmgen.out("  pha |  txa |  pha")
+                                    RegisterOrPair.AY -> asmgen.out("  pha |  tya |  pha")
+                                    RegisterOrPair.XY -> asmgen.out("  txa |  pha |  tya |  pha")
+                                    else -> throw AssemblyError("expected reg pair")
+                                }
                             }
                             asmgen.loadScaledArrayIndexIntoRegister(target.array, CpuRegister.Y)
                             asmgen.out("""
@@ -3225,11 +3234,20 @@ internal class AssignmentAsmGen(private val program: PtProgram,
                     }
                     else {
                         if (regs !in Cx16VirtualRegisters) {
-                            when (regs) {
-                                RegisterOrPair.AX -> asmgen.out("  pha |  txa |  pha")
-                                RegisterOrPair.AY -> asmgen.out("  pha |  tya |  pha")
-                                RegisterOrPair.XY -> asmgen.out("  txa |  pha |  tya |  pha")
-                                else -> throw AssemblyError("expected reg pair")
+                            if (asmgen.isTargetCpu(CpuType.CPU65c02)) {
+                                when (regs) {
+                                    RegisterOrPair.AX -> asmgen.out("  pha |  phx")
+                                    RegisterOrPair.AY -> asmgen.out("  pha |  phy")
+                                    RegisterOrPair.XY -> asmgen.out("  phx |  phy")
+                                    else -> throw AssemblyError("expected reg pair")
+                                }
+                            } else {
+                                when (regs) {
+                                    RegisterOrPair.AX -> asmgen.out("  pha |  txa |  pha")
+                                    RegisterOrPair.AY -> asmgen.out("  pha |  tya |  pha")
+                                    RegisterOrPair.XY -> asmgen.out("  txa |  pha |  tya |  pha")
+                                    else -> throw AssemblyError("expected reg pair")
+                                }
                             }
                             asmgen.loadScaledArrayIndexIntoRegister(target.array, CpuRegister.Y)
                             asmgen.out("""
diff --git a/docs/source/todo.rst b/docs/source/todo.rst
index 528257362..69f5e5e25 100644
--- a/docs/source/todo.rst
+++ b/docs/source/todo.rst
@@ -6,8 +6,6 @@ causes compiler error for virtual: just calling txt.cls() gives compile error un
 
 https://github.com/irmen/prog8/issues/136 (string.find register order issue)
 
-optimization: for 65c02 sometimes tya pha is generated, could be just phy (mind if A gets used afterwards though!) (same for pla tay etcetera?)
-
 if-optimization:
         if row == NUMQUEENS {
             print_solution()
diff --git a/examples/test.p8 b/examples/test.p8
index 39d622b6b..af9b00870 100644
--- a/examples/test.p8
+++ b/examples/test.p8
@@ -1,133 +1,52 @@
-%import math
 %import textio
 %zeropage basicsafe
+%option no_sysinit
 
 main {
     sub start() {
-        cx16.r0sL = 127
-        cx16.r0sL = bytefunc(cx16.r0sL+1)
-        cx16.r0sL = 0
-        cx16.r0sL = bytefunc(cx16.r0sL-1)
-        cx16.r0sL = 55
-        cx16.r0sL = bytefunc(cx16.r0sL+20)
-        cx16.r0sL = 55
-        cx16.r0sL = bytefunc(cx16.r0sL-20)
-
-        cx16.r0s = $99ff as word
-        cx16.r0s = wordfunc(cx16.r0s+1)
-        cx16.r0s = $9900 as word
-        cx16.r0s = wordfunc(cx16.r0s-1)
-        cx16.r0s = -12345
-        cx16.r0s = wordfunc(cx16.r0s+100)
-        cx16.r0s = -12345
-        cx16.r0s = wordfunc(cx16.r0s-100)
+        signed()
+        unsigned()
     }
 
+    sub signed() {
+        byte @shared bvalue = -100
+        word @shared wvalue = -20000
 
-    sub bytefunc(byte x) -> byte {
-        txt.print_ubhex(x as ubyte, true)
-        txt.spc()
-        txt.print_b(x)
+        bvalue /= 2     ; TODO should be a simple bit shift?
+        wvalue /= 2     ; TODO should be a simple bit shift?
+
+        txt.print_b(bvalue)
+        txt.nl()
+        txt.print_w(wvalue)
+        txt.nl()
+
+        bvalue *= 2
+        wvalue *= 2
+
+        txt.print_b(bvalue)
+        txt.nl()
+        txt.print_w(wvalue)
         txt.nl()
-        return x
     }
 
-    sub wordfunc(word x) -> word {
-        txt.print_uwhex(x as uword, true)
-        txt.spc()
-        txt.print_w(x)
+    sub unsigned() {
+        ubyte @shared ubvalue = 100
+        uword @shared uwvalue = 20000
+
+        ubvalue /= 2
+        uwvalue /= 2
+
+        txt.print_ub(ubvalue)
+        txt.nl()
+        txt.print_uw(uwvalue)
+        txt.nl()
+
+        ubvalue *= 2
+        uwvalue *= 2
+
+        txt.print_ub(ubvalue)
+        txt.nl()
+        txt.print_uw(uwvalue)
         txt.nl()
-        return x
     }
 }
-
-;%import math
-;%import sprites
-;
-;main {
-;    word[128] @split xpos_orig
-;    word[128] @split ypos_orig
-;    word[128] xpos
-;    word[128] ypos
-;    ubyte[128] tt
-;
-;    sub start() {
-;        cx16.mouse_config2(1)
-;        sprites.set_mousepointer_hand()
-;        ubyte sprdat_bank
-;        uword sprdat_addr
-;        sprdat_bank, sprdat_addr = sprites.get_data_ptr(0)
-;
-;        ubyte sprite
-;        for sprite in 0 to 127 {
-;            sprites.init(sprite, sprdat_bank, sprdat_addr, sprites.SIZE_16, sprites.SIZE_16, sprites.COLORS_256, 0)
-;            xpos_orig[sprite] = sprite*$0003 +100 as word
-;            ypos_orig[sprite] = sprite*$0002 +100 as word
-;            tt[sprite] = math.rnd()
-;        }
-;
-;        repeat {
-;            sys.waitvsync()
-;            sprites.pos_batch(0, 128, &xpos, &ypos)
-;            for sprite in 0 to 127 {
-;                tt[sprite]++
-;                xpos[sprite] = xpos_orig[sprite] + math.sin8(tt[sprite])
-;                ypos[sprite] = ypos_orig[sprite] + math.cos8(tt[sprite])
-;            }
-;        }
-;    }
-;}
-;
-;
-;;%import textio
-;;%zeropage basicsafe
-;;%option no_sysinit
-;;
-;;main {
-;;    sub start() {
-;;        signed()
-;;        unsigned()
-;;    }
-;;
-;;    sub signed() {
-;;        byte @shared bvalue = -100
-;;        word @shared wvalue = -20000
-;;
-;;        bvalue /= 2     ; TODO should be a simple bit shift?
-;;        wvalue /= 2     ; TODO should be a simple bit shift?
-;;
-;;        txt.print_b(bvalue)
-;;        txt.nl()
-;;        txt.print_w(wvalue)
-;;        txt.nl()
-;;
-;;        bvalue *= 2
-;;        wvalue *= 2
-;;
-;;        txt.print_b(bvalue)
-;;        txt.nl()
-;;        txt.print_w(wvalue)
-;;        txt.nl()
-;;    }
-;;
-;;    sub unsigned() {
-;;        ubyte @shared ubvalue = 100
-;;        uword @shared uwvalue = 20000
-;;
-;;        ubvalue /= 2
-;;        uwvalue /= 2
-;;
-;;        txt.print_ub(ubvalue)
-;;        txt.nl()
-;;        txt.print_uw(uwvalue)
-;;        txt.nl()
-;;
-;;        ubvalue *= 2
-;;        uwvalue *= 2
-;;
-;;        txt.print_ub(ubvalue)
-;;        txt.nl()
-;;        txt.print_uw(uwvalue)
-;;        txt.nl()
-;;    }
-;;}