From eb018ae6601ac890a54888d0e9dbff4403dc5248 Mon Sep 17 00:00:00 2001
From: Irmen de Jong <irmen@razorvine.net>
Date: Mon, 4 Sep 2023 21:07:49 +0200
Subject: [PATCH] code optimization for  bytearray[x] +/- bytearray[y]

use adc array,y or sbc array,y instead of tempvar
---
 .../cpu6502/assignment/AssignmentAsmGen.kt    | 36 +++++++++++++------
 .../astprocessing/BeforeAsmAstChanger.kt      | 10 ++++++
 docs/source/todo.rst                          |  3 --
 examples/c64/plasma.p8                        | 12 +++----
 examples/test.p8                              |  6 ++--
 5 files changed, 43 insertions(+), 24 deletions(-)

diff --git a/codeGenCpu6502/src/prog8/codegen/cpu6502/assignment/AssignmentAsmGen.kt b/codeGenCpu6502/src/prog8/codegen/cpu6502/assignment/AssignmentAsmGen.kt
index 052f01fb5..37fd86f25 100644
--- a/codeGenCpu6502/src/prog8/codegen/cpu6502/assignment/AssignmentAsmGen.kt
+++ b/codeGenCpu6502/src/prog8/codegen/cpu6502/assignment/AssignmentAsmGen.kt
@@ -739,15 +739,31 @@ internal class AssignmentAsmGen(private val program: PtProgram,
                     return true
                 }
                 else -> {
-                    assignExpressionToRegister(left, RegisterOrPair.A, left.type==DataType.BYTE)
-                    asmgen.out("  pha")
-                    assignExpressionToVariable(right, "P8ZP_SCRATCH_B1", right.type)
-                    asmgen.out("  pla")
-                    if(expr.operator=="+")
-                        asmgen.out("  clc |  adc  P8ZP_SCRATCH_B1")
-                    else
-                        asmgen.out("  sec |  sbc  P8ZP_SCRATCH_B1")
-                    assignRegisterByte(target, CpuRegister.A, dt in SignedDatatypes)
+                    val rightArrayIndexer = expr.right as? PtArrayIndexer
+                    if(rightArrayIndexer!=null && rightArrayIndexer.type in ByteDatatypes && left.type in ByteDatatypes) {
+                        // special optimization for  bytevalue +/- bytearr[y] :  no need to use a tempvar, just use adc array,y or sbc array,y
+                        assignExpressionToRegister(left, RegisterOrPair.A, left.type==DataType.BYTE)
+                        asmgen.out("  pha")
+                        asmgen.assignExpressionToRegister(rightArrayIndexer.index, RegisterOrPair.Y, false)
+                        asmgen.out("  pla")
+                        val arrayvarname = if(rightArrayIndexer.usesPointerVariable)
+                                "(${rightArrayIndexer.variable.name})"
+                            else
+                                asmgen.asmSymbolName(rightArrayIndexer.variable)
+                        if (expr.operator == "+")
+                            asmgen.out("  clc |  adc  $arrayvarname,y")
+                        else
+                            asmgen.out("  sec |  sbc  $arrayvarname,y")
+                        assignRegisterByte(target, CpuRegister.A, dt in SignedDatatypes)
+                    } else {
+                        assignExpressionToVariable(right, "P8ZP_SCRATCH_B1", right.type)
+                        assignExpressionToRegister(left, RegisterOrPair.A, left.type==DataType.BYTE)
+                        if (expr.operator == "+")
+                            asmgen.out("  clc |  adc  P8ZP_SCRATCH_B1")
+                        else
+                            asmgen.out("  sec |  sbc  P8ZP_SCRATCH_B1")
+                        assignRegisterByte(target, CpuRegister.A, dt in SignedDatatypes)
+                    }
                     return true
                 }
             }
@@ -1857,7 +1873,7 @@ internal class AssignmentAsmGen(private val program: PtProgram,
             return
         }
 
-        // No more special optmized cases yet. Do the rest via more complex evaluation
+        // No more special optimized cases yet. Do the rest via more complex evaluation
         // note: cannot use assignTypeCastedValue because that is ourselves :P
         // NOTE: THIS MAY TURN INTO A STACK OVERFLOW ERROR IF IT CAN'T SIMPLIFY THE TYPECAST..... :-/
         asmgen.assignExpressionTo(origTypeCastExpression, target)
diff --git a/compiler/src/prog8/compiler/astprocessing/BeforeAsmAstChanger.kt b/compiler/src/prog8/compiler/astprocessing/BeforeAsmAstChanger.kt
index 7288c9e93..7e8687c3c 100644
--- a/compiler/src/prog8/compiler/astprocessing/BeforeAsmAstChanger.kt
+++ b/compiler/src/prog8/compiler/astprocessing/BeforeAsmAstChanger.kt
@@ -78,6 +78,16 @@ internal class BeforeAsmAstChanger(val program: Program, private val options: Co
                         val typeCast = binExpr.left as? TypecastExpression
                         if(typeCast!=null && typeCast.expression isSameAs assignment.target)
                             return noModifications
+
+                        if(binExpr.operator in "+-") {
+                            val leftDt = binExpr.left.inferType(program)
+                            val rightDt = binExpr.right.inferType(program)
+                            if(leftDt==rightDt && leftDt.isInteger && rightDt.isInteger && binExpr.right is ArrayIndexedExpression) {
+                                // don't split array[i] +/- array[i]    (the codegen has an optimized path for this)
+                                return noModifications
+                            }
+                        }
+
                         val sourceDt = binExpr.left.inferType(program).getOrElse { throw AssemblyError("unknown dt") }
                         val (_, left) = binExpr.left.typecastTo(assignment.target.inferType(program).getOrElse { throw AssemblyError(
                             "unknown dt"
diff --git a/docs/source/todo.rst b/docs/source/todo.rst
index 73c8f5df1..94458fa1a 100644
--- a/docs/source/todo.rst
+++ b/docs/source/todo.rst
@@ -1,9 +1,6 @@
 TODO
 ====
 
-- add special optimization for  @(screen+i) = xbuf[x] + ybuf[y]  and   @(screen+i) = xbuf[x] - ybuf[y]
-  (noticable in plasma.p8 and cube examples?)
-
 - prefix prog8 subroutines with p8s_ instead of p8_ to not let them clash with variables in the asm??
 - [on branch: shortcircuit] investigate McCarthy evaluation again? this may also reduce code size perhaps for things like if a>4 or a<2 ....
 - IR: reduce the number of branch instructions such as BEQ, BEQR, etc (gradually), replace with CMP(I) + status branch instruction
diff --git a/examples/c64/plasma.p8 b/examples/c64/plasma.p8
index 97104b921..93af3f669 100644
--- a/examples/c64/plasma.p8
+++ b/examples/c64/plasma.p8
@@ -88,14 +88,10 @@ main {
         c2A += 2
         c2B -= 3
 
-        for y in 24 downto 0 {
-            for x in 39 downto 0 {
-                ; split the array expression to avoid a prog8 temporary var inefficiency
-                ; this pure prog8 version achieves ~17 fps
-                ubyte @zp tmp = ybuf[y]
-                @(screen+x) = xbuf[x] + tmp
-; prog8 at this time needs a temp variable to calculate the above expression.
-; in optimized asm, this is the fastest way to do this line (achieving ~21 fps on the C64):
+        for y in 0 to 24 {
+            for x in 0 to 39 {
+                @(screen+x) = xbuf[x] + ybuf[y]
+; max optimized asm is this: (achieving ~21 fps on the C64):
 ;                %asm {{
 ;                     ldy  p8_y
 ;                     lda  p8_ybuf,y
diff --git a/examples/test.p8 b/examples/test.p8
index ec9c61493..5b198b599 100644
--- a/examples/test.p8
+++ b/examples/test.p8
@@ -10,13 +10,13 @@ main {
         ubyte j = 4
         uword screen
 
-        ubyte result = xx[i] + yy[j]        ; TODO optimize to use add addr,y
+        ubyte result = xx[i] + yy[j]
         txt.print_ub(result)    ; 149
         txt.nl()
-        result = xx[i] + yy[i]              ; TODO optimize to use add addr,y
+        result = xx[i] + yy[i]
         txt.print_ub(result)    ; 148
         txt.nl()
-        @(screen+i) = xx[i] + yy[i]     ; TODO why is this using P8ZP_SCRATCH_B1?
+        @(screen+i) = xx[i] + yy[i]
 
 ;        ubyte index = 100
 ;        ubyte[] t_index = [1,2,3,4,5]