code optimization for bytearray[x] +/- bytearray[y]

use adc array,y or sbc array,y instead of tempvar
2025-01-10 20:30:23 +00:00 · 2023-09-04 21:07:49 +02:00 · 2023-09-04 21:07:49 +02:00 · eb018ae660
commit eb018ae660
parent 7e5a9474fe
5 changed files with 43 additions and 24 deletions
--- a/codeGenCpu6502/src/prog8/codegen/cpu6502/assignment/AssignmentAsmGen.kt
+++ b/codeGenCpu6502/src/prog8/codegen/cpu6502/assignment/AssignmentAsmGen.kt
@ -739,15 +739,31 @@ internal class AssignmentAsmGen(private val program: PtProgram,
                    return true
                }
                else -> {
-                    assignExpressionToRegister(left, RegisterOrPair.A, left.type==DataType.BYTE)
-                    asmgen.out("  pha")
-                    assignExpressionToVariable(right, "P8ZP_SCRATCH_B1", right.type)
-                    asmgen.out("  pla")
-                    if(expr.operator=="+")
-                        asmgen.out("  clc |  adc  P8ZP_SCRATCH_B1")
-                    else
-                        asmgen.out("  sec |  sbc  P8ZP_SCRATCH_B1")
-                    assignRegisterByte(target, CpuRegister.A, dt in SignedDatatypes)
+                    val rightArrayIndexer = expr.right as? PtArrayIndexer
+                    if(rightArrayIndexer!=null && rightArrayIndexer.type in ByteDatatypes && left.type in ByteDatatypes) {
+                        // special optimization for  bytevalue +/- bytearr[y] :  no need to use a tempvar, just use adc array,y or sbc array,y
+                        assignExpressionToRegister(left, RegisterOrPair.A, left.type==DataType.BYTE)
+                        asmgen.out("  pha")
+                        asmgen.assignExpressionToRegister(rightArrayIndexer.index, RegisterOrPair.Y, false)
+                        asmgen.out("  pla")
+                        val arrayvarname = if(rightArrayIndexer.usesPointerVariable)
+                                "(${rightArrayIndexer.variable.name})"
+                            else
+                                asmgen.asmSymbolName(rightArrayIndexer.variable)
+                        if (expr.operator == "+")
+                            asmgen.out("  clc |  adc  $arrayvarname,y")
+                        else
+                            asmgen.out("  sec |  sbc  $arrayvarname,y")
+                        assignRegisterByte(target, CpuRegister.A, dt in SignedDatatypes)
+                    } else {
+                        assignExpressionToVariable(right, "P8ZP_SCRATCH_B1", right.type)
+                        assignExpressionToRegister(left, RegisterOrPair.A, left.type==DataType.BYTE)
+                        if (expr.operator == "+")
+                            asmgen.out("  clc |  adc  P8ZP_SCRATCH_B1")
+                        else
+                            asmgen.out("  sec |  sbc  P8ZP_SCRATCH_B1")
+                        assignRegisterByte(target, CpuRegister.A, dt in SignedDatatypes)
+                    }
                    return true
                }
            }
@ -1857,7 +1873,7 @@ internal class AssignmentAsmGen(private val program: PtProgram,
            return
        }

-        // No more special optmized cases yet. Do the rest via more complex evaluation
+        // No more special optimized cases yet. Do the rest via more complex evaluation
        // note: cannot use assignTypeCastedValue because that is ourselves :P
        // NOTE: THIS MAY TURN INTO A STACK OVERFLOW ERROR IF IT CAN'T SIMPLIFY THE TYPECAST..... :-/
        asmgen.assignExpressionTo(origTypeCastExpression, target)
--- a/compiler/src/prog8/compiler/astprocessing/BeforeAsmAstChanger.kt
+++ b/compiler/src/prog8/compiler/astprocessing/BeforeAsmAstChanger.kt
@ -78,6 +78,16 @@ internal class BeforeAsmAstChanger(val program: Program, private val options: Co
                        val typeCast = binExpr.left as? TypecastExpression
                        if(typeCast!=null && typeCast.expression isSameAs assignment.target)
                            return noModifications
+
+                        if(binExpr.operator in "+-") {
+                            val leftDt = binExpr.left.inferType(program)
+                            val rightDt = binExpr.right.inferType(program)
+                            if(leftDt==rightDt && leftDt.isInteger && rightDt.isInteger && binExpr.right is ArrayIndexedExpression) {
+                                // don't split array[i] +/- array[i]    (the codegen has an optimized path for this)
+                                return noModifications
+                            }
+                        }
+
                        val sourceDt = binExpr.left.inferType(program).getOrElse { throw AssemblyError("unknown dt") }
                        val (_, left) = binExpr.left.typecastTo(assignment.target.inferType(program).getOrElse { throw AssemblyError(
                            "unknown dt"
--- a/docs/source/todo.rst
+++ b/docs/source/todo.rst
@ -1,9 +1,6 @@
 TODO
 ====

- add special optimization for  @(screen+i) = xbuf[x] + ybuf[y]  and   @(screen+i) = xbuf[x] - ybuf[y]
-  (noticable in plasma.p8 and cube examples?)
-
 - prefix prog8 subroutines with p8s_ instead of p8_ to not let them clash with variables in the asm??
 - [on branch: shortcircuit] investigate McCarthy evaluation again? this may also reduce code size perhaps for things like if a>4 or a<2 ....
 - IR: reduce the number of branch instructions such as BEQ, BEQR, etc (gradually), replace with CMP(I) + status branch instruction
--- a/examples/c64/plasma.p8
+++ b/examples/c64/plasma.p8
@ -88,14 +88,10 @@ main {
        c2A += 2
        c2B -= 3

-        for y in 24 downto 0 {
-            for x in 39 downto 0 {
-                ; split the array expression to avoid a prog8 temporary var inefficiency
-                ; this pure prog8 version achieves ~17 fps
-                ubyte @zp tmp = ybuf[y]
-                @(screen+x) = xbuf[x] + tmp
-; prog8 at this time needs a temp variable to calculate the above expression.
-; in optimized asm, this is the fastest way to do this line (achieving ~21 fps on the C64):
+        for y in 0 to 24 {
+            for x in 0 to 39 {
+                @(screen+x) = xbuf[x] + ybuf[y]
+; max optimized asm is this: (achieving ~21 fps on the C64):
 ;                %asm {{
 ;                     ldy  p8_y
 ;                     lda  p8_ybuf,y
--- a/examples/test.p8
+++ b/examples/test.p8
@ -10,13 +10,13 @@ main {
        ubyte j = 4
        uword screen

-        ubyte result = xx[i] + yy[j]        ; TODO optimize to use add addr,y
+        ubyte result = xx[i] + yy[j]
        txt.print_ub(result)    ; 149
        txt.nl()
-        result = xx[i] + yy[i]              ; TODO optimize to use add addr,y
+        result = xx[i] + yy[i]
        txt.print_ub(result)    ; 148
        txt.nl()
-        @(screen+i) = xx[i] + yy[i]     ; TODO why is this using P8ZP_SCRATCH_B1?
+        @(screen+i) = xx[i] + yy[i]

 ;        ubyte index = 100
 ;        ubyte[] t_index = [1,2,3,4,5]