From eb018ae6601ac890a54888d0e9dbff4403dc5248 Mon Sep 17 00:00:00 2001 From: Irmen de Jong Date: Mon, 4 Sep 2023 21:07:49 +0200 Subject: [PATCH] code optimization for bytearray[x] +/- bytearray[y] use adc array,y or sbc array,y instead of tempvar --- .../cpu6502/assignment/AssignmentAsmGen.kt | 36 +++++++++++++------ .../astprocessing/BeforeAsmAstChanger.kt | 10 ++++++ docs/source/todo.rst | 3 -- examples/c64/plasma.p8 | 12 +++---- examples/test.p8 | 6 ++-- 5 files changed, 43 insertions(+), 24 deletions(-) diff --git a/codeGenCpu6502/src/prog8/codegen/cpu6502/assignment/AssignmentAsmGen.kt b/codeGenCpu6502/src/prog8/codegen/cpu6502/assignment/AssignmentAsmGen.kt index 052f01fb5..37fd86f25 100644 --- a/codeGenCpu6502/src/prog8/codegen/cpu6502/assignment/AssignmentAsmGen.kt +++ b/codeGenCpu6502/src/prog8/codegen/cpu6502/assignment/AssignmentAsmGen.kt @@ -739,15 +739,31 @@ internal class AssignmentAsmGen(private val program: PtProgram, return true } else -> { - assignExpressionToRegister(left, RegisterOrPair.A, left.type==DataType.BYTE) - asmgen.out(" pha") - assignExpressionToVariable(right, "P8ZP_SCRATCH_B1", right.type) - asmgen.out(" pla") - if(expr.operator=="+") - asmgen.out(" clc | adc P8ZP_SCRATCH_B1") - else - asmgen.out(" sec | sbc P8ZP_SCRATCH_B1") - assignRegisterByte(target, CpuRegister.A, dt in SignedDatatypes) + val rightArrayIndexer = expr.right as? PtArrayIndexer + if(rightArrayIndexer!=null && rightArrayIndexer.type in ByteDatatypes && left.type in ByteDatatypes) { + // special optimization for bytevalue +/- bytearr[y] : no need to use a tempvar, just use adc array,y or sbc array,y + assignExpressionToRegister(left, RegisterOrPair.A, left.type==DataType.BYTE) + asmgen.out(" pha") + asmgen.assignExpressionToRegister(rightArrayIndexer.index, RegisterOrPair.Y, false) + asmgen.out(" pla") + val arrayvarname = if(rightArrayIndexer.usesPointerVariable) + "(${rightArrayIndexer.variable.name})" + else + asmgen.asmSymbolName(rightArrayIndexer.variable) + if (expr.operator == "+") + asmgen.out(" clc | adc $arrayvarname,y") + else + asmgen.out(" sec | sbc $arrayvarname,y") + assignRegisterByte(target, CpuRegister.A, dt in SignedDatatypes) + } else { + assignExpressionToVariable(right, "P8ZP_SCRATCH_B1", right.type) + assignExpressionToRegister(left, RegisterOrPair.A, left.type==DataType.BYTE) + if (expr.operator == "+") + asmgen.out(" clc | adc P8ZP_SCRATCH_B1") + else + asmgen.out(" sec | sbc P8ZP_SCRATCH_B1") + assignRegisterByte(target, CpuRegister.A, dt in SignedDatatypes) + } return true } } @@ -1857,7 +1873,7 @@ internal class AssignmentAsmGen(private val program: PtProgram, return } - // No more special optmized cases yet. Do the rest via more complex evaluation + // No more special optimized cases yet. Do the rest via more complex evaluation // note: cannot use assignTypeCastedValue because that is ourselves :P // NOTE: THIS MAY TURN INTO A STACK OVERFLOW ERROR IF IT CAN'T SIMPLIFY THE TYPECAST..... :-/ asmgen.assignExpressionTo(origTypeCastExpression, target) diff --git a/compiler/src/prog8/compiler/astprocessing/BeforeAsmAstChanger.kt b/compiler/src/prog8/compiler/astprocessing/BeforeAsmAstChanger.kt index 7288c9e93..7e8687c3c 100644 --- a/compiler/src/prog8/compiler/astprocessing/BeforeAsmAstChanger.kt +++ b/compiler/src/prog8/compiler/astprocessing/BeforeAsmAstChanger.kt @@ -78,6 +78,16 @@ internal class BeforeAsmAstChanger(val program: Program, private val options: Co val typeCast = binExpr.left as? TypecastExpression if(typeCast!=null && typeCast.expression isSameAs assignment.target) return noModifications + + if(binExpr.operator in "+-") { + val leftDt = binExpr.left.inferType(program) + val rightDt = binExpr.right.inferType(program) + if(leftDt==rightDt && leftDt.isInteger && rightDt.isInteger && binExpr.right is ArrayIndexedExpression) { + // don't split array[i] +/- array[i] (the codegen has an optimized path for this) + return noModifications + } + } + val sourceDt = binExpr.left.inferType(program).getOrElse { throw AssemblyError("unknown dt") } val (_, left) = binExpr.left.typecastTo(assignment.target.inferType(program).getOrElse { throw AssemblyError( "unknown dt" diff --git a/docs/source/todo.rst b/docs/source/todo.rst index 73c8f5df1..94458fa1a 100644 --- a/docs/source/todo.rst +++ b/docs/source/todo.rst @@ -1,9 +1,6 @@ TODO ==== -- add special optimization for @(screen+i) = xbuf[x] + ybuf[y] and @(screen+i) = xbuf[x] - ybuf[y] - (noticable in plasma.p8 and cube examples?) - - prefix prog8 subroutines with p8s_ instead of p8_ to not let them clash with variables in the asm?? - [on branch: shortcircuit] investigate McCarthy evaluation again? this may also reduce code size perhaps for things like if a>4 or a<2 .... - IR: reduce the number of branch instructions such as BEQ, BEQR, etc (gradually), replace with CMP(I) + status branch instruction diff --git a/examples/c64/plasma.p8 b/examples/c64/plasma.p8 index 97104b921..93af3f669 100644 --- a/examples/c64/plasma.p8 +++ b/examples/c64/plasma.p8 @@ -88,14 +88,10 @@ main { c2A += 2 c2B -= 3 - for y in 24 downto 0 { - for x in 39 downto 0 { - ; split the array expression to avoid a prog8 temporary var inefficiency - ; this pure prog8 version achieves ~17 fps - ubyte @zp tmp = ybuf[y] - @(screen+x) = xbuf[x] + tmp -; prog8 at this time needs a temp variable to calculate the above expression. -; in optimized asm, this is the fastest way to do this line (achieving ~21 fps on the C64): + for y in 0 to 24 { + for x in 0 to 39 { + @(screen+x) = xbuf[x] + ybuf[y] +; max optimized asm is this: (achieving ~21 fps on the C64): ; %asm {{ ; ldy p8_y ; lda p8_ybuf,y diff --git a/examples/test.p8 b/examples/test.p8 index ec9c61493..5b198b599 100644 --- a/examples/test.p8 +++ b/examples/test.p8 @@ -10,13 +10,13 @@ main { ubyte j = 4 uword screen - ubyte result = xx[i] + yy[j] ; TODO optimize to use add addr,y + ubyte result = xx[i] + yy[j] txt.print_ub(result) ; 149 txt.nl() - result = xx[i] + yy[i] ; TODO optimize to use add addr,y + result = xx[i] + yy[i] txt.print_ub(result) ; 148 txt.nl() - @(screen+i) = xx[i] + yy[i] ; TODO why is this using P8ZP_SCRATCH_B1? + @(screen+i) = xx[i] + yy[i] ; ubyte index = 100 ; ubyte[] t_index = [1,2,3,4,5]