mirror of
https://github.com/irmen/prog8.git
synced 2025-01-10 20:30:23 +00:00
code optimization for bytearray[x] +/- bytearray[y]
use adc array,y or sbc array,y instead of tempvar
This commit is contained in:
parent
7e5a9474fe
commit
eb018ae660
@ -739,15 +739,31 @@ internal class AssignmentAsmGen(private val program: PtProgram,
|
||||
return true
|
||||
}
|
||||
else -> {
|
||||
assignExpressionToRegister(left, RegisterOrPair.A, left.type==DataType.BYTE)
|
||||
asmgen.out(" pha")
|
||||
assignExpressionToVariable(right, "P8ZP_SCRATCH_B1", right.type)
|
||||
asmgen.out(" pla")
|
||||
if(expr.operator=="+")
|
||||
asmgen.out(" clc | adc P8ZP_SCRATCH_B1")
|
||||
else
|
||||
asmgen.out(" sec | sbc P8ZP_SCRATCH_B1")
|
||||
assignRegisterByte(target, CpuRegister.A, dt in SignedDatatypes)
|
||||
val rightArrayIndexer = expr.right as? PtArrayIndexer
|
||||
if(rightArrayIndexer!=null && rightArrayIndexer.type in ByteDatatypes && left.type in ByteDatatypes) {
|
||||
// special optimization for bytevalue +/- bytearr[y] : no need to use a tempvar, just use adc array,y or sbc array,y
|
||||
assignExpressionToRegister(left, RegisterOrPair.A, left.type==DataType.BYTE)
|
||||
asmgen.out(" pha")
|
||||
asmgen.assignExpressionToRegister(rightArrayIndexer.index, RegisterOrPair.Y, false)
|
||||
asmgen.out(" pla")
|
||||
val arrayvarname = if(rightArrayIndexer.usesPointerVariable)
|
||||
"(${rightArrayIndexer.variable.name})"
|
||||
else
|
||||
asmgen.asmSymbolName(rightArrayIndexer.variable)
|
||||
if (expr.operator == "+")
|
||||
asmgen.out(" clc | adc $arrayvarname,y")
|
||||
else
|
||||
asmgen.out(" sec | sbc $arrayvarname,y")
|
||||
assignRegisterByte(target, CpuRegister.A, dt in SignedDatatypes)
|
||||
} else {
|
||||
assignExpressionToVariable(right, "P8ZP_SCRATCH_B1", right.type)
|
||||
assignExpressionToRegister(left, RegisterOrPair.A, left.type==DataType.BYTE)
|
||||
if (expr.operator == "+")
|
||||
asmgen.out(" clc | adc P8ZP_SCRATCH_B1")
|
||||
else
|
||||
asmgen.out(" sec | sbc P8ZP_SCRATCH_B1")
|
||||
assignRegisterByte(target, CpuRegister.A, dt in SignedDatatypes)
|
||||
}
|
||||
return true
|
||||
}
|
||||
}
|
||||
@ -1857,7 +1873,7 @@ internal class AssignmentAsmGen(private val program: PtProgram,
|
||||
return
|
||||
}
|
||||
|
||||
// No more special optmized cases yet. Do the rest via more complex evaluation
|
||||
// No more special optimized cases yet. Do the rest via more complex evaluation
|
||||
// note: cannot use assignTypeCastedValue because that is ourselves :P
|
||||
// NOTE: THIS MAY TURN INTO A STACK OVERFLOW ERROR IF IT CAN'T SIMPLIFY THE TYPECAST..... :-/
|
||||
asmgen.assignExpressionTo(origTypeCastExpression, target)
|
||||
|
@ -78,6 +78,16 @@ internal class BeforeAsmAstChanger(val program: Program, private val options: Co
|
||||
val typeCast = binExpr.left as? TypecastExpression
|
||||
if(typeCast!=null && typeCast.expression isSameAs assignment.target)
|
||||
return noModifications
|
||||
|
||||
if(binExpr.operator in "+-") {
|
||||
val leftDt = binExpr.left.inferType(program)
|
||||
val rightDt = binExpr.right.inferType(program)
|
||||
if(leftDt==rightDt && leftDt.isInteger && rightDt.isInteger && binExpr.right is ArrayIndexedExpression) {
|
||||
// don't split array[i] +/- array[i] (the codegen has an optimized path for this)
|
||||
return noModifications
|
||||
}
|
||||
}
|
||||
|
||||
val sourceDt = binExpr.left.inferType(program).getOrElse { throw AssemblyError("unknown dt") }
|
||||
val (_, left) = binExpr.left.typecastTo(assignment.target.inferType(program).getOrElse { throw AssemblyError(
|
||||
"unknown dt"
|
||||
|
@ -1,9 +1,6 @@
|
||||
TODO
|
||||
====
|
||||
|
||||
- add special optimization for @(screen+i) = xbuf[x] + ybuf[y] and @(screen+i) = xbuf[x] - ybuf[y]
|
||||
(noticable in plasma.p8 and cube examples?)
|
||||
|
||||
- prefix prog8 subroutines with p8s_ instead of p8_ to not let them clash with variables in the asm??
|
||||
- [on branch: shortcircuit] investigate McCarthy evaluation again? this may also reduce code size perhaps for things like if a>4 or a<2 ....
|
||||
- IR: reduce the number of branch instructions such as BEQ, BEQR, etc (gradually), replace with CMP(I) + status branch instruction
|
||||
|
@ -88,14 +88,10 @@ main {
|
||||
c2A += 2
|
||||
c2B -= 3
|
||||
|
||||
for y in 24 downto 0 {
|
||||
for x in 39 downto 0 {
|
||||
; split the array expression to avoid a prog8 temporary var inefficiency
|
||||
; this pure prog8 version achieves ~17 fps
|
||||
ubyte @zp tmp = ybuf[y]
|
||||
@(screen+x) = xbuf[x] + tmp
|
||||
; prog8 at this time needs a temp variable to calculate the above expression.
|
||||
; in optimized asm, this is the fastest way to do this line (achieving ~21 fps on the C64):
|
||||
for y in 0 to 24 {
|
||||
for x in 0 to 39 {
|
||||
@(screen+x) = xbuf[x] + ybuf[y]
|
||||
; max optimized asm is this: (achieving ~21 fps on the C64):
|
||||
; %asm {{
|
||||
; ldy p8_y
|
||||
; lda p8_ybuf,y
|
||||
|
@ -10,13 +10,13 @@ main {
|
||||
ubyte j = 4
|
||||
uword screen
|
||||
|
||||
ubyte result = xx[i] + yy[j] ; TODO optimize to use add addr,y
|
||||
ubyte result = xx[i] + yy[j]
|
||||
txt.print_ub(result) ; 149
|
||||
txt.nl()
|
||||
result = xx[i] + yy[i] ; TODO optimize to use add addr,y
|
||||
result = xx[i] + yy[i]
|
||||
txt.print_ub(result) ; 148
|
||||
txt.nl()
|
||||
@(screen+i) = xx[i] + yy[i] ; TODO why is this using P8ZP_SCRATCH_B1?
|
||||
@(screen+i) = xx[i] + yy[i]
|
||||
|
||||
; ubyte index = 100
|
||||
; ubyte[] t_index = [1,2,3,4,5]
|
||||
|
Loading…
x
Reference in New Issue
Block a user