diff --git a/docs/source/todo.rst b/docs/source/todo.rst index 94458fa1a..73c8f5df1 100644 --- a/docs/source/todo.rst +++ b/docs/source/todo.rst @@ -1,6 +1,9 @@ TODO ==== +- add special optimization for @(screen+i) = xbuf[x] + ybuf[y] and @(screen+i) = xbuf[x] - ybuf[y] + (noticable in plasma.p8 and cube examples?) + - prefix prog8 subroutines with p8s_ instead of p8_ to not let them clash with variables in the asm?? - [on branch: shortcircuit] investigate McCarthy evaluation again? this may also reduce code size perhaps for things like if a>4 or a<2 .... - IR: reduce the number of branch instructions such as BEQ, BEQR, etc (gradually), replace with CMP(I) + status branch instruction diff --git a/examples/c64/plasma.p8 b/examples/c64/plasma.p8 index 937fd69dc..97104b921 100644 --- a/examples/c64/plasma.p8 +++ b/examples/c64/plasma.p8 @@ -1,5 +1,4 @@ %import syslib -%import test_stack %import textio %import math @@ -7,10 +6,11 @@ ; which is (w)2001 by groepaz/hitmen ; ; Cleanup and porting to C by Ullrich von Bassewitz. +; See https://github.com/cc65/cc65/tree/master/samples/cbm/plasma.c +; ; Converted to prog8 by Irmen de Jong. - main { const uword SCREEN1 = $E000 const uword SCREEN2 = $E400 @@ -22,27 +22,39 @@ main { sub start() { txt.color(1) txt.clear_screen() - txt.print("creating charset...\n") + txt.print("creating charset...\n\nwhile running, press key to stop.\n\n") makechar() ubyte block = c64.CIA2PRA - ; ubyte v = cbm.VMCSB + ubyte v = c64.VMCSB c64.CIA2PRA = (block & $FC) | (lsb(SCREEN1 >> 14) ^ $03) - repeat { + uword frames = 0 + cbm.SETTIM(0,0,0) + + while cbm.GETIN()==0 { doplasma(SCREEN1) c64.VMCSB = PAGE1 doplasma(SCREEN2) c64.VMCSB = PAGE2 + frames += 2 } - ; restore screen (if you want) - ;c64.VMCSB = v - ;c64.CIA2PRA = block - ;txt.print("done!\n") - ;test_stack.test() - ;repeat { - ;} + uword jiffies = cbm.RDTIM16() + + ; restore screen and displays stats + c64.VMCSB = v + c64.CIA2PRA = block + txt.print("time in jiffies: ") + txt.print_uw(jiffies) + txt.print("\nframes: ") + txt.print_uw(frames) + uword fps = (frames*60)/jiffies + txt.print("\nfps: ") + txt.print_uw(fps) + txt.print("\ndone!\n") + repeat { + } } ; several variables outside of doplasma to make them retain their value @@ -51,7 +63,7 @@ main { ubyte c2A ubyte c2B - sub doplasma(uword screen) { + sub doplasma(uword @zp screen) { ubyte[40] xbuf ubyte[25] ybuf ubyte c1a = c1A @@ -78,17 +90,19 @@ main { for y in 24 downto 0 { for x in 39 downto 0 { - ; using a temp var here to enable expression optimization that can't be done on a 'problematic' ROM/RAM memory location - ubyte @zp cc = xbuf[x] + ybuf[y] - @(screen+x) = cc -; this is the fastest way to do this inner part: + ; split the array expression to avoid a prog8 temporary var inefficiency + ; this pure prog8 version achieves ~17 fps + ubyte @zp tmp = ybuf[y] + @(screen+x) = xbuf[x] + tmp +; prog8 at this time needs a temp variable to calculate the above expression. +; in optimized asm, this is the fastest way to do this line (achieving ~21 fps on the C64): ; %asm {{ -; ldy y -; lda ybuf,y -; ldy x +; ldy p8_y +; lda p8_ybuf,y +; ldy p8_x ; clc -; adc xbuf,y -; sta (screen),y +; adc p8_xbuf,y +; sta (p8_screen),y ; }} } screen += 40 @@ -99,16 +113,15 @@ main { ubyte[8] bittab = [ $01, $02, $04, $08, $10, $20, $40, $80 ] ubyte c for c in 0 to 255 { - ubyte @zp s = math.sin8u(c) + ubyte @zp s = math.sin8u(c) ; chance ubyte i + ; for all the pixels in the 8x8 character grid, determine (with a rnd chance) if they should be on or off for i in 0 to 7 { ubyte b=0 ubyte @zp ii for ii in 0 to 7 { - ; use 16 bit rng for a bit more randomness instead of the 8-bit rng - if math.rnd() > s { + if math.rnd() > s b |= bittab[ii] - } } @(CHARSET + i + c*$0008) = b } diff --git a/examples/test.p8 b/examples/test.p8 index cbf4d876f..ec9c61493 100644 --- a/examples/test.p8 +++ b/examples/test.p8 @@ -4,10 +4,19 @@ main { sub start() { - byte bb = 20 - word ww= 300 - ww += bb*3 - txt.print_w(ww) ; 240 + ubyte[5] xx = [11,22,33,44,55] + ubyte[5] yy = [101,102,103,104,105] + ubyte i=3 + ubyte j = 4 + uword screen + + ubyte result = xx[i] + yy[j] ; TODO optimize to use add addr,y + txt.print_ub(result) ; 149 + txt.nl() + result = xx[i] + yy[i] ; TODO optimize to use add addr,y + txt.print_ub(result) ; 148 + txt.nl() + @(screen+i) = xx[i] + yy[i] ; TODO why is this using P8ZP_SCRATCH_B1? ; ubyte index = 100 ; ubyte[] t_index = [1,2,3,4,5]