improve plasma example

2024-12-24 01:29:28 +00:00 · 2023-09-04 20:19:04 +02:00 · 2023-09-04 20:19:04 +02:00 · 7e5a9474fe
commit 7e5a9474fe
parent 525a9b5036
3 changed files with 55 additions and 30 deletions
--- a/docs/source/todo.rst
+++ b/docs/source/todo.rst
@ -1,6 +1,9 @@
 TODO
 ====

+- add special optimization for  @(screen+i) = xbuf[x] + ybuf[y]  and   @(screen+i) = xbuf[x] - ybuf[y]
+  (noticable in plasma.p8 and cube examples?)
+
 - prefix prog8 subroutines with p8s_ instead of p8_ to not let them clash with variables in the asm??
 - [on branch: shortcircuit] investigate McCarthy evaluation again? this may also reduce code size perhaps for things like if a>4 or a<2 ....
 - IR: reduce the number of branch instructions such as BEQ, BEQR, etc (gradually), replace with CMP(I) + status branch instruction
--- a/examples/c64/plasma.p8
+++ b/examples/c64/plasma.p8
@ -1,5 +1,4 @@
 %import syslib
-%import test_stack
 %import textio
 %import math

@ -7,10 +6,11 @@
 ;  which is (w)2001 by groepaz/hitmen
 ;
 ;  Cleanup and porting to C by Ullrich von Bassewitz.
+;  See https://github.com/cc65/cc65/tree/master/samples/cbm/plasma.c
+;
 ;  Converted to prog8 by Irmen de Jong.


-
 main {
    const uword SCREEN1 = $E000
    const uword SCREEN2 = $E400
@ -22,27 +22,39 @@ main {
    sub start() {
        txt.color(1)
        txt.clear_screen()
-        txt.print("creating charset...\n")
+        txt.print("creating charset...\n\nwhile running, press key to stop.\n\n")
        makechar()

        ubyte block = c64.CIA2PRA
-        ; ubyte v = cbm.VMCSB
+        ubyte v = c64.VMCSB
        c64.CIA2PRA = (block & $FC) | (lsb(SCREEN1 >> 14) ^ $03)

-        repeat {
+        uword frames = 0
+        cbm.SETTIM(0,0,0)
+
+        while cbm.GETIN()==0 {
            doplasma(SCREEN1)
            c64.VMCSB = PAGE1
            doplasma(SCREEN2)
            c64.VMCSB = PAGE2
+            frames += 2
        }

-        ; restore screen (if you want)
-        ;c64.VMCSB = v
-        ;c64.CIA2PRA = block
-        ;txt.print("done!\n")
-        ;test_stack.test()
-        ;repeat {
-        ;}
+        uword jiffies = cbm.RDTIM16()
+
+        ; restore screen and displays stats
+        c64.VMCSB = v
+        c64.CIA2PRA = block
+        txt.print("time in jiffies: ")
+        txt.print_uw(jiffies)
+        txt.print("\nframes: ")
+        txt.print_uw(frames)
+        uword fps = (frames*60)/jiffies
+        txt.print("\nfps: ")
+        txt.print_uw(fps)
+        txt.print("\ndone!\n")
+        repeat {
+        }
    }

    ; several variables outside of doplasma to make them retain their value
@ -51,7 +63,7 @@ main {
    ubyte c2A
    ubyte c2B

-    sub doplasma(uword screen) {
+    sub doplasma(uword @zp screen) {
        ubyte[40] xbuf
        ubyte[25] ybuf
        ubyte c1a = c1A
@ -78,17 +90,19 @@ main {

        for y in 24 downto 0 {
            for x in 39 downto 0 {
-                ; using a temp var here to enable expression optimization that can't be done on a 'problematic' ROM/RAM memory location
-                ubyte @zp cc = xbuf[x] + ybuf[y]
-                @(screen+x) = cc
-; this is the fastest way to do this inner part:
+                ; split the array expression to avoid a prog8 temporary var inefficiency
+                ; this pure prog8 version achieves ~17 fps
+                ubyte @zp tmp = ybuf[y]
+                @(screen+x) = xbuf[x] + tmp
+; prog8 at this time needs a temp variable to calculate the above expression.
+; in optimized asm, this is the fastest way to do this line (achieving ~21 fps on the C64):
 ;                %asm {{
-;                     ldy  y
-;                     lda  ybuf,y
-;                     ldy  x
+;                     ldy  p8_y
+;                     lda  p8_ybuf,y
+;                     ldy  p8_x
 ;                     clc
-;                     adc  xbuf,y
-;                     sta  (screen),y
+;                     adc  p8_xbuf,y
+;                     sta  (p8_screen),y
 ;                 }}
            }
            screen += 40
@ -99,16 +113,15 @@ main {
        ubyte[8] bittab = [ $01, $02, $04, $08, $10, $20, $40, $80 ]
        ubyte c
        for c in 0 to 255 {
-            ubyte @zp s = math.sin8u(c)
+            ubyte @zp s = math.sin8u(c)     ; chance
            ubyte i
+            ; for all the pixels in the 8x8 character grid, determine (with a rnd chance) if they should be on or off
            for i in 0 to 7 {
                ubyte b=0
                ubyte @zp ii
                for ii in 0 to 7 {
-                    ; use 16 bit rng for a bit more randomness instead of the 8-bit rng
-                    if math.rnd() > s {
+                    if math.rnd() > s
                        b |= bittab[ii]
-                    }
                }
                @(CHARSET + i + c*$0008) = b
            }
--- a/examples/test.p8
+++ b/examples/test.p8
@ -4,10 +4,19 @@
 main {
    sub start() {

-        byte bb = 20
-        word ww= 300
-        ww += bb*3
-        txt.print_w(ww)        ; 240
+        ubyte[5] xx = [11,22,33,44,55]
+        ubyte[5] yy = [101,102,103,104,105]
+        ubyte i=3
+        ubyte j = 4
+        uword screen
+
+        ubyte result = xx[i] + yy[j]        ; TODO optimize to use add addr,y
+        txt.print_ub(result)    ; 149
+        txt.nl()
+        result = xx[i] + yy[i]              ; TODO optimize to use add addr,y
+        txt.print_ub(result)    ; 148
+        txt.nl()
+        @(screen+i) = xx[i] + yy[i]     ; TODO why is this using P8ZP_SCRATCH_B1?

 ;        ubyte index = 100
 ;        ubyte[] t_index = [1,2,3,4,5]