diff --git a/docs/source/todo.rst b/docs/source/todo.rst
index 94458fa1a..73c8f5df1 100644
--- a/docs/source/todo.rst
+++ b/docs/source/todo.rst
@@ -1,6 +1,9 @@
 TODO
 ====
 
+- add special optimization for  @(screen+i) = xbuf[x] + ybuf[y]  and   @(screen+i) = xbuf[x] - ybuf[y]
+  (noticable in plasma.p8 and cube examples?)
+
 - prefix prog8 subroutines with p8s_ instead of p8_ to not let them clash with variables in the asm??
 - [on branch: shortcircuit] investigate McCarthy evaluation again? this may also reduce code size perhaps for things like if a>4 or a<2 ....
 - IR: reduce the number of branch instructions such as BEQ, BEQR, etc (gradually), replace with CMP(I) + status branch instruction
diff --git a/examples/c64/plasma.p8 b/examples/c64/plasma.p8
index 937fd69dc..97104b921 100644
--- a/examples/c64/plasma.p8
+++ b/examples/c64/plasma.p8
@@ -1,5 +1,4 @@
 %import syslib
-%import test_stack
 %import textio
 %import math
 
@@ -7,10 +6,11 @@
 ;  which is (w)2001 by groepaz/hitmen
 ;
 ;  Cleanup and porting to C by Ullrich von Bassewitz.
+;  See https://github.com/cc65/cc65/tree/master/samples/cbm/plasma.c
+;
 ;  Converted to prog8 by Irmen de Jong.
 
 
-
 main {
     const uword SCREEN1 = $E000
     const uword SCREEN2 = $E400
@@ -22,27 +22,39 @@ main {
     sub start() {
         txt.color(1)
         txt.clear_screen()
-        txt.print("creating charset...\n")
+        txt.print("creating charset...\n\nwhile running, press key to stop.\n\n")
         makechar()
 
         ubyte block = c64.CIA2PRA
-        ; ubyte v = cbm.VMCSB
+        ubyte v = c64.VMCSB
         c64.CIA2PRA = (block & $FC) | (lsb(SCREEN1 >> 14) ^ $03)
 
-        repeat {
+        uword frames = 0
+        cbm.SETTIM(0,0,0)
+
+        while cbm.GETIN()==0 {
             doplasma(SCREEN1)
             c64.VMCSB = PAGE1
             doplasma(SCREEN2)
             c64.VMCSB = PAGE2
+            frames += 2
         }
 
-        ; restore screen (if you want)
-        ;c64.VMCSB = v
-        ;c64.CIA2PRA = block
-        ;txt.print("done!\n")
-        ;test_stack.test()
-        ;repeat {
-        ;}
+        uword jiffies = cbm.RDTIM16()
+
+        ; restore screen and displays stats
+        c64.VMCSB = v
+        c64.CIA2PRA = block
+        txt.print("time in jiffies: ")
+        txt.print_uw(jiffies)
+        txt.print("\nframes: ")
+        txt.print_uw(frames)
+        uword fps = (frames*60)/jiffies
+        txt.print("\nfps: ")
+        txt.print_uw(fps)
+        txt.print("\ndone!\n")
+        repeat {
+        }
     }
 
     ; several variables outside of doplasma to make them retain their value
@@ -51,7 +63,7 @@ main {
     ubyte c2A
     ubyte c2B
 
-    sub doplasma(uword screen) {
+    sub doplasma(uword @zp screen) {
         ubyte[40] xbuf
         ubyte[25] ybuf
         ubyte c1a = c1A
@@ -78,17 +90,19 @@ main {
 
         for y in 24 downto 0 {
             for x in 39 downto 0 {
-                ; using a temp var here to enable expression optimization that can't be done on a 'problematic' ROM/RAM memory location
-                ubyte @zp cc = xbuf[x] + ybuf[y]
-                @(screen+x) = cc
-; this is the fastest way to do this inner part:
+                ; split the array expression to avoid a prog8 temporary var inefficiency
+                ; this pure prog8 version achieves ~17 fps
+                ubyte @zp tmp = ybuf[y]
+                @(screen+x) = xbuf[x] + tmp
+; prog8 at this time needs a temp variable to calculate the above expression.
+; in optimized asm, this is the fastest way to do this line (achieving ~21 fps on the C64):
 ;                %asm {{
-;                     ldy  y
-;                     lda  ybuf,y
-;                     ldy  x
+;                     ldy  p8_y
+;                     lda  p8_ybuf,y
+;                     ldy  p8_x
 ;                     clc
-;                     adc  xbuf,y
-;                     sta  (screen),y
+;                     adc  p8_xbuf,y
+;                     sta  (p8_screen),y
 ;                 }}
             }
             screen += 40
@@ -99,16 +113,15 @@ main {
         ubyte[8] bittab = [ $01, $02, $04, $08, $10, $20, $40, $80 ]
         ubyte c
         for c in 0 to 255 {
-            ubyte @zp s = math.sin8u(c)
+            ubyte @zp s = math.sin8u(c)     ; chance
             ubyte i
+            ; for all the pixels in the 8x8 character grid, determine (with a rnd chance) if they should be on or off
             for i in 0 to 7 {
                 ubyte b=0
                 ubyte @zp ii
                 for ii in 0 to 7 {
-                    ; use 16 bit rng for a bit more randomness instead of the 8-bit rng
-                    if math.rnd() > s {
+                    if math.rnd() > s
                         b |= bittab[ii]
-                    }
                 }
                 @(CHARSET + i + c*$0008) = b
             }
diff --git a/examples/test.p8 b/examples/test.p8
index cbf4d876f..ec9c61493 100644
--- a/examples/test.p8
+++ b/examples/test.p8
@@ -4,10 +4,19 @@
 main {
     sub start() {
 
-        byte bb = 20
-        word ww= 300
-        ww += bb*3
-        txt.print_w(ww)        ; 240
+        ubyte[5] xx = [11,22,33,44,55]
+        ubyte[5] yy = [101,102,103,104,105]
+        ubyte i=3
+        ubyte j = 4
+        uword screen
+
+        ubyte result = xx[i] + yy[j]        ; TODO optimize to use add addr,y
+        txt.print_ub(result)    ; 149
+        txt.nl()
+        result = xx[i] + yy[i]              ; TODO optimize to use add addr,y
+        txt.print_ub(result)    ; 148
+        txt.nl()
+        @(screen+i) = xx[i] + yy[i]     ; TODO why is this using P8ZP_SCRATCH_B1?
 
 ;        ubyte index = 100
 ;        ubyte[] t_index = [1,2,3,4,5]