diff --git a/docs/source/targetsystem.rst b/docs/source/targetsystem.rst
index 9e182999a..8d9d4cd02 100644
--- a/docs/source/targetsystem.rst
+++ b/docs/source/targetsystem.rst
@@ -152,10 +152,16 @@ The IRQ handler routine must return a boolean value (0 or 1) in the A register:
 
 **CommanderX16 specific notes**
 
+.. sidebar::
+    X16 specific routines
+
+    For the X16 there are also some specialized IRQ handling routines, see  :ref:`x16-specific-irq` below.
+
 Note that for the CommanderX16 the set_rasterirq() will disable VSYNC irqs and never call the system IRQ handler regardless
 of the return value of the user handler routine. This also means the default sys.wait() routine won't work anymore,
 when using this handler.
 
+
 These two helper routines are not particularly suited to handle multiple IRQ sources on the Commander X16.
 It's possible but it requires correct fiddling with IRQ enable bits, acknowledging the IRQs, and properly calling
 or not calling the system IRQ handler routine. See the section below for perhaps a better and easier solution that
@@ -192,6 +198,8 @@ will corrupt any Vera operations that were going on in the main program. The rou
     and restored at the end of the handler, further increasing its execution time...
 
 
+.. _x16-specific-irq:
+
 Commander X16 specific IRQ handling
 ===================================
 
diff --git a/docs/source/todo.rst b/docs/source/todo.rst
index ebce982d8..13c4d09d3 100644
--- a/docs/source/todo.rst
+++ b/docs/source/todo.rst
@@ -1,6 +1,14 @@
 TODO
 ====
 
+Can we move the asm init code that is injected into the start() subroutine, to init_system_phase2 instead?
+
+Doc improvements: some short overview for people coming from other programming languages like C:
+  tell something about prog8 not having function overloading, max 16 bit (u)word integer as native type (and floats sometimes),
+  static variable allocations, no dynamic memory allocation in the language itself (although possible via user written libraries),
+  etc ...
+
+
 Improve register load order in subroutine call args assignments:
 in certain situations, the "wrong" order of evaluation of function call arguments is done which results
 in overwriting registers that already got their value, which requires a lot of stack juggling (especially on plain 6502 cpu!)
diff --git a/examples/c64/balloonflight.p8 b/examples/c64/balloonflight.p8
index f7ab94591..08767250b 100644
--- a/examples/c64/balloonflight.p8
+++ b/examples/c64/balloonflight.p8
@@ -1,38 +1,32 @@
 %import syslib
 %import textio
 %import math
-%import test_stack
 %zeropage basicsafe
 
-; note: The flickering in the scrolling is caused by the CPU requiring
-;       too long to scroll the characters + the colors in course scroll.
-;       This takes nearly a full frame to accomplish, and causes tearing.
-;       It's very difficult to remove this flicker: it requires double buffering
-;       and splitting the coarse character scrolling on multiple phases...
 
 main {
 
-    bool perform_scroll = false
+    bool do_char_scroll = false
 
     sub start() {
+        uword moon_x = 310
         c64.set_sprite_ptr(0, $0f00)           ; alternatively, set directly:  c64.SPRPTR[0] = $0f00 / 64
-        c64.SPENA = 1
+        c64.set_sprite_ptr(1, $0f00+64)        ; alternatively, set directly:  c64.SPRPTR[0] = $0f00 / 64
+        c64.SPENA = %00000011
         c64.SP0COL = 14
+        c64.SP1COL = 7
         c64.SPXY[0] = 80
         c64.SPXY[1] = 100
+        set_moon_pos(moon_x)
 
         c64.SCROLX &= %11110111     ; 38 column mode
-
-        sys.set_rasterirq(&irq.irqhandler, 200)     ; enable animation via raster interrupt
+        sys.set_rasterirq(&irq.irqhandler, 250)     ; enable animation via raster interrupt
 
         ubyte target_height = 10
-        ubyte active_height = 24
+        ubyte active_height = 25
         bool upwards = true
 
         repeat {
-            ;txt.plot(0,0)
-            ;test_stack.test()
-
             ubyte mountain = 223        ; slope upwards
             if active_height < target_height {
                 active_height++
@@ -42,66 +36,124 @@ main {
                 active_height--
                 upwards = false
             } else {
-                target_height = 8 + math.rnd() % 16
+                ; determine new height for next mountain
+                target_height = 9 + math.rnd() % 15
                 if upwards
                     mountain = 233
                 else
                     mountain = 223
             }
 
-            while not perform_scroll {
+            while not do_char_scroll {
                 ; let the raster irq do its timing job
             }
 
-            perform_scroll = false
-            txt.scroll_left(true)
+            do_char_scroll = false
+            scroll_characters_left()
 
-            ; float the balloon
-            if math.rnd() & %10000 !=0
+            ; float the balloon and the moon sprites
+            if math.rnd() & 1 !=0
                 c64.SPXY[1] ++
             else
                 c64.SPXY[1] --
 
+            moon_x--
+            if msb(moon_x)==255
+                moon_x = 340
+            set_moon_pos(moon_x)
+
+            ; draw new mountain etc.
+            const ubyte RIGHT_COLUMN = 39
             ubyte yy
             for yy in 0 to active_height-1 {
-                txt.setcc(39, yy, 32, 2)         ; clear top of screen
+                txt.setcc(RIGHT_COLUMN, yy, 32, 2)         ; clear top of screen
             }
-            txt.setcc(39, active_height, mountain, 8)    ; mountain edge
+            txt.setcc(RIGHT_COLUMN, active_height, mountain, 8)    ; mountain edge
             for yy in active_height+1 to 24 {
-                txt.setcc(39, yy, 160, 8)        ; draw mountain
+                txt.setcc(RIGHT_COLUMN, yy, 160, 8)        ; draw filled mountain
             }
 
-            yy = math.rnd()
-            if yy > 100 {
+            ubyte clutter = math.rnd()
+            if clutter > 100 {
                 ; draw a star
-                txt.setcc(39, yy % (active_height-1), '.', math.rnd())
+                txt.setcc(RIGHT_COLUMN, clutter % (active_height-1), sc:'.', math.rnd())
             }
 
-            if yy > 200 {
+            if clutter > 200 {
                 ; draw a tree
-                ubyte tree = 30
+                ubyte tree = sc:'↑'
                 ubyte treecolor = 5
-                if yy & %01000000 != 0
-                    tree = 88
-                else if yy & %00100000 != 0
-                    tree = 65
+                if clutter & %00010000 != 0
+                    tree = sc:'♣'
+                else if clutter & %00100000 != 0
+                    tree = sc:'♠'
                 if math.rnd() > 130
                     treecolor = 13
-                txt.setcc(39, active_height, tree, treecolor)
+                txt.setcc(RIGHT_COLUMN, active_height, tree, treecolor)
             }
 
-            if yy > 235 {
+            if clutter > 235 {
                 ; draw a camel
-                txt.setcc(39, active_height, 94, 9)
+                txt.setcc(RIGHT_COLUMN, active_height, sc:'π', 9)
             }
         }
     }
+
+    sub set_moon_pos(uword x) {
+        c64.SPXY[2] = lsb(x)
+        c64.SPXY[3] = 55
+        if msb(x)!=0
+            c64.MSIGX |= %00000010
+        else
+            c64.MSIGX &= %11111101
+    }
+
+    sub scroll_characters_left () {
+	    ; Scroll the bottom half (approx.) of the character screen 1 character to the left
+	    ; contents of the rightmost column are unchanged, you should clear/refill this yourself
+	    ; Without clever split-screen tricks, the C64 is not fast enough to scroll the whole
+	    ; screen smootly without tearing. So for simplicity it's constrained to less rows
+	    ; such that what is scrolled, *does* scrolls smoothly.
+	    ; For maximum performance the scrolling is done in unrolled assembly code.
+
+        %asm {{
+		    ldx  #0
+		    ldy  #38
+-
+        .for row=10, row<=24, row+=1
+            lda  cbm.Screen + 40*row + 1,x
+            sta  cbm.Screen + 40*row + 0,x
+            lda  cbm.Colors + 40*row + 1,x
+            sta  cbm.Colors + 40*row + 0,x
+        .next
+
+		    inx
+		    dey
+		    bpl  -
+    		rts
+	    }}
+    }
+}
+
+
+irq {
+    ; does the smooth scrolling immediately after the visible screen area,
+    ; so there is no screen tearing. The main loop does the "big" character
+    ; scrolling when the soft-scroll runs out after 8 pixels
+    ubyte smoothx=0
+    sub irqhandler() -> bool {
+        smoothx = (smoothx-1) & 7
+        main.do_char_scroll = smoothx==7
+        c64.SCROLX = (c64.SCROLX & %11111000) | smoothx
+        return false
+    }
 }
 
 
 spritedata $0f00 {
-    ; this memory block contains the sprite data
-    ; it must start on an address aligned to 64 bytes.
+    ; this memory block contains the sprite data. it must start on an address aligned to 64 bytes.
+    ; for simplicity, it's currently statically located at $0f00 (not far in memory after the program code),
+    ; but there are ways to do this more dynamically.
     %option force_output    ; make sure the data in this block appears in the resulting program
 
     ubyte[] balloonsprite = [ %00000000,%01111111,%00000000,
@@ -124,17 +176,29 @@ spritedata $0f00 {
                               %00000000,%00111110,%00000000,
                               %00000000,%00111110,%00000000,
                               %00000000,%00111110,%00000000,
-                              %00000000,%00011100,%00000000   ]
+                              %00000000,%00011100,%00000000,
+                              0]
+
+    ubyte[] moonsprite =    [ %00000000,%00000110,%00000000,
+                              %00000000,%00011100,%00000000,
+                              %00000000,%01111000,%00000000,
+                              %00000000,%11111000,%00000000,
+                              %00000001,%11110000,%00000000,
+                              %00000011,%11110000,%00000000,
+                              %00000011,%11110000,%00000000,
+                              %00000111,%11100000,%00000000,
+                              %00000111,%11100000,%00000000,
+                              %00000111,%11100000,%00000000,
+                              %00000111,%11100000,%00000000,
+                              %00000111,%11100000,%00000000,
+                              %00000111,%11100000,%00000000,
+                              %00000111,%11100000,%00000000,
+                              %00000011,%11110000,%00000000,
+                              %00000011,%11110000,%00000000,
+                              %00000001,%11110000,%00000000,
+                              %00000000,%11111000,%00000000,
+                              %00000000,%01111000,%00000000,
+                              %00000000,%00011100,%00000000,
+                              %00000000,%00000110,%00000000,
+                              0]
 }
-
-
-irq {
-    ubyte smoothx=0
-    sub irqhandler() -> bool {
-        smoothx = (smoothx-1) & 7
-        main.perform_scroll = smoothx==7
-        c64.SCROLX = (c64.SCROLX & %11111000) | smoothx
-        return false
-    }
-}
-
diff --git a/examples/c64/plasma.p8 b/examples/c64/plasma.p8
index 97dc56e26..8f641cab0 100644
--- a/examples/c64/plasma.p8
+++ b/examples/c64/plasma.p8
@@ -92,15 +92,18 @@ main {
         for y in 0 to 24 {
             ubyte @zp @shared yvalue = ybuf[y]
             for x in 0 to 39 {
-                ; @(screen+x) = xbuf[x] + yvalue
-; max optimized asm is this: (achieving ~21 fps on the C64):
-                %asm {{
-                     lda  p8v_yvalue
-                     ldy  p8v_x
-                     clc
-                     adc  p8v_xbuf,y
-                     sta  (p8v_screen),y
-                 }}
+                @(screen+x) = xbuf[x] + yvalue
+
+; optimized asm for the line above is this:
+; (achieving ~23 fps on the C64, about 1 or 2 fps more than with the pure prog8 code):
+;                %asm {{
+;                     lda  p8v_yvalue
+;                     ldy  p8v_x
+;                     clc
+;                     adc  p8v_xbuf,y
+;                     sta  (p8v_screen),y
+;                 }}
+
             }
             screen += 40
         }
diff --git a/examples/c64/wizzine.p8 b/examples/c64/wizzine.p8
index c67f7bae5..8aa4a2b26 100644
--- a/examples/c64/wizzine.p8
+++ b/examples/c64/wizzine.p8
@@ -57,8 +57,9 @@ irq {
             uword @zp x = math.sin8u(angle1-spri*16) as uword + 50
             ubyte @zp y = math.sin8u(angle2-spri*16) / 2 + 70
             c64.SPXYW[spri] = mkword(y, lsb(x))
-            c64.MSIGX <<= 1
-            if msb(x)!=0 c64.MSIGX++
+            if msb(x)!=0
+                sys.set_carry()
+            rol(c64.MSIGX)
         }
         c64.EXTCOL-=8
         return true
diff --git a/gradle.properties b/gradle.properties
index c23cb50c0..31ea33c6f 100644
--- a/gradle.properties
+++ b/gradle.properties
@@ -5,4 +5,4 @@ org.gradle.daemon=true
 kotlin.code.style=official
 javaVersion=11
 kotlinVersion=2.0.20
-version=10.4.1
+version=10.4.2-SNAPSHOT