diff --git a/codeGenCpu6502/src/prog8/codegen/cpu6502/BuiltinFunctionsAsmGen.kt b/codeGenCpu6502/src/prog8/codegen/cpu6502/BuiltinFunctionsAsmGen.kt
index 3ebbc797d..f7f34999e 100644
--- a/codeGenCpu6502/src/prog8/codegen/cpu6502/BuiltinFunctionsAsmGen.kt
+++ b/codeGenCpu6502/src/prog8/codegen/cpu6502/BuiltinFunctionsAsmGen.kt
@@ -275,6 +275,8 @@ internal class BuiltinFunctionsAsmGen(private val program: PtProgram,
               sta  (+)+1
               sty  (+)+2
 +             jsr  0       ; modified""")
+            // TODO: avoid using modifying code here by pushing return address on the stack manually and use JMP (INDIRECT) ?  And if it's just a variable, simply JMP (variable) !
+            // TODO: also do this for CallFar below!
         }
 
         // note: the routine can return a word value (in AY)
diff --git a/compiler/res/prog8lib/buffers.p8 b/compiler/res/prog8lib/buffers.p8
index b4a7a17f0..5021da23b 100644
--- a/compiler/res/prog8lib/buffers.p8
+++ b/compiler/res/prog8lib/buffers.p8
@@ -1,4 +1,4 @@
-; experimental buffer data structures
+; **experimental** buffer data structures, API subject to change!!
 
 %option no_symbol_prefixing, ignore_unused
 
diff --git a/compiler/res/prog8lib/compression.p8 b/compiler/res/prog8lib/compression.p8
new file mode 100644
index 000000000..195bc2885
--- /dev/null
+++ b/compiler/res/prog8lib/compression.p8
@@ -0,0 +1,407 @@
+; **experimental** data compression/decompression routines, API subject to change!!
+
+compression {
+
+    sub encode_rle_outfunc(uword data, uword size, uword output_function, bool is_last_block) {
+        ; -- Compress the given data block using ByteRun1 aka PackBits RLE encoding.
+        ;    output_function = address of a routine that gets a byte arg in A,
+        ;                      this is the next RLE byte to write to the compressed output buffer or file.
+        ;    is_last_block = usually true, but you can set it to false if you want to concatenate multiple
+        ;                    compressed blocks (for instance if the source data is >64Kb)
+        ;    Worst case result storage size needed = (size + (size+126) / 127) + 1
+        ;    This routine is not optimized for speed but for readability and ease of use.
+        uword idx = 0
+        uword literals_start_idx = 0
+        ubyte literals_length = 0
+
+        asmsub call_output_function(ubyte arg @A) {
+            %asm {{
+                jmp  (p8v_output_function)
+            }}
+        }
+
+        sub next_same_span() {
+            ; returns length in cx16.r1L, and the byte value in cx16.r1H
+            cx16.r1H = data[idx]
+            cx16.r1L = 0
+            while data[idx]==cx16.r1H and cx16.r1L<128 and idx<size {
+                idx++
+                cx16.r1L++
+            }
+        }
+
+        sub output_literals() {
+            call_output_function(literals_length-1)
+            uword dataptr = data + literals_start_idx
+            ubyte i
+            for i in 0 to literals_length-1 {
+                call_output_function(@(dataptr))
+                dataptr++
+            }
+            literals_length = 0
+        }
+
+        while idx<size {
+            next_same_span()     ; count in r1L, value in r1H
+            if cx16.r1L>1 {
+                ; a replicate run
+                if literals_length>0
+                    output_literals()
+                call_output_function((cx16.r1L^255)+2)        ;  257-cx16.r1L
+                call_output_function(cx16.r1H)
+            }
+            else {
+                ; add more to the literals run
+                if literals_length==128
+                    output_literals()
+                if literals_length==0
+                    literals_start_idx = idx-1
+                literals_length++
+            }
+        }
+
+        if literals_length>0
+            output_literals()
+
+        if is_last_block
+            call_output_function(128)
+    }
+
+    sub encode_rle(uword data, uword size, uword target, bool is_last_block) -> uword {
+        ; -- Compress the given data block using ByteRun1 aka PackBits RLE encoding.
+        ;    Returns the size of the compressed RLE data. Worst case result storage size needed = (size + (size+126) / 127) + 1.
+        ;    is_last_block = usually true, but you can set it to false if you want to concatenate multiple
+        ;                    compressed blocks (for instance if the source data is >64Kb)
+        ;    This routine is not optimized for speed but for readability and ease of use.
+        uword idx = 0
+        uword literals_start_idx = 0
+        ubyte literals_length = 0
+        uword orig_target = target
+
+        sub next_same_span() {
+            ; returns length in cx16.r1L, and the byte value in cx16.r1H
+            cx16.r1H = data[idx]
+            cx16.r1L = 0
+            while data[idx]==cx16.r1H and cx16.r1L<128 and idx<size {
+                idx++
+                cx16.r1L++
+            }
+        }
+
+        sub output_literals() {
+            @(target) = literals_length-1
+            target++
+            uword dataptr = data + literals_start_idx
+            ubyte i
+            for i in 0 to literals_length-1 {
+                @(target) = @(dataptr)
+                target++
+                dataptr++
+            }
+            literals_length = 0
+        }
+
+        while idx<size {
+            next_same_span()     ; count in r1L, value in r1H
+            if cx16.r1L>1 {
+                ; a replicate run
+                if literals_length>0
+                    output_literals()
+                @(target) = (cx16.r1L^255)+2        ;  257-cx16.r1L
+                target++
+                @(target) = cx16.r1H
+                target++
+            }
+            else {
+                ; add more to the literals run
+                if literals_length==128
+                    output_literals()
+                if literals_length==0
+                    literals_start_idx = idx-1
+                literals_length++
+            }
+        }
+
+        if literals_length>0
+            output_literals()
+
+        if is_last_block {
+            @(target) = 128
+            target ++
+        }
+
+        return target-orig_target
+    }
+
+    asmsub decode_rle_srcfunc(uword source_function @AY, uword target @R0, uword maxsize @R1) clobbers(X) -> uword @AY {
+        ; -- decodes "ByteRun1" (aka PackBits) RLE compressed data. Control byte value 128 ends the decoding.
+        ;    instead of a source buffer, you provide a callback function that must return the next byte to compress in A.
+        ;    Stops decompressing when the maxsize has been reached.
+        %asm {{
+            sta  _cb_mod1+1
+            sty  _cb_mod1+2
+            sta  _cb_mod2+1
+            sty  _cb_mod2+2
+            sta  _cb_mod3+1
+            sty  _cb_mod3+2
+            lda  cx16.r0L
+            ldy  cx16.r0H
+            sta  P8ZP_SCRATCH_W2        ; target ptr
+            sta  _orig_target
+            sty  P8ZP_SCRATCH_W2+1
+            sty  _orig_target+1
+            lda  cx16.r0L
+            clc
+            adc  cx16.r1L
+            sta  cx16.r1L
+            lda  cx16.r0H
+            adc  cx16.r1H
+            sta  cx16.r1H        ; decompression limit
+
+_loop
+	        ; while target (W2) < limit (r1)
+        	lda  P8ZP_SCRATCH_W2
+	        ldy  P8ZP_SCRATCH_W2+1
+	        cmp  cx16.r1L
+	        tya
+	        sbc  cx16.r1H
+	        bcs  _end
+
+_cb_mod1    jsr  $ffff      ; modified
+            bpl  _literals
+            cmp  #128
+            beq  _end
+
+            ; replicate next byte -n+1 times
+            eor  #255
+            clc
+            adc  #2
+            sta  P8ZP_SCRATCH_REG
+_cb_mod2    jsr  $ffff      ; modified
+            ldx  P8ZP_SCRATCH_REG
+            ldy  #0
+-           sta  (P8ZP_SCRATCH_W2),y
+            iny
+            dex
+            bne  -
+            ; add A to target
+            lda  P8ZP_SCRATCH_REG
+            clc
+            adc  P8ZP_SCRATCH_W2
+            sta  P8ZP_SCRATCH_W2
+            lda  #0
+            adc  P8ZP_SCRATCH_W2+1
+            sta  P8ZP_SCRATCH_W2+1
+            jmp  _loop
+_literals
+            ; copy the next n+1 bytes
+            pha
+            sta  P8ZP_SCRATCH_B1
+            ldy  #0
+            sty  P8ZP_SCRATCH_REG
+_cb_mod3    jsr  $ffff      ; modified
+            ldy  P8ZP_SCRATCH_REG
+            sta  (P8ZP_SCRATCH_W2),y
+            inc  P8ZP_SCRATCH_REG
+            dec  P8ZP_SCRATCH_B1
+            bpl  _cb_mod3
+            ; add N+1 to target
+            pla
+            sec
+            adc  P8ZP_SCRATCH_W2
+            sta  P8ZP_SCRATCH_W2
+            lda  #0
+            adc  P8ZP_SCRATCH_W2+1
+            sta  P8ZP_SCRATCH_W2+1
+            jmp  _loop
+
+_orig_target    .word  0
+_end
+            ; return w2-orig_target, the size of the decompressed data
+            lda  P8ZP_SCRATCH_W2
+            ldy  P8ZP_SCRATCH_W2+1
+            sec
+            sbc  _orig_target
+            tax
+            tya
+            sbc  _orig_target+1
+            tay
+            txa
+            rts
+        }}
+    }
+
+    asmsub decode_rle(uword compressed @AY, uword target @R0, uword maxsize @R1) clobbers(X) -> uword @AY {
+        ; -- decodes "ByteRun1" (aka PackBits) RLE compressed data. Control byte value 128 ends the decoding.
+        ;    Stops decompressing when the maxsize has been reached.
+        %asm {{
+            sta  P8ZP_SCRATCH_W1        ; compressed data ptr
+            sty  P8ZP_SCRATCH_W1+1
+            lda  cx16.r0L
+            ldy  cx16.r0H
+            sta  P8ZP_SCRATCH_W2        ; target ptr
+            sta  _orig_target
+            sty  P8ZP_SCRATCH_W2+1
+            sty  _orig_target+1
+            lda  cx16.r0L
+            clc
+            adc  cx16.r1L
+            sta  cx16.r1L
+            lda  cx16.r0H
+            adc  cx16.r1H
+            sta  cx16.r1H        ; decompression limit
+
+_loop       ; while target (W2) < limit (r1)
+        	lda  P8ZP_SCRATCH_W2
+	        ldy  P8ZP_SCRATCH_W2+1
+	        cmp  cx16.r1L
+	        tya
+	        sbc  cx16.r1H
+	        bcs  _end
+
+            ldy  #0
+            lda  (P8ZP_SCRATCH_W1),y
+            bpl  _literals
+            cmp  #128
+            beq  _end
+
+            ; replicate next byte -n+1 times
+            eor  #255
+            clc
+            adc  #2
+            pha
+            tax
+            iny
+            lda  (P8ZP_SCRATCH_W1),y
+            ldy  #0
+-           sta  (P8ZP_SCRATCH_W2),y
+            iny
+            dex
+            bne  -
+            ; add A to target
+            pla
+            clc
+            adc  P8ZP_SCRATCH_W2
+            sta  P8ZP_SCRATCH_W2
+            lda  #0
+            adc  P8ZP_SCRATCH_W2+1
+            sta  P8ZP_SCRATCH_W2+1
+            ; increase source by 2
+            clc
+            lda  P8ZP_SCRATCH_W1
+            adc  #2
+            sta  P8ZP_SCRATCH_W1
+            lda  #0
+            adc  P8ZP_SCRATCH_W1+1
+            sta  P8ZP_SCRATCH_W1+1
+            jmp  _loop
+_literals
+            ; copy the next n+1 bytes
+            pha
+            tax
+            inc  P8ZP_SCRATCH_W1
+            bne  +
+            inc  P8ZP_SCRATCH_W1+1
++           ldy  #0
+-           lda  (P8ZP_SCRATCH_W1),y
+            sta  (P8ZP_SCRATCH_W2),y
+            iny
+            dex
+            bpl  -
+            ; add N+1 to source
+            pla
+            tax
+            sec
+            adc  P8ZP_SCRATCH_W1
+            sta  P8ZP_SCRATCH_W1
+            lda  #0
+            adc  P8ZP_SCRATCH_W1+1
+            sta  P8ZP_SCRATCH_W1+1
+            ; add N+1 to target as well
+            txa
+            sec
+            adc  P8ZP_SCRATCH_W2
+            sta  P8ZP_SCRATCH_W2
+            lda  #0
+            adc  P8ZP_SCRATCH_W2+1
+            sta  P8ZP_SCRATCH_W2+1
+            jmp  _loop
+
+_orig_target    .word  0
+_end
+            ; return w2-orig_target, the size of the decompressed data
+            lda  P8ZP_SCRATCH_W2
+            ldy  P8ZP_SCRATCH_W2+1
+            sec
+            sbc  _orig_target
+            tax
+            tya
+            sbc  _orig_target+1
+            tay
+            txa
+            rts
+        }}
+    }
+
+
+/***
+    ; prog8 source code for the asm routine above:
+
+    sub decode_rle_prog8(uword @zp compressed, uword @zp target, uword maxsize) -> uword {
+        cx16.r0 = target    ; original target
+        cx16.r1 = target+maxsize     ; decompression limit
+
+        while target<cx16.r1 {
+            cx16.r2L = @(compressed)
+            if_neg {
+                if cx16.r2L==128
+                    break
+                ; replicate the next byte -n+1 times
+                compressed++
+                cx16.r3L = @(compressed)
+                repeat 2+(cx16.r2L^255) {
+                    @(target) = cx16.r3L
+                    target++
+                }
+                compressed++
+            } else {
+                ; copy the next n+1 bytes
+                compressed++
+                repeat cx16.r2L+1 {
+                    @(target) = @(compressed)
+                    compressed++
+                    target++
+                }
+            }
+        }
+        return target-cx16.r0
+    }
+
+    sub decode_rle_callback_prog8(uword producer_callback, uword @zp target, uword maxsize) -> uword {
+        cx16.r0 = target   ; original target
+        cx16.r1 = target+maxsize     ; decompression limit
+
+        while target<cx16.r1 {
+            cx16.r2L = lsb(call(producer_callback))
+            if_neg {
+                if cx16.r2L==128
+                    break
+                ; replicate the next byte -n+1 times
+                cx16.r3L = lsb(call(producer_callback))
+                repeat 2+(cx16.r2L^255) {
+                    @(target) = cx16.r3L
+                    target++
+                }
+            } else {
+                ; copy the next n+1 bytes
+                repeat cx16.r2L+1 {
+                    @(target) = lsb(call(producer_callback))
+                    target++
+                }
+            }
+        }
+        return target-cx16.r0
+    }
+***/
+
+}
diff --git a/compiler/res/prog8lib/virtual/compression.p8 b/compiler/res/prog8lib/virtual/compression.p8
new file mode 100644
index 000000000..af0dd9dbb
--- /dev/null
+++ b/compiler/res/prog8lib/virtual/compression.p8
@@ -0,0 +1,99 @@
+compression {
+
+    sub decode_rle(uword @zp compressed, uword @zp target, uword maxsize) -> uword {
+        cx16.r0 = target    ; original target
+        cx16.r1 = target+maxsize     ; decompression limit
+
+        while target<cx16.r1 {
+            cx16.r2L = @(compressed)
+            if_neg {
+                if cx16.r2L==128
+                    break
+                ; replicate the next byte -n+1 times
+                compressed++
+                cx16.r3L = @(compressed)
+                repeat 2+(cx16.r2L^255) {
+                    @(target) = cx16.r3L
+                    target++
+                }
+                compressed++
+            } else {
+                ; copy the next n+1 bytes
+                compressed++
+                repeat cx16.r2L+1 {
+                    @(target) = @(compressed)
+                    compressed++
+                    target++
+                }
+            }
+        }
+        return target-cx16.r0
+    }
+
+    sub encode_rle(uword data, uword size, uword target, bool is_last_block) -> uword {
+        ; -- Compress the given data block using ByteRun1 aka PackBits RLE encoding.
+        ;    Returns the size of the compressed RLE data. Worst case result storage size needed = (size + (size+126) / 127) + 1.
+        ;    is_last_block = usually true, but you can set it to false if you want to concatenate multiple
+        ;                    compressed blocks (for instance if the source data is >64Kb)
+        ;    This routine is not optimized for speed but for readability and ease of use.
+        uword idx = 0
+        uword literals_start_idx = 0
+        ubyte literals_length = 0
+        uword orig_target = target
+
+        sub next_same_span() {
+            ; returns length in cx16.r1L, and the byte value in cx16.r1H
+            cx16.r1H = data[idx]
+            cx16.r1L = 0
+            while data[idx]==cx16.r1H and cx16.r1L<128 and idx<size {
+                idx++
+                cx16.r1L++
+            }
+        }
+
+        sub output_literals() {
+            @(target) = literals_length-1
+            target++
+            uword dataptr = data + literals_start_idx
+            ubyte i
+            for i in 0 to literals_length-1 {
+                @(target) = @(dataptr)
+                target++
+                dataptr++
+            }
+            literals_length = 0
+        }
+
+        while idx<size {
+            next_same_span()     ; count in r1L, value in r1H
+            if cx16.r1L>1 {
+                ; a replicate run
+                if literals_length>0
+                    output_literals()
+                @(target) = (cx16.r1L^255)+2        ;  257-cx16.r1L
+                target++
+                @(target) = cx16.r1H
+                target++
+            }
+            else {
+                ; add more to the literals run
+                if literals_length==128
+                    output_literals()
+                if literals_length==0
+                    literals_start_idx = idx-1
+                literals_length++
+            }
+        }
+
+        if literals_length>0
+            output_literals()
+
+        if is_last_block {
+            @(target) = 128
+            target ++
+        }
+
+        return target-orig_target
+    }
+
+}
diff --git a/compilerAst/src/prog8/ast/SymbolPrinter.kt b/compilerAst/src/prog8/ast/SymbolDumper.kt
similarity index 81%
rename from compilerAst/src/prog8/ast/SymbolPrinter.kt
rename to compilerAst/src/prog8/ast/SymbolDumper.kt
index e8c333a6f..a9e204c26 100644
--- a/compilerAst/src/prog8/ast/SymbolPrinter.kt
+++ b/compilerAst/src/prog8/ast/SymbolDumper.kt
@@ -2,30 +2,53 @@ package prog8.ast
 
 import prog8.ast.statements.*
 import prog8.ast.walk.IAstVisitor
-import prog8.code.core.DataType
-import prog8.code.core.ZeropageWish
-import prog8.code.core.toHex
+import prog8.code.core.*
+import java.io.PrintStream
 
 
 fun printSymbols(program: Program) {
     println()
-    val printer = SymbolPrinter(::print, program, false)
-    printer.visit(program)
+    val symbols = SymbolDumper(false)
+    symbols.visit(program)
+    symbols.write(System.out)
     println()
 }
 
 
-class SymbolPrinter(val output: (text: String) -> Unit, val program: Program, val skipLibraries: Boolean): IAstVisitor {
-    private fun outputln(text: String) = output(text + "\n")
+private class SymbolDumper(val skipLibraries: Boolean): IAstVisitor {
+    private val moduleOutputs = mutableMapOf<Module, MutableList<String>>()
+
+    private var currentModule = Module(mutableListOf(), Position.DUMMY, SourceCode.Generated("dummy"))
+    private fun output(line: String) {
+        var lines = moduleOutputs[currentModule]
+        if(lines == null) {
+            lines = mutableListOf()
+            moduleOutputs[currentModule] = lines
+        }
+        lines.add(line)
+    }
+    private fun outputln(line: String) = output(line + '\n')
+
+    fun write(out: PrintStream) {
+        for((module, lines) in moduleOutputs.toSortedMap(compareBy { it.name })) {
+            if(lines.any()) {
+                val moduleName = "LIBRARY MODULE NAME: ${module.source.name}"
+                out.println()
+                out.println(moduleName)
+                out.println("-".repeat(moduleName.length))
+                out.println()
+                for (line in lines) {
+                    out.print(line)
+                }
+            }
+        }
+    }
 
     override fun visit(module: Module) {
         if(!module.isLibrary || !skipLibraries) {
             if(module.source.isFromFilesystem || module.source.isFromResources) {
-                val moduleName = "LIBRARY MODULE NAME: ${module.source.name}"
-                outputln(moduleName)
-                outputln("-".repeat(moduleName.length))
+                currentModule = module
                 super.visit(module)
-                output("\n")
             }
         }
     }
diff --git a/docs/import-all-atari.p8 b/docs/import-all-atari.p8
index 133189259..757ce92bb 100644
--- a/docs/import-all-atari.p8
+++ b/docs/import-all-atari.p8
@@ -2,6 +2,7 @@
 
 %import anyall
 %import buffers
+%import compression
 %import conv
 %import cx16logo
 %import diskio
diff --git a/docs/import-all-c128.p8 b/docs/import-all-c128.p8
index 335031ed6..b5182054c 100644
--- a/docs/import-all-c128.p8
+++ b/docs/import-all-c128.p8
@@ -2,6 +2,7 @@
 
 %import anyall
 %import buffers
+%import compression
 %import conv
 %import cx16logo
 %import diskio
diff --git a/docs/import-all-c64.p8 b/docs/import-all-c64.p8
index 7a41ba03c..feb461a7b 100644
--- a/docs/import-all-c64.p8
+++ b/docs/import-all-c64.p8
@@ -2,6 +2,7 @@
 
 %import anyall
 %import buffers
+%import compression
 %import conv
 %import cx16logo
 %import diskio
diff --git a/docs/import-all-cx16.p8 b/docs/import-all-cx16.p8
index 76d871f81..52361d6ce 100644
--- a/docs/import-all-cx16.p8
+++ b/docs/import-all-cx16.p8
@@ -4,6 +4,7 @@
 %import anyall
 %import buffers
 %import bmx
+%import compression
 %import conv
 %import cx16logo
 %import diskio
diff --git a/docs/import-all-pet32.p8 b/docs/import-all-pet32.p8
index f0b89204d..4751c3359 100644
--- a/docs/import-all-pet32.p8
+++ b/docs/import-all-pet32.p8
@@ -2,6 +2,7 @@
 
 %import anyall
 %import buffers
+%import compression
 %import conv
 %import cx16logo
 %import diskio
diff --git a/docs/import-all-virtual.p8 b/docs/import-all-virtual.p8
index 175946a64..7d7741e76 100644
--- a/docs/import-all-virtual.p8
+++ b/docs/import-all-virtual.p8
@@ -2,6 +2,7 @@
 
 %import anyall
 %import buffers
+%import compression
 %import conv
 %import cx16logo
 %import diskio
diff --git a/docs/source/comparing.rst b/docs/source/comparing.rst
index c60c9aa9d..134e8c377 100644
--- a/docs/source/comparing.rst
+++ b/docs/source/comparing.rst
@@ -42,6 +42,7 @@ Variables
   (but user written libraries are possible that provide that indirectly).
 - Variables can be declared everywhere inside the code but all variable declarations in a subroutine
   are moved to the top of the subroutine. A for loop, or if/else blocks do not introduce a new scope.
+  A subroutine (also nested ones) *do* introduce a new scope.
 - All variables are initialized at the start of the program. There is no random garbage in them: they are zero or any other initialization value you provide.
 - This als means you can run a Prog8 program multiple times without having to reload it from disk, unlike programs produced by most other compilers targeting these 8 bit platforms.
 
diff --git a/docs/source/libraries.rst b/docs/source/libraries.rst
index 60ce8773b..60d84910f 100644
--- a/docs/source/libraries.rst
+++ b/docs/source/libraries.rst
@@ -186,6 +186,7 @@ memory (name, size, alignment)
 call (address) -> uword
     Calls a subroutine given by its memory address. You cannot pass arguments directly,
     although it is ofcourse possible to do this via the global ``cx16.r0...`` registers for example.
+    It is *not* possible to use cpu registers to pass arguments, because these are clobbered while performing the call!
     It is assumed the subroutine returns a word value (in AY), if it does not, just add void to the call to ignore the result value.
     This function effectively creates an "indirect JSR" if you use it on a ``uword`` pointer variable.
     But because it doesn't handle bank switching etcetera by itself,
diff --git a/docs/source/todo.rst b/docs/source/todo.rst
index 4de567291..bf020fc13 100644
--- a/docs/source/todo.rst
+++ b/docs/source/todo.rst
@@ -1,6 +1,16 @@
 TODO
 ====
 
+Regenerate skeleton doc files.
+
+"invalid number of arguments" -> print the list of missing arguments
+
+call() asm gen in funcCall() could be improved by not using modifying code , see the TODO.
+
+callfar() should allow setting an argument in the X register as well?
+
+Add a new SublimeText syntax file for prog8, and also install this for bat: https://github.com/sharkdp/bat?tab=readme-ov-file#adding-new-syntaxes--language-definitions
+
 Improve register load order in subroutine call args assignments:
 in certain situations, the "wrong" order of evaluation of function call arguments is done which results
 in overwriting registers that already got their value, which requires a lot of stack juggling (especially on plain 6502 cpu!)
diff --git a/examples/cx16/cobramk3-gfx.p8 b/examples/cx16/cobramk3-gfx.p8
index a198fdd26..1e99fd928 100644
--- a/examples/cx16/cobramk3-gfx.p8
+++ b/examples/cx16/cobramk3-gfx.p8
@@ -5,6 +5,8 @@
 %import verafx
 
 ; TODO add all other Elite's ships, show their name, advance to next ship on keypress
+; TODO fix the camera normal calculation for the hidden surface removal
+; TODO embed pre calculated surface normals???
 
 main {
     sub start()  {
diff --git a/examples/test.p8 b/examples/test.p8
index ac4155400..68b85a961 100644
--- a/examples/test.p8
+++ b/examples/test.p8
@@ -1,14 +1,116 @@
 %import textio
+%import string
+%import compression
 %zeropage basicsafe
 %option no_sysinit
 
 main {
     sub start() {
-        uword @shared address = $C09F
-        ubyte @shared bank = 10
-        uword @shared argument = $1234
+        ubyte[] rle = [
+            5, '1', '2', '3', '4', '5', '6',
+            3, 'a', 'b', 'c', 'd',
+            0, 'z',
+            3, '1', '2', '3','4',
+            -5, '=',
+            -127, '+',
+            4, '!', '@', '#', '$', '%',
+            128]
 
-        void callfar(bank, address, argument)
-        void callfar(10, $C09F, argument)
+        str @shared result = "\x00"*200
+
+        uword ptr = &rle
+
+        txt.print_uw(compression.decode_rle_srcfunc(callback, result, len(result)))
+        txt.nl()
+        txt.print_uw(compression.decode_rle(rle, result, len(result)))
+        txt.nl()
+        txt.print_uw(string.length(result))
+        txt.nl()
+        txt.print(result)
+        txt.nl()
+        txt.print_uwhex(&result, true)
+        txt.nl()
+
+        sub callback() -> ubyte {
+            ubyte x = @(ptr)
+            ptr++
+            return x
+        }
+
+
+        ubyte[256] buffer
+        ubyte[256] buffer2
+        uword bufptr = &buffer
+        uword outputsize=0
+
+        sub outputter(ubyte value) {
+            @(bufptr) = value
+            bufptr++
+            outputsize++
+        }
+
+        str input = iso:"123456aaaaabcdzzz1234======++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++!@#$%"
+        ubyte[31] expected_rle = [$5, $31, $32, $33, $34, $35, $36, $fc, $61, $2, $62, $63, $64, $fe, $7a, $3, $31, $32, $33, $34, $fb, $3d, $81, $2b, $4, $21, $40, $23, $24, $25, $80]
+        txt.print(input)
+        txt.nl()
+
+        compression.encode_rle_outfunc(input, len(input), outputter, true)
+        txt.print_uw(outputsize)
+        txt.nl()
+        if outputsize!=len(expected_rle)
+            txt.print("wrong rle size!\n")
+
+        txt.print("\ncompare rle (encode using callback):\n")
+        for cx16.r9L in 0 to len(expected_rle)-1 {
+;            txt.print_ub0(cx16.r9L)
+;            txt.spc()
+;            txt.print_ubhex(expected_rle[cx16.r9L], false)
+;            txt.spc()
+;            txt.print_ubhex(buffer[cx16.r9L], false)
+            if buffer[cx16.r9L] != expected_rle[cx16.r9L]
+                txt.print(" wrong rle data!")
+;            txt.nl()
+        }
+        txt.nl()
+
+        cx16.r0 = compression.decode_rle(buffer, buffer2, len(buffer2))
+        if cx16.r0 != len(input)
+            txt.print("wrong decompress result!\n")
+        else {
+            txt.print("good result: ")
+            txt.print(buffer2)
+            txt.nl()
+        }
+
+
+        buffer[0] = buffer[1] = buffer[2] = 128
+        outputsize = compression.encode_rle(input, len(input), buffer, true)
+        txt.print_uw(outputsize)
+        txt.nl()
+        if outputsize!=len(expected_rle)
+            txt.print("wrong rle size!\n")
+
+        txt.print("\ncompare rle (encode into buffer):\n")
+        for cx16.r9L in 0 to len(expected_rle)-1 {
+;            txt.print_ub0(cx16.r9L)
+;            txt.spc()
+;            txt.print_ubhex(expected_rle[cx16.r9L], false)
+;            txt.spc()
+;            txt.print_ubhex(buffer[cx16.r9L], false)
+            if buffer[cx16.r9L] != expected_rle[cx16.r9L]
+                txt.print(" wrong rle data!")
+;            txt.nl()
+        }
+        txt.nl()
+
+        cx16.r0 = compression.decode_rle(buffer, buffer2, len(buffer2))
+        if cx16.r0 != len(input)
+            txt.print("wrong decompress result!\n")
+        else {
+            txt.print("good result: ")
+            txt.print(buffer2)
+            txt.nl()
+        }
     }
 }
+
diff --git a/syntax-files/SublimeText/readme.txt b/syntax-files/SublimeText/readme.txt
new file mode 100644
index 000000000..60d2d392e
--- /dev/null
+++ b/syntax-files/SublimeText/readme.txt
@@ -0,0 +1,8 @@
+We need a new syntax file for Sublime Text.
+Format: see https://www.sublimetext.com/docs/syntax.html
+
+
+The old (no longer functioning) syntax definition file can be found here:
+
+https://github.com/akubiczek/Prog8-TmLanguage-VsCode/tree/master/sublime3
+
diff --git a/syntax-files/Sublimetext3/readme.txt b/syntax-files/Sublimetext3/readme.txt
deleted file mode 100644
index 88360ab69..000000000
--- a/syntax-files/Sublimetext3/readme.txt
+++ /dev/null
@@ -1,4 +0,0 @@
-A contributed syntax definition file for Sublime Text 3 can be obtained from:
-
-https://github.com/akubiczek/Prog8-TmLanguage-VsCode/tree/master/sublime3
-