diff --git a/codeGenCpu6502/src/prog8/codegen/cpu6502/FunctionCallAsmGen.kt b/codeGenCpu6502/src/prog8/codegen/cpu6502/FunctionCallAsmGen.kt
index 88aec8a20..4e74dbd40 100644
--- a/codeGenCpu6502/src/prog8/codegen/cpu6502/FunctionCallAsmGen.kt
+++ b/codeGenCpu6502/src/prog8/codegen/cpu6502/FunctionCallAsmGen.kt
@@ -82,7 +82,7 @@ internal class FunctionCallAsmGen(private val program: PtProgram, private val as
             is PtAddressOf -> false
             is PtIdentifier -> false
             is PtIrRegister -> false
-            is PtMemoryByte -> true     // TODO might not actually need extra registers if the value has to end up in A
+            is PtMemoryByte -> arg.address !is PtNumber && arg.address !is PtIdentifier
             is PtNumber -> false
             is PtBool -> false
             else -> true
diff --git a/compiler/res/prog8lib/cx16/gfx_lores.p8 b/compiler/res/prog8lib/cx16/gfx_lores.p8
index b3bf0dbd3..562f50045 100644
--- a/compiler/res/prog8lib/cx16/gfx_lores.p8
+++ b/compiler/res/prog8lib/cx16/gfx_lores.p8
@@ -1,10 +1,31 @@
-; optimized graphics routines for just a single screen mode:  lores 320*240, 256c  (8bpp)
+; optimized graphics routines for just the single screen mode: lores 320*240, 256c  (8bpp)
 ; bitmap image needs to start at VRAM addres $00000.
 ; This is compatible with the CX16's screen mode 128.  (void cx16.set_screen_mode(128))
 
 
 gfx_lores {
 
+    sub set_screen_mode() {
+        cx16.VERA_CTRL=0
+        cx16.VERA_DC_VIDEO = (cx16.VERA_DC_VIDEO & %11001111) | %00100000      ; enable only layer 1
+        cx16.VERA_DC_HSCALE = 64
+        cx16.VERA_DC_VSCALE = 64
+        cx16.VERA_L1_CONFIG = %00000111
+        cx16.VERA_L1_MAPBASE = 0
+        cx16.VERA_L1_TILEBASE = 0
+        clear_screen(0)
+    }
+
+    sub clear_screen(ubyte color) {
+        cx16.VERA_CTRL=0
+        cx16.VERA_ADDR=0
+        cx16.VERA_ADDR_H = 1<<4    ; 1 pixel auto increment
+        repeat 240
+            cs_innerloop320(color)
+        cx16.VERA_ADDR=0
+        cx16.VERA_ADDR_H = 0
+    }
+
     sub line(uword x1, ubyte y1, uword x2, ubyte y2, ubyte color) {
         ; Bresenham algorithm.
         ; This code special-cases various quadrant loops to allow simple ++ and -- operations.
@@ -141,7 +162,6 @@ times320_hi     .byte `times320
             }}
     }
 
-
     sub horizontal_line(uword xx, ubyte yy, uword length, ubyte color) {
         if length==0
             return
@@ -183,6 +203,26 @@ times320_hi     .byte `times320
         }}
     }
 
+
+    asmsub cs_innerloop320(ubyte color @A) clobbers(Y) {
+        ; using verafx 32 bits writes here would make this faster but it's safer to
+        ; use verafx only explicitly when you know what you're doing.
+        %asm {{
+            ldy  #40
+-           sta  cx16.VERA_DATA0
+            sta  cx16.VERA_DATA0
+            sta  cx16.VERA_DATA0
+            sta  cx16.VERA_DATA0
+            sta  cx16.VERA_DATA0
+            sta  cx16.VERA_DATA0
+            sta  cx16.VERA_DATA0
+            sta  cx16.VERA_DATA0
+            dey
+            bne  -
+            rts
+        }}
+    }
+
     inline asmsub vera_setaddr(uword xx @R0, ubyte yy @R1) {
         ; set the correct vera start address (no auto increment yet!)
         %asm {{
diff --git a/compiler/res/prog8lib/cx16/monogfx.p8 b/compiler/res/prog8lib/cx16/monogfx.p8
index 321d72e04..52d675e90 100644
--- a/compiler/res/prog8lib/cx16/monogfx.p8
+++ b/compiler/res/prog8lib/cx16/monogfx.p8
@@ -32,7 +32,7 @@ monogfx {
         width = 320
         height = 240
         mode = MODE_NORMAL
-        clear_screen(0)
+        clear_screen(false)
     }
 
     sub hires() {
@@ -47,7 +47,7 @@ monogfx {
         width = 640
         height = 480
         mode = MODE_NORMAL
-        clear_screen(0)
+        clear_screen(false)
     }
 
     sub textmode() {
@@ -61,16 +61,16 @@ monogfx {
         mode = dm
     }
 
-    sub clear_screen(ubyte color) {
+    sub clear_screen(bool draw) {
         position(0, 0)
         when width {
             320 -> {
                 repeat 240/2/8
-                    cs_innerloop640(color)
+                    cs_innerloop640(draw)
             }
             640 -> {
                 repeat 480/8
-                    cs_innerloop640(color)
+                    cs_innerloop640(draw)
             }
         }
         position(0, 0)
@@ -90,8 +90,8 @@ monogfx {
     }
 
     sub fillrect(uword xx, uword yy, uword rwidth, uword rheight, bool draw) {
-        ; Draw a filled rectangle of the given size and color.
-        ; To fill the whole screen, use clear_screen(color) instead - it is much faster.
+        ; Draw a filled rectangle of the given size.
+        ; To fill the whole screen, use clear_screen(draw) instead - it is much faster.
         if rwidth==0
             return
         repeat rheight {
@@ -1000,7 +1000,7 @@ cdraw_mod2              ora  cx16.VERA_DATA1
         }
     }
 
-    asmsub cs_innerloop640(ubyte color @A) clobbers(Y) {
+    asmsub cs_innerloop640(bool draw @A) clobbers(Y) {
         ; using verafx 32 bits writes here would make this faster but it's safer to
         ; use verafx only explicitly when you know what you're doing.
         %asm {{
diff --git a/compiler/res/prog8lib/virtual/monogfx.p8 b/compiler/res/prog8lib/virtual/monogfx.p8
index 00fc6f118..811154ee3 100644
--- a/compiler/res/prog8lib/virtual/monogfx.p8
+++ b/compiler/res/prog8lib/virtual/monogfx.p8
@@ -23,7 +23,7 @@ monogfx {
         width = 320
         height = 240
         mode = MODE_NORMAL
-        clear_screen(0)
+        clear_screen(false)
     }
 
     sub hires() {
@@ -32,7 +32,7 @@ monogfx {
         width = 640
         height = 480
         mode = MODE_NORMAL
-        clear_screen(0)
+        clear_screen(false)
     }
 
     sub textmode() {
@@ -43,9 +43,10 @@ monogfx {
         mode = dm
     }
 
-    sub clear_screen(ubyte color) {
-        if color!=0
-            color=255
+    sub clear_screen(bool draw) {
+        ubyte color = 0
+        if draw
+            color = 255
         sys.gfx_clear(color)
     }
 
@@ -64,7 +65,7 @@ monogfx {
 
     sub fillrect(uword xx, uword yy, uword rwidth, uword rheight, bool draw) {
         ; Draw a filled rectangle of the given size and color.
-        ; To fill the whole screen, use clear_screen(color) instead - it is much faster.
+        ; To fill the whole screen, use clear_screen(draw) instead - it is much faster.
         if rwidth==0
             return
         repeat rheight {
diff --git a/docs/source/libraries.rst b/docs/source/libraries.rst
index ef2010796..1d58b8135 100644
--- a/docs/source/libraries.rst
+++ b/docs/source/libraries.rst
@@ -903,7 +903,7 @@ to see what's in there.
 gfx2  (cx16 only)
 -----------------
 Full-screen multicolor bitmap graphics routines, available on the Cx16 machine only.
-Same interface as monogfx, but for color screens. For 1 bpp monochrome screens, use monogfx.
+Same interface as monogfx, but for color screens. For 1 bpp monochrome screens: use monogfx.
 
 - multiple full-screen bitmap color resolutions
 - clearing screen, switching screen mode, also back to text mode
@@ -916,6 +916,15 @@ Read the `gfx2 source code <https://github.com/irmen/prog8/tree/master/compiler/
 to see what's in there.
 
 
+gfx_lores  (cx16 only)
+----------------------
+Heavily optimized graphics routines for just the single screen mode: lores 320*240, 256c  (8bpp)
+This is screen mode 1 from the gfx2 module (and also compatible with X16's basic screen mode 128).
+
+Read the `gfx_lores source code <https://github.com/irmen/prog8/tree/master/compiler/res/prog8lib/cx16/gfx_lores.p8>`_
+to see what's in there.
+
+
 palette  (cx16 only)
 --------------------
 Available for the Cx16 target. Various routines to set the display color palette.
diff --git a/docs/source/todo.rst b/docs/source/todo.rst
index fb5d3198b..51f7a0e0a 100644
--- a/docs/source/todo.rst
+++ b/docs/source/todo.rst
@@ -1,25 +1,27 @@
 TODO
 ====
 
+Regenerate skeletons in doc.
+
+
 Improve register load order in subroutine call args assignments:
 in certain situations, the "wrong" order of evaluation of function call arguments is done which results
 in overwriting registers that already got their value, which requires a lot of stack juggling (especially on plain 6502 cpu!)
 Maybe this routine can be made more intelligent.  See usesOtherRegistersWhileEvaluating() and argumentsViaRegisters().
 
 
-Regenerate skeletons in doc.
-
-
 Future Things and Ideas
 ^^^^^^^^^^^^^^^^^^^^^^^
 Compiler:
 
 - Some facility to use add-with-carry and sub-with-carry (so we can chain additions/subtractions without clc/sec inserted every time)
+  Note: +/- 0  can't be optimized away anymore in this case!
+  Note2: may need to preserve carry flag during evaluation of the operands!
+  Note3: only available for bytes? (or does it work on words automatically?), and perhaps restrict operand to a simple expression?
 - Can we support signed % (remainder) somehow?
 - Don't add "random" rts to %asm blocks but instead give a warning about it? (but this breaks existing behavior that others already depend on... command line switch?)
 - IR: implement missing operators in AssignmentGen  (array shifts etc)
 - IR: CMPI+BSTEQ --> new BEQ reg,value,label instruction (like BGT etc)
-- expand the kata encoding to somehow translate normal katana to half-widths?  (see comment in KatakanaEncoding)
 - instead of copy-pasting inline asmsubs, make them into a 64tass macro and use that instead.
   that will allow them to be reused from custom user written assembly code as well.
 - Multidimensional arrays and chained indexing, purely as syntactic sugar over regular arrays.
diff --git a/examples/cx16/pcmaudio/adpcm.p8 b/examples/cx16/pcmaudio/adpcm.p8
index fabf4a32b..af4da39ac 100644
--- a/examples/cx16/pcmaudio/adpcm.p8
+++ b/examples/cx16/pcmaudio/adpcm.p8
@@ -65,10 +65,10 @@ adpcm {
     }
 
     sub decode_nibble(ubyte @zp nibble) {
-        ; Decoder for nibbles for the first channel.
-        ; this is the hotspot of the decoder algorithm!
+        ; Decoder for a single nibble for the first channel. (value of 'nibble' needs to be strictly 0-15 !)
+        ; This is the hotspot of the decoder algorithm!
         ; Note that the generated assembly from this is pretty efficient,
-        ; rewriting it by hand in asm seems to improve it only 5-10%
+        ; rewriting it by hand in asm seems to improve it only ~10%.
         cx16.r0s = 0                ; difference
         if nibble & %0100 !=0
             cx16.r0s += pstep
diff --git a/examples/cx16/pcmaudio/play-adpcm.p8 b/examples/cx16/pcmaudio/play-adpcm.p8
index d0c42d0bf..6d4a9511c 100644
--- a/examples/cx16/pcmaudio/play-adpcm.p8
+++ b/examples/cx16/pcmaudio/play-adpcm.p8
@@ -153,10 +153,10 @@ mono {
         repeat 252/2 {
             unroll 2 {
                 nibble = @(main.nibblesptr)
-                adpcm.decode_nibble(nibble & 15)     ; first word
+                adpcm.decode_nibble(nibble & 15)     ; first word  (note: upper nibble needs to be zero!)
                 cx16.VERA_AUDIO_DATA = lsb(adpcm.predict)
                 cx16.VERA_AUDIO_DATA = msb(adpcm.predict)
-                adpcm.decode_nibble(nibble>>4)       ; second word
+                adpcm.decode_nibble(nibble>>4)       ; second word  (note: upper nibble is zero, after the shifts.)
                 cx16.VERA_AUDIO_DATA = lsb(adpcm.predict)
                 cx16.VERA_AUDIO_DATA = msb(adpcm.predict)
                 main.nibblesptr++
@@ -219,6 +219,7 @@ stereo {
 
     sub decode_nibbles_unrolled() {
         ; decode 4 left channel nibbles
+        ; note: when calling decode_nibble(), the upper nibble in the argument needs to be zero
         uword[8] left
         uword[8] right
         ubyte @requirezp nibble = @(main.nibblesptr)
diff --git a/examples/cx16/pcmaudio/stream-wav.p8 b/examples/cx16/pcmaudio/stream-wav.p8
index d0972a032..9f7c1a10b 100644
--- a/examples/cx16/pcmaudio/stream-wav.p8
+++ b/examples/cx16/pcmaudio/stream-wav.p8
@@ -304,6 +304,7 @@ _lp2        lda  $ffff,y
         repeat 252/2 {
             unroll 2 {
                 nibble = @(nibblesptr)
+                ; note: when calling decode_nibble(), the upper nibble in the argument needs to be zero
                 adpcm.decode_nibble(nibble & 15)     ; first word
                 cx16.VERA_AUDIO_DATA = lsb(adpcm.predict)
                 cx16.VERA_AUDIO_DATA = msb(adpcm.predict)
@@ -330,6 +331,7 @@ _lp2        lda  $ffff,y
 
     sub decode_nibbles_unrolled() {
         ; decode 4 left channel nibbles
+        ; note: when calling decode_nibble(), the upper nibble in the argument needs to be zero
         uword[8] left
         uword[8] right
         ubyte @requirezp nibble = @(nibblesptr)
diff --git a/examples/cx16/testmonogfx.p8 b/examples/cx16/testmonogfx.p8
index b4dd75a2a..625d4f264 100644
--- a/examples/cx16/testmonogfx.p8
+++ b/examples/cx16/testmonogfx.p8
@@ -195,8 +195,8 @@ main {
         }
 
         sys.wait(60)
-        monogfx.clear_screen(1)
-        monogfx.clear_screen(0)
+        monogfx.clear_screen(true)
+        monogfx.clear_screen(false)
 
         ubyte radius
 
diff --git a/examples/test.p8 b/examples/test.p8
index 57a1f5728..a7b1386a3 100644
--- a/examples/test.p8
+++ b/examples/test.p8
@@ -1,27 +1,11 @@
 %option no_sysinit
+%import gfx_lores
 %zeropage basicsafe
 
 main {
     sub start() {
-        ; nothing!
-    }
-}
-
-
-derp {
-    asmsub f_tell() -> uword @R0, uword @R1, uword @R2, uword @R3 {
-        %asm {{
-            jmp  p8s_internal_f_tell
-        }}
-    }
-
-    sub internal_f_tell() {
-        cx16.r1 = read4hex()
-
-        sub read4hex() -> uword {
-            str @shared hex = "0000000000000000000000000000000000000000000"
-            cx16.r0++
-            return cx16.r0
-        }
+        gfx_lores.set_screen_mode()
+        gfx_lores.clear_screen(0)
+        gfx_lores.line(0,0,319,239,5)
     }
 }
diff --git a/scripts/profiler.py b/scripts/profiler.py
index 72eb2e7f8..303c3d31d 100755
--- a/scripts/profiler.py
+++ b/scripts/profiler.py
@@ -1,25 +1,22 @@
 #!/usr/bin/env python
 
-"""
+program_description = """
 This is a simple run-time profiler tool for X16 assembly programs.
 It takes an assembly list file (as produced by 64tass/turbo assembler) and
-a memory access statistics dump file (produced by the emulator's -memorystats option)
+a memory access statistics dump file (produced by the X16 emulator's -memorystats option)
 and prints out what assembly lines and variables were read from and written to the most.
 These may indicate hot paths or even bottlenecks in your program,
 and what variables in system ram might be better placed in Zeropage.
-
-The -memorystats option in the emulator is work in progress at the time of writing.
 """
 
 
-import sys
 import argparse
 import operator
 from typing import Tuple
 
 
 class AsmList:
-    """parses a l64tass Turbo Assembler Macro listing file"""
+    """parses a 64tass Turbo Assembler Macro listing file"""
 
     def __init__(self, filename: str) -> None:
         self.lines = []
@@ -170,7 +167,7 @@ def profile(number_of_lines: int, asmlist: str, memstats: str) -> None:
 
 
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="simple X16 assembly run time profiler")
+    parser = argparse.ArgumentParser(description=program_description)
     parser.add_argument("-n", dest="number", type=int, default=20, help="amount of reads and writes to print (default 20)")
     parser.add_argument("asmlistfile", type=str, help="the 64tass/turbo assembler listing file to read")
     parser.add_argument("memorystatsfile", type=str, help="the X16 emulator memstats dump file to read")