diff --git a/compiler/res/prog8lib/cx16/verafx.p8 b/compiler/res/prog8lib/cx16/verafx.p8
index 274c7408f..a0341183e 100644
--- a/compiler/res/prog8lib/cx16/verafx.p8
+++ b/compiler/res/prog8lib/cx16/verafx.p8
@@ -6,15 +6,45 @@
 verafx {
     %option no_symbol_prefixing
 
-    sub fill(ubyte vbank, uword vaddr, ubyte data, uword numlongs) {
-        ; TODO use vera fx cache write
-        cx16.vaddr(vbank, vaddr, 0, true)
-        repeat numlongs {
-            cx16.VERA_DATA0 = data
-            cx16.VERA_DATA0 = data
-            cx16.VERA_DATA0 = data
-            cx16.VERA_DATA0 = data
+    sub clear(ubyte vbank, uword vaddr, ubyte data, uword amountof32bits) {
+        ; use cached 4-byte write to quickly clear a portion of the video memory to a given byte value
+        ; this routine is around 3 times faster as gfx2.clear_screen()
+        cx16.VERA_CTRL = 0
+        cx16.VERA_ADDR_H = vbank | %00110000       ; 4-byte increment
+        cx16.VERA_ADDR_M = msb(vaddr)
+        cx16.VERA_ADDR_L = lsb(vaddr)
+        cx16.VERA_CTRL = 6<<1       ; dcsel = 6, fill the 32 bits cache
+        cx16.VERA_FX_CACHE_L = data
+        cx16.VERA_FX_CACHE_M = data
+        cx16.VERA_FX_CACHE_H = data
+        cx16.VERA_FX_CACHE_U = data
+        cx16.VERA_CTRL = 2<<1       ; dcsel = 2
+        cx16.VERA_FX_MULT = 0
+        cx16.VERA_FX_CTRL = %01000000    ; cache write enable
+
+        if (amountof32bits & %1111110000000011) == 0 {
+            repeat lsb(amountof32bits >> 2)
+                unroll 4 cx16.VERA_DATA0=0       ; write 4 bytes at a time, unrolled
         }
+        else if (amountof32bits & %1111111000000001) == 0 {
+            repeat lsb(amountof32bits >> 1)
+                unroll 2 cx16.VERA_DATA0=0       ; write 4 bytes at a time, unrolled
+        }
+        else if (lsb(amountof32bits) & 3) == 0 {
+            repeat amountof32bits >> 2
+                unroll 4 cx16.VERA_DATA0=0       ; write 4 bytes at a time, unrolled
+        }
+        else if (lsb(amountof32bits) & 1) == 0 {
+            repeat amountof32bits >> 1
+                unroll 2 cx16.VERA_DATA0=0       ; write 4 bytes at a time, unrolled
+        }
+        else {
+            repeat amountof32bits
+                cx16.VERA_DATA0=0       ; write 4 bytes at a time
+        }
+
+        cx16.VERA_FX_CTRL = 0       ; cache write disable
+        cx16.VERA_CTRL = 0
     }
 
     ; unsigned multiplication just passes the values as signed to muls
diff --git a/docs/source/libraries.rst b/docs/source/libraries.rst
index cfe8bd2e5..33da4d4ef 100644
--- a/docs/source/libraries.rst
+++ b/docs/source/libraries.rst
@@ -493,9 +493,14 @@ Available for the Cx16 target.
 Experimental routines that use the new Vera FX logic (hopefully coming in the Vera in new X16 boards,
 the emulators already support it).
 
-For now, the hardware 16*16 multiplier is exposed via ``mult`` and ``muls`` routines.
-They are about 4 to 5 times faster as the default 6502 cpu routine for word multiplication.
-But they depend on
+``mult`` , ``muls``
+    For now, the hardware 16*16 multiplier is exposed via ``mult`` and ``muls`` routines (unsigned and signed respectively).
+    They are about 4 to 5 times faster as the default 6502 cpu routine for word multiplication.
+    But they depend on some Vera manipulation and 4 bytes in vram just below the PSG registers for storage.
+
+``clear``
+    There's also a ``clear`` routine here to very quickly clear a piece of vram to a given byte value (it writes 4 bytes at a time).
+    The routine is around 3 times faster as a regular unrolled loop to clear vram.
 
 Read the `source code <https://github.com/irmen/prog8/tree/master/compiler/res/prog8lib/cx16/verafx.p8>`_
 to see what's in there.
diff --git a/docs/source/todo.rst b/docs/source/todo.rst
index ff09bc848..62ddcce73 100644
--- a/docs/source/todo.rst
+++ b/docs/source/todo.rst
@@ -1,6 +1,8 @@
 TODO
 ====
 
+- '>>='  can be used as an operator in an expression?? should only be augmented assignment!
+
 - [on branch: shortcircuit] investigate McCarthy evaluation again? this may also reduce code size perhaps for things like if a>4 or a<2 ....
 - [on branch: ir-less-branch-opcodes] IR: reduce the number of branch instructions such as BEQ, BEQR, etc (gradually), replace with CMP(I) + status branch instruction
 - IR: reduce amount of CMP/CMPI after instructions that set the status bits correctly (LOADs? INC? etc), but only after setting the status bits is verified!
diff --git a/examples/test.p8 b/examples/test.p8
index 89efde817..9f3d54bce 100644
--- a/examples/test.p8
+++ b/examples/test.p8
@@ -37,8 +37,24 @@ main {
         txt.nl()
 
         gfx2.screen_mode(1)
-        verafx.fill(0, 0, %10101010, 1200)        ; should fill top half of the screen
-        verafx.fill(0, 4800, %11111111, 1200)     ; should fill bottom half of the screen
+
+        cbm.SETTIM(0,0,0)
+        repeat 255 {
+            gfx2.clear_screen()
+        }
+        uword time1 = cbm.RDTIM16()
+
+        cbm.SETTIM(0,0,0)
+        repeat 255 {
+            verafx.clear(0, 0, %10101010, 2400)
+        }
+        uword time2 = cbm.RDTIM16()
+
+        gfx2.screen_mode(0)
+        txt.print_uw(time1)
+        txt.spc()
+        txt.print_uw(time2)
+        txt.nl()
 
 
 ;        txt.print_uw(math.mul16_last_upper())