diff --git a/compiler/res/prog8lib/cx16/syslib.p8 b/compiler/res/prog8lib/cx16/syslib.p8
index a7d078ed9..3d2c4bea4 100644
--- a/compiler/res/prog8lib/cx16/syslib.p8
+++ b/compiler/res/prog8lib/cx16/syslib.p8
@@ -267,6 +267,17 @@ cx16 {
     &ubyte  VERA_AUDIO_DATA     = VERA_BASE + $001D
     &ubyte  VERA_SPI_DATA       = VERA_BASE + $001E
     &ubyte  VERA_SPI_CTRL       = VERA_BASE + $001F
+
+    ; experimental Vera FX registers: (depends on particular value set in VERA_CTRL!!!)
+    &ubyte VERA_FX_CTRL         = VERA_BASE + $0009
+    &ubyte VERA_FX_MULT         = VERA_BASE + $000C
+    &ubyte VERA_FX_CACHE_L      = VERA_BASE + $0009
+    &ubyte VERA_FX_CACHE_M      = VERA_BASE + $000A
+    &ubyte VERA_FX_CACHE_H      = VERA_BASE + $000B
+    &ubyte VERA_FX_CACHE_U      = VERA_BASE + $000C
+    &ubyte VERA_FX_ACCUM_RESET  = VERA_BASE + $0009     ; (DCSEL=6)
+
+
 ; VERA_PSG_BASE     = $1F9C0
 ; VERA_PALETTE_BASE = $1FA00
 ; VERA_SPRITES_BASE = $1FC00
diff --git a/compiler/res/prog8lib/cx16/verafx.p8 b/compiler/res/prog8lib/cx16/verafx.p8
new file mode 100644
index 000000000..fb4aef2ea
--- /dev/null
+++ b/compiler/res/prog8lib/cx16/verafx.p8
@@ -0,0 +1,59 @@
+; Experimental Vera FX support.
+; Docs:
+; https://github.com/X16Community/x16-docs/blob/master/VERA%20FX%20Reference.md
+; https://docs.google.com/document/d/1q34uWOiM3Be2pnaHRVgSdHySI-qsiQWPTo_gfE54PTg/edit
+
+verafx {
+
+    ; unsigned multiplication just passes the values as signed to muls
+    ; if you do this yourself in your call to muls, it will save a few instructions.
+    sub mult(uword value1, uword value2) -> uword {
+        return muls(value1 as word, value2 as word) as uword
+    }
+
+    asmsub muls(word value1 @R0, word value2 @R1) -> word @AY {
+        %asm {{
+            lda  #(2 << 1)
+            sta  cx16.VERA_CTRL        ; $9F25
+            stz  cx16.VERA_FX_CTRL     ; $9F29 (mainly to reset Addr1 Mode to 0)
+            lda  #%00010000
+            sta  cx16.VERA_FX_MULT     ; $9F2C
+            lda  #(6 << 1)
+            sta  cx16.VERA_CTRL        ; $9F25
+            lda  cx16.r0
+            ldy  cx16.r0+1
+            sta  cx16.VERA_FX_CACHE_L  ; $9F29
+            sty  cx16.VERA_FX_CACHE_M  ; $9F2A
+            lda  cx16.r1
+            ldy  cx16.r1+1
+            sta  cx16.VERA_FX_CACHE_H  ; $9F2B
+            sty  cx16.VERA_FX_CACHE_U  ; $9F2C
+            lda  cx16.VERA_FX_ACCUM_RESET   ; $9F29 (DCSEL=6)
+
+            ; Set the ADDR0 pointer to $1f9bc and write our multiplication result there
+            ; (these are the 4 bytes just before the PSG registers start)
+            lda  #(2 << 1)
+            sta  cx16.VERA_CTRL
+            lda  #%01000000           ; Cache Write Enable
+            sta  cx16.VERA_FX_CTRL
+            lda  #$bc
+            sta  cx16.VERA_ADDR_L
+            lda  #$f9
+            sta  cx16.VERA_ADDR_M
+            lda  #$01
+            sta  cx16.VERA_ADDR_H     ; no increment
+            stz  cx16.VERA_DATA0      ; multiply and write out result
+            lda  #%00010001           ; $01 with Increment 1
+            sta  cx16.VERA_ADDR_H     ; so we can read out the result
+            stz  cx16.VERA_FX_CTRL    ; Cache write disable
+            lda  cx16.VERA_DATA0
+            ldy  cx16.VERA_DATA0
+            rts
+; we skip the upper 16 bits of the result:
+;            lda  cx16.VERA_DATA0
+;            sta  $0402
+;            lda  cx16.VERA_DATA0
+;            sta  $0403
+        }}
+    }
+}
diff --git a/compiler/res/prog8lib/math.asm b/compiler/res/prog8lib/math.asm
index 2b63d1fd8..8f9920d6d 100644
--- a/compiler/res/prog8lib/math.asm
+++ b/compiler/res/prog8lib/math.asm
@@ -64,6 +64,7 @@ multiply_words	.proc
 	;       but there currently is no way to use 4 consecutive bytes in ZP (without disabling irq and saving/restoring them)...
 
 ; mult62.a
+; from: https://github.com/TobyLobster/multiply_test/blob/main/tests/mult62.a
 ; based on Dr Jefyll, http://forum.6502.org/viewtopic.php?f=9&t=689&start=0#p19958
 ; - adjusted to use fixed zero page addresses
 ; - removed 'decrement to avoid clc' as this is slower on average
diff --git a/docs/source/libraries.rst b/docs/source/libraries.rst
index 8d9ba98ce..cfe8bd2e5 100644
--- a/docs/source/libraries.rst
+++ b/docs/source/libraries.rst
@@ -485,3 +485,17 @@ See the examples/cx16/sprites/dragon.p8 and dragons.p8 programs for ideas how to
 
 Read the `source code <https://github.com/irmen/prog8/tree/master/compiler/res/prog8lib/cx16/sprites.p8>`_
 to see what's in there.
+
+
+verafx  (cx16 only)
+-------------------
+Available for the Cx16 target.
+Experimental routines that use the new Vera FX logic (hopefully coming in the Vera in new X16 boards,
+the emulators already support it).
+
+For now, the hardware 16*16 multiplier is exposed via ``mult`` and ``muls`` routines.
+They are about 4 to 5 times faster as the default 6502 cpu routine for word multiplication.
+But they depend on
+
+Read the `source code <https://github.com/irmen/prog8/tree/master/compiler/res/prog8lib/cx16/verafx.p8>`_
+to see what's in there.
diff --git a/examples/test.p8 b/examples/test.p8
index 041bfa835..52c8a97bc 100644
--- a/examples/test.p8
+++ b/examples/test.p8
@@ -1,26 +1,39 @@
 %import textio
-%import floats
+%import verafx
+%zeropage basicsafe
+%option no_sysinit
 
 main {
     sub start() {
-        ubyte from = 10
-        ubyte compare=9
-        if from==compare
-            goto equal
 
-        txt.print("from is not compare\n")
-equal:
+        const word MULTIPLIER = 431
 
-        ubyte end = 15
-        ubyte xx
-        for xx in from to end {
-            txt.print_ub(xx)
-            txt.spc()
+        ; verify results:
+        for value in -50 to 50 {
+            if value*MULTIPLIER != verafx.muls(value, MULTIPLIER) {
+                txt.print("verafx muls error\n")
+                sys.exit(1)
+            }
         }
+
+
+        word value
+        txt.print("verafx muls...")
+        cbm.SETTIM(0,0,0)
+        for value in -50 to 50 {
+            repeat 250 void verafx.muls(value, MULTIPLIER)
+        }
+        txt.print_uw(cbm.RDTIM16())
+        txt.nl()
+
+        txt.print("6502 muls...")
+        cbm.SETTIM(0,0,0)
+        for value in -50 to 50 {
+            repeat 250 cx16.r0s = value*MULTIPLIER
+        }
+        txt.print_uw(cbm.RDTIM16())
         txt.nl()
 
-        ubyte ten=9
-        if from!=ten
-            txt.print("from is not 10\n")
     }
 }
+