diff --git a/compiler/res/prog8lib/cx16/verafx.p8 b/compiler/res/prog8lib/cx16/verafx.p8
index 47636a6b5..00eb7cccb 100644
--- a/compiler/res/prog8lib/cx16/verafx.p8
+++ b/compiler/res/prog8lib/cx16/verafx.p8
@@ -111,15 +111,17 @@ verafx {
 
     ; unsigned multiplication just passes the values as signed to muls
     ; if you do this yourself in your call to muls, it will save a few instructions.
-    inline asmsub mult(uword value1 @R0, uword value2 @R1) clobbers(X) -> uword @AY, uword @R0 {
-        ; Returns the 32 bits unsigned result in AY and R0  (lower word, upper word).
-        %asm {{
-            jsr  verafx.muls
-        }}
-    }
+    ; TODO fix this: verafx.muls doesn't support unsigned values like this
+;    inline asmsub mult(uword value1 @R0, uword value2 @R1) clobbers(X) -> uword @AY, uword @R0 {
+;        ; Returns the 32 bits unsigned result in AY and R0  (lower word, upper word).
+;        %asm {{
+;            jsr  verafx.muls
+;        }}
+;    }
 
     asmsub muls(word value1 @R0, word value2 @R1) clobbers(X) -> word @AY, word @R0 {
         ; Returns the 32 bits signed result in AY and R0  (lower word, upper word).
+        ; Vera Fx multiplication support only works on signed values!
         %asm {{
             lda  #(2 << 1)
             sta  cx16.VERA_CTRL        ; $9F25
diff --git a/compiler/res/prog8lib/math.asm b/compiler/res/prog8lib/math.asm
index 8ea07cc49..97ade9ea0 100644
--- a/compiler/res/prog8lib/math.asm
+++ b/compiler/res/prog8lib/math.asm
@@ -55,13 +55,14 @@ _multiplier      = P8ZP_SCRATCH_REG
 
 
 multiply_words	.proc
-	; -- multiply two 16-bit words into a 32-bit result  (signed and unsigned)
+	; -- multiply two 16-bit words into a 32-bit result  (UNSIGNED)
 	;      input: A/Y = first 16-bit number, multiply_words.multiplier = second 16-bit number
 	;      output: multiply_words.result, 4-bytes/32-bits product, LSB order (low-to-high)  low 16 bits also in AY.
+	;      you can retrieve the upper 16 bits via math.mul16_last_upper()
 
-	; NOTE: the result (which includes the multiplier parameter on entry) is a 4-byte array.
-	;       this routine could be faster if we could stick that into zeropage,
-	;       but there currently is no way to use 4 consecutive bytes in ZP (without disabling irq and saving/restoring them)...
+	; NOTE FOR NEGATIVE VALUES:
+	;      The routine also works for NEGATIVE (signed) word values, but ONLY the lower 16 bits of the result are correct then!
+	;      Prog8 only uses those so that's not an issue, but math.mul16_last_upper() no longer gives the correct result here.
 
 ; mult62.a
 ; from: https://github.com/TobyLobster/multiply_test/blob/main/tests/mult62.a
@@ -179,7 +180,7 @@ _inner_loop2
     ldy  result+1
     rts
 
-result		.byte  0,0,0,0
+result		.byte  0,0,0,0       ; routine could be faster if this were in Zeropage...
 
 		.pend
 
diff --git a/compiler/res/prog8lib/math.p8 b/compiler/res/prog8lib/math.p8
index b472d8897..5c33bae41 100644
--- a/compiler/res/prog8lib/math.p8
+++ b/compiler/res/prog8lib/math.p8
@@ -168,6 +168,9 @@ _sinecosR8	.char  trunc(127.0 * sin(range(180+45) * rad(360.0/180.0)))
         ;     for instance, simply printing a number may already result in new multiplication calls being performed
         ;   - not all multiplications in the source code result in an actual multiplication call:
         ;     some simpler multiplications will be optimized away into faster routines. These will not set the upper 16 bits at all!
+        ;   - THE RESULT IS ONLY VALID IF THE MULTIPLICATION WAS DONE WITH UWORD ARGUMENTS (or two positive WORD arguments)
+        ;     as soon as a negative word value (or 2) was used in the multiplication, these upper 16 bits are not valid!!
+        ;     Suggestion (if you are on the Commander X16): use verafx.muls() to get a hardware accelerated 32 bit signed multplication.
         %asm {{
             lda  multiply_words.result+2
             ldy  multiply_words.result+3
diff --git a/compiler/res/prog8lib/virtual/math.p8 b/compiler/res/prog8lib/virtual/math.p8
index 85aa296be..9f77c749f 100644
--- a/compiler/res/prog8lib/virtual/math.p8
+++ b/compiler/res/prog8lib/virtual/math.p8
@@ -293,6 +293,8 @@ math {
         ;     for instance, simply printing a number may already result in new multiplication calls being performed
         ;   - not all multiplications in the source code result in an actual multiplication call:
         ;     some simpler multiplications will be optimized away into faster routines. These will not set the upper 16 bits at all!
+        ;   - THE RESULT IS ONLY VALID IF THE MULTIPLICATION WAS DONE WITH UWORD ARGUMENTS (or two positive WORD arguments)
+        ;     as soon as a negative word value (or 2) was used in the multiplication, these upper 16 bits are not valid!!
         %ir {{
             syscall 33 (): r0.w
             returnr.w r0
diff --git a/docs/source/libraries.rst b/docs/source/libraries.rst
index ce4da7d5c..6af8a9545 100644
--- a/docs/source/libraries.rst
+++ b/docs/source/libraries.rst
@@ -787,6 +787,10 @@ but perhaps the provided ones can be of service too.
     It does not work for the verafx multiplication routines on the Commander X16!
     These have a different way to obtain the upper 16 bits of the result: just read cx16.r0.
 
+    **NOTE:** the result is only valid if the multiplication was done with uword arguments (or two positive word arguments).
+    As soon as a single negative word value (or both) was used in the multiplication, these upper 16 bits are not valid!
+    Suggestion (if you are on the Commander X16): use ``verafx.muls()`` to get a hardware accelerated 32 bit signed multiplication.
+
 ``crc16 (uword data, uword length) -> uword``
     Returns a CRC-16 (XMODEM) checksum over the given data buffer.
     Note: on the Commander X16, there is a CRC-16 routine in the kernal: cx16.memory_crc().
diff --git a/docs/source/todo.rst b/docs/source/todo.rst
index ebce982d8..4de567291 100644
--- a/docs/source/todo.rst
+++ b/docs/source/todo.rst
@@ -14,7 +14,6 @@ Compiler:
 - Can we support signed % (remainder) somehow?
 - Don't add "random" rts to %asm blocks but instead give a warning about it? (but this breaks existing behavior that others already depend on... command line switch? block directive?)
 - IR: implement missing operators in AssignmentGen  (array shifts etc)
-- IR: CMPI+BSTEQ --> new BEQ reg,value,label instruction (like BGT etc)
 - instead of copy-pasting inline asmsubs, make them into a 64tass macro and use that instead.
   that will allow them to be reused from custom user written assembly code as well.
 - Multidimensional arrays and chained indexing, purely as syntactic sugar over regular arrays.