From c78b7b1a24fe06e237570fdb92215626a620e0e4 Mon Sep 17 00:00:00 2001
From: Irmen de Jong <irmen@razorvine.net>
Date: Sat, 28 Sep 2024 01:00:28 +0200
Subject: [PATCH] added verafx.mult16()

---
 compiler/res/prog8lib/cx16/verafx.p8 | 19 ++++++++++++++++++-
 docs/source/libraries.rst            | 21 ++++++++++++++-------
 examples/test.p8                     | 22 +++++++++++++---------
 3 files changed, 45 insertions(+), 17 deletions(-)

diff --git a/compiler/res/prog8lib/cx16/verafx.p8 b/compiler/res/prog8lib/cx16/verafx.p8
index 00eb7cccb..16836a735 100644
--- a/compiler/res/prog8lib/cx16/verafx.p8
+++ b/compiler/res/prog8lib/cx16/verafx.p8
@@ -111,7 +111,7 @@ verafx {
 
     ; unsigned multiplication just passes the values as signed to muls
     ; if you do this yourself in your call to muls, it will save a few instructions.
-    ; TODO fix this: verafx.muls doesn't support unsigned values like this
+    ; TODO fix this: verafx.muls doesn't support unsigned values like this for full 32 bit result
 ;    inline asmsub mult(uword value1 @R0, uword value2 @R1) clobbers(X) -> uword @AY, uword @R0 {
 ;        ; Returns the 32 bits unsigned result in AY and R0  (lower word, upper word).
 ;        %asm {{
@@ -119,6 +119,23 @@ verafx {
 ;        }}
 ;    }
 
+    asmsub mult16(uword value1 @R0, uword value2 @R1) clobbers(X) -> uword @AY {
+        ; Returns the 16 bits unsigned result of R0*R1 in AY.
+        ; Note: only the lower 16 bits!   (the upper 16 bits are not valid for unsigned word multiplications, only for signed)
+        %asm {{
+            lda  cx16.r0
+            sta  P8ZP_SCRATCH_W1
+            lda  cx16.r0+1
+            sta  P8ZP_SCRATCH_W1+1
+            jsr  verafx.muls
+            ldx  P8ZP_SCRATCH_W1
+            stx  cx16.r0
+            ldx  P8ZP_SCRATCH_W1+1
+            stx  cx16.r0+1
+            rts
+        }}
+    }
+
     asmsub muls(word value1 @R0, word value2 @R1) clobbers(X) -> word @AY, word @R0 {
         ; Returns the 32 bits signed result in AY and R0  (lower word, upper word).
         ; Vera Fx multiplication support only works on signed values!
diff --git a/docs/source/libraries.rst b/docs/source/libraries.rst
index 6af8a9545..60ce8773b 100644
--- a/docs/source/libraries.rst
+++ b/docs/source/libraries.rst
@@ -982,15 +982,22 @@ the emulators already support it).
 ``available``
     Returns true if Vera FX is available, false if not (that would be an older Vera chip)
 
-``mult`` , ``muls``
-    The hardware 16*16 multiplier is exposed via ``mult`` and ``muls`` routines (unsigned and signed respectively).
-    They are about 4 to 5 times faster as the default 6502 cpu routine for word multiplication.
-    But they depend on some Vera manipulation and 4 bytes in vram just below the PSG registers for storage.
+``muls``
+    The VeraFX signed word 16*16 to 32 multiplier is accessible via the ``muls`` routine.
+    It is about 4 to 5 times faster than the default 6502 cpu routine for word multiplication.
+    But it depends on some Vera manipulation and 4 bytes in vram just below the PSG registers for storage.
     Note: there is a block level %option "verafxmuls" that automatically replaces all word multiplications in that block
-    by calls to verafx.muls/mult, but be careful with it because it may interfere with other Vera operations or IRQs.
+    by calls to verafx, but be careful with it because it may interfere with other Vera operations or IRQs.
+    The full 32 bits result value is returned in two result values: lower word, upper word.
 
-    Note: the lower 16 bits of the 32 bits result is returned as the normal subroutine's returnvalue,
-    but the upper 16 bits is returned in cx16.r0 so you can still access those separately.
+``mult16``
+    VeraFX hardware multiplication of two unsigned words.
+    NOTE: it only returns the lower 16 bits of the full 32 bits result, because the upper 16 bits are not valid for unsigned word multiplications here
+    (the signed word multiplier ``muls`` does return the full 32 bits result).
+    It is about 4 to 5 times faster than the default 6502 cpu routine for word multiplication.
+    But it depends on some Vera manipulation and 4 bytes in vram just below the PSG registers for storage.
+    Note: there is a block level %option "verafxmuls" that automatically replaces all word multiplications in that block
+    by calls to verafx, but be careful with it because it may interfere with other Vera operations or IRQs.
 
 ``clear``
     Very quickly clear a piece of vram to a given byte value (it writes 4 bytes at a time).
diff --git a/examples/test.p8 b/examples/test.p8
index 3bc1d1d0a..a5c44bbfe 100644
--- a/examples/test.p8
+++ b/examples/test.p8
@@ -1,16 +1,20 @@
 %import textio
-%zeropage dontuse
-
-%output raw
-%launcher none
-%address $2000
+%import verafx
+%zeropage basicsafe
+%option no_sysinit
 
 main {
-    uword @shared variable
+
+    word @shared w1 = -30
+    word @shared w2 = -40
+    uword @shared uw1 = 9999
+    uword @shared uw2 = 4
+
     sub start() {
-        txt.print("hello!\n")
-        txt.print_uw(variable)
+        cx16.r0 = 12345
+        txt.print_uw(verafx.mult16(uw1, uw2))
+        txt.spc()
+        txt.print_uw(uw1 * uw2)
         txt.nl()
-        sys.exit3(1,2,3,false)
     }
 }