From c78b7b1a24fe06e237570fdb92215626a620e0e4 Mon Sep 17 00:00:00 2001 From: Irmen de Jong Date: Sat, 28 Sep 2024 01:00:28 +0200 Subject: [PATCH] added verafx.mult16() --- compiler/res/prog8lib/cx16/verafx.p8 | 19 ++++++++++++++++++- docs/source/libraries.rst | 21 ++++++++++++++------- examples/test.p8 | 22 +++++++++++++--------- 3 files changed, 45 insertions(+), 17 deletions(-) diff --git a/compiler/res/prog8lib/cx16/verafx.p8 b/compiler/res/prog8lib/cx16/verafx.p8 index 00eb7cccb..16836a735 100644 --- a/compiler/res/prog8lib/cx16/verafx.p8 +++ b/compiler/res/prog8lib/cx16/verafx.p8 @@ -111,7 +111,7 @@ verafx { ; unsigned multiplication just passes the values as signed to muls ; if you do this yourself in your call to muls, it will save a few instructions. - ; TODO fix this: verafx.muls doesn't support unsigned values like this + ; TODO fix this: verafx.muls doesn't support unsigned values like this for full 32 bit result ; inline asmsub mult(uword value1 @R0, uword value2 @R1) clobbers(X) -> uword @AY, uword @R0 { ; ; Returns the 32 bits unsigned result in AY and R0 (lower word, upper word). ; %asm {{ @@ -119,6 +119,23 @@ verafx { ; }} ; } + asmsub mult16(uword value1 @R0, uword value2 @R1) clobbers(X) -> uword @AY { + ; Returns the 16 bits unsigned result of R0*R1 in AY. + ; Note: only the lower 16 bits! (the upper 16 bits are not valid for unsigned word multiplications, only for signed) + %asm {{ + lda cx16.r0 + sta P8ZP_SCRATCH_W1 + lda cx16.r0+1 + sta P8ZP_SCRATCH_W1+1 + jsr verafx.muls + ldx P8ZP_SCRATCH_W1 + stx cx16.r0 + ldx P8ZP_SCRATCH_W1+1 + stx cx16.r0+1 + rts + }} + } + asmsub muls(word value1 @R0, word value2 @R1) clobbers(X) -> word @AY, word @R0 { ; Returns the 32 bits signed result in AY and R0 (lower word, upper word). ; Vera Fx multiplication support only works on signed values! diff --git a/docs/source/libraries.rst b/docs/source/libraries.rst index 6af8a9545..60ce8773b 100644 --- a/docs/source/libraries.rst +++ b/docs/source/libraries.rst @@ -982,15 +982,22 @@ the emulators already support it). ``available`` Returns true if Vera FX is available, false if not (that would be an older Vera chip) -``mult`` , ``muls`` - The hardware 16*16 multiplier is exposed via ``mult`` and ``muls`` routines (unsigned and signed respectively). - They are about 4 to 5 times faster as the default 6502 cpu routine for word multiplication. - But they depend on some Vera manipulation and 4 bytes in vram just below the PSG registers for storage. +``muls`` + The VeraFX signed word 16*16 to 32 multiplier is accessible via the ``muls`` routine. + It is about 4 to 5 times faster than the default 6502 cpu routine for word multiplication. + But it depends on some Vera manipulation and 4 bytes in vram just below the PSG registers for storage. Note: there is a block level %option "verafxmuls" that automatically replaces all word multiplications in that block - by calls to verafx.muls/mult, but be careful with it because it may interfere with other Vera operations or IRQs. + by calls to verafx, but be careful with it because it may interfere with other Vera operations or IRQs. + The full 32 bits result value is returned in two result values: lower word, upper word. - Note: the lower 16 bits of the 32 bits result is returned as the normal subroutine's returnvalue, - but the upper 16 bits is returned in cx16.r0 so you can still access those separately. +``mult16`` + VeraFX hardware multiplication of two unsigned words. + NOTE: it only returns the lower 16 bits of the full 32 bits result, because the upper 16 bits are not valid for unsigned word multiplications here + (the signed word multiplier ``muls`` does return the full 32 bits result). + It is about 4 to 5 times faster than the default 6502 cpu routine for word multiplication. + But it depends on some Vera manipulation and 4 bytes in vram just below the PSG registers for storage. + Note: there is a block level %option "verafxmuls" that automatically replaces all word multiplications in that block + by calls to verafx, but be careful with it because it may interfere with other Vera operations or IRQs. ``clear`` Very quickly clear a piece of vram to a given byte value (it writes 4 bytes at a time). diff --git a/examples/test.p8 b/examples/test.p8 index 3bc1d1d0a..a5c44bbfe 100644 --- a/examples/test.p8 +++ b/examples/test.p8 @@ -1,16 +1,20 @@ %import textio -%zeropage dontuse - -%output raw -%launcher none -%address $2000 +%import verafx +%zeropage basicsafe +%option no_sysinit main { - uword @shared variable + + word @shared w1 = -30 + word @shared w2 = -40 + uword @shared uw1 = 9999 + uword @shared uw2 = 4 + sub start() { - txt.print("hello!\n") - txt.print_uw(variable) + cx16.r0 = 12345 + txt.print_uw(verafx.mult16(uw1, uw2)) + txt.spc() + txt.print_uw(uw1 * uw2) txt.nl() - sys.exit3(1,2,3,false) } }