From e63921009cd3c7fa92b70d707190bc6ab517fec7 Mon Sep 17 00:00:00 2001
From: Irmen de Jong <irmen@razorvine.net>
Date: Sat, 4 Oct 2025 21:40:12 +0200
Subject: [PATCH] added math.mul32(), verafx.muls now returns long

---
 .../cpu6502/assignment/AssignmentAsmGen.kt    |  17 ++-
 .../assignment/AugmentableAssignmentAsmGen.kt |  18 ++--
 .../codegen/intermediate/ExpressionGen.kt     |   1 +
 compiler/res/prog8lib/cx16/verafx.p8          |  39 ++++---
 compiler/res/prog8lib/math.p8                 |   6 ++
 compiler/res/prog8lib/virtual/math.p8         |   6 ++
 docs/source/libraries.rst                     |   5 +-
 docs/source/todo.rst                          |   4 +-
 examples/test.p8                              | 101 ++++--------------
 9 files changed, 84 insertions(+), 113 deletions(-)

diff --git a/codeGenCpu6502/src/prog8/codegen/cpu6502/assignment/AssignmentAsmGen.kt b/codeGenCpu6502/src/prog8/codegen/cpu6502/assignment/AssignmentAsmGen.kt
index d2365a506..0cc318840 100644
--- a/codeGenCpu6502/src/prog8/codegen/cpu6502/assignment/AssignmentAsmGen.kt
+++ b/codeGenCpu6502/src/prog8/codegen/cpu6502/assignment/AssignmentAsmGen.kt
@@ -1122,7 +1122,7 @@ internal class AssignmentAsmGen(
                             asmgen.out("  pla")
                             asmgen.out("  sta  cx16.r0 |  sty  cx16.r0+1")
                         }
-                        asmgen.out("  jsr  verafx.muls")
+                        asmgen.out("  jsr  verafx.muls16")
                         assignRegisterpairWord(target, RegisterOrPair.AY)
                         return true
                     } else {
@@ -1187,7 +1187,7 @@ internal class AssignmentAsmGen(
                             asmgen.out("""
                                 sta  cx16.r0
                                 sty  cx16.r0+1
-                                jsr  verafx.muls""")
+                                jsr  verafx.muls16""")
                         } else {
                             asmgen.assignWordOperandsToAYAndVar(expr.right, expr.left, "prog8_math.multiply_words.multiplier")
                             asmgen.out("  jsr  prog8_math.multiply_words")
@@ -2488,6 +2488,9 @@ $endLabel""")
                     assignExpressionToRegister(value, RegisterOrPair.A, valueDt.isSigned)
                     assignTypeCastedRegisters(target.asmVarname, targetDt.base, RegisterOrPair.A, valueDt.base)
                 }
+                valueDt.isLong -> {
+                    TODO("assign typecasted long to $targetDt ${value.position}")
+                }
                 valueDt.isWord || valueDt.isPointer -> {
                     assignExpressionToRegister(value, RegisterOrPair.AY, valueDt.isSigned)
                     assignTypeCastedRegisters(target.asmVarname, targetDt.base, RegisterOrPair.AY, valueDt.base)
@@ -2983,7 +2986,15 @@ $endLabel""")
                             else -> throw AssemblyError("non-word regs")
                         }
                     }
-                    BaseDataType.LONG -> TODO("assign typecasted to LONG")
+                    BaseDataType.LONG -> {
+                        when(regs) {
+                            RegisterOrPair.AX -> asmgen.out("  sta  $targetAsmVarName |  stx  $targetAsmVarName+1")
+                            RegisterOrPair.AY -> asmgen.out("  sta  $targetAsmVarName |  sty  $targetAsmVarName+1")
+                            RegisterOrPair.XY -> asmgen.out("  stx  $targetAsmVarName |  sty  $targetAsmVarName+1")
+                            else -> throw AssemblyError("non-word regs")
+                        }
+                        asmgen.signExtendLongVariable(targetAsmVarName, BaseDataType.WORD)
+                    }
                     BaseDataType.FLOAT -> {
                         if(regs!=RegisterOrPair.AY)
                             throw AssemblyError("only supports AY here")
diff --git a/codeGenCpu6502/src/prog8/codegen/cpu6502/assignment/AugmentableAssignmentAsmGen.kt b/codeGenCpu6502/src/prog8/codegen/cpu6502/assignment/AugmentableAssignmentAsmGen.kt
index 14c9a4ebc..fa6c4bec4 100644
--- a/codeGenCpu6502/src/prog8/codegen/cpu6502/assignment/AugmentableAssignmentAsmGen.kt
+++ b/codeGenCpu6502/src/prog8/codegen/cpu6502/assignment/AugmentableAssignmentAsmGen.kt
@@ -2323,7 +2323,7 @@ $shortcutLabel:""")
                 if(value in asmgen.optimizedWordMultiplications) {
                     asmgen.out("  lda  $lsb |  ldy  $msb |  jsr  prog8_math.mul_word_$value |  sta  $lsb |  sty  $msb")
                 } else {
-                    if(block?.options?.veraFxMuls==true)
+                    if(block?.options?.veraFxMuls==true) {
                         // cx16 verafx hardware mul
                         asmgen.out("""
                             lda  $lsb
@@ -2334,9 +2334,10 @@ $shortcutLabel:""")
                             ldy  #>$value
                             sta  cx16.r1
                             sty  cx16.r1+1
-                            jsr  verafx.muls
+                            jsr  verafx.muls16
                             sta  $lsb
                             sty  $msb""")
+                    }
                     else
                         asmgen.out("""
                             lda  $lsb
@@ -2821,9 +2822,10 @@ $shortcutLabel:""")
                                 ldy  $name+1
                                 sta  cx16.r0
                                 sty  cx16.r0+1
-                                jsr  verafx.muls
+                                jsr  verafx.muls16
                                 sta  $name
                                 sty  $name+1""")
+
                         } else {
                             if(valueDt.isUnsignedByte) {
                                 asmgen.out("  lda  $otherName |  sta  prog8_math.multiply_words.multiplier")
@@ -2966,7 +2968,7 @@ $shortcutLabel:""")
                     "+" -> asmgen.out("  lda  $name |  clc |  adc  $otherName |  sta  $name |  lda  $name+1 |  adc  $otherName+1 |  sta  $name+1")
                     "-" -> asmgen.out("  lda  $name |  sec |  sbc  $otherName |  sta  $name |  lda  $name+1 |  sbc  $otherName+1 |  sta  $name+1")
                     "*" -> {
-                        if(block?.options?.veraFxMuls==true)
+                        if(block?.options?.veraFxMuls==true) {
                             // cx16 verafx hardware muls
                             asmgen.out("""
                                 lda  $name
@@ -2977,9 +2979,10 @@ $shortcutLabel:""")
                                 ldy  $otherName+1
                                 sta  cx16.r1
                                 sty  cx16.r1+1
-                                jsr  verafx.muls
+                                jsr  verafx.muls16
                                 sta  $name
                                 sty  $name+1""")
+                        }
                         else
                             asmgen.out("""
                                 lda  $otherName
@@ -3170,7 +3173,7 @@ $shortcutLabel:""")
     private fun inplacemodificationWordWithValue(name: String, dt: DataType, operator: String, value: PtExpression, block: PtBlock?) {
         require(dt.isWord)
         fun multiplyVarByWordInAX() {
-            if(block?.options?.veraFxMuls==true)
+            if(block?.options?.veraFxMuls==true) {
                 // cx16 verafx hardware muls
                 asmgen.out("""
                     sta  cx16.r1
@@ -3179,9 +3182,10 @@ $shortcutLabel:""")
                     ldx  $name+1
                     sta  cx16.r0
                     stx  cx16.r0+1
-                    jsr  verafx.muls
+                    jsr  verafx.muls16
                     sta  $name
                     sty  $name+1""")
+            }
             else
                 asmgen.out("""
                     sta  prog8_math.multiply_words.multiplier
diff --git a/codeGenIntermediate/src/prog8/codegen/intermediate/ExpressionGen.kt b/codeGenIntermediate/src/prog8/codegen/intermediate/ExpressionGen.kt
index b54924e64..a7ea013a5 100644
--- a/codeGenIntermediate/src/prog8/codegen/intermediate/ExpressionGen.kt
+++ b/codeGenIntermediate/src/prog8/codegen/intermediate/ExpressionGen.kt
@@ -686,6 +686,7 @@ internal class ExpressionGen(private val codeGen: IRCodeGen) {
                         addInstr(result, IRInstruction(Opcode.CMPI, IRDataType.WORD, reg1=tr.resultReg, immediate = 0), null)
                         actualResultReg2 = loadStatusAsBooleanResult(Opcode.BSTNE, result)
                     }
+                    valueDt.isLong -> TODO("typecast long ${cast.position}")
                     valueDt.isFloat -> {
                         actualResultReg2 = codeGen.registers.next(IRDataType.BYTE)
                         result += IRCodeChunk(null, null).also {
diff --git a/compiler/res/prog8lib/cx16/verafx.p8 b/compiler/res/prog8lib/cx16/verafx.p8
index 4ce8b2408..286a3114d 100644
--- a/compiler/res/prog8lib/cx16/verafx.p8
+++ b/compiler/res/prog8lib/cx16/verafx.p8
@@ -116,26 +116,29 @@ verafx {
 
 
     asmsub mult16(uword value1 @R0, uword value2 @R1) clobbers(X) -> uword @AY {
-        ; Returns the 16 bits unsigned result of R0*R1 in AY.
+        ; Returns the lower 16 bits unsigned result of R0*R1 in AY
         ; Note: only the lower 16 bits!   (the upper 16 bits are not valid for unsigned word multiplications, only for signed)
         ; Verafx doesn't support unsigned values like this for full 32 bit result.
         ; Note: clobbers VRAM $1f9bc - $1f9bf (inclusive)
         %asm {{
-            lda  cx16.r0
-            sta  P8ZP_SCRATCH_W1
-            lda  cx16.r0+1
-            sta  P8ZP_SCRATCH_W1+1
-            jsr  verafx.muls
-            ldx  P8ZP_SCRATCH_W1
-            stx  cx16.r0
-            ldx  P8ZP_SCRATCH_W1+1
-            stx  cx16.r0+1
+            jmp  muls16
+        }}
+    }
+
+    asmsub muls16(word value1 @R0, word value2 @R1) clobbers(X) -> word @AY {
+        ; Returns just the lower 16 bits signed result of the multiplication in cx16.AY.
+        ; Note: clobbers R0, R1, and VRAM $1f9bc - $1f9bf (inclusive)
+        %asm {{
+            jsr  muls
+            lda  cx16.r0L
+            ldy  cx16.r0H
             rts
         }}
     }
 
-    asmsub muls(word value1 @R0, word value2 @R1) clobbers(X) -> word @AY, word @R0 {
-        ; Returns the 32 bits signed result in AY and R0  (lower word, upper word).
+
+    asmsub muls(word value1 @R0, word value2 @R1) clobbers(X) -> long @R0R1_32 {
+        ; Returns the 32 bits signed result in R0:R1  (lower word, upper word).
         ; Vera Fx multiplication support only works on signed values!
         ; Note: clobbers VRAM $1f9bc - $1f9bf (inclusive)
         %asm {{
@@ -171,12 +174,14 @@ verafx {
             stz  cx16.VERA_DATA0      ; multiply and write out result
             lda  #%00010001           ; $01 with Increment 1
             sta  cx16.VERA_ADDR_H     ; so we can read out the result
-            lda  cx16.VERA_DATA0      ; store the lower 16 bits of the result in AY
+            lda  cx16.VERA_DATA0      ; store the lower 16 bits of the result in R0
             ldy  cx16.VERA_DATA0
-            ldx  cx16.VERA_DATA0      ; store the upper 16 bits of the result in R0
-            stx  cx16.r0s
-            ldx  cx16.VERA_DATA0
-            stx  cx16.r0s+1
+            sta  cx16.r0L
+            sty  cx16.r0H
+            lda  cx16.VERA_DATA0      ; store the upper 16 bits of the result in R1
+            ldy  cx16.VERA_DATA0      ; store the upper 16 bits of the result in R1
+            sta  cx16.r1L
+            sty  cx16.r1H
             stz  cx16.VERA_FX_CTRL    ; Cache write disable
             stz  cx16.VERA_FX_MULT    ; $9F2C  reset multiply bit
             stz  cx16.VERA_CTRL       ; reset DCSEL
diff --git a/compiler/res/prog8lib/math.p8 b/compiler/res/prog8lib/math.p8
index c1d36e873..f5fce6a84 100644
--- a/compiler/res/prog8lib/math.p8
+++ b/compiler/res/prog8lib/math.p8
@@ -220,6 +220,12 @@ _sinecosR8	.char  trunc(127.0 * sin(range(180+45) * rad(360.0/180.0)))
         }}
     }
 
+    sub mul32(uword a, uword b) -> long {
+        ; return 32 bits result of a*b
+        cx16.r2 = a*b
+        return mklong2(mul16_last_upper(), cx16.r2)
+    }
+
 sub direction_sc(byte x1, byte y1, byte x2, byte y2) -> ubyte {
     ; From a pair of signed coordinates around the origin, calculate discrete direction between 0 and 23 into A.
     cx16.r0L = 3        ; quadrant
diff --git a/compiler/res/prog8lib/virtual/math.p8 b/compiler/res/prog8lib/virtual/math.p8
index 260ace7f2..993e17793 100644
--- a/compiler/res/prog8lib/virtual/math.p8
+++ b/compiler/res/prog8lib/virtual/math.p8
@@ -304,6 +304,12 @@ math {
         }}
     }
 
+    sub mul32(uword a, uword b) -> long {
+        ; return 32 bits result of a*b
+        cx16.r2 = a*b
+        return mklong2(mul16_last_upper(), cx16.r2)
+    }
+
     sub diff(ubyte b1, ubyte b2) -> ubyte {
         if b1>b2
             return b1-b2
diff --git a/docs/source/libraries.rst b/docs/source/libraries.rst
index 987509fa8..2b6601a85 100644
--- a/docs/source/libraries.rst
+++ b/docs/source/libraries.rst
@@ -1239,7 +1239,10 @@ Available for the Cx16 target. Routines that use the Vera FX logic to accelerate
     But it depends on some Vera manipulation and 4 bytes in vram just below the PSG registers for storage.
     Note: there is a block level %option "verafxmuls" that automatically replaces all word multiplications in that block
     by calls to verafx, but be careful with it because it may interfere with other Vera operations or IRQs.
-    The full 32 bits result value is returned in two result values: lower word, upper word.
+    The full 32 bits result value is returned as a long.
+
+``muls16``
+    Like ``muls`` but only returns the lower word of the result, which is sometimes useful if you're just interested in word values.
 
 ``mult16``
     VeraFX hardware multiplication of two unsigned words.
diff --git a/docs/source/todo.rst b/docs/source/todo.rst
index 5252f6ebf..42599fe5a 100644
--- a/docs/source/todo.rst
+++ b/docs/source/todo.rst
@@ -3,9 +3,7 @@ TODO
 
 LONG TYPE
 ---------
-- scan through more library routines if there are opportunities to use a long param or returnvalue?
-- document the new long type! and mklong(a,b,c,d) and mklong2(w1,w2) , print_l , print_ulhex (& conv.str_l) and pokel, peekl, cbm.SETTIML/RDTIML,  and the use of R0:R1 when doing LONG calculations
-- asmsub call convention: @R0R1_32 to specify a 32 bits long combined register R0:R1
+- document the new long type! and mklong(a,b,c,d) and mklong2(w1,w2) , print_l , print_ulhex (& conv.str_l) and pokel, peekl, cbm.SETTIML/RDTIML, math.mul32, verafx.muls/muls16, and the use of R0:R1 when doing LONG calculations, asmsub call convention: @R0R1_32 to specify a 32 bits long combined register R0:R1
 - how hard is it to also implement the other comparison operators (<,>,<=,>=) on longs?
 - implement LONG testcases in testmemory
 
diff --git a/examples/test.p8 b/examples/test.p8
index 0de96095b..d7cc195d6 100644
--- a/examples/test.p8
+++ b/examples/test.p8
@@ -1,95 +1,32 @@
 %import textio
 %import math
+%import verafx
 %zeropage basicsafe
 
 main {
+    %option verafxmuls
+
     sub start() {
-        long @shared lv1 = 12345678
-        long @shared lv2same = 12345678
-        long @shared lv2different = 999999
 
-        if lv1==0
-            txt.print("wrong1\n")
+        cx16.r5s = 22
+        cx16.r6s = -999
 
-        if lv1==0
-            txt.print("wrong2\n")
-        else
-            txt.print("ok2\n")
+        cx16.r0s = cx16.r5s * cx16.r6s
+        txt.print_w(cx16.r0s)
+        txt.nl()
 
-        if lv1!=0
-            txt.print("ok3\n")
-
-        if lv1!=0
-            txt.print("ok4\n")
-        else
-            txt.print("wrong4\n")
+        long lv = cx16.r5s * cx16.r6s
+        txt.print_l(lv)
+        txt.nl()
 
 
-        if lv1==999999
-            txt.print("wrong5\n")
-
-        if lv1==999999
-            txt.print("wrong6\n")
-        else
-            txt.print("ok6\n")
-
-        if lv1!=999999
-            txt.print("ok7\n")
-
-        if lv1!=999999
-            txt.print("ok8\n")
-        else
-            txt.print("wrong8\n")
-
-        if lv1==12345678
-            txt.print("ok9\n")
-
-        if lv1==12345678
-            txt.print("ok10\n")
-        else
-            txt.print("wrong10\n")
-
-        if lv1!=12345678
-            txt.print("wrong11\n")
-
-        if lv1!=12345678
-            txt.print("wrong12\n")
-        else
-            txt.print("ok12\n")
-
-
-
-        if lv1==lv2same
-            txt.print("ok13\n")
-
-        if lv1==lv2same
-            txt.print("ok14\n")
-        else
-            txt.print("wrong14\n")
-
-        if lv1!=lv2same
-            txt.print("wrong15\n")
-
-        if lv1!=lv2same
-            txt.print("wrong16\n")
-        else
-            txt.print("ok16\n")
-
-
-        if lv1==lv2different
-            txt.print("wrong17\n")
-
-        if lv1==lv2different
-            txt.print("wrong18\n")
-        else
-            txt.print("ok18\n")
-
-        if lv1!=lv2different
-            txt.print("ok19\n")
-
-        if lv1!=lv2different
-            txt.print("ok20\n")
-        else
-            txt.print("wrong20\n")
+        cx16.r5s = 5555
+        cx16.r6s = -9999
+        lv = cx16.r5s * cx16.r6s
+        txt.print_l(lv)
+        txt.nl()
+        lv = verafx.muls(cx16.r5s, cx16.r6s)
+        txt.print_l(lv)
+        txt.nl()
     }
 }