From bc8126eb16d98b219f43abf827fefae51fff880f Mon Sep 17 00:00:00 2001
From: Irmen de Jong <irmen@razorvine.net>
Date: Mon, 14 Aug 2023 17:34:28 +0200
Subject: [PATCH] 2x faster word multiplication routine

---
 .../codegen/cpu6502/ProgramAndVarsGen.kt      |   8 +-
 .../cpu6502/assignment/AssignmentAsmGen.kt    |   4 +-
 .../assignment/AugmentableAssignmentAsmGen.kt |  51 +++--
 compiler/res/prog8lib/math.asm                | 143 ++++++++++---
 docs/source/todo.rst                          |   2 -
 examples/test.p8                              | 193 ++----------------
 6 files changed, 173 insertions(+), 228 deletions(-)

diff --git a/codeGenCpu6502/src/prog8/codegen/cpu6502/ProgramAndVarsGen.kt b/codeGenCpu6502/src/prog8/codegen/cpu6502/ProgramAndVarsGen.kt
index 0a4e68f35..ba73f7b2a 100644
--- a/codeGenCpu6502/src/prog8/codegen/cpu6502/ProgramAndVarsGen.kt
+++ b/codeGenCpu6502/src/prog8/codegen/cpu6502/ProgramAndVarsGen.kt
@@ -428,12 +428,12 @@ internal class ProgramAndVarsGen(
                 asmgen.out("""
                     lda  #<${name}_init_value
                     ldy  #>${name}_init_value
-                    sta  cx16.r0L
-                    sty  cx16.r0H
+                    sta  cx16.r0
+                    sty  cx16.r0+1
                     lda  #<${name}
                     ldy  #>${name}
-                    sta  cx16.r1L
-                    sty  cx16.r1H
+                    sta  cx16.r1
+                    sty  cx16.r1+1
                     lda  #<$size
                     ldy  #>$size
                     jsr  sys.memcopy""")
diff --git a/codeGenCpu6502/src/prog8/codegen/cpu6502/assignment/AssignmentAsmGen.kt b/codeGenCpu6502/src/prog8/codegen/cpu6502/assignment/AssignmentAsmGen.kt
index 2e1db62da..6c339da06 100644
--- a/codeGenCpu6502/src/prog8/codegen/cpu6502/assignment/AssignmentAsmGen.kt
+++ b/codeGenCpu6502/src/prog8/codegen/cpu6502/assignment/AssignmentAsmGen.kt
@@ -544,7 +544,7 @@ internal class AssignmentAsmGen(private val program: PtProgram,
                     return true
                 }
                 in WordDatatypes -> {
-                    asmgen.assignWordOperandsToAYAndVar(expr.right, expr.left, "P8ZP_SCRATCH_W1")
+                    asmgen.assignWordOperandsToAYAndVar(expr.right, expr.left, "cx16.r0")
                     asmgen.out("  jsr  math.multiply_words")
                     assignRegisterpairWord(target, RegisterOrPair.AY)
                     return true
@@ -568,7 +568,7 @@ internal class AssignmentAsmGen(private val program: PtProgram,
                         asmgen.out("  jsr  math.mul_word_${value}")
                     }
                     else {
-                        asmgen.assignWordOperandsToAYAndVar(expr.right, expr.left, "P8ZP_SCRATCH_W1")
+                        asmgen.assignWordOperandsToAYAndVar(expr.right, expr.left, "cx16.r0")
                         asmgen.out("  jsr  math.multiply_words")
                     }
                     assignRegisterpairWord(target, RegisterOrPair.AY)
diff --git a/codeGenCpu6502/src/prog8/codegen/cpu6502/assignment/AugmentableAssignmentAsmGen.kt b/codeGenCpu6502/src/prog8/codegen/cpu6502/assignment/AugmentableAssignmentAsmGen.kt
index 074f6c2ab..d6515a181 100644
--- a/codeGenCpu6502/src/prog8/codegen/cpu6502/assignment/AugmentableAssignmentAsmGen.kt
+++ b/codeGenCpu6502/src/prog8/codegen/cpu6502/assignment/AugmentableAssignmentAsmGen.kt
@@ -1178,9 +1178,9 @@ internal class AugmentableAssignmentAsmGen(private val program: PtProgram,
                 } else {
                     asmgen.out("""
                         lda  $name
-                        sta  P8ZP_SCRATCH_W1
+                        sta  cx16.r0
                         lda  $name+1
-                        sta  P8ZP_SCRATCH_W1+1
+                        sta  cx16.r0+1
                         lda  #<$value
                         ldy  #>$value
                         jsr  math.multiply_words
@@ -1629,15 +1629,15 @@ internal class AugmentableAssignmentAsmGen(private val program: PtProgram,
                     }
                     "*" -> {
                         if(valueDt==DataType.UBYTE) {
-                            asmgen.out("  lda  $otherName |  sta  P8ZP_SCRATCH_W1")
+                            asmgen.out("  lda  $otherName |  sta  cx16.r0")
                             if(asmgen.isTargetCpu(CpuType.CPU65c02))
-                                asmgen.out("  stz  P8ZP_SCRATCH_W1+1")
+                                asmgen.out("  stz  cx16.r0+1")
                             else
-                                asmgen.out("  lda  #0 |  sta  P8ZP_SCRATCH_W1+1")
+                                asmgen.out("  lda  #0 |  sta  cx16.r0+1")
                         } else {
                             asmgen.out("  lda  $otherName")
                             asmgen.signExtendAYlsb(valueDt)
-                            asmgen.out("  sta  P8ZP_SCRATCH_W1 |  sty  P8ZP_SCRATCH_W1+1")
+                            asmgen.out("  sta  cx16.r0 |  sty  cx16.r0+1")
                         }
                         asmgen.out("""
                                 lda  $name
@@ -1773,16 +1773,31 @@ internal class AugmentableAssignmentAsmGen(private val program: PtProgram,
                     "+" -> asmgen.out("  lda  $name |  clc |  adc  $otherName |  sta  $name |  lda  $name+1 |  adc  $otherName+1 |  sta  $name+1")
                     "-" -> asmgen.out("  lda  $name |  sec |  sbc  $otherName |  sta  $name |  lda  $name+1 |  sbc  $otherName+1 |  sta  $name+1")
                     "*" -> {
-                        asmgen.out("""
-                            lda  $otherName
-                            ldy  $otherName+1
-                            sta  P8ZP_SCRATCH_W1
-                            sty  P8ZP_SCRATCH_W1+1
-                            lda  $name
-                            ldy  $name+1
-                            jsr  math.multiply_words
-                            sta  $name
-                            sty  $name+1""")
+                        if(otherName=="cx16.r0")
+                            asmgen.out("""
+                                lda  $name
+                                ldy  $name+1
+                                jsr  math.multiply_words
+                                sta  $name
+                                sty  $name+1""")
+                        else if(name=="cx16.r0")
+                            asmgen.out("""
+                                lda  $otherName
+                                ldy  $otherName+1
+                                jsr  math.multiply_words
+                                sta  $name
+                                sty  $name+1""")
+                        else
+                            asmgen.out("""
+                                lda  $otherName
+                                ldy  $otherName+1
+                                sta  cx16.r0
+                                sty  cx16.r0+1
+                                lda  $name
+                                ldy  $name+1
+                                jsr  math.multiply_words
+                                sta  $name
+                                sty  $name+1""")
                     }
                     "/" -> {
                         if(dt==DataType.WORD) {
@@ -1963,8 +1978,8 @@ internal class AugmentableAssignmentAsmGen(private val program: PtProgram,
     private fun inplacemodificationWordWithValue(name: String, dt: DataType, operator: String, value: PtExpression) {
         fun multiplyVarByWordInAY() {
             asmgen.out("""
-                sta  P8ZP_SCRATCH_W1
-                sty  P8ZP_SCRATCH_W1+1
+                sta  cx16.r0
+                sty  cx16.r0+1
                 lda  $name
                 ldy  $name+1
                 jsr  math.multiply_words
diff --git a/compiler/res/prog8lib/math.asm b/compiler/res/prog8lib/math.asm
index 149156de5..2b17be1d9 100644
--- a/compiler/res/prog8lib/math.asm
+++ b/compiler/res/prog8lib/math.asm
@@ -56,37 +56,124 @@ _multiplier      = P8ZP_SCRATCH_REG
 
 multiply_words	.proc
 	; -- multiply two 16-bit words into a 32-bit result  (signed and unsigned)
-	;      input: A/Y = first 16-bit number, P8ZP_SCRATCH_W1 in ZP = second 16-bit number
-	;      output: multiply_words.result  4-bytes/32-bits product, LSB order (low-to-high)  low 16 bits also in AY.
+	;      input: A/Y = first 16-bit number, cx16.R0 = second 16-bit number
+	;      output: multiply_words.result == cx16.R0:R1, 4-bytes/32-bits product, LSB order (low-to-high)  low 16 bits also in AY.
 
-		sta  P8ZP_SCRATCH_W2
-		sty  P8ZP_SCRATCH_W2+1
+; mult62.a
+; based on Dr Jefyll, http://forum.6502.org/viewtopic.php?f=9&t=689&start=0#p19958
+; - adjusted to use fixed zero page addresses
+; - removed 'decrement to avoid clc' as this is slower on average
+; - rearranged memory use to remove final memory copy and give LSB first order to result
+; - removed temp zp storage bytes
+; - unrolled the outer loop
+; - unrolled the two inner loops once
+;
+; 16 bit x 16 bit unsigned multiply, 32 bit result
+; Average cycles:
+; 93 bytes
 
-mult16		lda  #0
-		sta  result+2	; clear upper bits of product
-		sta  result+3
-		ldx  #16			; for all 16 bits...
--	 	lsr  P8ZP_SCRATCH_W1+1	; divide multiplier by 2
-		ror  P8ZP_SCRATCH_W1
-		bcc  +
-		lda  result+2	; get upper half of product and add multiplicand
-		clc
-		adc  P8ZP_SCRATCH_W2
-		sta  result+2
-		lda  result+3
-		adc  P8ZP_SCRATCH_W2+1
-+ 		ror  a				; rotate partial product
-		sta  result+3
-		ror  result+2
-		ror  result+1
-		ror  result
-		dex
-		bne  -
-		lda  result
-		ldy  result+1
-		rts
+_multiplicand    = P8ZP_SCRATCH_W1   ; 2 bytes
+_multiplier      = cx16.r0   ; 2 bytes
+result           = cx16.r0   ; 4 bytes   (note: shares memory with multiplier)  so is r0 and ALSO r1.
 
-result		.byte  0,0,0,0
+; 16 bit x 16 bit unsigned multiply, 32 bit result
+;
+; On Entry:
+;   (multiplier, multiplier+1): two byte multiplier, four bytes needed for result
+;   (multiplicand, multiplicand+1): two byte multiplicand
+; On Exit:
+;   (result, result+1, result+2, result+3): product
+
+    sta  _multiplicand
+    sty  _multiplicand+1
+
+    lda  #0              ;
+    sta  result+2        ; 16 bits of zero in A, result+2
+                        ;  Note:    First 8 shifts are  A -> result+2 -> result
+                        ;           Final 8 shifts are  A -> result+2 -> result+1
+
+    ; --- 1st byte ---
+    ldy  #4              ; count for inner loop
+    lsr  result
+
+    ; inner loop (8 times)
+_inner_loop
+    ; first time
+    bcc +
+    tax                 ; retain A
+    lda  result+2
+    clc
+    adc  _multiplicand
+    sta  result+2
+    txa                 ; recall A
+    adc  _multiplicand+1
+
++
+    ror  a                ; shift
+    ror  result+2
+    ror  result
+
+    ; second time
+    bcc +
+    tax                 ; retain A
+    lda  result+2
+    clc
+    adc  _multiplicand
+    sta  result+2
+    txa                 ; recall A
+    adc  _multiplicand+1
+
++
+    ror  a                 ; shift
+    ror  result+2
+    ror  result
+
+    dey
+    bne  _inner_loop      ; go back for 1 more shift?
+
+    ; --- 2nd byte ---
+    ldy  #4              ; count for inner loop
+    lsr  result+1
+
+    ; inner loop (8 times)
+_inner_loop2
+    ; first time
+    bcc  +
+    tax                 ; retain A
+    lda  result+2
+    clc
+    adc  _multiplicand
+    sta  result+2
+    txa                 ; recall A
+    adc  _multiplicand+1
+
++
+    ror  a                ; shift
+    ror  result+2
+    ror  result+1
+
+    ; second time
+    bcc  +
+    tax                 ; retain A
+    lda  result+2
+    clc
+    adc  _multiplicand
+    sta  result+2
+    txa                 ; recall A
+    adc  _multiplicand+1
+
++
+    ror  a                ; shift
+    ror  result+2
+    ror  result+1
+    dey
+    bne  _inner_loop2     ; go back for 1 more shift?
+
+    sta  result+3        ; ms byte of hi-word of result
+
+    lda  result
+    ldy  result+1
+    rts
 		.pend
 
 
diff --git a/docs/source/todo.rst b/docs/source/todo.rst
index 72421a07d..c92a65c0e 100644
--- a/docs/source/todo.rst
+++ b/docs/source/todo.rst
@@ -2,8 +2,6 @@ TODO
 ====
 
 - don't allow txt.print('@')  if possible, don't cast up a byte to str
-- check mult routines with the benchmarked ones on https://github.com/TobyLobster/multiply_test
-- is math.square still the fastest after this? (now used for word*word)
 - [on branch:] investigate McCarthy evaluation again? this may also reduce code size perhaps for things like if a>4 or a<2 ....
 - IR: reduce the number of branch instructions such as BEQ, BEQR, etc (gradually), replace with CMP(I) + status branch instruction
 - IR: reduce amount of CMP/CMPI after instructions that set the status bits correctly (LOADs? INC? etc), but only after setting the status bits is verified!
diff --git a/examples/test.p8 b/examples/test.p8
index 4e554b5f0..80a70f051 100644
--- a/examples/test.p8
+++ b/examples/test.p8
@@ -11,199 +11,44 @@ cbm2 {
 
 main {
     sub start() {
-        ubyte value
-        byte svalue
         uword wvalue
-        word swvalue
+        uword wvalue2
 
-        txt.print("byte multiply..")
+        txt.print("word square..")
         cbm.SETTIM(0,0,0)
         repeat 200 {
-            for value in 0 to 255 {
-                cx16.r0L = value*99
+            for wvalue in 0 to 200 {
+                cx16.r0 = wvalue*wvalue
             }
         }
         txt.print_uw(cbm.RDTIM16())
         txt.nl()
 
-        txt.print("byte multiply new..")
+        txt.print("word square via multiply new..")
         cbm.SETTIM(0,0,0)
+        wvalue2 = wvalue
         repeat 200 {
-            for value in 0 to 255 {
-                cx16.r0L = multiply_b(value, 99)
+            for wvalue in 0 to 200 {
+                cx16.r0 = wvalue*wvalue2
             }
         }
         txt.print_uw(cbm.RDTIM16())
         txt.nl()
 
-        txt.print("byte multiply verify..")
-        for value in 0 to 255 {
-            if multiply_b(value,99) != value*99 {
-                txt.print("different!")
-                sys.exit(1)
-            }
-        }
-        txt.nl()
-
-        txt.print("sbyte multiply..")
-        cbm.SETTIM(0,0,0)
-        repeat 200 {
-            for svalue in -128 to 127 {
-                cx16.r0sL = svalue*99
-            }
-        }
-        txt.print_uw(cbm.RDTIM16())
-        txt.nl()
-
-        txt.print("sbyte multiply new..")
-        cbm.SETTIM(0,0,0)
-        repeat 200 {
-            for svalue in -128 to 127 {
-                cx16.r0L = multiply_sb(svalue, 99)
-            }
-        }
-        txt.print_uw(cbm.RDTIM16())
-        txt.nl()
-
-        txt.print("sbyte multiply verify..")
-        for svalue in -128 to 127 {
-            if multiply_sb(svalue,99) != svalue*99 {
-                txt.print("different!")
-                sys.exit(1)
-            }
-        }
-        txt.nl()
-
-        txt.print("word multiply..")
-        cbm.SETTIM(0,0,0)
-        repeat 200 {
-            for wvalue in 200 to 400 {
-                cx16.r0 = wvalue*987
-            }
-        }
-        txt.print_uw(cbm.RDTIM16())
-        txt.nl()
-
-        txt.print("word multiply new..")
-        cbm.SETTIM(0,0,0)
-        repeat 200 {
-            for wvalue in 200 to 400 {
-                cx16.r0 = multiply_w(wvalue, 987)
-            }
-        }
-        txt.print_uw(cbm.RDTIM16())
-        txt.nl()
-
-        txt.print("word multiply verify..")
-        for wvalue in 200 to 400 {
-            if multiply_w(value,987) != value*987 {
-                txt.print("different!")
-                sys.exit(1)
-            }
-        }
-        txt.nl()
-
-        txt.print("sword multiply..")
-        cbm.SETTIM(0,0,0)
-        repeat 100 {
-            for swvalue in -400 to 400 {
-                cx16.r0s = swvalue*987
-            }
-        }
-        txt.print_uw(cbm.RDTIM16())
-        txt.nl()
-
-        txt.print("sword multiply new..")
-        cbm.SETTIM(0,0,0)
-        repeat 100 {
-            for swvalue in -400 to 400 {
-                cx16.r0s = multiply_sw(swvalue, 987)
-            }
-        }
-        txt.print_uw(cbm.RDTIM16())
-        txt.nl()
-
-        txt.print("sword multiply verify..")
-        for swvalue in -400 to 400 {
-            if multiply_sw(swvalue,987) != swvalue*987 {
-                txt.print("different!")
+        txt.print("word square verify..")
+        for wvalue in 0 to 200 {
+            wvalue2 = wvalue
+            if wvalue*wvalue != wvalue*wvalue2 {
+                txt.print("different! ")
+                txt.print_uw(wvalue)
+                txt.spc()
+                txt.spc()
+                txt.print_uw(wvalue*wvalue)
+                txt.spc()
+                txt.print_uw(wvalue*wvalue2)
                 sys.exit(1)
             }
         }
         txt.nl()
     }
-
-asmsub multiply_sb(byte value @A, byte multiplicant @Y) -> ubyte @A {
-    %asm {{
-        jmp  p8_multiply_b
-    }}
-}
-
-asmsub multiply_sw(word value @AY, word multiplicant @R0) -> word @AY {
-    %asm {{
-        jmp  p8_multiply_w
-    }}
-}
-
-
-    asmsub multiply_b(ubyte value @A, ubyte multiplicant @Y) -> ubyte @A {
-        %asm {{
-
-; ***************************************************************************************
-; On Entry:
-;   A:   multiplier
-;   Y:   multiplicand
-; On Exit:
-;   A:     low byte of product
-;   Y: (optional) high byte of product
-_multiplicand    = P8ZP_SCRATCH_B1
-_multiplier      = P8ZP_SCRATCH_REG
-
-    sty  _multiplicand
-    lsr  a
-    sta  _multiplier
-    lda  #0
-    ldx  #2
--
-    bcc  +
-    clc
-    adc  _multiplicand
-+
-    ror  a
-    ror  _multiplier
-    bcc  +
-    clc
-    adc  _multiplicand
-+
-    ror  a
-    ror  _multiplier
-
-    bcc  +
-    clc
-    adc  _multiplicand
-+
-    ror  a
-    ror  _multiplier
-    bcc  +
-    clc
-    adc  _multiplicand
-+
-    ror  a
-    ror  _multiplier
-    dex
-    bne  -
-    ; tay       ; if you want 16 bits result in AY, enable this again
-    lda  _multiplier
-    rts
-        }}
-    }
-
-    asmsub multiply_w(uword value @AY, uword multiplicant @R0) -> uword @AY {
-        %asm {{
-            ; TODO
-            lda  #99
-            ldy  #1
-            rts
-        }}
-    }
 }