From bc8126eb16d98b219f43abf827fefae51fff880f Mon Sep 17 00:00:00 2001 From: Irmen de Jong Date: Mon, 14 Aug 2023 17:34:28 +0200 Subject: [PATCH] 2x faster word multiplication routine --- .../codegen/cpu6502/ProgramAndVarsGen.kt | 8 +- .../cpu6502/assignment/AssignmentAsmGen.kt | 4 +- .../assignment/AugmentableAssignmentAsmGen.kt | 51 +++-- compiler/res/prog8lib/math.asm | 143 ++++++++++--- docs/source/todo.rst | 2 - examples/test.p8 | 193 ++---------------- 6 files changed, 173 insertions(+), 228 deletions(-) diff --git a/codeGenCpu6502/src/prog8/codegen/cpu6502/ProgramAndVarsGen.kt b/codeGenCpu6502/src/prog8/codegen/cpu6502/ProgramAndVarsGen.kt index 0a4e68f35..ba73f7b2a 100644 --- a/codeGenCpu6502/src/prog8/codegen/cpu6502/ProgramAndVarsGen.kt +++ b/codeGenCpu6502/src/prog8/codegen/cpu6502/ProgramAndVarsGen.kt @@ -428,12 +428,12 @@ internal class ProgramAndVarsGen( asmgen.out(""" lda #<${name}_init_value ldy #>${name}_init_value - sta cx16.r0L - sty cx16.r0H + sta cx16.r0 + sty cx16.r0+1 lda #<${name} ldy #>${name} - sta cx16.r1L - sty cx16.r1H + sta cx16.r1 + sty cx16.r1+1 lda #<$size ldy #>$size jsr sys.memcopy""") diff --git a/codeGenCpu6502/src/prog8/codegen/cpu6502/assignment/AssignmentAsmGen.kt b/codeGenCpu6502/src/prog8/codegen/cpu6502/assignment/AssignmentAsmGen.kt index 2e1db62da..6c339da06 100644 --- a/codeGenCpu6502/src/prog8/codegen/cpu6502/assignment/AssignmentAsmGen.kt +++ b/codeGenCpu6502/src/prog8/codegen/cpu6502/assignment/AssignmentAsmGen.kt @@ -544,7 +544,7 @@ internal class AssignmentAsmGen(private val program: PtProgram, return true } in WordDatatypes -> { - asmgen.assignWordOperandsToAYAndVar(expr.right, expr.left, "P8ZP_SCRATCH_W1") + asmgen.assignWordOperandsToAYAndVar(expr.right, expr.left, "cx16.r0") asmgen.out(" jsr math.multiply_words") assignRegisterpairWord(target, RegisterOrPair.AY) return true @@ -568,7 +568,7 @@ internal class AssignmentAsmGen(private val program: PtProgram, asmgen.out(" jsr math.mul_word_${value}") } else { - asmgen.assignWordOperandsToAYAndVar(expr.right, expr.left, "P8ZP_SCRATCH_W1") + asmgen.assignWordOperandsToAYAndVar(expr.right, expr.left, "cx16.r0") asmgen.out(" jsr math.multiply_words") } assignRegisterpairWord(target, RegisterOrPair.AY) diff --git a/codeGenCpu6502/src/prog8/codegen/cpu6502/assignment/AugmentableAssignmentAsmGen.kt b/codeGenCpu6502/src/prog8/codegen/cpu6502/assignment/AugmentableAssignmentAsmGen.kt index 074f6c2ab..d6515a181 100644 --- a/codeGenCpu6502/src/prog8/codegen/cpu6502/assignment/AugmentableAssignmentAsmGen.kt +++ b/codeGenCpu6502/src/prog8/codegen/cpu6502/assignment/AugmentableAssignmentAsmGen.kt @@ -1178,9 +1178,9 @@ internal class AugmentableAssignmentAsmGen(private val program: PtProgram, } else { asmgen.out(""" lda $name - sta P8ZP_SCRATCH_W1 + sta cx16.r0 lda $name+1 - sta P8ZP_SCRATCH_W1+1 + sta cx16.r0+1 lda #<$value ldy #>$value jsr math.multiply_words @@ -1629,15 +1629,15 @@ internal class AugmentableAssignmentAsmGen(private val program: PtProgram, } "*" -> { if(valueDt==DataType.UBYTE) { - asmgen.out(" lda $otherName | sta P8ZP_SCRATCH_W1") + asmgen.out(" lda $otherName | sta cx16.r0") if(asmgen.isTargetCpu(CpuType.CPU65c02)) - asmgen.out(" stz P8ZP_SCRATCH_W1+1") + asmgen.out(" stz cx16.r0+1") else - asmgen.out(" lda #0 | sta P8ZP_SCRATCH_W1+1") + asmgen.out(" lda #0 | sta cx16.r0+1") } else { asmgen.out(" lda $otherName") asmgen.signExtendAYlsb(valueDt) - asmgen.out(" sta P8ZP_SCRATCH_W1 | sty P8ZP_SCRATCH_W1+1") + asmgen.out(" sta cx16.r0 | sty cx16.r0+1") } asmgen.out(""" lda $name @@ -1773,16 +1773,31 @@ internal class AugmentableAssignmentAsmGen(private val program: PtProgram, "+" -> asmgen.out(" lda $name | clc | adc $otherName | sta $name | lda $name+1 | adc $otherName+1 | sta $name+1") "-" -> asmgen.out(" lda $name | sec | sbc $otherName | sta $name | lda $name+1 | sbc $otherName+1 | sta $name+1") "*" -> { - asmgen.out(""" - lda $otherName - ldy $otherName+1 - sta P8ZP_SCRATCH_W1 - sty P8ZP_SCRATCH_W1+1 - lda $name - ldy $name+1 - jsr math.multiply_words - sta $name - sty $name+1""") + if(otherName=="cx16.r0") + asmgen.out(""" + lda $name + ldy $name+1 + jsr math.multiply_words + sta $name + sty $name+1""") + else if(name=="cx16.r0") + asmgen.out(""" + lda $otherName + ldy $otherName+1 + jsr math.multiply_words + sta $name + sty $name+1""") + else + asmgen.out(""" + lda $otherName + ldy $otherName+1 + sta cx16.r0 + sty cx16.r0+1 + lda $name + ldy $name+1 + jsr math.multiply_words + sta $name + sty $name+1""") } "/" -> { if(dt==DataType.WORD) { @@ -1963,8 +1978,8 @@ internal class AugmentableAssignmentAsmGen(private val program: PtProgram, private fun inplacemodificationWordWithValue(name: String, dt: DataType, operator: String, value: PtExpression) { fun multiplyVarByWordInAY() { asmgen.out(""" - sta P8ZP_SCRATCH_W1 - sty P8ZP_SCRATCH_W1+1 + sta cx16.r0 + sty cx16.r0+1 lda $name ldy $name+1 jsr math.multiply_words diff --git a/compiler/res/prog8lib/math.asm b/compiler/res/prog8lib/math.asm index 149156de5..2b17be1d9 100644 --- a/compiler/res/prog8lib/math.asm +++ b/compiler/res/prog8lib/math.asm @@ -56,37 +56,124 @@ _multiplier = P8ZP_SCRATCH_REG multiply_words .proc ; -- multiply two 16-bit words into a 32-bit result (signed and unsigned) - ; input: A/Y = first 16-bit number, P8ZP_SCRATCH_W1 in ZP = second 16-bit number - ; output: multiply_words.result 4-bytes/32-bits product, LSB order (low-to-high) low 16 bits also in AY. + ; input: A/Y = first 16-bit number, cx16.R0 = second 16-bit number + ; output: multiply_words.result == cx16.R0:R1, 4-bytes/32-bits product, LSB order (low-to-high) low 16 bits also in AY. - sta P8ZP_SCRATCH_W2 - sty P8ZP_SCRATCH_W2+1 +; mult62.a +; based on Dr Jefyll, http://forum.6502.org/viewtopic.php?f=9&t=689&start=0#p19958 +; - adjusted to use fixed zero page addresses +; - removed 'decrement to avoid clc' as this is slower on average +; - rearranged memory use to remove final memory copy and give LSB first order to result +; - removed temp zp storage bytes +; - unrolled the outer loop +; - unrolled the two inner loops once +; +; 16 bit x 16 bit unsigned multiply, 32 bit result +; Average cycles: +; 93 bytes -mult16 lda #0 - sta result+2 ; clear upper bits of product - sta result+3 - ldx #16 ; for all 16 bits... -- lsr P8ZP_SCRATCH_W1+1 ; divide multiplier by 2 - ror P8ZP_SCRATCH_W1 - bcc + - lda result+2 ; get upper half of product and add multiplicand - clc - adc P8ZP_SCRATCH_W2 - sta result+2 - lda result+3 - adc P8ZP_SCRATCH_W2+1 -+ ror a ; rotate partial product - sta result+3 - ror result+2 - ror result+1 - ror result - dex - bne - - lda result - ldy result+1 - rts +_multiplicand = P8ZP_SCRATCH_W1 ; 2 bytes +_multiplier = cx16.r0 ; 2 bytes +result = cx16.r0 ; 4 bytes (note: shares memory with multiplier) so is r0 and ALSO r1. -result .byte 0,0,0,0 +; 16 bit x 16 bit unsigned multiply, 32 bit result +; +; On Entry: +; (multiplier, multiplier+1): two byte multiplier, four bytes needed for result +; (multiplicand, multiplicand+1): two byte multiplicand +; On Exit: +; (result, result+1, result+2, result+3): product + + sta _multiplicand + sty _multiplicand+1 + + lda #0 ; + sta result+2 ; 16 bits of zero in A, result+2 + ; Note: First 8 shifts are A -> result+2 -> result + ; Final 8 shifts are A -> result+2 -> result+1 + + ; --- 1st byte --- + ldy #4 ; count for inner loop + lsr result + + ; inner loop (8 times) +_inner_loop + ; first time + bcc + + tax ; retain A + lda result+2 + clc + adc _multiplicand + sta result+2 + txa ; recall A + adc _multiplicand+1 + ++ + ror a ; shift + ror result+2 + ror result + + ; second time + bcc + + tax ; retain A + lda result+2 + clc + adc _multiplicand + sta result+2 + txa ; recall A + adc _multiplicand+1 + ++ + ror a ; shift + ror result+2 + ror result + + dey + bne _inner_loop ; go back for 1 more shift? + + ; --- 2nd byte --- + ldy #4 ; count for inner loop + lsr result+1 + + ; inner loop (8 times) +_inner_loop2 + ; first time + bcc + + tax ; retain A + lda result+2 + clc + adc _multiplicand + sta result+2 + txa ; recall A + adc _multiplicand+1 + ++ + ror a ; shift + ror result+2 + ror result+1 + + ; second time + bcc + + tax ; retain A + lda result+2 + clc + adc _multiplicand + sta result+2 + txa ; recall A + adc _multiplicand+1 + ++ + ror a ; shift + ror result+2 + ror result+1 + dey + bne _inner_loop2 ; go back for 1 more shift? + + sta result+3 ; ms byte of hi-word of result + + lda result + ldy result+1 + rts .pend diff --git a/docs/source/todo.rst b/docs/source/todo.rst index 72421a07d..c92a65c0e 100644 --- a/docs/source/todo.rst +++ b/docs/source/todo.rst @@ -2,8 +2,6 @@ TODO ==== - don't allow txt.print('@') if possible, don't cast up a byte to str -- check mult routines with the benchmarked ones on https://github.com/TobyLobster/multiply_test -- is math.square still the fastest after this? (now used for word*word) - [on branch:] investigate McCarthy evaluation again? this may also reduce code size perhaps for things like if a>4 or a<2 .... - IR: reduce the number of branch instructions such as BEQ, BEQR, etc (gradually), replace with CMP(I) + status branch instruction - IR: reduce amount of CMP/CMPI after instructions that set the status bits correctly (LOADs? INC? etc), but only after setting the status bits is verified! diff --git a/examples/test.p8 b/examples/test.p8 index 4e554b5f0..80a70f051 100644 --- a/examples/test.p8 +++ b/examples/test.p8 @@ -11,199 +11,44 @@ cbm2 { main { sub start() { - ubyte value - byte svalue uword wvalue - word swvalue + uword wvalue2 - txt.print("byte multiply..") + txt.print("word square..") cbm.SETTIM(0,0,0) repeat 200 { - for value in 0 to 255 { - cx16.r0L = value*99 + for wvalue in 0 to 200 { + cx16.r0 = wvalue*wvalue } } txt.print_uw(cbm.RDTIM16()) txt.nl() - txt.print("byte multiply new..") + txt.print("word square via multiply new..") cbm.SETTIM(0,0,0) + wvalue2 = wvalue repeat 200 { - for value in 0 to 255 { - cx16.r0L = multiply_b(value, 99) + for wvalue in 0 to 200 { + cx16.r0 = wvalue*wvalue2 } } txt.print_uw(cbm.RDTIM16()) txt.nl() - txt.print("byte multiply verify..") - for value in 0 to 255 { - if multiply_b(value,99) != value*99 { - txt.print("different!") - sys.exit(1) - } - } - txt.nl() - - txt.print("sbyte multiply..") - cbm.SETTIM(0,0,0) - repeat 200 { - for svalue in -128 to 127 { - cx16.r0sL = svalue*99 - } - } - txt.print_uw(cbm.RDTIM16()) - txt.nl() - - txt.print("sbyte multiply new..") - cbm.SETTIM(0,0,0) - repeat 200 { - for svalue in -128 to 127 { - cx16.r0L = multiply_sb(svalue, 99) - } - } - txt.print_uw(cbm.RDTIM16()) - txt.nl() - - txt.print("sbyte multiply verify..") - for svalue in -128 to 127 { - if multiply_sb(svalue,99) != svalue*99 { - txt.print("different!") - sys.exit(1) - } - } - txt.nl() - - txt.print("word multiply..") - cbm.SETTIM(0,0,0) - repeat 200 { - for wvalue in 200 to 400 { - cx16.r0 = wvalue*987 - } - } - txt.print_uw(cbm.RDTIM16()) - txt.nl() - - txt.print("word multiply new..") - cbm.SETTIM(0,0,0) - repeat 200 { - for wvalue in 200 to 400 { - cx16.r0 = multiply_w(wvalue, 987) - } - } - txt.print_uw(cbm.RDTIM16()) - txt.nl() - - txt.print("word multiply verify..") - for wvalue in 200 to 400 { - if multiply_w(value,987) != value*987 { - txt.print("different!") - sys.exit(1) - } - } - txt.nl() - - txt.print("sword multiply..") - cbm.SETTIM(0,0,0) - repeat 100 { - for swvalue in -400 to 400 { - cx16.r0s = swvalue*987 - } - } - txt.print_uw(cbm.RDTIM16()) - txt.nl() - - txt.print("sword multiply new..") - cbm.SETTIM(0,0,0) - repeat 100 { - for swvalue in -400 to 400 { - cx16.r0s = multiply_sw(swvalue, 987) - } - } - txt.print_uw(cbm.RDTIM16()) - txt.nl() - - txt.print("sword multiply verify..") - for swvalue in -400 to 400 { - if multiply_sw(swvalue,987) != swvalue*987 { - txt.print("different!") + txt.print("word square verify..") + for wvalue in 0 to 200 { + wvalue2 = wvalue + if wvalue*wvalue != wvalue*wvalue2 { + txt.print("different! ") + txt.print_uw(wvalue) + txt.spc() + txt.spc() + txt.print_uw(wvalue*wvalue) + txt.spc() + txt.print_uw(wvalue*wvalue2) sys.exit(1) } } txt.nl() } - -asmsub multiply_sb(byte value @A, byte multiplicant @Y) -> ubyte @A { - %asm {{ - jmp p8_multiply_b - }} -} - -asmsub multiply_sw(word value @AY, word multiplicant @R0) -> word @AY { - %asm {{ - jmp p8_multiply_w - }} -} - - - asmsub multiply_b(ubyte value @A, ubyte multiplicant @Y) -> ubyte @A { - %asm {{ - -; *************************************************************************************** -; On Entry: -; A: multiplier -; Y: multiplicand -; On Exit: -; A: low byte of product -; Y: (optional) high byte of product -_multiplicand = P8ZP_SCRATCH_B1 -_multiplier = P8ZP_SCRATCH_REG - - sty _multiplicand - lsr a - sta _multiplier - lda #0 - ldx #2 -- - bcc + - clc - adc _multiplicand -+ - ror a - ror _multiplier - bcc + - clc - adc _multiplicand -+ - ror a - ror _multiplier - - bcc + - clc - adc _multiplicand -+ - ror a - ror _multiplier - bcc + - clc - adc _multiplicand -+ - ror a - ror _multiplier - dex - bne - - ; tay ; if you want 16 bits result in AY, enable this again - lda _multiplier - rts - }} - } - - asmsub multiply_w(uword value @AY, uword multiplicant @R0) -> uword @AY { - %asm {{ - ; TODO - lda #99 - ldy #1 - rts - }} - } }