2x faster word multiplication routine

This commit is contained in:
Irmen de Jong 2023-08-14 17:34:28 +02:00
parent 4c8beefdcb
commit bc8126eb16
6 changed files with 173 additions and 228 deletions

View File

@ -428,12 +428,12 @@ internal class ProgramAndVarsGen(
asmgen.out("""
lda #<${name}_init_value
ldy #>${name}_init_value
sta cx16.r0L
sty cx16.r0H
sta cx16.r0
sty cx16.r0+1
lda #<${name}
ldy #>${name}
sta cx16.r1L
sty cx16.r1H
sta cx16.r1
sty cx16.r1+1
lda #<$size
ldy #>$size
jsr sys.memcopy""")

View File

@ -544,7 +544,7 @@ internal class AssignmentAsmGen(private val program: PtProgram,
return true
}
in WordDatatypes -> {
asmgen.assignWordOperandsToAYAndVar(expr.right, expr.left, "P8ZP_SCRATCH_W1")
asmgen.assignWordOperandsToAYAndVar(expr.right, expr.left, "cx16.r0")
asmgen.out(" jsr math.multiply_words")
assignRegisterpairWord(target, RegisterOrPair.AY)
return true
@ -568,7 +568,7 @@ internal class AssignmentAsmGen(private val program: PtProgram,
asmgen.out(" jsr math.mul_word_${value}")
}
else {
asmgen.assignWordOperandsToAYAndVar(expr.right, expr.left, "P8ZP_SCRATCH_W1")
asmgen.assignWordOperandsToAYAndVar(expr.right, expr.left, "cx16.r0")
asmgen.out(" jsr math.multiply_words")
}
assignRegisterpairWord(target, RegisterOrPair.AY)

View File

@ -1178,9 +1178,9 @@ internal class AugmentableAssignmentAsmGen(private val program: PtProgram,
} else {
asmgen.out("""
lda $name
sta P8ZP_SCRATCH_W1
sta cx16.r0
lda $name+1
sta P8ZP_SCRATCH_W1+1
sta cx16.r0+1
lda #<$value
ldy #>$value
jsr math.multiply_words
@ -1629,15 +1629,15 @@ internal class AugmentableAssignmentAsmGen(private val program: PtProgram,
}
"*" -> {
if(valueDt==DataType.UBYTE) {
asmgen.out(" lda $otherName | sta P8ZP_SCRATCH_W1")
asmgen.out(" lda $otherName | sta cx16.r0")
if(asmgen.isTargetCpu(CpuType.CPU65c02))
asmgen.out(" stz P8ZP_SCRATCH_W1+1")
asmgen.out(" stz cx16.r0+1")
else
asmgen.out(" lda #0 | sta P8ZP_SCRATCH_W1+1")
asmgen.out(" lda #0 | sta cx16.r0+1")
} else {
asmgen.out(" lda $otherName")
asmgen.signExtendAYlsb(valueDt)
asmgen.out(" sta P8ZP_SCRATCH_W1 | sty P8ZP_SCRATCH_W1+1")
asmgen.out(" sta cx16.r0 | sty cx16.r0+1")
}
asmgen.out("""
lda $name
@ -1773,16 +1773,31 @@ internal class AugmentableAssignmentAsmGen(private val program: PtProgram,
"+" -> asmgen.out(" lda $name | clc | adc $otherName | sta $name | lda $name+1 | adc $otherName+1 | sta $name+1")
"-" -> asmgen.out(" lda $name | sec | sbc $otherName | sta $name | lda $name+1 | sbc $otherName+1 | sta $name+1")
"*" -> {
asmgen.out("""
lda $otherName
ldy $otherName+1
sta P8ZP_SCRATCH_W1
sty P8ZP_SCRATCH_W1+1
lda $name
ldy $name+1
jsr math.multiply_words
sta $name
sty $name+1""")
if(otherName=="cx16.r0")
asmgen.out("""
lda $name
ldy $name+1
jsr math.multiply_words
sta $name
sty $name+1""")
else if(name=="cx16.r0")
asmgen.out("""
lda $otherName
ldy $otherName+1
jsr math.multiply_words
sta $name
sty $name+1""")
else
asmgen.out("""
lda $otherName
ldy $otherName+1
sta cx16.r0
sty cx16.r0+1
lda $name
ldy $name+1
jsr math.multiply_words
sta $name
sty $name+1""")
}
"/" -> {
if(dt==DataType.WORD) {
@ -1963,8 +1978,8 @@ internal class AugmentableAssignmentAsmGen(private val program: PtProgram,
private fun inplacemodificationWordWithValue(name: String, dt: DataType, operator: String, value: PtExpression) {
fun multiplyVarByWordInAY() {
asmgen.out("""
sta P8ZP_SCRATCH_W1
sty P8ZP_SCRATCH_W1+1
sta cx16.r0
sty cx16.r0+1
lda $name
ldy $name+1
jsr math.multiply_words

View File

@ -56,37 +56,124 @@ _multiplier = P8ZP_SCRATCH_REG
multiply_words .proc
; -- multiply two 16-bit words into a 32-bit result (signed and unsigned)
; input: A/Y = first 16-bit number, P8ZP_SCRATCH_W1 in ZP = second 16-bit number
; output: multiply_words.result 4-bytes/32-bits product, LSB order (low-to-high) low 16 bits also in AY.
; input: A/Y = first 16-bit number, cx16.R0 = second 16-bit number
; output: multiply_words.result == cx16.R0:R1, 4-bytes/32-bits product, LSB order (low-to-high) low 16 bits also in AY.
sta P8ZP_SCRATCH_W2
sty P8ZP_SCRATCH_W2+1
; mult62.a
; based on Dr Jefyll, http://forum.6502.org/viewtopic.php?f=9&t=689&start=0#p19958
; - adjusted to use fixed zero page addresses
; - removed 'decrement to avoid clc' as this is slower on average
; - rearranged memory use to remove final memory copy and give LSB first order to result
; - removed temp zp storage bytes
; - unrolled the outer loop
; - unrolled the two inner loops once
;
; 16 bit x 16 bit unsigned multiply, 32 bit result
; Average cycles:
; 93 bytes
mult16 lda #0
sta result+2 ; clear upper bits of product
sta result+3
ldx #16 ; for all 16 bits...
- lsr P8ZP_SCRATCH_W1+1 ; divide multiplier by 2
ror P8ZP_SCRATCH_W1
bcc +
lda result+2 ; get upper half of product and add multiplicand
clc
adc P8ZP_SCRATCH_W2
sta result+2
lda result+3
adc P8ZP_SCRATCH_W2+1
+ ror a ; rotate partial product
sta result+3
ror result+2
ror result+1
ror result
dex
bne -
lda result
ldy result+1
rts
_multiplicand = P8ZP_SCRATCH_W1 ; 2 bytes
_multiplier = cx16.r0 ; 2 bytes
result = cx16.r0 ; 4 bytes (note: shares memory with multiplier) so is r0 and ALSO r1.
result .byte 0,0,0,0
; 16 bit x 16 bit unsigned multiply, 32 bit result
;
; On Entry:
; (multiplier, multiplier+1): two byte multiplier, four bytes needed for result
; (multiplicand, multiplicand+1): two byte multiplicand
; On Exit:
; (result, result+1, result+2, result+3): product
sta _multiplicand
sty _multiplicand+1
lda #0 ;
sta result+2 ; 16 bits of zero in A, result+2
; Note: First 8 shifts are A -> result+2 -> result
; Final 8 shifts are A -> result+2 -> result+1
; --- 1st byte ---
ldy #4 ; count for inner loop
lsr result
; inner loop (8 times)
_inner_loop
; first time
bcc +
tax ; retain A
lda result+2
clc
adc _multiplicand
sta result+2
txa ; recall A
adc _multiplicand+1
+
ror a ; shift
ror result+2
ror result
; second time
bcc +
tax ; retain A
lda result+2
clc
adc _multiplicand
sta result+2
txa ; recall A
adc _multiplicand+1
+
ror a ; shift
ror result+2
ror result
dey
bne _inner_loop ; go back for 1 more shift?
; --- 2nd byte ---
ldy #4 ; count for inner loop
lsr result+1
; inner loop (8 times)
_inner_loop2
; first time
bcc +
tax ; retain A
lda result+2
clc
adc _multiplicand
sta result+2
txa ; recall A
adc _multiplicand+1
+
ror a ; shift
ror result+2
ror result+1
; second time
bcc +
tax ; retain A
lda result+2
clc
adc _multiplicand
sta result+2
txa ; recall A
adc _multiplicand+1
+
ror a ; shift
ror result+2
ror result+1
dey
bne _inner_loop2 ; go back for 1 more shift?
sta result+3 ; ms byte of hi-word of result
lda result
ldy result+1
rts
.pend

View File

@ -2,8 +2,6 @@ TODO
====
- don't allow txt.print('@') if possible, don't cast up a byte to str
- check mult routines with the benchmarked ones on https://github.com/TobyLobster/multiply_test
- is math.square still the fastest after this? (now used for word*word)
- [on branch:] investigate McCarthy evaluation again? this may also reduce code size perhaps for things like if a>4 or a<2 ....
- IR: reduce the number of branch instructions such as BEQ, BEQR, etc (gradually), replace with CMP(I) + status branch instruction
- IR: reduce amount of CMP/CMPI after instructions that set the status bits correctly (LOADs? INC? etc), but only after setting the status bits is verified!

View File

@ -11,199 +11,44 @@ cbm2 {
main {
sub start() {
ubyte value
byte svalue
uword wvalue
word swvalue
uword wvalue2
txt.print("byte multiply..")
txt.print("word square..")
cbm.SETTIM(0,0,0)
repeat 200 {
for value in 0 to 255 {
cx16.r0L = value*99
for wvalue in 0 to 200 {
cx16.r0 = wvalue*wvalue
}
}
txt.print_uw(cbm.RDTIM16())
txt.nl()
txt.print("byte multiply new..")
txt.print("word square via multiply new..")
cbm.SETTIM(0,0,0)
wvalue2 = wvalue
repeat 200 {
for value in 0 to 255 {
cx16.r0L = multiply_b(value, 99)
for wvalue in 0 to 200 {
cx16.r0 = wvalue*wvalue2
}
}
txt.print_uw(cbm.RDTIM16())
txt.nl()
txt.print("byte multiply verify..")
for value in 0 to 255 {
if multiply_b(value,99) != value*99 {
txt.print("different!")
sys.exit(1)
}
}
txt.nl()
txt.print("sbyte multiply..")
cbm.SETTIM(0,0,0)
repeat 200 {
for svalue in -128 to 127 {
cx16.r0sL = svalue*99
}
}
txt.print_uw(cbm.RDTIM16())
txt.nl()
txt.print("sbyte multiply new..")
cbm.SETTIM(0,0,0)
repeat 200 {
for svalue in -128 to 127 {
cx16.r0L = multiply_sb(svalue, 99)
}
}
txt.print_uw(cbm.RDTIM16())
txt.nl()
txt.print("sbyte multiply verify..")
for svalue in -128 to 127 {
if multiply_sb(svalue,99) != svalue*99 {
txt.print("different!")
sys.exit(1)
}
}
txt.nl()
txt.print("word multiply..")
cbm.SETTIM(0,0,0)
repeat 200 {
for wvalue in 200 to 400 {
cx16.r0 = wvalue*987
}
}
txt.print_uw(cbm.RDTIM16())
txt.nl()
txt.print("word multiply new..")
cbm.SETTIM(0,0,0)
repeat 200 {
for wvalue in 200 to 400 {
cx16.r0 = multiply_w(wvalue, 987)
}
}
txt.print_uw(cbm.RDTIM16())
txt.nl()
txt.print("word multiply verify..")
for wvalue in 200 to 400 {
if multiply_w(value,987) != value*987 {
txt.print("different!")
sys.exit(1)
}
}
txt.nl()
txt.print("sword multiply..")
cbm.SETTIM(0,0,0)
repeat 100 {
for swvalue in -400 to 400 {
cx16.r0s = swvalue*987
}
}
txt.print_uw(cbm.RDTIM16())
txt.nl()
txt.print("sword multiply new..")
cbm.SETTIM(0,0,0)
repeat 100 {
for swvalue in -400 to 400 {
cx16.r0s = multiply_sw(swvalue, 987)
}
}
txt.print_uw(cbm.RDTIM16())
txt.nl()
txt.print("sword multiply verify..")
for swvalue in -400 to 400 {
if multiply_sw(swvalue,987) != swvalue*987 {
txt.print("different!")
txt.print("word square verify..")
for wvalue in 0 to 200 {
wvalue2 = wvalue
if wvalue*wvalue != wvalue*wvalue2 {
txt.print("different! ")
txt.print_uw(wvalue)
txt.spc()
txt.spc()
txt.print_uw(wvalue*wvalue)
txt.spc()
txt.print_uw(wvalue*wvalue2)
sys.exit(1)
}
}
txt.nl()
}
asmsub multiply_sb(byte value @A, byte multiplicant @Y) -> ubyte @A {
%asm {{
jmp p8_multiply_b
}}
}
asmsub multiply_sw(word value @AY, word multiplicant @R0) -> word @AY {
%asm {{
jmp p8_multiply_w
}}
}
asmsub multiply_b(ubyte value @A, ubyte multiplicant @Y) -> ubyte @A {
%asm {{
; ***************************************************************************************
; On Entry:
; A: multiplier
; Y: multiplicand
; On Exit:
; A: low byte of product
; Y: (optional) high byte of product
_multiplicand = P8ZP_SCRATCH_B1
_multiplier = P8ZP_SCRATCH_REG
sty _multiplicand
lsr a
sta _multiplier
lda #0
ldx #2
-
bcc +
clc
adc _multiplicand
+
ror a
ror _multiplier
bcc +
clc
adc _multiplicand
+
ror a
ror _multiplier
bcc +
clc
adc _multiplicand
+
ror a
ror _multiplier
bcc +
clc
adc _multiplicand
+
ror a
ror _multiplier
dex
bne -
; tay ; if you want 16 bits result in AY, enable this again
lda _multiplier
rts
}}
}
asmsub multiply_w(uword value @AY, uword multiplicant @R0) -> uword @AY {
%asm {{
; TODO
lda #99
ldy #1
rts
}}
}
}