mirror of
https://github.com/irmen/prog8.git
synced 2024-12-24 01:29:28 +00:00
2x faster word multiplication routine
This commit is contained in:
parent
4c8beefdcb
commit
bc8126eb16
@ -428,12 +428,12 @@ internal class ProgramAndVarsGen(
|
||||
asmgen.out("""
|
||||
lda #<${name}_init_value
|
||||
ldy #>${name}_init_value
|
||||
sta cx16.r0L
|
||||
sty cx16.r0H
|
||||
sta cx16.r0
|
||||
sty cx16.r0+1
|
||||
lda #<${name}
|
||||
ldy #>${name}
|
||||
sta cx16.r1L
|
||||
sty cx16.r1H
|
||||
sta cx16.r1
|
||||
sty cx16.r1+1
|
||||
lda #<$size
|
||||
ldy #>$size
|
||||
jsr sys.memcopy""")
|
||||
|
@ -544,7 +544,7 @@ internal class AssignmentAsmGen(private val program: PtProgram,
|
||||
return true
|
||||
}
|
||||
in WordDatatypes -> {
|
||||
asmgen.assignWordOperandsToAYAndVar(expr.right, expr.left, "P8ZP_SCRATCH_W1")
|
||||
asmgen.assignWordOperandsToAYAndVar(expr.right, expr.left, "cx16.r0")
|
||||
asmgen.out(" jsr math.multiply_words")
|
||||
assignRegisterpairWord(target, RegisterOrPair.AY)
|
||||
return true
|
||||
@ -568,7 +568,7 @@ internal class AssignmentAsmGen(private val program: PtProgram,
|
||||
asmgen.out(" jsr math.mul_word_${value}")
|
||||
}
|
||||
else {
|
||||
asmgen.assignWordOperandsToAYAndVar(expr.right, expr.left, "P8ZP_SCRATCH_W1")
|
||||
asmgen.assignWordOperandsToAYAndVar(expr.right, expr.left, "cx16.r0")
|
||||
asmgen.out(" jsr math.multiply_words")
|
||||
}
|
||||
assignRegisterpairWord(target, RegisterOrPair.AY)
|
||||
|
@ -1178,9 +1178,9 @@ internal class AugmentableAssignmentAsmGen(private val program: PtProgram,
|
||||
} else {
|
||||
asmgen.out("""
|
||||
lda $name
|
||||
sta P8ZP_SCRATCH_W1
|
||||
sta cx16.r0
|
||||
lda $name+1
|
||||
sta P8ZP_SCRATCH_W1+1
|
||||
sta cx16.r0+1
|
||||
lda #<$value
|
||||
ldy #>$value
|
||||
jsr math.multiply_words
|
||||
@ -1629,15 +1629,15 @@ internal class AugmentableAssignmentAsmGen(private val program: PtProgram,
|
||||
}
|
||||
"*" -> {
|
||||
if(valueDt==DataType.UBYTE) {
|
||||
asmgen.out(" lda $otherName | sta P8ZP_SCRATCH_W1")
|
||||
asmgen.out(" lda $otherName | sta cx16.r0")
|
||||
if(asmgen.isTargetCpu(CpuType.CPU65c02))
|
||||
asmgen.out(" stz P8ZP_SCRATCH_W1+1")
|
||||
asmgen.out(" stz cx16.r0+1")
|
||||
else
|
||||
asmgen.out(" lda #0 | sta P8ZP_SCRATCH_W1+1")
|
||||
asmgen.out(" lda #0 | sta cx16.r0+1")
|
||||
} else {
|
||||
asmgen.out(" lda $otherName")
|
||||
asmgen.signExtendAYlsb(valueDt)
|
||||
asmgen.out(" sta P8ZP_SCRATCH_W1 | sty P8ZP_SCRATCH_W1+1")
|
||||
asmgen.out(" sta cx16.r0 | sty cx16.r0+1")
|
||||
}
|
||||
asmgen.out("""
|
||||
lda $name
|
||||
@ -1773,16 +1773,31 @@ internal class AugmentableAssignmentAsmGen(private val program: PtProgram,
|
||||
"+" -> asmgen.out(" lda $name | clc | adc $otherName | sta $name | lda $name+1 | adc $otherName+1 | sta $name+1")
|
||||
"-" -> asmgen.out(" lda $name | sec | sbc $otherName | sta $name | lda $name+1 | sbc $otherName+1 | sta $name+1")
|
||||
"*" -> {
|
||||
asmgen.out("""
|
||||
lda $otherName
|
||||
ldy $otherName+1
|
||||
sta P8ZP_SCRATCH_W1
|
||||
sty P8ZP_SCRATCH_W1+1
|
||||
lda $name
|
||||
ldy $name+1
|
||||
jsr math.multiply_words
|
||||
sta $name
|
||||
sty $name+1""")
|
||||
if(otherName=="cx16.r0")
|
||||
asmgen.out("""
|
||||
lda $name
|
||||
ldy $name+1
|
||||
jsr math.multiply_words
|
||||
sta $name
|
||||
sty $name+1""")
|
||||
else if(name=="cx16.r0")
|
||||
asmgen.out("""
|
||||
lda $otherName
|
||||
ldy $otherName+1
|
||||
jsr math.multiply_words
|
||||
sta $name
|
||||
sty $name+1""")
|
||||
else
|
||||
asmgen.out("""
|
||||
lda $otherName
|
||||
ldy $otherName+1
|
||||
sta cx16.r0
|
||||
sty cx16.r0+1
|
||||
lda $name
|
||||
ldy $name+1
|
||||
jsr math.multiply_words
|
||||
sta $name
|
||||
sty $name+1""")
|
||||
}
|
||||
"/" -> {
|
||||
if(dt==DataType.WORD) {
|
||||
@ -1963,8 +1978,8 @@ internal class AugmentableAssignmentAsmGen(private val program: PtProgram,
|
||||
private fun inplacemodificationWordWithValue(name: String, dt: DataType, operator: String, value: PtExpression) {
|
||||
fun multiplyVarByWordInAY() {
|
||||
asmgen.out("""
|
||||
sta P8ZP_SCRATCH_W1
|
||||
sty P8ZP_SCRATCH_W1+1
|
||||
sta cx16.r0
|
||||
sty cx16.r0+1
|
||||
lda $name
|
||||
ldy $name+1
|
||||
jsr math.multiply_words
|
||||
|
@ -56,37 +56,124 @@ _multiplier = P8ZP_SCRATCH_REG
|
||||
|
||||
multiply_words .proc
|
||||
; -- multiply two 16-bit words into a 32-bit result (signed and unsigned)
|
||||
; input: A/Y = first 16-bit number, P8ZP_SCRATCH_W1 in ZP = second 16-bit number
|
||||
; output: multiply_words.result 4-bytes/32-bits product, LSB order (low-to-high) low 16 bits also in AY.
|
||||
; input: A/Y = first 16-bit number, cx16.R0 = second 16-bit number
|
||||
; output: multiply_words.result == cx16.R0:R1, 4-bytes/32-bits product, LSB order (low-to-high) low 16 bits also in AY.
|
||||
|
||||
sta P8ZP_SCRATCH_W2
|
||||
sty P8ZP_SCRATCH_W2+1
|
||||
; mult62.a
|
||||
; based on Dr Jefyll, http://forum.6502.org/viewtopic.php?f=9&t=689&start=0#p19958
|
||||
; - adjusted to use fixed zero page addresses
|
||||
; - removed 'decrement to avoid clc' as this is slower on average
|
||||
; - rearranged memory use to remove final memory copy and give LSB first order to result
|
||||
; - removed temp zp storage bytes
|
||||
; - unrolled the outer loop
|
||||
; - unrolled the two inner loops once
|
||||
;
|
||||
; 16 bit x 16 bit unsigned multiply, 32 bit result
|
||||
; Average cycles:
|
||||
; 93 bytes
|
||||
|
||||
mult16 lda #0
|
||||
sta result+2 ; clear upper bits of product
|
||||
sta result+3
|
||||
ldx #16 ; for all 16 bits...
|
||||
- lsr P8ZP_SCRATCH_W1+1 ; divide multiplier by 2
|
||||
ror P8ZP_SCRATCH_W1
|
||||
bcc +
|
||||
lda result+2 ; get upper half of product and add multiplicand
|
||||
clc
|
||||
adc P8ZP_SCRATCH_W2
|
||||
sta result+2
|
||||
lda result+3
|
||||
adc P8ZP_SCRATCH_W2+1
|
||||
+ ror a ; rotate partial product
|
||||
sta result+3
|
||||
ror result+2
|
||||
ror result+1
|
||||
ror result
|
||||
dex
|
||||
bne -
|
||||
lda result
|
||||
ldy result+1
|
||||
rts
|
||||
_multiplicand = P8ZP_SCRATCH_W1 ; 2 bytes
|
||||
_multiplier = cx16.r0 ; 2 bytes
|
||||
result = cx16.r0 ; 4 bytes (note: shares memory with multiplier) so is r0 and ALSO r1.
|
||||
|
||||
result .byte 0,0,0,0
|
||||
; 16 bit x 16 bit unsigned multiply, 32 bit result
|
||||
;
|
||||
; On Entry:
|
||||
; (multiplier, multiplier+1): two byte multiplier, four bytes needed for result
|
||||
; (multiplicand, multiplicand+1): two byte multiplicand
|
||||
; On Exit:
|
||||
; (result, result+1, result+2, result+3): product
|
||||
|
||||
sta _multiplicand
|
||||
sty _multiplicand+1
|
||||
|
||||
lda #0 ;
|
||||
sta result+2 ; 16 bits of zero in A, result+2
|
||||
; Note: First 8 shifts are A -> result+2 -> result
|
||||
; Final 8 shifts are A -> result+2 -> result+1
|
||||
|
||||
; --- 1st byte ---
|
||||
ldy #4 ; count for inner loop
|
||||
lsr result
|
||||
|
||||
; inner loop (8 times)
|
||||
_inner_loop
|
||||
; first time
|
||||
bcc +
|
||||
tax ; retain A
|
||||
lda result+2
|
||||
clc
|
||||
adc _multiplicand
|
||||
sta result+2
|
||||
txa ; recall A
|
||||
adc _multiplicand+1
|
||||
|
||||
+
|
||||
ror a ; shift
|
||||
ror result+2
|
||||
ror result
|
||||
|
||||
; second time
|
||||
bcc +
|
||||
tax ; retain A
|
||||
lda result+2
|
||||
clc
|
||||
adc _multiplicand
|
||||
sta result+2
|
||||
txa ; recall A
|
||||
adc _multiplicand+1
|
||||
|
||||
+
|
||||
ror a ; shift
|
||||
ror result+2
|
||||
ror result
|
||||
|
||||
dey
|
||||
bne _inner_loop ; go back for 1 more shift?
|
||||
|
||||
; --- 2nd byte ---
|
||||
ldy #4 ; count for inner loop
|
||||
lsr result+1
|
||||
|
||||
; inner loop (8 times)
|
||||
_inner_loop2
|
||||
; first time
|
||||
bcc +
|
||||
tax ; retain A
|
||||
lda result+2
|
||||
clc
|
||||
adc _multiplicand
|
||||
sta result+2
|
||||
txa ; recall A
|
||||
adc _multiplicand+1
|
||||
|
||||
+
|
||||
ror a ; shift
|
||||
ror result+2
|
||||
ror result+1
|
||||
|
||||
; second time
|
||||
bcc +
|
||||
tax ; retain A
|
||||
lda result+2
|
||||
clc
|
||||
adc _multiplicand
|
||||
sta result+2
|
||||
txa ; recall A
|
||||
adc _multiplicand+1
|
||||
|
||||
+
|
||||
ror a ; shift
|
||||
ror result+2
|
||||
ror result+1
|
||||
dey
|
||||
bne _inner_loop2 ; go back for 1 more shift?
|
||||
|
||||
sta result+3 ; ms byte of hi-word of result
|
||||
|
||||
lda result
|
||||
ldy result+1
|
||||
rts
|
||||
.pend
|
||||
|
||||
|
||||
|
@ -2,8 +2,6 @@ TODO
|
||||
====
|
||||
|
||||
- don't allow txt.print('@') if possible, don't cast up a byte to str
|
||||
- check mult routines with the benchmarked ones on https://github.com/TobyLobster/multiply_test
|
||||
- is math.square still the fastest after this? (now used for word*word)
|
||||
- [on branch:] investigate McCarthy evaluation again? this may also reduce code size perhaps for things like if a>4 or a<2 ....
|
||||
- IR: reduce the number of branch instructions such as BEQ, BEQR, etc (gradually), replace with CMP(I) + status branch instruction
|
||||
- IR: reduce amount of CMP/CMPI after instructions that set the status bits correctly (LOADs? INC? etc), but only after setting the status bits is verified!
|
||||
|
193
examples/test.p8
193
examples/test.p8
@ -11,199 +11,44 @@ cbm2 {
|
||||
|
||||
main {
|
||||
sub start() {
|
||||
ubyte value
|
||||
byte svalue
|
||||
uword wvalue
|
||||
word swvalue
|
||||
uword wvalue2
|
||||
|
||||
txt.print("byte multiply..")
|
||||
txt.print("word square..")
|
||||
cbm.SETTIM(0,0,0)
|
||||
repeat 200 {
|
||||
for value in 0 to 255 {
|
||||
cx16.r0L = value*99
|
||||
for wvalue in 0 to 200 {
|
||||
cx16.r0 = wvalue*wvalue
|
||||
}
|
||||
}
|
||||
txt.print_uw(cbm.RDTIM16())
|
||||
txt.nl()
|
||||
|
||||
txt.print("byte multiply new..")
|
||||
txt.print("word square via multiply new..")
|
||||
cbm.SETTIM(0,0,0)
|
||||
wvalue2 = wvalue
|
||||
repeat 200 {
|
||||
for value in 0 to 255 {
|
||||
cx16.r0L = multiply_b(value, 99)
|
||||
for wvalue in 0 to 200 {
|
||||
cx16.r0 = wvalue*wvalue2
|
||||
}
|
||||
}
|
||||
txt.print_uw(cbm.RDTIM16())
|
||||
txt.nl()
|
||||
|
||||
txt.print("byte multiply verify..")
|
||||
for value in 0 to 255 {
|
||||
if multiply_b(value,99) != value*99 {
|
||||
txt.print("different!")
|
||||
sys.exit(1)
|
||||
}
|
||||
}
|
||||
txt.nl()
|
||||
|
||||
txt.print("sbyte multiply..")
|
||||
cbm.SETTIM(0,0,0)
|
||||
repeat 200 {
|
||||
for svalue in -128 to 127 {
|
||||
cx16.r0sL = svalue*99
|
||||
}
|
||||
}
|
||||
txt.print_uw(cbm.RDTIM16())
|
||||
txt.nl()
|
||||
|
||||
txt.print("sbyte multiply new..")
|
||||
cbm.SETTIM(0,0,0)
|
||||
repeat 200 {
|
||||
for svalue in -128 to 127 {
|
||||
cx16.r0L = multiply_sb(svalue, 99)
|
||||
}
|
||||
}
|
||||
txt.print_uw(cbm.RDTIM16())
|
||||
txt.nl()
|
||||
|
||||
txt.print("sbyte multiply verify..")
|
||||
for svalue in -128 to 127 {
|
||||
if multiply_sb(svalue,99) != svalue*99 {
|
||||
txt.print("different!")
|
||||
sys.exit(1)
|
||||
}
|
||||
}
|
||||
txt.nl()
|
||||
|
||||
txt.print("word multiply..")
|
||||
cbm.SETTIM(0,0,0)
|
||||
repeat 200 {
|
||||
for wvalue in 200 to 400 {
|
||||
cx16.r0 = wvalue*987
|
||||
}
|
||||
}
|
||||
txt.print_uw(cbm.RDTIM16())
|
||||
txt.nl()
|
||||
|
||||
txt.print("word multiply new..")
|
||||
cbm.SETTIM(0,0,0)
|
||||
repeat 200 {
|
||||
for wvalue in 200 to 400 {
|
||||
cx16.r0 = multiply_w(wvalue, 987)
|
||||
}
|
||||
}
|
||||
txt.print_uw(cbm.RDTIM16())
|
||||
txt.nl()
|
||||
|
||||
txt.print("word multiply verify..")
|
||||
for wvalue in 200 to 400 {
|
||||
if multiply_w(value,987) != value*987 {
|
||||
txt.print("different!")
|
||||
sys.exit(1)
|
||||
}
|
||||
}
|
||||
txt.nl()
|
||||
|
||||
txt.print("sword multiply..")
|
||||
cbm.SETTIM(0,0,0)
|
||||
repeat 100 {
|
||||
for swvalue in -400 to 400 {
|
||||
cx16.r0s = swvalue*987
|
||||
}
|
||||
}
|
||||
txt.print_uw(cbm.RDTIM16())
|
||||
txt.nl()
|
||||
|
||||
txt.print("sword multiply new..")
|
||||
cbm.SETTIM(0,0,0)
|
||||
repeat 100 {
|
||||
for swvalue in -400 to 400 {
|
||||
cx16.r0s = multiply_sw(swvalue, 987)
|
||||
}
|
||||
}
|
||||
txt.print_uw(cbm.RDTIM16())
|
||||
txt.nl()
|
||||
|
||||
txt.print("sword multiply verify..")
|
||||
for swvalue in -400 to 400 {
|
||||
if multiply_sw(swvalue,987) != swvalue*987 {
|
||||
txt.print("different!")
|
||||
txt.print("word square verify..")
|
||||
for wvalue in 0 to 200 {
|
||||
wvalue2 = wvalue
|
||||
if wvalue*wvalue != wvalue*wvalue2 {
|
||||
txt.print("different! ")
|
||||
txt.print_uw(wvalue)
|
||||
txt.spc()
|
||||
txt.spc()
|
||||
txt.print_uw(wvalue*wvalue)
|
||||
txt.spc()
|
||||
txt.print_uw(wvalue*wvalue2)
|
||||
sys.exit(1)
|
||||
}
|
||||
}
|
||||
txt.nl()
|
||||
}
|
||||
|
||||
asmsub multiply_sb(byte value @A, byte multiplicant @Y) -> ubyte @A {
|
||||
%asm {{
|
||||
jmp p8_multiply_b
|
||||
}}
|
||||
}
|
||||
|
||||
asmsub multiply_sw(word value @AY, word multiplicant @R0) -> word @AY {
|
||||
%asm {{
|
||||
jmp p8_multiply_w
|
||||
}}
|
||||
}
|
||||
|
||||
|
||||
asmsub multiply_b(ubyte value @A, ubyte multiplicant @Y) -> ubyte @A {
|
||||
%asm {{
|
||||
|
||||
; ***************************************************************************************
|
||||
; On Entry:
|
||||
; A: multiplier
|
||||
; Y: multiplicand
|
||||
; On Exit:
|
||||
; A: low byte of product
|
||||
; Y: (optional) high byte of product
|
||||
_multiplicand = P8ZP_SCRATCH_B1
|
||||
_multiplier = P8ZP_SCRATCH_REG
|
||||
|
||||
sty _multiplicand
|
||||
lsr a
|
||||
sta _multiplier
|
||||
lda #0
|
||||
ldx #2
|
||||
-
|
||||
bcc +
|
||||
clc
|
||||
adc _multiplicand
|
||||
+
|
||||
ror a
|
||||
ror _multiplier
|
||||
bcc +
|
||||
clc
|
||||
adc _multiplicand
|
||||
+
|
||||
ror a
|
||||
ror _multiplier
|
||||
|
||||
bcc +
|
||||
clc
|
||||
adc _multiplicand
|
||||
+
|
||||
ror a
|
||||
ror _multiplier
|
||||
bcc +
|
||||
clc
|
||||
adc _multiplicand
|
||||
+
|
||||
ror a
|
||||
ror _multiplier
|
||||
dex
|
||||
bne -
|
||||
; tay ; if you want 16 bits result in AY, enable this again
|
||||
lda _multiplier
|
||||
rts
|
||||
}}
|
||||
}
|
||||
|
||||
asmsub multiply_w(uword value @AY, uword multiplicant @R0) -> uword @AY {
|
||||
%asm {{
|
||||
; TODO
|
||||
lda #99
|
||||
ldy #1
|
||||
rts
|
||||
}}
|
||||
}
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user