dos33fsprogs/hellmood_memories/multiply_tables.s
2020-05-12 14:58:40 -04:00

91 lines
2.2 KiB
ArmAsm

; Fast mutiply -- setup tables
; Note for our purposes we only care about 8.8 x 8.8 fixed point
; with 8.8 result, which means we only care about the middle two bytes
; of the 32 bit result. So we disable generation of the high and low byte
; to save some cycles.
;
; The old routine took around 700 cycles for a 16bitx16bit=32bit mutiply
; This routine, at an expense of 2kB of looku tables, takes around 250
; If you reuse a term the next time this drops closer to 200
; This routine was described by Stephen Judd and found
; in The Fridge and in the C=Hacking magazine
; http://codebase64.org/doku.php?id=base:seriously_fast_multiplication
; The key thing to note is that
; (a+b)^2 (a-b)^2
; a*b = ------- - --------
; 4 4
; So if you have tables of the squares of 0..511 you can lookup and subtract
; instead of multiplying.
; Table generation: I:0..511
; square1_lo = <((I*I)/4)
; square1_hi = >((I*I)/4)
; square2_lo = <(((I-255)*(I-255))/4)
; square2_hi = >(((I-255)*(I-255))/4)
.ifndef square1_lo
square1_lo = $2000
square1_hi = $2200
square2_lo = $2400
square2_hi = $2600
.endif
; for(i=0;i<512;i++) {
; square1_lo[i]=((i*i)/4)&0xff;
; square1_hi[i]=(((i*i)/4)>>8)&0xff;
; square2_lo[i]=( ((i-255)*(i-255))/4)&0xff;
; square2_hi[i]=(( ((i-255)*(i-255))/4)>>8)&0xff;
; }
init_multiply_tables:
; Build the add tables
ldx #$00
txa
.byte $c9 ; CMP #immediate - skip TYA and clear carry flag
lb1: tya
adc #$00 ; 0
ml1: sta square1_hi,x ; square1_hi[0]=0
tay ; y=0
cmp #$40 ; subtract 64 and update flags (c=0)
txa ; a=0
ror ; rotate
ml9: adc #$00 ; add 0
sta ml9+1 ; update add value
inx ; x=1
ml0: sta square1_lo,x ; square1_lo[0]=1
bne lb1 ; if not zero, loop
inc ml0+2 ; increment values
inc ml1+2 ; increment values
clc ; c=0
iny ; y=1
bne lb1 ; loop
; Build the subtract tables based on the existing one
ldx #$00
ldy #$ff
second_table:
lda square1_hi+1,x
sta square2_hi+$100,x
lda square1_hi,x
sta square2_hi,y
lda square1_lo+1,x
sta square2_lo+$100,x
lda square1_lo,x
sta square2_lo,y
dey
inx
bne second_table
rts