dos33fsprogs/demos/hellmood_memories/multiply_u16x16_mod.s

222 lines
4.2 KiB
ArmAsm

; MODIFIED
; we only care about the result of RESULT1 for sierpinski
; Fast mutiply
; Note for our purposes we only care about 8.8 x 8.8 fixed point
; with 8.8 result, which means we only care about the middle two bytes
; of the 32 bit result. So we disable generation of the high and low byte
; to save some cycles.
;
; The old routine took around 700 cycles for a 16bitx16bit=32bit mutiply
; This routine, at an expense of 2kB of lookup tables, takes around 250
; If you reuse a term the next time this drops closer to 200
; Fast 16x16 bit unsigned multiplication, 32-bit result
; Input: NUM1H:NUM1L * NUM2H:NUM2L
; Result: RESULT3:RESULT2:RESULT1:RESULT0
;
; Does self-modifying code to hard-code NUM1H:NUM1L into the code
; carry=0: re-use previous NUM1H:NUM1L
; carry=1: reload NUM1H:NUM1L (58 cycles slower)
;
; clobbered: RESULT, X, A, C
; Allocation setup: T1,T2 and RESULT preferably on Zero-page.
;
; NUM1H (x_i), NUM1L (x_f)
; NUM2H (y_i), NUM2L (y_f)
; NUM1L * NUM2L = AAaa
; NUM1L * NUM2H = BBbb
; NUM1H * NUM2L = CCcc
; NUM1H * NUM2H = DDdd
;
; AAaa
; BBbb
; CCcc
; + DDdd
; ----------
; RESULT
;fixed_16x16_mul_unsigned:
multiply_u16x16:
;============================
; Set up self-modifying code
; this changes the code to be hard-coded to multiply by NUM1H:NUM1L
;============================
lda NUM1L ; load the low byte ; 3
sta sm1a+1 ; 3
sta sm3a+1 ; 3
sta sm5a+1 ; 3
; sta sm7a+1 ; 3
eor #$ff ; invert the bits for subtracting ; 2
sta sm2a+1 ; 3
sta sm4a+1 ; 3
sta sm6a+1 ; 3
; sta sm8a+1 ; 3
lda NUM1H ; load the high byte ; 3
sta sm1b+1 ; 3
; sta sm3b+1 ; 3
; sta sm5b+1 ; 3
; sta sm7b+1 ;
eor #$ff ; invert the bits for subtractin ; 2
sta sm2b+1 ; 3
; sta sm4b+1 ; 3
; sta sm6b+1 ; 3
; sta sm8b+1 ;
;===========
; 52
multiply_u16x16_same_num1:
stx TEMP
;==========================
; Perform NUM1L * NUM2L = AAaa
;==========================
ldx NUM2L ; (low le) ; 3
sec ; 2
sm1a:
lda square1_lo,x ; 4
sm2a:
sbc square2_lo,x ; 4
; a is _aa
sta RESULT0 ;
sm3a:
lda square1_hi,x ; 4
sm4a:
sbc square2_hi,x ; 4
; a is _AA
sta _AA+1 ; 3
;===========
; 24
; Perform NUM1H * NUM2L = CCcc
sec ; 2
sm1b:
lda square1_lo,x ; 4
sm2b:
sbc square2_lo,x ; 4
; a is _cc
sta _cc+1 ; 3
;sm3b:
; lda square1_hi,x ; 4
;sm4b:
; sbc square2_hi,x ; 4
; ; a is _CC
; sta _CC+1 ; 3
;===========
; 24
;==========================
; Perform NUM1L * NUM2H = BBbb
;==========================
ldx NUM2H ; 3
sec ; 2
sm5a:
lda square1_lo,x ; 4
sm6a:
sbc square2_lo,x ; 4
; a is _bb
sta _bb+1 ; 3
;sm7a:
; lda square1_hi,x ; 4
;sm8a:
; sbc square2_hi,x ; 4
; ; a is _BB
; sta _BB+1 ; 3
;===========
; 27
;==========================
; Perform NUM1H * NUM2H = DDdd
;==========================
; sec ; 2
;sm5b:
; lda square1_lo,x ; 4
;sm6b:
; sbc square2_lo,x ; 4
; ; a is _dd
; sta _dd+1 ; 3
;sm7b:
; lda square1_hi,x ;
;sm8b:
; sbc square2_hi,x ;
; ; a = _DD
; sta RESULT3 ;
;===========
; 13
;===========================================
; Add the separate multiplications together
;===========================================
clc ; 2
_AA:
lda #0 ; loading _AA ; 2
_bb:
adc #0 ; adding in _bb ; 2
sta RESULT1 ; 3
;==========
; 9
; product[2]=_BB+_CC+c
;_BB:
; lda #0 ; loading _BB ; 2
;_CC:
; adc #0 ; adding in _CC ; 2
; sta RESULT2 ; 3
;===========
; 7
; product[3]=_DD+c
; bcc dd_no_carry1 ;
; inc RESULT3 ;
; clc ; 2
;=============
; 2
dd_no_carry1:
; product[1]=_AA+_bb+_cc
_cc:
lda #0 ; load _cc ; 2
adc RESULT1 ; 3
sta RESULT1 ; 3
; product[2]=_BB+_CC+_dd+c
;_dd:
; lda #0 ; load _dd ; 2
; adc RESULT2 ; 3
; sta RESULT2 ; 3
;===========
; 16
; product[3]=_DD+c
; bcc dd_no_carry2 ;
; inc RESULT3 ;
;=============
; 0
;dd_no_carry2:
ldx TEMP
rts ; 6