dos33fsprogs/demos/hellmood_memories/multiply_u16x16.s

218 lines
4.1 KiB
ArmAsm

; Fast mutiply
; Note for our purposes we only care about 8.8 x 8.8 fixed point
; with 8.8 result, which means we only care about the middle two bytes
; of the 32 bit result. So we disable generation of the high and low byte
; to save some cycles.
;
; The old routine took around 700 cycles for a 16bitx16bit=32bit mutiply
; This routine, at an expense of 2kB of lookup tables, takes around 250
; If you reuse a term the next time this drops closer to 200
; Fast 16x16 bit unsigned multiplication, 32-bit result
; Input: NUM1H:NUM1L * NUM2H:NUM2L
; Result: RESULT3:RESULT2:RESULT1:RESULT0
;
; Does self-modifying code to hard-code NUM1H:NUM1L into the code
; carry=0: re-use previous NUM1H:NUM1L
; carry=1: reload NUM1H:NUM1L (58 cycles slower)
;
; clobbered: RESULT, X, A, C
; Allocation setup: T1,T2 and RESULT preferably on Zero-page.
;
; NUM1H (x_i), NUM1L (x_f)
; NUM2H (y_i), NUM2L (y_f)
; NUM1L * NUM2L = AAaa
; NUM1L * NUM2H = BBbb
; NUM1H * NUM2L = CCcc
; NUM1H * NUM2H = DDdd
;
; AAaa
; BBbb
; CCcc
; + DDdd
; ----------
; RESULT
;fixed_16x16_mul_unsigned:
multiply_u16x16:
;============================
; Set up self-modifying code
; this changes the code to be hard-coded to multiply by NUM1H:NUM1L
;============================
lda NUM1L ; load the low byte ; 3
sta sm1a+1 ; 3
sta sm3a+1 ; 3
sta sm5a+1 ; 3
sta sm7a+1 ; 3
eor #$ff ; invert the bits for subtracting ; 2
sta sm2a+1 ; 3
sta sm4a+1 ; 3
sta sm6a+1 ; 3
sta sm8a+1 ; 3
lda NUM1H ; load the high byte ; 3
sta sm1b+1 ; 3
sta sm3b+1 ; 3
sta sm5b+1 ; 3
sta sm7b+1 ;
eor #$ff ; invert the bits for subtractin ; 2
sta sm2b+1 ; 3
sta sm4b+1 ; 3
sta sm6b+1 ; 3
sta sm8b+1 ;
;===========
; 52
multiply_u16x16_same_num1:
stx TEMP
;==========================
; Perform NUM1L * NUM2L = AAaa
;==========================
ldx NUM2L ; (low le) ; 3
sec ; 2
sm1a:
lda square1_lo,x ; 4
sm2a:
sbc square2_lo,x ; 4
; a is _aa
sta RESULT0 ;
sm3a:
lda square1_hi,x ; 4
sm4a:
sbc square2_hi,x ; 4
; a is _AA
sta _AA+1 ; 3
;===========
; 24
; Perform NUM1H * NUM2L = CCcc
sec ; 2
sm1b:
lda square1_lo,x ; 4
sm2b:
sbc square2_lo,x ; 4
; a is _cc
sta _cc+1 ; 3
sm3b:
lda square1_hi,x ; 4
sm4b:
sbc square2_hi,x ; 4
; a is _CC
sta _CC+1 ; 3
;===========
; 24
;==========================
; Perform NUM1L * NUM2H = BBbb
;==========================
ldx NUM2H ; 3
sec ; 2
sm5a:
lda square1_lo,x ; 4
sm6a:
sbc square2_lo,x ; 4
; a is _bb
sta _bb+1 ; 3
sm7a:
lda square1_hi,x ; 4
sm8a:
sbc square2_hi,x ; 4
; a is _BB
sta _BB+1 ; 3
;===========
; 27
;==========================
; Perform NUM1H * NUM2H = DDdd
;==========================
sec ; 2
sm5b:
lda square1_lo,x ; 4
sm6b:
sbc square2_lo,x ; 4
; a is _dd
sta _dd+1 ; 3
sm7b:
lda square1_hi,x ;
sm8b:
sbc square2_hi,x ;
; a = _DD
sta RESULT3 ;
;===========
; 13
;===========================================
; Add the separate multiplications together
;===========================================
clc ; 2
_AA:
lda #0 ; loading _AA ; 2
_bb:
adc #0 ; adding in _bb ; 2
sta RESULT1 ; 3
;==========
; 9
; product[2]=_BB+_CC+c
_BB:
lda #0 ; loading _BB ; 2
_CC:
adc #0 ; adding in _CC ; 2
sta RESULT2 ; 3
;===========
; 7
; product[3]=_DD+c
bcc dd_no_carry1 ;
inc RESULT3 ;
clc ; 2
;=============
; 2
dd_no_carry1:
; product[1]=_AA+_bb+_cc
_cc:
lda #0 ; load _cc ; 2
adc RESULT1 ; 3
sta RESULT1 ; 3
; product[2]=_BB+_CC+_dd+c
_dd:
lda #0 ; load _dd ; 2
adc RESULT2 ; 3
sta RESULT2 ; 3
;===========
; 16
; product[3]=_DD+c
bcc dd_no_carry2 ;
inc RESULT3 ;
;=============
; 0
dd_no_carry2:
ldx TEMP
rts ; 6