; MODIFIED ; we only care about the result of RESULT1 for sierpinski ; Fast mutiply ; Note for our purposes we only care about 8.8 x 8.8 fixed point ; with 8.8 result, which means we only care about the middle two bytes ; of the 32 bit result. So we disable generation of the high and low byte ; to save some cycles. ; ; The old routine took around 700 cycles for a 16bitx16bit=32bit mutiply ; This routine, at an expense of 2kB of lookup tables, takes around 250 ; If you reuse a term the next time this drops closer to 200 ; Fast 16x16 bit unsigned multiplication, 32-bit result ; Input: NUM1H:NUM1L * NUM2H:NUM2L ; Result: RESULT3:RESULT2:RESULT1:RESULT0 ; ; Does self-modifying code to hard-code NUM1H:NUM1L into the code ; carry=0: re-use previous NUM1H:NUM1L ; carry=1: reload NUM1H:NUM1L (58 cycles slower) ; ; clobbered: RESULT, X, A, C ; Allocation setup: T1,T2 and RESULT preferably on Zero-page. ; ; NUM1H (x_i), NUM1L (x_f) ; NUM2H (y_i), NUM2L (y_f) ; NUM1L * NUM2L = AAaa ; NUM1L * NUM2H = BBbb ; NUM1H * NUM2L = CCcc ; NUM1H * NUM2H = DDdd ; ; AAaa ; BBbb ; CCcc ; + DDdd ; ---------- ; RESULT ;fixed_16x16_mul_unsigned: multiply_u16x16: ;============================ ; Set up self-modifying code ; this changes the code to be hard-coded to multiply by NUM1H:NUM1L ;============================ lda NUM1L ; load the low byte ; 3 sta sm1a+1 ; 3 sta sm3a+1 ; 3 sta sm5a+1 ; 3 ; sta sm7a+1 ; 3 eor #$ff ; invert the bits for subtracting ; 2 sta sm2a+1 ; 3 sta sm4a+1 ; 3 sta sm6a+1 ; 3 ; sta sm8a+1 ; 3 lda NUM1H ; load the high byte ; 3 sta sm1b+1 ; 3 ; sta sm3b+1 ; 3 ; sta sm5b+1 ; 3 ; sta sm7b+1 ; eor #$ff ; invert the bits for subtractin ; 2 sta sm2b+1 ; 3 ; sta sm4b+1 ; 3 ; sta sm6b+1 ; 3 ; sta sm8b+1 ; ;=========== ; 52 multiply_u16x16_same_num1: stx TEMP ;========================== ; Perform NUM1L * NUM2L = AAaa ;========================== ldx NUM2L ; (low le) ; 3 sec ; 2 sm1a: lda square1_lo,x ; 4 sm2a: sbc square2_lo,x ; 4 ; a is _aa sta RESULT0 ; sm3a: lda square1_hi,x ; 4 sm4a: sbc square2_hi,x ; 4 ; a is _AA sta _AA+1 ; 3 ;=========== ; 24 ; Perform NUM1H * NUM2L = CCcc sec ; 2 sm1b: lda square1_lo,x ; 4 sm2b: sbc square2_lo,x ; 4 ; a is _cc sta _cc+1 ; 3 ;sm3b: ; lda square1_hi,x ; 4 ;sm4b: ; sbc square2_hi,x ; 4 ; ; a is _CC ; sta _CC+1 ; 3 ;=========== ; 24 ;========================== ; Perform NUM1L * NUM2H = BBbb ;========================== ldx NUM2H ; 3 sec ; 2 sm5a: lda square1_lo,x ; 4 sm6a: sbc square2_lo,x ; 4 ; a is _bb sta _bb+1 ; 3 ;sm7a: ; lda square1_hi,x ; 4 ;sm8a: ; sbc square2_hi,x ; 4 ; ; a is _BB ; sta _BB+1 ; 3 ;=========== ; 27 ;========================== ; Perform NUM1H * NUM2H = DDdd ;========================== ; sec ; 2 ;sm5b: ; lda square1_lo,x ; 4 ;sm6b: ; sbc square2_lo,x ; 4 ; ; a is _dd ; sta _dd+1 ; 3 ;sm7b: ; lda square1_hi,x ; ;sm8b: ; sbc square2_hi,x ; ; ; a = _DD ; sta RESULT3 ; ;=========== ; 13 ;=========================================== ; Add the separate multiplications together ;=========================================== clc ; 2 _AA: lda #0 ; loading _AA ; 2 _bb: adc #0 ; adding in _bb ; 2 sta RESULT1 ; 3 ;========== ; 9 ; product[2]=_BB+_CC+c ;_BB: ; lda #0 ; loading _BB ; 2 ;_CC: ; adc #0 ; adding in _CC ; 2 ; sta RESULT2 ; 3 ;=========== ; 7 ; product[3]=_DD+c ; bcc dd_no_carry1 ; ; inc RESULT3 ; ; clc ; 2 ;============= ; 2 dd_no_carry1: ; product[1]=_AA+_bb+_cc _cc: lda #0 ; load _cc ; 2 adc RESULT1 ; 3 sta RESULT1 ; 3 ; product[2]=_BB+_CC+_dd+c ;_dd: ; lda #0 ; load _dd ; 2 ; adc RESULT2 ; 3 ; sta RESULT2 ; 3 ;=========== ; 16 ; product[3]=_DD+c ; bcc dd_no_carry2 ; ; inc RESULT3 ; ;============= ; 0 ;dd_no_carry2: ldx TEMP rts ; 6