From 382fe07bfdf1a57c2785beb4b333e0c0b955509f Mon Sep 17 00:00:00 2001 From: Vince Weaver Date: Sat, 25 Nov 2017 19:37:02 -0500 Subject: [PATCH] tfv: plug in the high speed multiply --- tfv/TODO | 14 +- tfv/tfv_flying.s | 3 + tfv/tfv_multiply.s | 421 ++++++++++++++++++++++++++++++---------- tfv/tfv_multiply_slow.s | 129 ++++++++++++ 4 files changed, 460 insertions(+), 107 deletions(-) create mode 100644 tfv/tfv_multiply_slow.s diff --git a/tfv/TODO b/tfv/TODO index e544242e..ad96ffbb 100644 --- a/tfv/TODO +++ b/tfv/TODO @@ -1,11 +1,23 @@ mode7 speed fixes: + Don't draw sky every frame, only if needed + + faster multiply routine - + move multiply vars to zero page + + If result is AABBCCDD we only need BBCC for fixed point + result + + + re-arrange variables to better take advantage of self-modifying code + + only doing the spacez calculation if it has changed + + update the constants to be constants + + move the screen width constant to own set of varaibles + (instead of temp) + leave one of multiply results in accumulator at end? + Skip key parsing if no key read + slower: move to 40x40 again + draw every other line. First with color/black + then xor in the second line + short term: longer term: diff --git a/tfv/tfv_flying.s b/tfv/tfv_flying.s index 5ae1096a..c206bb2d 100644 --- a/tfv/tfv_flying.s +++ b/tfv/tfv_flying.s @@ -19,6 +19,9 @@ flying_start: jsr clear_screens jsr set_gr_page0 + + jsr init_multiply_tables + ;=============== ; Init Variables ;=============== diff --git a/tfv/tfv_multiply.s b/tfv/tfv_multiply.s index c1801ab3..ae3a6fb6 100644 --- a/tfv/tfv_multiply.s +++ b/tfv/tfv_multiply.s @@ -1,129 +1,338 @@ -; http://www.llx.com/~nparker/a2/mult.html -; MULTIPLY NUM1H:NUM1L * NUM2H:NUM2L -; NUM2 is zeroed out -; result is in RESULT3:RESULT2:RESULT1:RESULT0 +; Fast mutiply +; +; The old routine took around 700 cycles for a 16bitx16bit=32bit mutiply +; This routine, at an expense of 2kB of looku tables, takes around 250 +; If you reuse a term the next time this drops closer to 200 -;NUM1L: .byte 0 -;NUM1H: .byte 0 -;NUM2L: .byte 0 -;NUM2H: .byte 0 -;RESULT: .byte 0,0,0,0 -;NEGATE: .byte 0 - -; If we have 2k to spare we should check out +; This routine was described by Stephen Judd and found +; in The Fridge and in the C=Hacking magazine ; http://codebase64.org/doku.php?id=base:seriously_fast_multiplication -multiply: +; The key thing to note is that +; (a+b)^2 (a-b)^2 +; a*b = ------- - -------- +; 4 4 +; So if you have tables of the squares of 0..511 you can lookup and subtract +; instead of multiplying. - lda #$0 ; 2 - sta NEGATE ; 3 +; Table generation: I:0..511 +; square1_lo = <((I*I)/4) +; square1_hi = >((I*I)/4) +; square2_lo = <(((I-255)*(I-255))/4) +; square2_hi = >(((I-255)*(I-255))/4) - ; Handle Signed - lda NUM1H ; 3 - bpl check_num2 ; 2nt/3 - ;============== - ; 10 +; Note: DOS3.3 starts at $9600 - inc NEGATE ; 3 +square1_lo EQU $8E00 +square1_hi EQU $9000 +square2_lo EQU $9200 +square2_hi EQU $9400 - clc ; 2s-complement NUM1H/NUM1L ; 2 - lda NUM1L ; 3 - eor #$ff ; 2 - adc #$1 ; 2 - sta NUM1L ; 3 +; for(i=0;i<512;i++) { +; square1_lo[i]=((i*i)/4)&0xff; +; square1_hi[i]=(((i*i)/4)>>8)&0xff; +; square2_lo[i]=( ((i-255)*(i-255))/4)&0xff; +; square2_hi[i]=(( ((i-255)*(i-255))/4)>>8)&0xff; +; } - lda NUM1H ; 3 - eor #$ff ; 2 - adc #$0 ; 2 - sta NUM1H ; 3 +init_multiply_tables: + + ; Build the add tables + + ldx #$00 + txa + .byte $c9 ; CMP #immediate - skip TYA and clear carry flag +lb1: tya + adc #$00 ; 0 +ml1: sta square1_hi,x ; square1_hi[0]=0 + tay ; y=0 + cmp #$40 ; subtract 64 and update flags (c=0) + txa ; a=0 + ror ; rotate +ml9: adc #$00 ; add 0 + sta ml9+1 ; update add value + inx ; x=1 +ml0: sta square1_lo,x ; square1_lo[0]=1 + bne lb1 ; if not zero, loop + inc ml0+2 ; increment values + inc ml1+2 ; increment values + clc ; c=0 + iny ; y=1 + bne lb1 ; loop + + ; Build the subtract tables based on the existing one + + ldx #$00 + ldy #$ff +second_table: + lda square1_hi+1,x + sta square2_hi+$100,x + lda square1_hi,x + sta square2_hi,y + lda square1_lo+1,x + sta square2_lo+$100,x + lda square1_lo,x + sta square2_lo,y + dey + inx + bne second_table + + + rts + + +; Fast 16x16 bit unsigned multiplication, 32-bit result +; Input: NUM1H:NUM1L * NUM2H:NUM2L +; Result: RESULT3:RESULT2:RESULT1:RESULT0 +; +; Does self-modifying code to hard-code NUM1H:NUM1L into the code +; carry=0: re-use previous NUM1H:NUM1L +; carry=1: reload NUM1H:NUM1L (58 cycles slower) +; +; clobbered: RESULT, X, A, C +; Allocation setup: T1,T2 and RESULT preferably on Zero-page. +; +; NUM1H (x_i), NUM1L (x_f) +; NUM2H (y_i), NUM2L (y_f) + +; NUM1L * NUM2L = AAaa +; NUM1L * NUM2H = BBbb +; NUM1H * NUM2L = CCcc +; NUM1H * NUM2H = DDdd +; +; AAaa +; BBbb +; CCcc +; + DDdd +; ---------- +; RESULT + +fixed_16x16_mul_unsigned: + + sec ; FIXME-remove when we implement this + + bcc num1_same_as_last_time ; 2nt/3 + + ;============================ + ; Set up self-modifying code + ; this changes the code to be hard-coded to multiply by NUM1H:NUM1L + ;============================ + + lda NUM1L ; load the low byte ; 3 + sta sm1a+1 ; 3 + sta sm3a+1 ; 3 + sta sm5a+1 ; 3 + sta sm7a+1 ; 3 + eor #$ff ; invert the bits for subtracting ; 2 + sta sm2a+1 ; 3 + sta sm4a+1 ; 3 + sta sm6a+1 ; 3 + sta sm8a+1 ; 3 + lda NUM1H ; load the high byte ; 3 + sta sm1b+1 ; 3 + sta sm3b+1 ; 3 + sta sm5b+1 ; 3 + sta sm7b+1 ; 3 + eor #$ff ; invert the bits for subtractin ; 2 + sta sm2b+1 ; 3 + sta sm4b+1 ; 3 + sta sm6b+1 ; 3 + sta sm8b+1 ; 3 ;=========== - ; 25 -check_num2: - lda NUM2H ; 3 - bpl unsigned_multiply ; 2nt/3 - ;============== - ; 6 + ; 58 - inc NEGATE ; 3 +num1_same_as_last_time: - clc ; 2 - lda NUM2L ; 3 - eor #$ff ; 2 - adc #$1 ; 2 - sta NUM2L ; 3 + ;========================== + ; Perform NUM1L * NUM2L = AAaa + ;========================== - lda NUM2H ; 3 - eor #$ff ; 2 - adc #$0 ; 2 - sta NUM2H ; 3 - ;============= - ; 25 -unsigned_multiply: + ldx NUM2L ; (low le) ; 3 + sec ; 2 +sm1a: + lda square1_lo,x ; 4 +sm2a: + sbc square2_lo,x ; 4 - lda #0 ; Initialize RESULT to 0 ; 2 - sta RESULT+2 ; 3 - ldx #16 ; 16x16 multiply ; 2 - ;============ - ; 7 -multiply_mainloop: - lsr NUM2H ; Shift right 16-bit NUM2 ; 5 - ror NUM2L ; low bit goes into carry ; 5 - bcc shift_output ; 0 or 1? ; 2nt/3 - ;============ - ; 13 + ; a is _aa - tay ; If 1, add NUM1 (hi byte RESULT in A) ; 2 - clc ; 2 - lda NUM1L ; 3 - adc RESULT+2 ; 3 - sta RESULT+2 ; 3 - tya ; 2 - adc NUM1H ; 3 - ;============ - ; 18 -shift_output: - ror A ; "Stairstep" shift ; 2 - ror RESULT+2 ; 5 - ror RESULT+1 ; 5 - ror RESULT ; 5 - dex ; 2 - bne multiply_mainloop ; 2nt/3 - ;============= - ; 22 - - sta RESULT+3 ; 3 - - ;; Negate if necessary - - lda NEGATE ; 3 - and #$1 ; 2 - beq positive ; 2nt/3 - ;============== - ; 11 - - clc ; 2 - lda RESULT+0 ; 3 - eor #$ff ; 2 - adc #$1 ; 2 sta RESULT+0 ; 3 - lda RESULT+1 ; 3 - eor #$ff ; 2 - adc #$0 ; 2 - sta RESULT+1 ; 3 +sm3a: + lda square1_hi,x ; 4 +sm4a: + sbc square2_hi,x ; 4 + ; a is _AA + sta _AA+1 ; 3 + ;=========== + ; 27 - lda RESULT+2 ; 3 - eor #$ff ; 2 - adc #$0 ; 2 - sta RESULT+2 ; 3 + ; Perform NUM1H * NUM2L = CCcc + sec ; 2 +sm1b: + lda square1_lo,x ; 4 +sm2b: + sbc square2_lo,x ; 4 + ; a is _cc + sta _cc+1 ; 3 +sm3b: + lda square1_hi,x ; 4 +sm4b: + sbc square2_hi,x ; 4 + ; a is _CC + sta _CC+1 ; 3 + ;=========== + ; 24 - lda RESULT+3 ; 3 - eor #$ff ; 2 - adc #$0 ; 2 + ;========================== + ; Perform NUM1L * NUM2H = BBbb + ;========================== + ldx NUM2H ; 3 + sec ; 2 +sm5a: + lda square1_lo,x ; 4 +sm6a: + sbc square2_lo,x ; 4 + ; a is _bb + sta _bb+1 ; 3 + +sm7a: + lda square1_hi,x ; 4 +sm8a: + sbc square2_hi,x ; 4 + ; a is _BB + sta _BB+1 ; 3 + ;=========== + ; 27 + + ;========================== + ; Perform NUM1H * NUM2H = DDdd + ;========================== + sec ; 2 +sm5b: + lda square1_lo,x ; 4 +sm6b: + sbc square2_lo,x ; 4 + ; a is _dd + sta _dd+1 ; 3 +sm7b: + lda square1_hi,x ; 4 +sm8b: + sbc square2_hi,x ; 4 + ; a = _DD sta RESULT+3 ; 3 ;=========== - ; 42 -positive: + ; 24 + + ;=========================================== + ; Add the separate multiplications together + ;=========================================== + + clc ; 2 +_AA: + lda #0 ; loading _AA ; 2 +_bb: + adc #0 ; adding in _bb ; 2 + sta RESULT+1 ; 3 + + ; product[2]=_BB+_CC+c + +_BB: + lda #0 ; loading _BB ; 2 +_CC: + adc #0 ; adding in _CC ; 2 + sta RESULT+2 ; 3 + ;=========== + ; 19 + + ; product[3]=_DD+c + + bcc dd_no_carry1 ; ^2nt/3 + inc RESULT+3 ; 5 + clc ; 2 + ;============= + ; 6 +dd_no_carry1: + + ; product[1]=_AA+_bb+_cc + +_cc: + lda #0 ; load _cc ; 2 + adc RESULT+1 ; 3 + sta RESULT+1 ; 3 + + ; product[2]=_BB+_CC+_dd+c + +_dd: + lda #0 ; load _dd ; 2 + adc RESULT+2 ; 3 + sta RESULT+2 ; 3 + + ;=========== + ; 19 + ; product[3]=_DD+c + + + bcc dd_no_carry2 ; ^2nt/3 + inc RESULT+3 ; 5 + + ;============= + ; 4 + +dd_no_carry2: + +; *z_i=product[1]; +; *z_f=product[0]; + + rts ; 6 + + + ;================= + ; Signed multiply + ;================= + +multiply: + + jsr fixed_16x16_mul_unsigned ; 6 + + lda NUM1H ; x_i ; 3 + ;=========== + ; 12 + + + bpl x_positive ;^3/2nt + + sec ; 2 + lda RESULT+2 ; 3 + sbc NUM2L ; 3 + sta RESULT+2 ; 3 + lda RESULT+3 ; 3 + sbc NUM2H ; 3 + sta RESULT+3 ; 3 + ;============ + ; 19 + +x_positive: + + lda NUM2H ; y_i ; 3 + ;============ + ; ; 6 + bpl y_positive ; 3/2nt + + + sec ; 2 + lda RESULT+2 ; 3 + sbc NUM1L ; 3 + sta RESULT+2 ; 3 + lda RESULT+3 ; 3 + sbc NUM1H ; 3 + sta RESULT+3 ; 3 + ;=========== + ; 19 + +y_positive: +; *z_i=product[2]; +; *z_f=product[1]; rts ; 6 diff --git a/tfv/tfv_multiply_slow.s b/tfv/tfv_multiply_slow.s new file mode 100644 index 00000000..c1801ab3 --- /dev/null +++ b/tfv/tfv_multiply_slow.s @@ -0,0 +1,129 @@ +; http://www.llx.com/~nparker/a2/mult.html +; MULTIPLY NUM1H:NUM1L * NUM2H:NUM2L +; NUM2 is zeroed out +; result is in RESULT3:RESULT2:RESULT1:RESULT0 + +;NUM1L: .byte 0 +;NUM1H: .byte 0 +;NUM2L: .byte 0 +;NUM2H: .byte 0 +;RESULT: .byte 0,0,0,0 +;NEGATE: .byte 0 + +; If we have 2k to spare we should check out +; http://codebase64.org/doku.php?id=base:seriously_fast_multiplication + +multiply: + + lda #$0 ; 2 + sta NEGATE ; 3 + + ; Handle Signed + lda NUM1H ; 3 + bpl check_num2 ; 2nt/3 + ;============== + ; 10 + + inc NEGATE ; 3 + + clc ; 2s-complement NUM1H/NUM1L ; 2 + lda NUM1L ; 3 + eor #$ff ; 2 + adc #$1 ; 2 + sta NUM1L ; 3 + + lda NUM1H ; 3 + eor #$ff ; 2 + adc #$0 ; 2 + sta NUM1H ; 3 + ;=========== + ; 25 +check_num2: + lda NUM2H ; 3 + bpl unsigned_multiply ; 2nt/3 + ;============== + ; 6 + + inc NEGATE ; 3 + + clc ; 2 + lda NUM2L ; 3 + eor #$ff ; 2 + adc #$1 ; 2 + sta NUM2L ; 3 + + lda NUM2H ; 3 + eor #$ff ; 2 + adc #$0 ; 2 + sta NUM2H ; 3 + ;============= + ; 25 +unsigned_multiply: + + lda #0 ; Initialize RESULT to 0 ; 2 + sta RESULT+2 ; 3 + ldx #16 ; 16x16 multiply ; 2 + ;============ + ; 7 +multiply_mainloop: + lsr NUM2H ; Shift right 16-bit NUM2 ; 5 + ror NUM2L ; low bit goes into carry ; 5 + bcc shift_output ; 0 or 1? ; 2nt/3 + ;============ + ; 13 + + tay ; If 1, add NUM1 (hi byte RESULT in A) ; 2 + clc ; 2 + lda NUM1L ; 3 + adc RESULT+2 ; 3 + sta RESULT+2 ; 3 + tya ; 2 + adc NUM1H ; 3 + ;============ + ; 18 +shift_output: + ror A ; "Stairstep" shift ; 2 + ror RESULT+2 ; 5 + ror RESULT+1 ; 5 + ror RESULT ; 5 + dex ; 2 + bne multiply_mainloop ; 2nt/3 + ;============= + ; 22 + + sta RESULT+3 ; 3 + + ;; Negate if necessary + + lda NEGATE ; 3 + and #$1 ; 2 + beq positive ; 2nt/3 + ;============== + ; 11 + + clc ; 2 + lda RESULT+0 ; 3 + eor #$ff ; 2 + adc #$1 ; 2 + sta RESULT+0 ; 3 + + lda RESULT+1 ; 3 + eor #$ff ; 2 + adc #$0 ; 2 + sta RESULT+1 ; 3 + + lda RESULT+2 ; 3 + eor #$ff ; 2 + adc #$0 ; 2 + sta RESULT+2 ; 3 + + lda RESULT+3 ; 3 + eor #$ff ; 2 + adc #$0 ; 2 + sta RESULT+3 ; 3 + ;=========== + ; 42 +positive: + + rts ; 6 +