From d5f906218aa9015d9fcf4b34edb8aa54b065daba Mon Sep 17 00:00:00 2001 From: Vince Weaver Date: Sat, 16 Dec 2017 14:17:33 -0500 Subject: [PATCH] tfv: move fast multiply to common area --- .../multiply_fast.s | 0 .../multiply_slow.s | 0 mode7/Makefile | 11 +- mode7/mode7.s | 2 +- tfv/Makefile | 3 +- tfv/tfv_flying.s | 2 +- tfv/tfv_multiply.s | 349 ------------------ 7 files changed, 10 insertions(+), 357 deletions(-) rename mode7/fast_multiply.s => asm_routines/multiply_fast.s (100%) rename tfv/tfv_multiply_slow.s => asm_routines/multiply_slow.s (100%) delete mode 100644 tfv/tfv_multiply.s diff --git a/mode7/fast_multiply.s b/asm_routines/multiply_fast.s similarity index 100% rename from mode7/fast_multiply.s rename to asm_routines/multiply_fast.s diff --git a/tfv/tfv_multiply_slow.s b/asm_routines/multiply_slow.s similarity index 100% rename from tfv/tfv_multiply_slow.s rename to asm_routines/multiply_slow.s diff --git a/mode7/Makefile b/mode7/Makefile index 7a2a0930..c7fb501b 100644 --- a/mode7/Makefile +++ b/mode7/Makefile @@ -23,7 +23,8 @@ MODE7_ISLAND: mode7_island.o ld65 -o MODE7_ISLAND mode7_island.o -C ./apple2_1000.inc mode7_island.o: mode7.s island_lookup.s island_map.inc \ - fast_multiply.s zp.inc sprites.inc \ + zp.inc sprites.inc \ + ../asm_routines/multiply_fast.s \ ../asm_routines/hlin_clearscreen.s \ ../asm_routines/pageflip.s \ ../asm_routines/gr_setpage.s \ @@ -36,8 +37,8 @@ MODE7_CHECKERBOARD: mode7_checkerboard.o ld65 -o MODE7_CHECKERBOARD mode7_checkerboard.o -C ./apple2_1000.inc mode7_checkerboard.o: mode7.s checkerboard_lookup.s \ - fast_multiply.s zp.inc sprites.inc \ - fast_multiply.s zp.inc sprites.inc \ + zp.inc sprites.inc \ + ../asm_routines/multiply_fast.s \ ../asm_routines/hlin_clearscreen.s \ ../asm_routines/pageflip.s \ ../asm_routines/gr_setpage.s \ @@ -50,8 +51,8 @@ MODE7_RAINBOW: mode7_rainbow.o ld65 -o MODE7_RAINBOW mode7_rainbow.o -C ./apple2_1000.inc mode7_rainbow.o: mode7.s rainbow_lookup.s \ - fast_multiply.s zp.inc sprites.inc \ - fast_multiply.s zp.inc sprites.inc \ + zp.inc sprites.inc \ + ../asm_routines/multiply_fast.s \ ../asm_routines/hlin_clearscreen.s \ ../asm_routines/pageflip.s \ ../asm_routines/gr_setpage.s \ diff --git a/mode7/mode7.s b/mode7/mode7.s index bb3a4c1a..a12026cf 100644 --- a/mode7/mode7.s +++ b/mode7/mode7.s @@ -1113,7 +1113,7 @@ exit: .include "island_map.inc" .endif -.include "fast_multiply.s" +.include "../asm_routines/multiply_fast.s" ; 8.8 fixed point diff --git a/tfv/Makefile b/tfv/Makefile index 7a226c2f..be5578ea 100644 --- a/tfv/Makefile +++ b/tfv/Makefile @@ -55,9 +55,10 @@ TFV: tfv.o ld65 -o TFV tfv.o -C ./apple2_1000.inc tfv.o: tfv.s \ - tfv_flying.s tfv_info.s tfv_multiply.s tfv_opener.s tfv_title.s \ + tfv_flying.s tfv_info.s tfv_opener.s tfv_title.s \ tfv_textentry.s tfv_worldmap.s \ tfv_backgrounds.inc tfv_sprites.inc tfv_zp.inc \ + ../asm_routines/multiply_fast.s \ ../asm_routines/hlin_clearscreen.s \ ../asm_routines/pageflip.s \ ../asm_routines/gr_setpage.s \ diff --git a/tfv/tfv_flying.s b/tfv/tfv_flying.s index eabbf663..44c75aa0 100644 --- a/tfv/tfv_flying.s +++ b/tfv/tfv_flying.s @@ -1171,7 +1171,7 @@ water_map: .byte $22,$22,$22,$22, $22,$22,$22,$22 .byte $22,$22,$22,$22, $ee,$22,$22,$22 -.include "tfv_multiply.s" +.include "../asm_routines/multiply_fast.s" ; 8.8 fixed point diff --git a/tfv/tfv_multiply.s b/tfv/tfv_multiply.s deleted file mode 100644 index 294b3840..00000000 --- a/tfv/tfv_multiply.s +++ /dev/null @@ -1,349 +0,0 @@ -; Fast mutiply - - -; Note for our purposes we only care about 8.8 x 8.8 fixed point -; with 8.8 result, which means we only care about the middle two bytes -; of the 32 bit result. So we disable generation of the high and low byte -; to save some cycles. - -; -; The old routine took around 700 cycles for a 16bitx16bit=32bit mutiply -; This routine, at an expense of 2kB of looku tables, takes around 250 -; If you reuse a term the next time this drops closer to 200 - -; This routine was described by Stephen Judd and found -; in The Fridge and in the C=Hacking magazine -; http://codebase64.org/doku.php?id=base:seriously_fast_multiplication - -; The key thing to note is that -; (a+b)^2 (a-b)^2 -; a*b = ------- - -------- -; 4 4 -; So if you have tables of the squares of 0..511 you can lookup and subtract -; instead of multiplying. - -; Table generation: I:0..511 -; square1_lo = <((I*I)/4) -; square1_hi = >((I*I)/4) -; square2_lo = <(((I-255)*(I-255))/4) -; square2_hi = >(((I-255)*(I-255))/4) - -; Note: DOS3.3 starts at $9600 - -square1_lo EQU $8E00 -square1_hi EQU $9000 -square2_lo EQU $9200 -square2_hi EQU $9400 - -; for(i=0;i<512;i++) { -; square1_lo[i]=((i*i)/4)&0xff; -; square1_hi[i]=(((i*i)/4)>>8)&0xff; -; square2_lo[i]=( ((i-255)*(i-255))/4)&0xff; -; square2_hi[i]=(( ((i-255)*(i-255))/4)>>8)&0xff; -; } - -init_multiply_tables: - - ; Build the add tables - - ldx #$00 - txa - .byte $c9 ; CMP #immediate - skip TYA and clear carry flag -lb1: tya - adc #$00 ; 0 -ml1: sta square1_hi,x ; square1_hi[0]=0 - tay ; y=0 - cmp #$40 ; subtract 64 and update flags (c=0) - txa ; a=0 - ror ; rotate -ml9: adc #$00 ; add 0 - sta ml9+1 ; update add value - inx ; x=1 -ml0: sta square1_lo,x ; square1_lo[0]=1 - bne lb1 ; if not zero, loop - inc ml0+2 ; increment values - inc ml1+2 ; increment values - clc ; c=0 - iny ; y=1 - bne lb1 ; loop - - ; Build the subtract tables based on the existing one - - ldx #$00 - ldy #$ff -second_table: - lda square1_hi+1,x - sta square2_hi+$100,x - lda square1_hi,x - sta square2_hi,y - lda square1_lo+1,x - sta square2_lo+$100,x - lda square1_lo,x - sta square2_lo,y - dey - inx - bne second_table - - - rts - - -; Fast 16x16 bit unsigned multiplication, 32-bit result -; Input: NUM1H:NUM1L * NUM2H:NUM2L -; Result: RESULT3:RESULT2:RESULT1:RESULT0 -; -; Does self-modifying code to hard-code NUM1H:NUM1L into the code -; carry=0: re-use previous NUM1H:NUM1L -; carry=1: reload NUM1H:NUM1L (58 cycles slower) -; -; clobbered: RESULT, X, A, C -; Allocation setup: T1,T2 and RESULT preferably on Zero-page. -; -; NUM1H (x_i), NUM1L (x_f) -; NUM2H (y_i), NUM2L (y_f) - -; NUM1L * NUM2L = AAaa -; NUM1L * NUM2H = BBbb -; NUM1H * NUM2L = CCcc -; NUM1H * NUM2H = DDdd -; -; AAaa -; BBbb -; CCcc -; + DDdd -; ---------- -; RESULT - -;fixed_16x16_mul_unsigned: - -multiply: - - bcc num1_same_as_last_time ; 2nt/3 - - ;============================ - ; Set up self-modifying code - ; this changes the code to be hard-coded to multiply by NUM1H:NUM1L - ;============================ - - lda NUM1L ; load the low byte ; 3 - sta sm1a+1 ; 3 - sta sm3a+1 ; 3 - sta sm5a+1 ; 3 - sta sm7a+1 ; 3 - eor #$ff ; invert the bits for subtracting ; 2 - sta sm2a+1 ; 3 - sta sm4a+1 ; 3 - sta sm6a+1 ; 3 - sta sm8a+1 ; 3 - lda NUM1H ; load the high byte ; 3 - sta sm1b+1 ; 3 - sta sm3b+1 ; 3 - sta sm5b+1 ; 3 -; sta sm7b+1 ; - eor #$ff ; invert the bits for subtractin ; 2 - sta sm2b+1 ; 3 - sta sm4b+1 ; 3 - sta sm6b+1 ; 3 -; sta sm8b+1 ; - ;=========== - ; 52 - -num1_same_as_last_time: - - ;========================== - ; Perform NUM1L * NUM2L = AAaa - ;========================== - - ldx NUM2L ; (low le) ; 3 - sec ; 2 -sm1a: - lda square1_lo,x ; 4 -sm2a: - sbc square2_lo,x ; 4 - - ; a is _aa - -; sta RESULT+0 ; - -sm3a: - lda square1_hi,x ; 4 -sm4a: - sbc square2_hi,x ; 4 - ; a is _AA - sta _AA+1 ; 3 - ;=========== - ; 24 - - ; Perform NUM1H * NUM2L = CCcc - sec ; 2 -sm1b: - lda square1_lo,x ; 4 -sm2b: - sbc square2_lo,x ; 4 - ; a is _cc - sta _cc+1 ; 3 -sm3b: - lda square1_hi,x ; 4 -sm4b: - sbc square2_hi,x ; 4 - ; a is _CC - sta _CC+1 ; 3 - ;=========== - ; 24 - - ;========================== - ; Perform NUM1L * NUM2H = BBbb - ;========================== - ldx NUM2H ; 3 - sec ; 2 -sm5a: - lda square1_lo,x ; 4 -sm6a: - sbc square2_lo,x ; 4 - ; a is _bb - sta _bb+1 ; 3 - -sm7a: - lda square1_hi,x ; 4 -sm8a: - sbc square2_hi,x ; 4 - ; a is _BB - sta _BB+1 ; 3 - ;=========== - ; 27 - - ;========================== - ; Perform NUM1H * NUM2H = DDdd - ;========================== - sec ; 2 -sm5b: - lda square1_lo,x ; 4 -sm6b: - sbc square2_lo,x ; 4 - ; a is _dd - sta _dd+1 ; 3 -;sm7b: -; lda square1_hi,x ; -;sm8b: -; sbc square2_hi,x ; - ; a = _DD -; sta RESULT+3 ; - ;=========== - ; 13 - - ;=========================================== - ; Add the separate multiplications together - ;=========================================== - - clc ; 2 -_AA: - lda #0 ; loading _AA ; 2 -_bb: - adc #0 ; adding in _bb ; 2 - sta RESULT+1 ; 3 - ;========== - ; 9 - ; product[2]=_BB+_CC+c - -_BB: - lda #0 ; loading _BB ; 2 -_CC: - adc #0 ; adding in _CC ; 2 - sta RESULT+2 ; 3 - ;=========== - ; 7 - - ; product[3]=_DD+c - -; bcc dd_no_carry1 ; -; inc RESULT+3 ; - clc ; 2 - ;============= - ; 2 -dd_no_carry1: - - ; product[1]=_AA+_bb+_cc - -_cc: - lda #0 ; load _cc ; 2 - adc RESULT+1 ; 3 - sta RESULT+1 ; 3 - - ; product[2]=_BB+_CC+_dd+c - -_dd: - lda #0 ; load _dd ; 2 - adc RESULT+2 ; 3 - sta RESULT+2 ; 3 - - ;=========== - ; 16 - ; product[3]=_DD+c - - -; bcc dd_no_carry2 ; -; inc RESULT+3 ; - - ;============= - ; 0 - -dd_no_carry2: - -; *z_i=product[1]; -; *z_f=product[0]; - -; rts ; 6 - - - ;================= - ; Signed multiply - ;================= - -;multiply: - -; jsr fixed_16x16_mul_unsigned ; 6 - - lda NUM1H ; x_i ; 3 - ;=========== - ; 12 - - - bpl x_positive ;^3/2nt - - sec ; 2 - lda RESULT+2 ; 3 - sbc NUM2L ; 3 - sta RESULT+2 ; 3 -; lda RESULT+3 ; -; sbc NUM2H ; -; sta RESULT+3 ; - ;============ - ; 10 - -x_positive: - - lda NUM2H ; y_i ; 3 - ;============ - ; ; 6 - - bpl y_positive ;^3/2nt - - - sec ; 2 - lda RESULT+2 ; 3 - sbc NUM1L ; 3 - sta RESULT+2 ; 3 -; lda RESULT+3 ; -; sbc NUM1H ; -; sta RESULT+3 ; - ;=========== - ; 10 - -y_positive: - ldx RESULT+2 ; *z_i=product[2]; ; 3 - lda RESULT+1 ; *z_f=product[1]; ; 3 - - rts ; 6 - ;========== - ; 12 -