From 71fde333c9eb4918f9add0b3633aaac97f0429bf Mon Sep 17 00:00:00 2001 From: Vince Weaver Date: Sat, 25 Nov 2017 21:55:45 -0500 Subject: [PATCH] tfv: optimize the multiply some more --- gr-sim/tfv_flying_6502.c | 63 ++++++------ tfv/OPTIMIZATION | 16 +++ tfv/TODO | 2 + tfv/tfv_flying.s | 212 +++++++++++++++++++-------------------- tfv/tfv_multiply.s | 8 +- 5 files changed, 157 insertions(+), 144 deletions(-) diff --git a/gr-sim/tfv_flying_6502.c b/gr-sim/tfv_flying_6502.c index ebf0b870..fef4fc8b 100644 --- a/gr-sim/tfv_flying_6502.c +++ b/gr-sim/tfv_flying_6502.c @@ -43,8 +43,6 @@ #define SPEED 0x7b #define SPLASH_COUNT 0x7c #define OVER_WATER 0x7d -#define TEMP2_I 0x7e -#define TEMP2_F 0x7f #define SHIPY 0xE4 @@ -57,6 +55,10 @@ #define CONST_SHIPX 15 +#define CONST_LOWRES_HALF_I 0xec // -20 (LOWRES_W/2) +#define CONST_LOWRES_HALF_F 0x0 + + /* Mode7 code based on code from: */ /* http://www.helixsoft.nl/articles/circle/sincos.htm */ @@ -562,7 +564,7 @@ y_positive: // return (product[3]<<24)|(product[2]<<16)|(product[1]<<8)|product[0]; // rts ; 6 - cycles.multiply+=6; + cycles.multiply+=12; } @@ -786,9 +788,9 @@ void draw_background_mode7(void) { hlin_double(ram[DRAW_PAGE], 0, 40, 6); cycles.mode7+=14+63+(16*40); - cycles.mode7+=28; + cycles.mode7+=30; /* FIXME: only do this if SPACEZ changes? */ -// mul +// mul1 fixed_mul(ram[SPACEZ_I],ram[SPACEZ_F], CONST_BETA_I,CONST_BETA_F, &ram[FACTOR_I],&ram[FACTOR_F],0); @@ -804,7 +806,7 @@ void draw_background_mode7(void) { // fixed_to_double(ram[SPACEZ_I],ram[SPACEZ_F],), // fixed_to_double(&BETA), // fixed_to_double(ram[FACTOR_I],ram[FACTOR_F])); - cycles.mode7+=22; + cycles.mode7+=12; for (ram[SCREEN_Y] = 8; ram[SCREEN_Y] < LOWRES_H; ram[SCREEN_Y]+=2) { @@ -825,11 +827,11 @@ void draw_background_mode7(void) { ram[HORIZ_SCALE_I],ram[HORIZ_SCALE_F]); } -//mul // calculate the distance of the line we are drawing +//mul2 // calculate the distance of the line we are drawing fixed_mul(ram[HORIZ_SCALE_I],ram[HORIZ_SCALE_F], CONST_SCALE_I,CONST_SCALE_F, &ram[DISTANCE_I],&ram[DISTANCE_F],0); - cycles.mode7+=44; + cycles.mode7+=34; if (!displayed) { printf("DISTANCE %x:%x\n",ram[DISTANCE_I],ram[DISTANCE_F]); } @@ -840,11 +842,11 @@ void draw_background_mode7(void) { ram[DX_F]=fixed_sin[(ram[ANGLE]+8)&0xf].f; // -sin() cycles.mode7+=29; -// mul +// mul3 fixed_mul(ram[HORIZ_SCALE_I],ram[HORIZ_SCALE_F], ram[DX_I],ram[DX_F], &ram[DX_I],&ram[DX_F],1); - cycles.mode7+=48; + cycles.mode7+=26; if (!displayed) { printf("DX %x:%x\n",ram[DX_I],ram[DX_F]); } @@ -853,11 +855,11 @@ void draw_background_mode7(void) { ram[DY_I]=fixed_sin[(ram[ANGLE]+4)&0xf].i; // cos() ram[DY_F]=fixed_sin[(ram[ANGLE]+4)&0xf].f; // cos() cycles.mode7+=29; -// mul +// mul4 fixed_mul(ram[HORIZ_SCALE_I],ram[HORIZ_SCALE_F], ram[DY_I],ram[DY_F], &ram[DY_I],&ram[DY_F],1); - cycles.mode7+=48; + cycles.mode7+=28; if (!displayed) { printf("DY %x:%x\n",ram[DY_I],ram[DY_F]); } @@ -874,11 +876,11 @@ void draw_background_mode7(void) { ram[TEMP_I]=fixed_sin[(ram[ANGLE]+4)&0xf].i; // cos ram[TEMP_F]=fixed_sin[(ram[ANGLE]+4)&0xf].f; // cos cycles.mode7+=29; -// mul +// mul5 fixed_mul(ram[SPACEX_I],ram[SPACEX_F], ram[TEMP_I],ram[TEMP_F], &ram[SPACEX_I],&ram[SPACEX_F],0); - cycles.mode7+=48; + cycles.mode7+=38; fixed_add(ram[SPACEX_I],ram[SPACEX_F], ram[CX_I],ram[CX_F], @@ -888,25 +890,22 @@ void draw_background_mode7(void) { ram[TEMP_I]=fixed_sin[ram[ANGLE]&0xf].i; ram[TEMP_F]=fixed_sin[ram[ANGLE]&0xf].f; cycles.mode7+=25; -// mul +// mul6 fixed_mul(ram[SPACEY_I],ram[SPACEY_F], ram[TEMP_I],ram[TEMP_F], &ram[SPACEY_I],&ram[SPACEY_F],0); - cycles.mode7+=48; + cycles.mode7+=38; + fixed_add(ram[SPACEY_I],ram[SPACEY_F], ram[CY_I],ram[CY_F], &ram[SPACEY_I],&ram[SPACEY_F]); - ram[TEMP2_I]=0xec; // -20 (LOWRES_W/2) - ram[TEMP2_F]=0; - cycles.mode7+=30; - -// mul - fixed_mul(ram[TEMP2_I],ram[TEMP2_F], +// mul7 + fixed_mul(CONST_LOWRES_HALF_I,CONST_LOWRES_HALF_F, ram[DX_I],ram[DX_F], &ram[TEMP_I],&ram[TEMP_F],0); - cycles.mode7+=48; + cycles.mode7+=38; fixed_add(ram[SPACEX_I],ram[SPACEX_F], ram[TEMP_I],ram[TEMP_F], @@ -917,15 +916,11 @@ void draw_background_mode7(void) { ram[SPACEX_I],ram[SPACEX_F]); } - -// ram[TEMP_I]=0xec; // -20 (LOWRES_W/2) -// ram[TEMP_F]=0; - cycles.mode7+=30; -// mul - fixed_mul(ram[TEMP2_I],ram[TEMP2_F], +// mul8 + fixed_mul(CONST_LOWRES_HALF_I,CONST_LOWRES_HALF_F, ram[DY_I],ram[DY_F], &ram[TEMP_I],&ram[TEMP_F],1); - cycles.mode7+=48; + cycles.mode7+=26; fixed_add(ram[SPACEY_I],ram[SPACEY_F], ram[TEMP_I],ram[TEMP_F], &ram[SPACEY_I],&ram[SPACEY_F]); @@ -983,16 +978,16 @@ int flying(void) { memset(&cycles,0,sizeof(cycles)); fixed_mul(0x1,0x0, 0x2,0x0, - &ram[FACTOR_I],&ram[FACTOR_F],0); + &ram[TEMP_I],&ram[TEMP_F],0); printf("Multiplying 1.0 * 2.0 = %d.%d, took %d cycles\n", - ram[FACTOR_I],ram[FACTOR_F],cycles.multiply); + ram[TEMP_I],ram[TEMP_F],cycles.multiply); memset(&cycles,0,sizeof(cycles)); fixed_mul(0xff,0xff, 0xff,0xff, - &ram[FACTOR_I],&ram[FACTOR_F],0); + &ram[TEMP_I],&ram[TEMP_F],0); printf("Multiplying ff.ff * ff.ff = %d.%d, took %d cycles\n", - ram[FACTOR_I],ram[FACTOR_F],cycles.multiply); + ram[TEMP_I],ram[TEMP_F],cycles.multiply); gr(); clear_bottom(PAGE0); diff --git a/tfv/OPTIMIZATION b/tfv/OPTIMIZATION index eeeea5f8..6b072d65 100644 --- a/tfv/OPTIMIZATION +++ b/tfv/OPTIMIZATION @@ -42,3 +42,19 @@ Update to use "fast multiply" w 2kB squares table lookup: ================================= Total = 139,833 Frame Rate = 7.15 fps + +Update to optimize fast multiply (reusing NUM1H, return results in register) + Multiplying 1.0 * 2.0 = 2.0, took 234 cycles + Multiplying ff.ff * ff.ff = 0.0, took 278 cycles + + Cycles: flying= 162 + Cycles: getkey= 46 + Cycles: page_flip= 26 + Cycles: multiply= 24,935 + Cycles: mode7= 73,925 + Cycles: lookup_map= 33,920 + Cycles: put_sprite= 2,561 + ================================= + Total = 135,575 + Frame Rate = 7.38 fps + diff --git a/tfv/TODO b/tfv/TODO index ad96ffbb..fcc7b78e 100644 --- a/tfv/TODO +++ b/tfv/TODO @@ -4,6 +4,8 @@ mode7 speed fixes: + faster multiply routine + If result is AABBCCDD we only need BBCC for fixed point result + + Pass NUM1H in A? no because used multiple places + + inline hlin_setup? (save 12 cycles) + re-arrange variables to better take advantage of self-modifying code + only doing the spacez calculation if it has changed diff --git a/tfv/tfv_flying.s b/tfv/tfv_flying.s index b1dce84c..67650cb9 100644 --- a/tfv/tfv_flying.s +++ b/tfv/tfv_flying.s @@ -11,7 +11,8 @@ CONST_BETA_I EQU $ff CONST_BETA_F EQU $80 CONST_SCALE_I EQU $14 CONST_SCALE_F EQU $00 - +CONST_LOWRES_HALF_I EQU $ec ; -(LOWRES_W/2) +CONST_LOWRES_HALF_F EQU $00 flying_start: @@ -553,7 +554,7 @@ sky_loop: ; draw line across screen bne sky_loop ; 3/2nt ;============= ; (23+63+(X*16))*5 - ; Draw Horizon + ; Draw Hazy Horizon lda #COLOR_BOTH_GREY ; Horizon is Grey ; 2 sta COLOR ; 3 @@ -564,8 +565,9 @@ sky_loop: ; draw line across screen jsr hlin_double ; hlin 0,40 at 6 ; 63+(X*16) ;=========== ; 63+(X*16)+14 + ; FIXME: only do this if Z changes? ; fixed_mul(&space_z,&BETA,&factor); - +;mul1 lda SPACEZ_I ; 3 sta NUM1H ; 3 lda SPACEZ_F ; 3 @@ -581,10 +583,8 @@ sky_loop: ; draw line across screen ;=========== ; 30 - lda RESULT+2 ; 4 - sta FACTOR_I ; 4 - lda RESULT+1 ; 4 - sta FACTOR_F ; 4 + sta FACTOR_I ; 3 + stx FACTOR_F ; 3 ;; SPACEZ=78 * ff80 = FACTOR=66 @@ -595,7 +595,7 @@ sky_loop: ; draw line across screen lda #8 ; 2 sta SCREEN_Y ; 4 ;============= - ; 22 + ; 12 screeny_loop: ldy #0 ; 2 jsr hlin_setup ; y-coord in a, x-coord in y ; 41 @@ -629,25 +629,23 @@ screeny_loop: ;============ ; 44 ;; brk ASM, horiz_scale = 00:73 - +; mul2 ; calculate the distance of the line we are drawing ; fixed_mul(&horizontal_scale,&scale,&distance); lda HORIZ_SCALE_I ; 3 - sta NUM1H ; 4 + sta NUM1H ; 3 lda HORIZ_SCALE_F ; 3 - sta NUM1L ; 4 + sta NUM1L ; 3 lda #CONST_SCALE_I ; SCALE_I ; 2 - sta NUM2H ; 4 + sta NUM2H ; 3 lda #CONST_SCALE_F ; SCALE_F ; 2 - sta NUM2L ; 4 + sta NUM2L ; 3 sec ; 2 jsr multiply ; 6 - lda RESULT+2 ; 4 sta DISTANCE_I ; 2 - lda RESULT+1 ; 4 - sta DISTANCE_F ; 2 + stx DISTANCE_F ; 2 ;========== - ; 46 + ; 34 ;; brk ASM, distance = 08:fc ; calculate the dx and dy of points in space when we step @@ -666,24 +664,22 @@ screeny_loop: sta DX_F ; 3 ;========== ; 29 - +;mul3 ; fixed_mul(&dx,&horizontal_scale,&dx); - lda HORIZ_SCALE_I ; 3 - sta NUM1H ; 4 - lda HORIZ_SCALE_F ; 3 - sta NUM1L ; 4 +; lda HORIZ_SCALE_I +; sta NUM1H +; lda HORIZ_SCALE_F +; sta NUM1L lda DX_I ; 3 - sta NUM2H ; 4 + sta NUM2H ; 3 lda DX_F ; 3 - sta NUM2L ; 4 - sec ; 2 + sta NUM2L ; 3 + clc ; reuse HORIZ_SCALE in NUM1 ; 2 jsr multiply ; 6 - lda RESULT+2 ; 4 sta DX_I ; 3 - lda RESULT+1 ; 4 - sta DX_F ; 3 + stx DX_F ; 3 ;========== - ; 48 + ; 26 ;; ANGLE ;; brk ASM, dx = 00:00 @@ -700,23 +696,22 @@ screeny_loop: sta DY_F ; 3 ;========== ; 29 +;mul4 ; fixed_mul(&dy,&horizontal_scale,&dy); - lda HORIZ_SCALE_I ; 3 - sta NUM1H ; 4 - lda HORIZ_SCALE_F ; 3 - sta NUM1L ; 4 +; lda HORIZ_SCALE_I +; sta NUM1H +; lda HORIZ_SCALE_F +; sta NUM1L lda DY_I ; 3 sta NUM2H ; 4 lda DY_F ; 3 sta NUM2L ; 4 - sec ; 2 + clc ; reuse horiz_scale in num1 ; 2 jsr multiply ; 6 - lda RESULT+2 ; 4 sta DY_I ; 3 - lda RESULT+1 ; 4 - sta DY_F ; 3 + stx DY_F ; 3 ;========== - ; 48 + ; 28 ;; brk ASM, dy = 00:73 ; calculate the starting position @@ -749,23 +744,22 @@ screeny_loop: ;========== ; 29 +; mul5 ; fixed_mul(&space_x,&temp,&space_x); lda SPACEX_I ; 3 - sta NUM1H ; 4 + sta NUM1H ; 3 lda SPACEX_F ; 3 - sta NUM1L ; 4 + sta NUM1L ; 3 lda TEMP_I ; 3 - sta NUM2H ; 4 + sta NUM2H ; 3 lda TEMP_F ; 3 - sta NUM2L ; 4 + sta NUM2L ; 3 sec ; 2 jsr multiply ; 6 - lda RESULT+2 ; 4 sta SPACEX_I ; 3 - lda RESULT+1 ; 4 - sta SPACEX_F ; 3 + stx SPACEX_F ; 3 ;========== - ; 48 + ; 38 clc ; fixed_add(&space_x,&cx,&space_x); ; 2 lda SPACEX_F ; 3 @@ -775,41 +769,7 @@ screeny_loop: adc CX_I ; 3 sta SPACEX_I ; 3 - lda #$ec ; temp.i=0xec; // -20 (LOWRES_W/2) ; 2 - sta TEMP_I ; 3 - lda #0 ; temp.f=0; ; 2 - sta TEMP_F ; 3 - ;========== - ; 30 - ; fixed_mul(&temp,&dx,&temp); - lda TEMP_I ; 3 - sta NUM1H ; 4 - lda TEMP_F ; 3 - sta NUM1L ; 4 - lda DX_I ; 3 - sta NUM2H ; 4 - lda DX_F ; 3 - sta NUM2L ; 4 - sec ; 2 - jsr multiply ; 6 - lda RESULT+2 ; 4 - sta TEMP_I ; 3 - lda RESULT+1 ; 4 - sta TEMP_F ; 3 - ;========== - ; 48 - - - clc ; fixed_add(&space_x,&temp,&space_x); ; 2 - lda SPACEX_F ; 3 - adc TEMP_F ; 3 - sta SPACEX_F ; 3 - lda SPACEX_I ; 3 - adc TEMP_I ; 3 - sta SPACEX_I ; 3 - ;========== - ; 20 ; brk ; space_x = 06:bc lda ANGLE ; temp.i=fixed_sin[angle&0xf].i; ; 3 @@ -824,23 +784,22 @@ screeny_loop: ;========== ; 25 +;mul6 ; fixed_mul(&space_y,&fixed_temp,&space_y); lda SPACEY_I ; 3 - sta NUM1H ; 4 + sta NUM1H ; 3 lda SPACEY_F ; 3 - sta NUM1L ; 4 + sta NUM1L ; 3 lda TEMP_I ; 3 - sta NUM2H ; 4 + sta NUM2H ; 3 lda TEMP_F ; 3 - sta NUM2L ; 4 + sta NUM2L ; 3 sec ; 2 jsr multiply ; 6 - lda RESULT+2 ; 4 sta SPACEY_I ; 3 - lda RESULT+1 ; 4 - sta SPACEY_F ; 3 + stx SPACEY_F ; 3 ;========== - ; 48 + ; 38 clc ; fixed_add(&space_y,&cy,&space_y); ; 2 lda SPACEY_F ; 3 @@ -850,29 +809,68 @@ screeny_loop: adc CY_I ; 3 sta SPACEY_I ; 3 - lda #$ec ; temp.i=0xec; // -20 (LOWRES_W/2) ; 2 - sta TEMP_I ; 3 - lda #0 ; temp.f=0; ; 2 - sta TEMP_F ; 3 - ;========== - ; 30 - ; fixed_mul(&fixed_temp,&dy,&fixed_temp); - lda TEMP_I ; 3 - sta NUM1H ; 4 - lda TEMP_F ; 3 - sta NUM1L ; 4 - lda DY_I ; 3 - sta NUM2H ; 4 - lda DY_F ; 3 - sta NUM2L ; 4 + +; lda #$ec ; temp.i=0xec; // -20 (LOWRES_W/2) ; 2 +; sta TEMP_I ; 3 +; lda #0 ; temp.f=0; ; 2 +; sta TEMP_F ; 3 +; ;========== +; + ; 0 +; mul7 + ; fixed_mul(&temp,&dx,&temp); + lda #CONST_LOWRES_HALF_I ; 3 + sta NUM1H ; 3 + lda #CONST_LOWRES_HALF_F ; 3 + sta NUM1L ; 3 + lda DX_I ; 3 + sta NUM2H ; 3 + lda DX_F ; 3 + sta NUM2L ; 3 sec ; 2 jsr multiply ; 6 - lda RESULT+2 ; 4 sta TEMP_I ; 3 - lda RESULT+1 ; 4 - sta TEMP_F ; 3 + stx TEMP_F ; 3 ;========== - ; 48 + ; 38 + + + + clc ; fixed_add(&space_x,&temp,&space_x); ; 2 + lda SPACEX_F ; 3 + adc TEMP_F ; 3 + sta SPACEX_F ; 3 + lda SPACEX_I ; 3 + adc TEMP_I ; 3 + sta SPACEX_I ; 3 + ;========== + ; 20 + + + + +; lda #$ec ; temp.i=0xec; // -20 (LOWRES_W/2) ; 2 +; sta TEMP_I ; 3 +; lda #0 ; temp.f=0; ; 2 +; sta TEMP_F ; 3 + ;========== + ; 30 +;mul8 + ; fixed_mul(&fixed_temp,&dy,&fixed_temp); +; lda #CONST_LOWRES_HALF_I +; sta NUM1H +; lda #CONST_LOWRES_HALF_F +; sta NUM1L + lda DY_I ; 3 + sta NUM2H ; 3 + lda DY_F ; 3 + sta NUM2L ; 3 + clc ; reuse LOWRES_HALF_I from last time ; 2 + jsr multiply ; 6 + sta TEMP_I ; 3 + stx TEMP_F ; 3 + ;========== + ; 26 clc ; fixed_add(&space_y,&fixed_temp,&space_y); ; 2 lda SPACEY_F ; 3 diff --git a/tfv/tfv_multiply.s b/tfv/tfv_multiply.s index 440a3f0c..b8f57fea 100644 --- a/tfv/tfv_multiply.s +++ b/tfv/tfv_multiply.s @@ -329,8 +329,10 @@ x_positive: ; 19 y_positive: -; *z_i=product[2]; -; *z_f=product[1]; + lda RESULT+2 ; *z_i=product[2]; ; 3 + ldx RESULT+1 ; *z_f=product[1]; ; 3 - rts ; 6 + + + rts ; 12