tfv: optimize the multiply some more

This commit is contained in:
Vince Weaver 2017-11-25 21:55:45 -05:00
parent 6e0f505893
commit 71fde333c9
5 changed files with 157 additions and 144 deletions

View File

@ -43,8 +43,6 @@
#define SPEED 0x7b
#define SPLASH_COUNT 0x7c
#define OVER_WATER 0x7d
#define TEMP2_I 0x7e
#define TEMP2_F 0x7f
#define SHIPY 0xE4
@ -57,6 +55,10 @@
#define CONST_SHIPX 15
#define CONST_LOWRES_HALF_I 0xec // -20 (LOWRES_W/2)
#define CONST_LOWRES_HALF_F 0x0
/* Mode7 code based on code from: */
/* http://www.helixsoft.nl/articles/circle/sincos.htm */
@ -562,7 +564,7 @@ y_positive:
// return (product[3]<<24)|(product[2]<<16)|(product[1]<<8)|product[0];
// rts ; 6
cycles.multiply+=6;
cycles.multiply+=12;
}
@ -786,9 +788,9 @@ void draw_background_mode7(void) {
hlin_double(ram[DRAW_PAGE], 0, 40, 6);
cycles.mode7+=14+63+(16*40);
cycles.mode7+=28;
cycles.mode7+=30;
/* FIXME: only do this if SPACEZ changes? */
// mul
// mul1
fixed_mul(ram[SPACEZ_I],ram[SPACEZ_F],
CONST_BETA_I,CONST_BETA_F,
&ram[FACTOR_I],&ram[FACTOR_F],0);
@ -804,7 +806,7 @@ void draw_background_mode7(void) {
// fixed_to_double(ram[SPACEZ_I],ram[SPACEZ_F],),
// fixed_to_double(&BETA),
// fixed_to_double(ram[FACTOR_I],ram[FACTOR_F]));
cycles.mode7+=22;
cycles.mode7+=12;
for (ram[SCREEN_Y] = 8; ram[SCREEN_Y] < LOWRES_H; ram[SCREEN_Y]+=2) {
@ -825,11 +827,11 @@ void draw_background_mode7(void) {
ram[HORIZ_SCALE_I],ram[HORIZ_SCALE_F]);
}
//mul // calculate the distance of the line we are drawing
//mul2 // calculate the distance of the line we are drawing
fixed_mul(ram[HORIZ_SCALE_I],ram[HORIZ_SCALE_F],
CONST_SCALE_I,CONST_SCALE_F,
&ram[DISTANCE_I],&ram[DISTANCE_F],0);
cycles.mode7+=44;
cycles.mode7+=34;
if (!displayed) {
printf("DISTANCE %x:%x\n",ram[DISTANCE_I],ram[DISTANCE_F]);
}
@ -840,11 +842,11 @@ void draw_background_mode7(void) {
ram[DX_F]=fixed_sin[(ram[ANGLE]+8)&0xf].f; // -sin()
cycles.mode7+=29;
// mul
// mul3
fixed_mul(ram[HORIZ_SCALE_I],ram[HORIZ_SCALE_F],
ram[DX_I],ram[DX_F],
&ram[DX_I],&ram[DX_F],1);
cycles.mode7+=48;
cycles.mode7+=26;
if (!displayed) {
printf("DX %x:%x\n",ram[DX_I],ram[DX_F]);
}
@ -853,11 +855,11 @@ void draw_background_mode7(void) {
ram[DY_I]=fixed_sin[(ram[ANGLE]+4)&0xf].i; // cos()
ram[DY_F]=fixed_sin[(ram[ANGLE]+4)&0xf].f; // cos()
cycles.mode7+=29;
// mul
// mul4
fixed_mul(ram[HORIZ_SCALE_I],ram[HORIZ_SCALE_F],
ram[DY_I],ram[DY_F],
&ram[DY_I],&ram[DY_F],1);
cycles.mode7+=48;
cycles.mode7+=28;
if (!displayed) {
printf("DY %x:%x\n",ram[DY_I],ram[DY_F]);
}
@ -874,11 +876,11 @@ void draw_background_mode7(void) {
ram[TEMP_I]=fixed_sin[(ram[ANGLE]+4)&0xf].i; // cos
ram[TEMP_F]=fixed_sin[(ram[ANGLE]+4)&0xf].f; // cos
cycles.mode7+=29;
// mul
// mul5
fixed_mul(ram[SPACEX_I],ram[SPACEX_F],
ram[TEMP_I],ram[TEMP_F],
&ram[SPACEX_I],&ram[SPACEX_F],0);
cycles.mode7+=48;
cycles.mode7+=38;
fixed_add(ram[SPACEX_I],ram[SPACEX_F],
ram[CX_I],ram[CX_F],
@ -888,25 +890,22 @@ void draw_background_mode7(void) {
ram[TEMP_I]=fixed_sin[ram[ANGLE]&0xf].i;
ram[TEMP_F]=fixed_sin[ram[ANGLE]&0xf].f;
cycles.mode7+=25;
// mul
// mul6
fixed_mul(ram[SPACEY_I],ram[SPACEY_F],
ram[TEMP_I],ram[TEMP_F],
&ram[SPACEY_I],&ram[SPACEY_F],0);
cycles.mode7+=48;
cycles.mode7+=38;
fixed_add(ram[SPACEY_I],ram[SPACEY_F],
ram[CY_I],ram[CY_F],
&ram[SPACEY_I],&ram[SPACEY_F]);
ram[TEMP2_I]=0xec; // -20 (LOWRES_W/2)
ram[TEMP2_F]=0;
cycles.mode7+=30;
// mul
fixed_mul(ram[TEMP2_I],ram[TEMP2_F],
// mul7
fixed_mul(CONST_LOWRES_HALF_I,CONST_LOWRES_HALF_F,
ram[DX_I],ram[DX_F],
&ram[TEMP_I],&ram[TEMP_F],0);
cycles.mode7+=48;
cycles.mode7+=38;
fixed_add(ram[SPACEX_I],ram[SPACEX_F],
ram[TEMP_I],ram[TEMP_F],
@ -917,15 +916,11 @@ void draw_background_mode7(void) {
ram[SPACEX_I],ram[SPACEX_F]);
}
// ram[TEMP_I]=0xec; // -20 (LOWRES_W/2)
// ram[TEMP_F]=0;
cycles.mode7+=30;
// mul
fixed_mul(ram[TEMP2_I],ram[TEMP2_F],
// mul8
fixed_mul(CONST_LOWRES_HALF_I,CONST_LOWRES_HALF_F,
ram[DY_I],ram[DY_F],
&ram[TEMP_I],&ram[TEMP_F],1);
cycles.mode7+=48;
cycles.mode7+=26;
fixed_add(ram[SPACEY_I],ram[SPACEY_F],
ram[TEMP_I],ram[TEMP_F],
&ram[SPACEY_I],&ram[SPACEY_F]);
@ -983,16 +978,16 @@ int flying(void) {
memset(&cycles,0,sizeof(cycles));
fixed_mul(0x1,0x0,
0x2,0x0,
&ram[FACTOR_I],&ram[FACTOR_F],0);
&ram[TEMP_I],&ram[TEMP_F],0);
printf("Multiplying 1.0 * 2.0 = %d.%d, took %d cycles\n",
ram[FACTOR_I],ram[FACTOR_F],cycles.multiply);
ram[TEMP_I],ram[TEMP_F],cycles.multiply);
memset(&cycles,0,sizeof(cycles));
fixed_mul(0xff,0xff,
0xff,0xff,
&ram[FACTOR_I],&ram[FACTOR_F],0);
&ram[TEMP_I],&ram[TEMP_F],0);
printf("Multiplying ff.ff * ff.ff = %d.%d, took %d cycles\n",
ram[FACTOR_I],ram[FACTOR_F],cycles.multiply);
ram[TEMP_I],ram[TEMP_F],cycles.multiply);
gr();
clear_bottom(PAGE0);

View File

@ -42,3 +42,19 @@ Update to use "fast multiply" w 2kB squares table lookup:
=================================
Total = 139,833
Frame Rate = 7.15 fps
Update to optimize fast multiply (reusing NUM1H, return results in register)
Multiplying 1.0 * 2.0 = 2.0, took 234 cycles
Multiplying ff.ff * ff.ff = 0.0, took 278 cycles
Cycles: flying= 162
Cycles: getkey= 46
Cycles: page_flip= 26
Cycles: multiply= 24,935
Cycles: mode7= 73,925
Cycles: lookup_map= 33,920
Cycles: put_sprite= 2,561
=================================
Total = 135,575
Frame Rate = 7.38 fps

View File

@ -4,6 +4,8 @@ mode7 speed fixes:
+ faster multiply routine
+ If result is AABBCCDD we only need BBCC for fixed point
result
+ Pass NUM1H in A? no because used multiple places
+ inline hlin_setup? (save 12 cycles)
+ re-arrange variables to better take advantage of self-modifying code
+ only doing the spacez calculation if it has changed

View File

@ -11,7 +11,8 @@ CONST_BETA_I EQU $ff
CONST_BETA_F EQU $80
CONST_SCALE_I EQU $14
CONST_SCALE_F EQU $00
CONST_LOWRES_HALF_I EQU $ec ; -(LOWRES_W/2)
CONST_LOWRES_HALF_F EQU $00
flying_start:
@ -553,7 +554,7 @@ sky_loop: ; draw line across screen
bne sky_loop ; 3/2nt
;=============
; (23+63+(X*16))*5
; Draw Horizon
; Draw Hazy Horizon
lda #COLOR_BOTH_GREY ; Horizon is Grey ; 2
sta COLOR ; 3
@ -564,8 +565,9 @@ sky_loop: ; draw line across screen
jsr hlin_double ; hlin 0,40 at 6 ; 63+(X*16)
;===========
; 63+(X*16)+14
; FIXME: only do this if Z changes?
; fixed_mul(&space_z,&BETA,&factor);
;mul1
lda SPACEZ_I ; 3
sta NUM1H ; 3
lda SPACEZ_F ; 3
@ -581,10 +583,8 @@ sky_loop: ; draw line across screen
;===========
; 30
lda RESULT+2 ; 4
sta FACTOR_I ; 4
lda RESULT+1 ; 4
sta FACTOR_F ; 4
sta FACTOR_I ; 3
stx FACTOR_F ; 3
;; SPACEZ=78 * ff80 = FACTOR=66
@ -595,7 +595,7 @@ sky_loop: ; draw line across screen
lda #8 ; 2
sta SCREEN_Y ; 4
;=============
; 22
; 12
screeny_loop:
ldy #0 ; 2
jsr hlin_setup ; y-coord in a, x-coord in y ; 41
@ -629,25 +629,23 @@ screeny_loop:
;============
; 44
;; brk ASM, horiz_scale = 00:73
; mul2
; calculate the distance of the line we are drawing
; fixed_mul(&horizontal_scale,&scale,&distance);
lda HORIZ_SCALE_I ; 3
sta NUM1H ; 4
sta NUM1H ; 3
lda HORIZ_SCALE_F ; 3
sta NUM1L ; 4
sta NUM1L ; 3
lda #CONST_SCALE_I ; SCALE_I ; 2
sta NUM2H ; 4
sta NUM2H ; 3
lda #CONST_SCALE_F ; SCALE_F ; 2
sta NUM2L ; 4
sta NUM2L ; 3
sec ; 2
jsr multiply ; 6
lda RESULT+2 ; 4
sta DISTANCE_I ; 2
lda RESULT+1 ; 4
sta DISTANCE_F ; 2
stx DISTANCE_F ; 2
;==========
; 46
; 34
;; brk ASM, distance = 08:fc
; calculate the dx and dy of points in space when we step
@ -666,24 +664,22 @@ screeny_loop:
sta DX_F ; 3
;==========
; 29
;mul3
; fixed_mul(&dx,&horizontal_scale,&dx);
lda HORIZ_SCALE_I ; 3
sta NUM1H ; 4
lda HORIZ_SCALE_F ; 3
sta NUM1L ; 4
; lda HORIZ_SCALE_I
; sta NUM1H
; lda HORIZ_SCALE_F
; sta NUM1L
lda DX_I ; 3
sta NUM2H ; 4
sta NUM2H ; 3
lda DX_F ; 3
sta NUM2L ; 4
sec ; 2
sta NUM2L ; 3
clc ; reuse HORIZ_SCALE in NUM1 ; 2
jsr multiply ; 6
lda RESULT+2 ; 4
sta DX_I ; 3
lda RESULT+1 ; 4
sta DX_F ; 3
stx DX_F ; 3
;==========
; 48
; 26
;; ANGLE
;; brk ASM, dx = 00:00
@ -700,23 +696,22 @@ screeny_loop:
sta DY_F ; 3
;==========
; 29
;mul4
; fixed_mul(&dy,&horizontal_scale,&dy);
lda HORIZ_SCALE_I ; 3
sta NUM1H ; 4
lda HORIZ_SCALE_F ; 3
sta NUM1L ; 4
; lda HORIZ_SCALE_I
; sta NUM1H
; lda HORIZ_SCALE_F
; sta NUM1L
lda DY_I ; 3
sta NUM2H ; 4
lda DY_F ; 3
sta NUM2L ; 4
sec ; 2
clc ; reuse horiz_scale in num1 ; 2
jsr multiply ; 6
lda RESULT+2 ; 4
sta DY_I ; 3
lda RESULT+1 ; 4
sta DY_F ; 3
stx DY_F ; 3
;==========
; 48
; 28
;; brk ASM, dy = 00:73
; calculate the starting position
@ -749,23 +744,22 @@ screeny_loop:
;==========
; 29
; mul5
; fixed_mul(&space_x,&temp,&space_x);
lda SPACEX_I ; 3
sta NUM1H ; 4
sta NUM1H ; 3
lda SPACEX_F ; 3
sta NUM1L ; 4
sta NUM1L ; 3
lda TEMP_I ; 3
sta NUM2H ; 4
sta NUM2H ; 3
lda TEMP_F ; 3
sta NUM2L ; 4
sta NUM2L ; 3
sec ; 2
jsr multiply ; 6
lda RESULT+2 ; 4
sta SPACEX_I ; 3
lda RESULT+1 ; 4
sta SPACEX_F ; 3
stx SPACEX_F ; 3
;==========
; 48
; 38
clc ; fixed_add(&space_x,&cx,&space_x); ; 2
lda SPACEX_F ; 3
@ -775,41 +769,7 @@ screeny_loop:
adc CX_I ; 3
sta SPACEX_I ; 3
lda #$ec ; temp.i=0xec; // -20 (LOWRES_W/2) ; 2
sta TEMP_I ; 3
lda #0 ; temp.f=0; ; 2
sta TEMP_F ; 3
;==========
; 30
; fixed_mul(&temp,&dx,&temp);
lda TEMP_I ; 3
sta NUM1H ; 4
lda TEMP_F ; 3
sta NUM1L ; 4
lda DX_I ; 3
sta NUM2H ; 4
lda DX_F ; 3
sta NUM2L ; 4
sec ; 2
jsr multiply ; 6
lda RESULT+2 ; 4
sta TEMP_I ; 3
lda RESULT+1 ; 4
sta TEMP_F ; 3
;==========
; 48
clc ; fixed_add(&space_x,&temp,&space_x); ; 2
lda SPACEX_F ; 3
adc TEMP_F ; 3
sta SPACEX_F ; 3
lda SPACEX_I ; 3
adc TEMP_I ; 3
sta SPACEX_I ; 3
;==========
; 20
; brk ; space_x = 06:bc
lda ANGLE ; temp.i=fixed_sin[angle&0xf].i; ; 3
@ -824,23 +784,22 @@ screeny_loop:
;==========
; 25
;mul6
; fixed_mul(&space_y,&fixed_temp,&space_y);
lda SPACEY_I ; 3
sta NUM1H ; 4
sta NUM1H ; 3
lda SPACEY_F ; 3
sta NUM1L ; 4
sta NUM1L ; 3
lda TEMP_I ; 3
sta NUM2H ; 4
sta NUM2H ; 3
lda TEMP_F ; 3
sta NUM2L ; 4
sta NUM2L ; 3
sec ; 2
jsr multiply ; 6
lda RESULT+2 ; 4
sta SPACEY_I ; 3
lda RESULT+1 ; 4
sta SPACEY_F ; 3
stx SPACEY_F ; 3
;==========
; 48
; 38
clc ; fixed_add(&space_y,&cy,&space_y); ; 2
lda SPACEY_F ; 3
@ -850,29 +809,68 @@ screeny_loop:
adc CY_I ; 3
sta SPACEY_I ; 3
lda #$ec ; temp.i=0xec; // -20 (LOWRES_W/2) ; 2
sta TEMP_I ; 3
lda #0 ; temp.f=0; ; 2
sta TEMP_F ; 3
;==========
; 30
; fixed_mul(&fixed_temp,&dy,&fixed_temp);
lda TEMP_I ; 3
sta NUM1H ; 4
lda TEMP_F ; 3
sta NUM1L ; 4
lda DY_I ; 3
sta NUM2H ; 4
lda DY_F ; 3
sta NUM2L ; 4
; lda #$ec ; temp.i=0xec; // -20 (LOWRES_W/2) ; 2
; sta TEMP_I ; 3
; lda #0 ; temp.f=0; ; 2
; sta TEMP_F ; 3
; ;==========
;
; 0
; mul7
; fixed_mul(&temp,&dx,&temp);
lda #CONST_LOWRES_HALF_I ; 3
sta NUM1H ; 3
lda #CONST_LOWRES_HALF_F ; 3
sta NUM1L ; 3
lda DX_I ; 3
sta NUM2H ; 3
lda DX_F ; 3
sta NUM2L ; 3
sec ; 2
jsr multiply ; 6
lda RESULT+2 ; 4
sta TEMP_I ; 3
lda RESULT+1 ; 4
sta TEMP_F ; 3
stx TEMP_F ; 3
;==========
; 48
; 38
clc ; fixed_add(&space_x,&temp,&space_x); ; 2
lda SPACEX_F ; 3
adc TEMP_F ; 3
sta SPACEX_F ; 3
lda SPACEX_I ; 3
adc TEMP_I ; 3
sta SPACEX_I ; 3
;==========
; 20
; lda #$ec ; temp.i=0xec; // -20 (LOWRES_W/2) ; 2
; sta TEMP_I ; 3
; lda #0 ; temp.f=0; ; 2
; sta TEMP_F ; 3
;==========
; 30
;mul8
; fixed_mul(&fixed_temp,&dy,&fixed_temp);
; lda #CONST_LOWRES_HALF_I
; sta NUM1H
; lda #CONST_LOWRES_HALF_F
; sta NUM1L
lda DY_I ; 3
sta NUM2H ; 3
lda DY_F ; 3
sta NUM2L ; 3
clc ; reuse LOWRES_HALF_I from last time ; 2
jsr multiply ; 6
sta TEMP_I ; 3
stx TEMP_F ; 3
;==========
; 26
clc ; fixed_add(&space_y,&fixed_temp,&space_y); ; 2
lda SPACEY_F ; 3

View File

@ -329,8 +329,10 @@ x_positive:
; 19
y_positive:
; *z_i=product[2];
; *z_f=product[1];
lda RESULT+2 ; *z_i=product[2]; ; 3
ldx RESULT+1 ; *z_f=product[1]; ; 3
rts ; 6
rts ; 12