tfv: optimize the multiply routine down some more

This commit is contained in:
Vince Weaver 2017-11-29 00:38:06 -05:00
parent 68cac10f1e
commit a7f4ec0bc0
3 changed files with 101 additions and 84 deletions

View File

@ -291,8 +291,8 @@ static unsigned char square2_lo[512];
static unsigned char square2_hi[512];
static int sm1a,sm3a,sm5a,sm7a;
static int sm2a,sm4a,sm6a,sm8a;
static int sm1b,sm3b,sm5b,sm7b;
static int sm2b,sm4b,sm6b,sm8b;
static int sm1b,sm3b,sm5b; //,sm7b;
static int sm2b,sm4b,sm6b; //,sm8b;
static int table_ready=0;
@ -355,7 +355,8 @@ static int fixed_mul_unsigned(
int c=0;
int a,x;
int _AA,_BB,_CC,_DD,_aa,_bb,_cc,_dd;
int _AA,_BB,_CC; //,_DD;
int _aa,_bb,_cc,_dd;
if (!table_ready) init_table();
@ -377,57 +378,53 @@ static int fixed_mul_unsigned(
sm1b=a; // sta sm1b+1 ; 3
sm3b=a; // sta sm3b+1 ; 3
sm5b=a; // sta sm5b+1 ; 3
sm7b=a; // sta sm7b+1 ; 3
// sm7b=a; // sta sm7b+1 ;
a=(~a)&0xff; // eor #$ff ; 2
sm2b=a; // sta sm2b+1 ; 3
sm4b=a; // sta sm4b+1 ; 3
sm6b=a; // sta sm6b+1 ; 3
sm8b=a; // sta sm8b+1 ; 3
cycles.multiply+=58;
// sm8b=a; // sta sm8b+1 ;
cycles.multiply+=52;
}
/* Perform <T1 * <T2 = AAaa */
x=(y_f)&0xff; // ldx T2+0 (low le) ; 3
c=1; // sec ; 2
//sm1a:
//sm1a:
a=square1_lo[sm1a+x]; // lda square1_lo,x ; 4
//sm2a:
//sm2a:
a+=~(square2_lo[sm2a+x])+c; // sbc square2_lo,x ; 4
c=!(a&0x100);
a&=0xff;
// printf("\t\t\t\ta=(%d+%d)^2/4=%d "
// "b=(%d+%d)^2/4=%d\n",
// sm1a,x,square1_lo[sm1a+x],
// sm2a,x,square2_lo[sm2a+x]);
product[0]=a; // sta PRODUCT+0 ; 3
// product[0]=a; // sta PRODUCT+0 ;
_aa=a;
// printf("\t\t\t\ta-b aa=%2x\n",a);
//sm3a:
//sm3a:
a=square1_hi[sm3a+x]; // lda square1_hi,x ; 4
//sm4a:
//sm4a:
a+=(~(square2_hi[sm4a+x]))+c; // sbc square2_hi,x ; 4
c=!(a&0x100);
a&=0xff;
_AA=a; // sta _AA+1 ; 3
// ;===========
// ; 27
// ; 24
cycles.multiply+=27;
cycles.multiply+=24;
/* Perform >T1_hi * <T2 = CCcc */
c=1; // sec ; 2
//sm1b:
//sm1b:
a=square1_lo[sm1b+x]; // lda square1_lo,x ; 4
//sm2b:
//sm2b:
a+=(~(square2_lo[sm2b+x]))+c; // sbc square2_lo,x ; 4
c=!(a&0x100);
a&=0xff;
_cc=a; // sta _cc+1 ; 3
//sm3b:
//sm3b:
a=square1_hi[sm3b+x]; // lda square1_hi,x ; 4
//sm4b:
//sm4b:
a+=(~(square2_hi[sm4b+x]))+c; // sbc square2_hi,x ; 4
c=!!(a&0x100);
a&=0xff;
@ -438,17 +435,16 @@ static int fixed_mul_unsigned(
/* Perform <T1 * >T2 = BBbb */
x=(y_i)&0xff; // ldx T2+1 ; 3
c=1; // sec ; 2
//sm5a:
//sm5a:
a=square1_lo[sm5a+x]; // lda square1_lo,x ; 4
//sm6a:
//sm6a:
a+=(~(square2_lo[sm6a+x]))+c; // sbc square2_lo,x ; 4
c=!(a&0x100);
a&=0xff;
_bb=a; // sta _bb+1 ; 3
// printf("\t\t\t\tbb=%x c=%d\n",_bb,c);
//sm7a:
//sm7a:
a=square1_hi[sm7a+x]; // lda square1_hi,x ; 4
//sm8a:
//sm8a:
a+=(~(square2_hi[sm8a+x]))+c; // sbc square2_hi,x ; 4
c=!(a&0x100);
a&=0xff;
@ -457,23 +453,23 @@ static int fixed_mul_unsigned(
/* Perform >T1 * >T2 = DDdd */
c=1; // sec ; 2
//sm5b:
//sm5b:
a=square1_lo[sm5b+x]; // lda square1_lo,x ; 4
//sm6b:
//sm6b:
a+=(~(square2_lo[sm6b+x]))+c; // sbc square2_lo,x ; 4
c=!(a&0x100);
a&=0xff;
_dd=a; // sta _dd+1 ; 3
//sm7b:
a=square1_hi[sm7b+x]; // lda square1_hi,x ; 4
//sm8b:
a+=(~(square2_hi[sm8b+x]))+c; // sbc square2_hi,x ; 4
c=!(a&0x100);
a&=0xff;
//sm7b:
//a=square1_hi[sm7b+x]; // lda square1_hi,x ;
//sm8b:
//a+=(~(square2_hi[sm8b+x]))+c; // sbc square2_hi,x ;
//c=!(a&0x100);
//a&=0xff;
product[3]=a; // sta PRODUCT+3 ; 3
_DD=a;
cycles.multiply+=24;
//product[3]=a; // sta PRODUCT+3 ;
//_DD=a;
cycles.multiply+=13;
/*********************************************/
/* Add the separate multiplications together */
/*********************************************/
@ -485,9 +481,9 @@ static int fixed_mul_unsigned(
if (debug) printf("product[1]=%02x+%02x+0=",_AA,_bb);
c=0; // clc ; 2
//_AA:
//_AA:
a=_AA; // lda #0 ; 2
//_bb:
//_bb:
a+=(c+_bb); // adc #0 ; 2
c=!!(a&0x100);
a&=0xff;
@ -496,27 +492,27 @@ static int fixed_mul_unsigned(
cycles.multiply+=9;
// product[2]=_BB+_CC+c
if (debug) printf("product[2]=%02x+%02x+%d=",_BB,_CC,c);
//_BB:
//_BB:
a=_BB; // lda #0 ; 2
//_CC:
//_CC:
a+=(c+_CC); // adc #0 ; 2
c=!!(a&0x100);
a&=0xff;
product[2]=a; // sta PRODUCT+2 ; 3
if (debug) printf("%x.%02x\n",c,a);
cycles.multiply+=10;
cycles.multiply+=7;
// product[3]=_DD+c
if (debug) printf("product[3]=%02x+%d=",_DD,c);
if (c==0) goto urgh2; // bcc :+ ; 2nt/3
product[3]++; // inc PRODUCT+3 ; 5
product[3]&=0xff;
// if (debug) printf("product[3]=%02x+%d=",_DD,c);
// if (c==0) goto urgh2; // bcc :+ ; 2nt/3
// product[3]++; // inc PRODUCT+3 ; 5
// product[3]&=0xff;
c=0; // clc ; 2
cycles.multiply+=6;
urgh2:
cycles.multiply+=2;
//urgh2:
if (debug) printf("%x.%02x\n",c,product[3]);
// product[1]=_AA+_bb+_cc
if (debug) printf("product[1]=%02x+%02x+%d=",product[1],_cc,c);
//_cc:
//_cc:
a=_cc; // lda #0 ; 2
a+=c+product[1]; // adc PRODUCT+1 ; 3
c=!!(a&0x100);
@ -526,7 +522,7 @@ urgh2:
// product[2]=_BB+_CC+_dd+c
if (debug) printf("product[2]=%02x+%02x+%d=",product[2],_dd,c);
//_dd:
//_dd:
a=_dd; // lda #0 ; 2
a+=c+product[2]; // adc PRODUCT+2 ; 3
c=!!(a&0x100);
@ -535,25 +531,25 @@ urgh2:
if (debug) printf("%x.%02x\n",c,product[2]);
// product[3]=_DD+c
if (debug) printf("product[3]=%02x+%d=",product[3],c);
cycles.multiply+=19;
if (c==0) goto urgh; // bcc :+ ; 2nt/3
product[3]++; // inc PRODUCT+3 ; 5
product[3]&=0xff;
cycles.multiply+=4;
urgh:
//if (debug) printf("product[3]=%02x+%d=",product[3],c);
cycles.multiply+=16;
//if (c==0) goto urgh; // bcc :+ ; 2nt/3
//product[3]++; // inc PRODUCT+3 ; 5
//product[3]&=0xff;
cycles.multiply+=0;
//urgh:
if (debug) printf("%x.%02x\n",c,product[3]);
*z_i=product[1];
*z_f=product[0];
// printf("Result=%02x:%02x\n",*z_i,*z_f);
if (debug) {
printf(" AAaa %02x:%02x\n",_AA,_aa);
printf(" BBbb %02x:%02x\n",_BB,_bb);
printf(" CCcc %02x:%02x\n",_CC,_cc);
printf("DDdd %02x:%02x\n",_DD,_dd);
}
// if (debug) {
// printf(" AAaa %02x:%02x\n",_AA,_aa);
// printf(" BBbb %02x:%02x\n",_BB,_bb);
// printf(" CCcc %02x:%02x\n",_CC,_cc);
// printf("DDdd %02x:%02x\n",_DD,_dd);
// }
cycles.multiply+=6;

View File

@ -158,5 +158,18 @@ More self-modifying code, also move SCREEN_X to X register
Total = 193,214
Frame Rate = 5.18 fps
Remove unneeded precision in the 8.8 x 8.8 fixed point multiply
Cycles: flying= 187
Cycles: getkey= 46
Cycles: page_flip= 26
Cycles: multiply= 44,785
Cycles: mode7= 118,034
Cycles: lookup_map= 22,747
Cycles: put_sprite= 2,561
================================
Total = 188,386
Frame Rate = 5.31 fps
Each cycle removed from inner X loop saves
32*40=1280 cycles

View File

@ -1,4 +1,11 @@
; Fast mutiply
; Note for our purposes we only care about 8.8 x 8.8 fixed point
; with 8.8 result, which means we only care about the middle two bytes
; of the 32 bit result. So we disable generation of the high and low byte
; to save some cycles.
;
; The old routine took around 700 cycles for a 16bitx16bit=32bit mutiply
; This routine, at an expense of 2kB of looku tables, takes around 250
@ -130,14 +137,14 @@ fixed_16x16_mul_unsigned:
sta sm1b+1 ; 3
sta sm3b+1 ; 3
sta sm5b+1 ; 3
sta sm7b+1 ; 3
; sta sm7b+1 ;
eor #$ff ; invert the bits for subtractin ; 2
sta sm2b+1 ; 3
sta sm4b+1 ; 3
sta sm6b+1 ; 3
sta sm8b+1 ; 3
; sta sm8b+1 ;
;===========
; 58
; 52
num1_same_as_last_time:
@ -154,7 +161,7 @@ sm2a:
; a is _aa
sta RESULT+0 ; 3
; sta RESULT+0 ;
sm3a:
lda square1_hi,x ; 4
@ -163,7 +170,7 @@ sm4a:
; a is _AA
sta _AA+1 ; 3
;===========
; 27
; 24
; Perform NUM1H * NUM2L = CCcc
sec ; 2
@ -213,14 +220,14 @@ sm6b:
sbc square2_lo,x ; 4
; a is _dd
sta _dd+1 ; 3
sm7b:
lda square1_hi,x ; 4
sm8b:
sbc square2_hi,x ; 4
;sm7b:
; lda square1_hi,x ;
;sm8b:
; sbc square2_hi,x ;
; a = _DD
sta RESULT+3 ; 3
; sta RESULT+3 ;
;===========
; 24
; 13
;===========================================
; Add the separate multiplications together
@ -232,7 +239,8 @@ _AA:
_bb:
adc #0 ; adding in _bb ; 2
sta RESULT+1 ; 3
;==========
; 9
; product[2]=_BB+_CC+c
_BB:
@ -241,15 +249,15 @@ _CC:
adc #0 ; adding in _CC ; 2
sta RESULT+2 ; 3
;===========
; 19
; 7
; product[3]=_DD+c
bcc dd_no_carry1 ; ^2nt/3
inc RESULT+3 ; 5
; bcc dd_no_carry1 ;
; inc RESULT+3 ;
clc ; 2
;=============
; 6
; 2
dd_no_carry1:
; product[1]=_AA+_bb+_cc
@ -267,15 +275,15 @@ _dd:
sta RESULT+2 ; 3
;===========
; 19
; 16
; product[3]=_DD+c
bcc dd_no_carry2 ; ^2nt/3
inc RESULT+3 ; 5
; bcc dd_no_carry2 ;
; inc RESULT+3 ;
;=============
; 4
; 0
dd_no_carry2: