diff --git a/gr-sim/tfv_flying_6502.c b/gr-sim/tfv_flying_6502.c index ba597d47..9d9650ff 100644 --- a/gr-sim/tfv_flying_6502.c +++ b/gr-sim/tfv_flying_6502.c @@ -291,8 +291,8 @@ static unsigned char square2_lo[512]; static unsigned char square2_hi[512]; static int sm1a,sm3a,sm5a,sm7a; static int sm2a,sm4a,sm6a,sm8a; -static int sm1b,sm3b,sm5b,sm7b; -static int sm2b,sm4b,sm6b,sm8b; +static int sm1b,sm3b,sm5b; //,sm7b; +static int sm2b,sm4b,sm6b; //,sm8b; static int table_ready=0; @@ -355,7 +355,8 @@ static int fixed_mul_unsigned( int c=0; int a,x; - int _AA,_BB,_CC,_DD,_aa,_bb,_cc,_dd; + int _AA,_BB,_CC; //,_DD; + int _aa,_bb,_cc,_dd; if (!table_ready) init_table(); @@ -377,57 +378,53 @@ static int fixed_mul_unsigned( sm1b=a; // sta sm1b+1 ; 3 sm3b=a; // sta sm3b+1 ; 3 sm5b=a; // sta sm5b+1 ; 3 - sm7b=a; // sta sm7b+1 ; 3 + // sm7b=a; // sta sm7b+1 ; a=(~a)&0xff; // eor #$ff ; 2 sm2b=a; // sta sm2b+1 ; 3 sm4b=a; // sta sm4b+1 ; 3 sm6b=a; // sta sm6b+1 ; 3 - sm8b=a; // sta sm8b+1 ; 3 - cycles.multiply+=58; + // sm8b=a; // sta sm8b+1 ; + cycles.multiply+=52; } /* Perform T1_hi * T2 = BBbb */ x=(y_i)&0xff; // ldx T2+1 ; 3 c=1; // sec ; 2 -//sm5a: + //sm5a: a=square1_lo[sm5a+x]; // lda square1_lo,x ; 4 -//sm6a: + //sm6a: a+=(~(square2_lo[sm6a+x]))+c; // sbc square2_lo,x ; 4 c=!(a&0x100); a&=0xff; _bb=a; // sta _bb+1 ; 3 -// printf("\t\t\t\tbb=%x c=%d\n",_bb,c); -//sm7a: + //sm7a: a=square1_hi[sm7a+x]; // lda square1_hi,x ; 4 -//sm8a: + //sm8a: a+=(~(square2_hi[sm8a+x]))+c; // sbc square2_hi,x ; 4 c=!(a&0x100); a&=0xff; @@ -457,23 +453,23 @@ static int fixed_mul_unsigned( /* Perform >T1 * >T2 = DDdd */ c=1; // sec ; 2 -//sm5b: + //sm5b: a=square1_lo[sm5b+x]; // lda square1_lo,x ; 4 -//sm6b: + //sm6b: a+=(~(square2_lo[sm6b+x]))+c; // sbc square2_lo,x ; 4 c=!(a&0x100); a&=0xff; _dd=a; // sta _dd+1 ; 3 -//sm7b: - a=square1_hi[sm7b+x]; // lda square1_hi,x ; 4 -//sm8b: - a+=(~(square2_hi[sm8b+x]))+c; // sbc square2_hi,x ; 4 - c=!(a&0x100); - a&=0xff; + //sm7b: + //a=square1_hi[sm7b+x]; // lda square1_hi,x ; + //sm8b: + //a+=(~(square2_hi[sm8b+x]))+c; // sbc square2_hi,x ; + //c=!(a&0x100); + //a&=0xff; - product[3]=a; // sta PRODUCT+3 ; 3 - _DD=a; - cycles.multiply+=24; + //product[3]=a; // sta PRODUCT+3 ; + //_DD=a; + cycles.multiply+=13; /*********************************************/ /* Add the separate multiplications together */ /*********************************************/ @@ -485,9 +481,9 @@ static int fixed_mul_unsigned( if (debug) printf("product[1]=%02x+%02x+0=",_AA,_bb); c=0; // clc ; 2 -//_AA: + //_AA: a=_AA; // lda #0 ; 2 -//_bb: + //_bb: a+=(c+_bb); // adc #0 ; 2 c=!!(a&0x100); a&=0xff; @@ -496,27 +492,27 @@ static int fixed_mul_unsigned( cycles.multiply+=9; // product[2]=_BB+_CC+c if (debug) printf("product[2]=%02x+%02x+%d=",_BB,_CC,c); -//_BB: + //_BB: a=_BB; // lda #0 ; 2 -//_CC: + //_CC: a+=(c+_CC); // adc #0 ; 2 c=!!(a&0x100); a&=0xff; product[2]=a; // sta PRODUCT+2 ; 3 if (debug) printf("%x.%02x\n",c,a); - cycles.multiply+=10; + cycles.multiply+=7; // product[3]=_DD+c - if (debug) printf("product[3]=%02x+%d=",_DD,c); - if (c==0) goto urgh2; // bcc :+ ; 2nt/3 - product[3]++; // inc PRODUCT+3 ; 5 - product[3]&=0xff; +// if (debug) printf("product[3]=%02x+%d=",_DD,c); +// if (c==0) goto urgh2; // bcc :+ ; 2nt/3 +// product[3]++; // inc PRODUCT+3 ; 5 +// product[3]&=0xff; c=0; // clc ; 2 - cycles.multiply+=6; -urgh2: + cycles.multiply+=2; +//urgh2: if (debug) printf("%x.%02x\n",c,product[3]); // product[1]=_AA+_bb+_cc if (debug) printf("product[1]=%02x+%02x+%d=",product[1],_cc,c); -//_cc: + //_cc: a=_cc; // lda #0 ; 2 a+=c+product[1]; // adc PRODUCT+1 ; 3 c=!!(a&0x100); @@ -526,7 +522,7 @@ urgh2: // product[2]=_BB+_CC+_dd+c if (debug) printf("product[2]=%02x+%02x+%d=",product[2],_dd,c); -//_dd: + //_dd: a=_dd; // lda #0 ; 2 a+=c+product[2]; // adc PRODUCT+2 ; 3 c=!!(a&0x100); @@ -535,25 +531,25 @@ urgh2: if (debug) printf("%x.%02x\n",c,product[2]); // product[3]=_DD+c - if (debug) printf("product[3]=%02x+%d=",product[3],c); - cycles.multiply+=19; - if (c==0) goto urgh; // bcc :+ ; 2nt/3 - product[3]++; // inc PRODUCT+3 ; 5 - product[3]&=0xff; - cycles.multiply+=4; -urgh: + //if (debug) printf("product[3]=%02x+%d=",product[3],c); + cycles.multiply+=16; + //if (c==0) goto urgh; // bcc :+ ; 2nt/3 + //product[3]++; // inc PRODUCT+3 ; 5 + //product[3]&=0xff; + cycles.multiply+=0; +//urgh: if (debug) printf("%x.%02x\n",c,product[3]); *z_i=product[1]; *z_f=product[0]; // printf("Result=%02x:%02x\n",*z_i,*z_f); - if (debug) { - printf(" AAaa %02x:%02x\n",_AA,_aa); - printf(" BBbb %02x:%02x\n",_BB,_bb); - printf(" CCcc %02x:%02x\n",_CC,_cc); - printf("DDdd %02x:%02x\n",_DD,_dd); - } +// if (debug) { +// printf(" AAaa %02x:%02x\n",_AA,_aa); +// printf(" BBbb %02x:%02x\n",_BB,_bb); +// printf(" CCcc %02x:%02x\n",_CC,_cc); +// printf("DDdd %02x:%02x\n",_DD,_dd); +// } cycles.multiply+=6; diff --git a/tfv/OPTIMIZATION b/tfv/OPTIMIZATION index 76f1a07d..6e6eeff0 100644 --- a/tfv/OPTIMIZATION +++ b/tfv/OPTIMIZATION @@ -158,5 +158,18 @@ More self-modifying code, also move SCREEN_X to X register Total = 193,214 Frame Rate = 5.18 fps +Remove unneeded precision in the 8.8 x 8.8 fixed point multiply + Cycles: flying= 187 + Cycles: getkey= 46 + Cycles: page_flip= 26 + Cycles: multiply= 44,785 + Cycles: mode7= 118,034 + Cycles: lookup_map= 22,747 + Cycles: put_sprite= 2,561 + ================================ + Total = 188,386 + Frame Rate = 5.31 fps + + Each cycle removed from inner X loop saves 32*40=1280 cycles diff --git a/tfv/tfv_multiply.s b/tfv/tfv_multiply.s index b8f57fea..c969474d 100644 --- a/tfv/tfv_multiply.s +++ b/tfv/tfv_multiply.s @@ -1,4 +1,11 @@ ; Fast mutiply + + +; Note for our purposes we only care about 8.8 x 8.8 fixed point +; with 8.8 result, which means we only care about the middle two bytes +; of the 32 bit result. So we disable generation of the high and low byte +; to save some cycles. + ; ; The old routine took around 700 cycles for a 16bitx16bit=32bit mutiply ; This routine, at an expense of 2kB of looku tables, takes around 250 @@ -130,14 +137,14 @@ fixed_16x16_mul_unsigned: sta sm1b+1 ; 3 sta sm3b+1 ; 3 sta sm5b+1 ; 3 - sta sm7b+1 ; 3 +; sta sm7b+1 ; eor #$ff ; invert the bits for subtractin ; 2 sta sm2b+1 ; 3 sta sm4b+1 ; 3 sta sm6b+1 ; 3 - sta sm8b+1 ; 3 +; sta sm8b+1 ; ;=========== - ; 58 + ; 52 num1_same_as_last_time: @@ -154,7 +161,7 @@ sm2a: ; a is _aa - sta RESULT+0 ; 3 +; sta RESULT+0 ; sm3a: lda square1_hi,x ; 4 @@ -163,7 +170,7 @@ sm4a: ; a is _AA sta _AA+1 ; 3 ;=========== - ; 27 + ; 24 ; Perform NUM1H * NUM2L = CCcc sec ; 2 @@ -213,14 +220,14 @@ sm6b: sbc square2_lo,x ; 4 ; a is _dd sta _dd+1 ; 3 -sm7b: - lda square1_hi,x ; 4 -sm8b: - sbc square2_hi,x ; 4 +;sm7b: +; lda square1_hi,x ; +;sm8b: +; sbc square2_hi,x ; ; a = _DD - sta RESULT+3 ; 3 +; sta RESULT+3 ; ;=========== - ; 24 + ; 13 ;=========================================== ; Add the separate multiplications together @@ -232,7 +239,8 @@ _AA: _bb: adc #0 ; adding in _bb ; 2 sta RESULT+1 ; 3 - + ;========== + ; 9 ; product[2]=_BB+_CC+c _BB: @@ -241,15 +249,15 @@ _CC: adc #0 ; adding in _CC ; 2 sta RESULT+2 ; 3 ;=========== - ; 19 + ; 7 ; product[3]=_DD+c - bcc dd_no_carry1 ; ^2nt/3 - inc RESULT+3 ; 5 +; bcc dd_no_carry1 ; +; inc RESULT+3 ; clc ; 2 ;============= - ; 6 + ; 2 dd_no_carry1: ; product[1]=_AA+_bb+_cc @@ -267,15 +275,15 @@ _dd: sta RESULT+2 ; 3 ;=========== - ; 19 + ; 16 ; product[3]=_DD+c - bcc dd_no_carry2 ; ^2nt/3 - inc RESULT+3 ; 5 +; bcc dd_no_carry2 ; +; inc RESULT+3 ; ;============= - ; 4 + ; 0 dd_no_carry2: