tfv: optimize the multiply routine down some more

2025-01-11 09:29:51 +00:00 · 2017-11-29 00:38:06 -05:00 · 2017-11-29 00:38:06 -05:00 · a7f4ec0bc0
commit a7f4ec0bc0
parent 68cac10f1e
3 changed files with 101 additions and 84 deletions
--- a/gr-sim/tfv_flying_6502.c
+++ b/gr-sim/tfv_flying_6502.c
@ -291,8 +291,8 @@ static unsigned char square2_lo[512];
 static unsigned char square2_hi[512];
 static int sm1a,sm3a,sm5a,sm7a;
 static int sm2a,sm4a,sm6a,sm8a;
-static int sm1b,sm3b,sm5b,sm7b;
-static int sm2b,sm4b,sm6b,sm8b;
+static int sm1b,sm3b,sm5b; //,sm7b;
+static int sm2b,sm4b,sm6b; //,sm8b;


 static int table_ready=0;
@ -355,7 +355,8 @@ static int fixed_mul_unsigned(
 	int c=0;
 	int a,x;

-	int _AA,_BB,_CC,_DD,_aa,_bb,_cc,_dd;
+	int _AA,_BB,_CC; //,_DD;
+	int _aa,_bb,_cc,_dd;

 	if (!table_ready) init_table();

@ -377,57 +378,53 @@ static int fixed_mul_unsigned(
 		sm1b=a;			// sta sm1b+1		; 3
 		sm3b=a;			// sta sm3b+1		; 3
 		sm5b=a;			// sta sm5b+1		; 3
-		sm7b=a;			// sta sm7b+1		; 3
+	//	sm7b=a;			// sta sm7b+1		;
 		a=(~a)&0xff;		// eor #$ff		; 2
 		sm2b=a;			// sta sm2b+1		; 3
 		sm4b=a;			// sta sm4b+1		; 3
 		sm6b=a;			// sta sm6b+1		; 3
-		sm8b=a;			// sta sm8b+1		; 3
-					cycles.multiply+=58;
+	//	sm8b=a;			// sta sm8b+1		;
+					cycles.multiply+=52;
 	}

 	/* Perform <T1 * <T2 = AAaa */
 	x=(y_f)&0xff;			// ldx T2+0 (low le)		; 3
 	c=1;					// sec			; 2
-//sm1a:
+	//sm1a:
 	a=square1_lo[sm1a+x];			// lda square1_lo,x	; 4
-//sm2a:
+	//sm2a:
 	a+=~(square2_lo[sm2a+x])+c;		// sbc square2_lo,x	; 4
 	c=!(a&0x100);
 	a&=0xff;

-//	printf("\t\t\t\ta=(%d+%d)^2/4=%d "
-//		"b=(%d+%d)^2/4=%d\n",
-//		sm1a,x,square1_lo[sm1a+x],
-//		sm2a,x,square2_lo[sm2a+x]);
-	product[0]=a;				// sta PRODUCT+0	; 3
+//	product[0]=a;				// sta PRODUCT+0	;
 	_aa=a;
-//	printf("\t\t\t\ta-b aa=%2x\n",a);
-//sm3a:
+
+	//sm3a:
 	a=square1_hi[sm3a+x];			// lda square1_hi,x	; 4
-//sm4a:
+	//sm4a:
 	a+=(~(square2_hi[sm4a+x]))+c;		// sbc square2_hi,x	; 4
 	c=!(a&0x100);
 	a&=0xff;
 	_AA=a;					// sta _AA+1		; 3
 						//		;===========
-						//		;	27
+						//		;	24

-					cycles.multiply+=27;
+					cycles.multiply+=24;

 	/* Perform >T1_hi * <T2 = CCcc */
 	c=1;					// sec			; 2
-//sm1b:
+	//sm1b:
 	a=square1_lo[sm1b+x];			// lda square1_lo,x	; 4
-//sm2b:
+	//sm2b:
 	a+=(~(square2_lo[sm2b+x]))+c;		// sbc square2_lo,x	; 4
 	c=!(a&0x100);
 	a&=0xff;

 	_cc=a;					// sta _cc+1		; 3
-//sm3b:
+	//sm3b:
 	a=square1_hi[sm3b+x];			// lda square1_hi,x	; 4
-//sm4b:
+	//sm4b:
 	a+=(~(square2_hi[sm4b+x]))+c;		// sbc square2_hi,x	; 4
 	c=!!(a&0x100);
 	a&=0xff;
@ -438,17 +435,16 @@ static int fixed_mul_unsigned(
 	/* Perform <T1 * >T2 = BBbb */
 	x=(y_i)&0xff;				// ldx T2+1		; 3
 	c=1;					// sec			; 2
-//sm5a:
+	//sm5a:
 	a=square1_lo[sm5a+x];			// lda square1_lo,x	; 4
-//sm6a:
+	//sm6a:
 	a+=(~(square2_lo[sm6a+x]))+c;		// sbc square2_lo,x	; 4
 	c=!(a&0x100);
 	a&=0xff;
 	_bb=a;					// sta _bb+1		; 3
-//	printf("\t\t\t\tbb=%x c=%d\n",_bb,c);
-//sm7a:
+	//sm7a:
 	a=square1_hi[sm7a+x];			// lda square1_hi,x	; 4
-//sm8a:
+	//sm8a:
 	a+=(~(square2_hi[sm8a+x]))+c;		// sbc square2_hi,x	; 4
 	c=!(a&0x100);
 	a&=0xff;
@ -457,23 +453,23 @@ static int fixed_mul_unsigned(

 	/* Perform >T1 * >T2 = DDdd */
 	c=1;					// sec			; 2
-//sm5b:
+	//sm5b:
 	a=square1_lo[sm5b+x];			// lda square1_lo,x	; 4
-//sm6b:
+	//sm6b:
 	a+=(~(square2_lo[sm6b+x]))+c;		// sbc square2_lo,x	; 4
 	c=!(a&0x100);
 	a&=0xff;
 	_dd=a;					// sta _dd+1		; 3
-//sm7b:
-	a=square1_hi[sm7b+x];			// lda square1_hi,x	; 4
-//sm8b:
-	a+=(~(square2_hi[sm8b+x]))+c;		// sbc square2_hi,x	; 4
-	c=!(a&0x100);
-	a&=0xff;
+	//sm7b:
+	//a=square1_hi[sm7b+x];			// lda square1_hi,x	;
+	//sm8b:
+	//a+=(~(square2_hi[sm8b+x]))+c;		// sbc square2_hi,x	;
+	//c=!(a&0x100);
+	//a&=0xff;

-	product[3]=a;				// sta PRODUCT+3	; 3
-	_DD=a;
-					cycles.multiply+=24;
+	//product[3]=a;				// sta PRODUCT+3	;
+	//_DD=a;
+					cycles.multiply+=13;
 	/*********************************************/
 	/* Add the separate multiplications together */
 	/*********************************************/
@ -485,9 +481,9 @@ static int fixed_mul_unsigned(
 	if (debug) printf("product[1]=%02x+%02x+0=",_AA,_bb);

 	c=0;					// clc			; 2
-//_AA:
+	//_AA:
 	a=_AA;					// lda #0		; 2
-//_bb:
+	//_bb:
 	a+=(c+_bb);				// adc #0		; 2
 	c=!!(a&0x100);
 	a&=0xff;
@ -496,27 +492,27 @@ static int fixed_mul_unsigned(
 					cycles.multiply+=9;
 	// product[2]=_BB+_CC+c
 	if (debug) printf("product[2]=%02x+%02x+%d=",_BB,_CC,c);
-//_BB:
+	//_BB:
 	a=_BB;					// lda #0		; 2
-//_CC:
+	//_CC:
 	a+=(c+_CC);				// adc #0		; 2
 	c=!!(a&0x100);
 	a&=0xff;
 	product[2]=a;				// sta PRODUCT+2	; 3
 	if (debug) printf("%x.%02x\n",c,a);
-					cycles.multiply+=10;
+					cycles.multiply+=7;
 	// product[3]=_DD+c
-	if (debug) printf("product[3]=%02x+%d=",_DD,c);
-	if (c==0) goto urgh2;			// bcc :+		; 2nt/3
-	product[3]++;				// inc PRODUCT+3	; 5
-	product[3]&=0xff;
+//	if (debug) printf("product[3]=%02x+%d=",_DD,c);
+//	if (c==0) goto urgh2;			// bcc :+		; 2nt/3
+//	product[3]++;				// inc PRODUCT+3	; 5
+//	product[3]&=0xff;
 	c=0;					// clc			; 2
-					cycles.multiply+=6;
-urgh2:
+					cycles.multiply+=2;
+//urgh2:
 	if (debug) printf("%x.%02x\n",c,product[3]);
 	// product[1]=_AA+_bb+_cc
 	if (debug) printf("product[1]=%02x+%02x+%d=",product[1],_cc,c);
-//_cc:
+	//_cc:
 	a=_cc;					// lda #0		; 2
 	a+=c+product[1];			// adc PRODUCT+1	; 3
 	c=!!(a&0x100);
@ -526,7 +522,7 @@ urgh2:

 	// product[2]=_BB+_CC+_dd+c
 	if (debug) printf("product[2]=%02x+%02x+%d=",product[2],_dd,c);
-//_dd:
+	//_dd:
 	a=_dd;					// lda #0		; 2
 	a+=c+product[2];			// adc PRODUCT+2	; 3
 	c=!!(a&0x100);
@ -535,25 +531,25 @@ urgh2:
 	if (debug) printf("%x.%02x\n",c,product[2]);

 	// product[3]=_DD+c
-	if (debug) printf("product[3]=%02x+%d=",product[3],c);
-					cycles.multiply+=19;
-	if (c==0) goto urgh;			// bcc :+		; 2nt/3
-	product[3]++;				// inc PRODUCT+3	; 5
-	product[3]&=0xff;
-					cycles.multiply+=4;
-urgh:
+	//if (debug) printf("product[3]=%02x+%d=",product[3],c);
+					cycles.multiply+=16;
+	//if (c==0) goto urgh;			// bcc :+		; 2nt/3
+	//product[3]++;				// inc PRODUCT+3	; 5
+	//product[3]&=0xff;
+					cycles.multiply+=0;
+//urgh:
 	if (debug) printf("%x.%02x\n",c,product[3]);
 	*z_i=product[1];
 	*z_f=product[0];

 //	printf("Result=%02x:%02x\n",*z_i,*z_f);

-	if (debug) {
-		printf("    AAaa        %02x:%02x\n",_AA,_aa);
-		printf("  BBbb       %02x:%02x\n",_BB,_bb);
-		printf("  CCcc       %02x:%02x\n",_CC,_cc);
-		printf("DDdd      %02x:%02x\n",_DD,_dd);
-	}
+//	if (debug) {
+//		printf("    AAaa        %02x:%02x\n",_AA,_aa);
+//		printf("  BBbb       %02x:%02x\n",_BB,_bb);
+//		printf("  CCcc       %02x:%02x\n",_CC,_cc);
+//		printf("DDdd      %02x:%02x\n",_DD,_dd);
+//	}

 					cycles.multiply+=6;

--- a/tfv/OPTIMIZATION
+++ b/tfv/OPTIMIZATION
@ -158,5 +158,18 @@ More self-modifying code, also move SCREEN_X to X register
 	Total =                 193,214
 	Frame Rate = 5.18 fps

+Remove unneeded precision in the 8.8 x 8.8 fixed point multiply
+	Cycles: flying=             187
+	Cycles: getkey=              46
+	Cycles: page_flip=           26
+	Cycles: multiply=        44,785
+	Cycles: mode7=          118,034
+	Cycles: lookup_map=      22,747
+	Cycles: put_sprite=       2,561
+	================================
+	Total =                 188,386
+	Frame Rate = 5.31 fps
+
+
 Each cycle removed from inner X loop saves
 	32*40=1280 cycles
--- a/tfv/tfv_multiply.s
+++ b/tfv/tfv_multiply.s
@ -1,4 +1,11 @@
 ; Fast mutiply
+
+
+; Note for our purposes we only care about 8.8 x 8.8 fixed point
+; with 8.8 result, which means we only care about the middle two bytes
+; of the 32 bit result.  So we disable generation of the high and low byte
+; to save some cycles.
+
 ;
 ; The old routine took around 700 cycles for a 16bitx16bit=32bit mutiply
 ; This routine, at an expense of 2kB of looku tables, takes around 250
@ -130,14 +137,14 @@ fixed_16x16_mul_unsigned:
 	sta	sm1b+1							; 3
 	sta	sm3b+1							; 3
 	sta	sm5b+1							; 3
-	sta	sm7b+1							; 3
+;	sta	sm7b+1							;
 	eor	#$ff	; invert the bits for subtractin		; 2
 	sta	sm2b+1							; 3
 	sta	sm4b+1							; 3
 	sta	sm6b+1							; 3
-	sta	sm8b+1							; 3
+;	sta	sm8b+1							;
 								;===========
-								;	 58
+								;	 52

 num1_same_as_last_time:

@ -154,7 +161,7 @@ sm2a:

 	; a is _aa

-	sta	RESULT+0						; 3
+;	sta	RESULT+0						;

 sm3a:
 	lda	square1_hi,x						; 4
@ -163,7 +170,7 @@ sm4a:
 	; a is _AA
 	sta	_AA+1							; 3
 								;===========
-								;	27
+								;	24

 	; Perform NUM1H * NUM2L = CCcc
 	sec								; 2
@ -213,14 +220,14 @@ sm6b:
 	sbc	square2_lo,x						; 4
 	; a is _dd
 	sta	_dd+1							; 3
-sm7b:
-	lda	square1_hi,x						; 4
-sm8b:
-	sbc	square2_hi,x						; 4
+;sm7b:
+;	lda	square1_hi,x						;
+;sm8b:
+;	sbc	square2_hi,x						;
 	; a = _DD
-	sta	RESULT+3						; 3
+;	sta	RESULT+3						;
 								;===========
-								; 	 24
+								; 	 13

 	;===========================================
 	; Add the separate multiplications together
@ -232,7 +239,8 @@ _AA:
 _bb:
 	adc	#0		; adding in _bb				; 2
 	sta	RESULT+1						; 3
-
+								;==========
+								;	  9
 	; product[2]=_BB+_CC+c

 _BB:
@ -241,15 +249,15 @@ _CC:
 	adc	#0		; adding in _CC				; 2
 	sta RESULT+2							; 3
 								;===========
-								;	 19
+								;	  7

 	;  product[3]=_DD+c

-	bcc	dd_no_carry1						; ^2nt/3
-	inc	RESULT+3						; 5
+;	bcc	dd_no_carry1						;
+;	inc	RESULT+3						;
 	clc								; 2
 								;=============
-								;	  6
+								;	  2
 dd_no_carry1:

 	; product[1]=_AA+_bb+_cc
@ -267,15 +275,15 @@ _dd:
 	sta	RESULT+2						; 3

 								;===========
-								;	 19
+								;	 16
 	; product[3]=_DD+c


-	bcc	dd_no_carry2						; ^2nt/3
-	inc	RESULT+3						; 5
+;	bcc	dd_no_carry2						;
+;	inc	RESULT+3						;

 								;=============
-								;	 4
+								;	 0

 dd_no_carry2: