From 382fe07bfdf1a57c2785beb4b333e0c0b955509f Mon Sep 17 00:00:00 2001
From: Vince Weaver <vince@deater.net>
Date: Sat, 25 Nov 2017 19:37:02 -0500
Subject: [PATCH] tfv: plug in the high speed multiply

---
 tfv/TODO                |  14 +-
 tfv/tfv_flying.s        |   3 +
 tfv/tfv_multiply.s      | 421 ++++++++++++++++++++++++++++++----------
 tfv/tfv_multiply_slow.s | 129 ++++++++++++
 4 files changed, 460 insertions(+), 107 deletions(-)
 create mode 100644 tfv/tfv_multiply_slow.s

diff --git a/tfv/TODO b/tfv/TODO
index e544242e..ad96ffbb 100644
--- a/tfv/TODO
+++ b/tfv/TODO
@@ -1,11 +1,23 @@
 mode7 speed fixes:
 	+ Don't draw sky every frame, only if needed
+
 	+ faster multiply routine
-	+ move multiply vars to zero page
+	+ If result is AABBCCDD we only need BBCC for fixed point
+		result
+
+	+ re-arrange variables to better take advantage of self-modifying code
+	+ only doing the spacez calculation if it has changed
+	+ update the constants to be constants
+	+ move the screen width constant to own set of varaibles
+		(instead of temp)
 	+ leave one of multiply results in accumulator at end?
 
 	+  Skip key parsing if no key read
 
+	slower: move to 40x40 again
+		draw every other line.  First with color/black		
+		then xor in the second line
+
 short term:
 
 longer term:
diff --git a/tfv/tfv_flying.s b/tfv/tfv_flying.s
index 5ae1096a..c206bb2d 100644
--- a/tfv/tfv_flying.s
+++ b/tfv/tfv_flying.s
@@ -19,6 +19,9 @@ flying_start:
 	jsr	clear_screens
 	jsr     set_gr_page0
 
+
+	jsr	init_multiply_tables
+
 	;===============
 	; Init Variables
 	;===============
diff --git a/tfv/tfv_multiply.s b/tfv/tfv_multiply.s
index c1801ab3..ae3a6fb6 100644
--- a/tfv/tfv_multiply.s
+++ b/tfv/tfv_multiply.s
@@ -1,129 +1,338 @@
-; http://www.llx.com/~nparker/a2/mult.html
-; MULTIPLY NUM1H:NUM1L * NUM2H:NUM2L
-; NUM2 is zeroed out
-; result is in RESULT3:RESULT2:RESULT1:RESULT0
+; Fast mutiply
+;
+; The old routine took around 700 cycles for a 16bitx16bit=32bit mutiply
+; This routine, at an expense of 2kB of looku tables, takes around 250
+;	If you reuse a term the next time this drops closer to 200
 
-;NUM1L:	.byte 0
-;NUM1H:	.byte 0
-;NUM2L:	.byte 0
-;NUM2H:	.byte 0
-;RESULT:	.byte 0,0,0,0
-;NEGATE:	.byte 0
-
-; If we have 2k to spare we should check out
+; This routine was described by Stephen Judd and found
+;	in The Fridge and in the C=Hacking magazine
 ; http://codebase64.org/doku.php?id=base:seriously_fast_multiplication
 
-multiply:
+; The key thing to note is that
+;	       (a+b)^2     (a-b)^2
+;       a*b =  -------  -  --------
+;                 4           4
+; So if you have tables of the squares of 0..511 you can lookup and subtract
+; instead of multiplying.
 
-	lda	#$0							; 2
-	sta	NEGATE							; 3
+; Table generation: I:0..511
+;                   square1_lo = <((I*I)/4)
+;                   square1_hi = >((I*I)/4)
+;                   square2_lo = <(((I-255)*(I-255))/4)
+;                   square2_hi = >(((I-255)*(I-255))/4)
 
-	; Handle Signed
-	lda	NUM1H							; 3
-	bpl	check_num2						; 2nt/3
-								;==============
-								;	 10
+; Note: DOS3.3 starts at $9600
 
-	inc	NEGATE							; 3
+square1_lo	EQU	$8E00
+square1_hi	EQU	$9000
+square2_lo	EQU	$9200
+square2_hi	EQU	$9400
 
-	clc		; 2s-complement NUM1H/NUM1L			; 2
-	lda	NUM1L							; 3
-	eor	#$ff							; 2
-	adc	#$1							; 2
-	sta	NUM1L							; 3
+;	for(i=0;i<512;i++) {
+;		square1_lo[i]=((i*i)/4)&0xff;
+;		square1_hi[i]=(((i*i)/4)>>8)&0xff;
+;		square2_lo[i]=( ((i-255)*(i-255))/4)&0xff;
+;		square2_hi[i]=(( ((i-255)*(i-255))/4)>>8)&0xff;
+;	}
 
-	lda	NUM1H							; 3
-	eor	#$ff							; 2
-	adc	#$0							; 2
-	sta	NUM1H							; 3
+init_multiply_tables:
+
+	; Build the add tables
+
+	ldx	#$00
+	txa
+	.byte $c9   ; CMP #immediate - skip TYA and clear carry flag
+lb1:	tya
+	adc	#$00			; 0
+ml1:	sta	square1_hi,x		; square1_hi[0]=0
+	tay				; y=0
+	cmp	#$40			; subtract 64 and update flags (c=0)
+	txa				; a=0
+	ror				; rotate
+ml9:	adc	#$00			; add 0
+	sta	ml9+1			; update add value
+	inx				; x=1
+ml0:	sta	square1_lo,x		; square1_lo[0]=1
+	bne	lb1			; if not zero, loop
+	inc	ml0+2			; increment values
+	inc	ml1+2			; increment values
+	clc				; c=0
+	iny				; y=1
+	bne	lb1			; loop
+
+	; Build the subtract tables based on the existing one
+
+	ldx	#$00
+	ldy	#$ff
+second_table:
+	lda	square1_hi+1,x
+	sta	square2_hi+$100,x
+	lda	square1_hi,x
+	sta	square2_hi,y
+	lda	square1_lo+1,x
+	sta	square2_lo+$100,x
+	lda	square1_lo,x
+	sta	square2_lo,y
+	dey
+	inx
+	bne second_table
+
+
+	rts
+
+
+; Fast 16x16 bit unsigned multiplication, 32-bit result
+; Input: NUM1H:NUM1L * NUM2H:NUM2L
+; Result: RESULT3:RESULT2:RESULT1:RESULT0
+;
+; Does self-modifying code to hard-code NUM1H:NUM1L into the code
+;  carry=0: re-use previous NUM1H:NUM1L
+;  carry=1: reload NUM1H:NUM1L (58 cycles slower)
+;
+; clobbered: RESULT, X, A, C
+; Allocation setup: T1,T2 and RESULT preferably on Zero-page.
+;
+; NUM1H (x_i), NUM1L (x_f)
+; NUM2H (y_i), NUM2L (y_f)
+
+;	NUM1L * NUM2L = AAaa
+;	NUM1L * NUM2H = BBbb
+;	NUM1H * NUM2L = CCcc
+;	NUM1H * NUM2H = DDdd
+;
+;	       AAaa
+;	     BBbb
+;	     CCcc
+;	 + DDdd
+;	 ----------
+;	   RESULT
+
+fixed_16x16_mul_unsigned:
+
+	sec	; FIXME-remove when we implement this
+
+	bcc	num1_same_as_last_time					; 2nt/3
+
+	;============================
+	; Set up self-modifying code
+	; this changes the code to be hard-coded to multiply by NUM1H:NUM1L
+	;============================
+
+	lda	NUM1L	; load the low byte				; 3
+	sta	sm1a+1							; 3
+	sta	sm3a+1							; 3
+	sta	sm5a+1							; 3
+	sta	sm7a+1							; 3
+	eor	#$ff	; invert the bits for subtracting		; 2
+	sta	sm2a+1							; 3
+	sta	sm4a+1							; 3
+	sta	sm6a+1							; 3
+	sta	sm8a+1							; 3
+	lda	NUM1H	; load the high byte				; 3
+	sta	sm1b+1							; 3
+	sta	sm3b+1							; 3
+	sta	sm5b+1							; 3
+	sta	sm7b+1							; 3
+	eor	#$ff	; invert the bits for subtractin		; 2
+	sta	sm2b+1							; 3
+	sta	sm4b+1							; 3
+	sta	sm6b+1							; 3
+	sta	sm8b+1							; 3
 								;===========
-								;	 25
-check_num2:
-	lda	NUM2H							; 3
-	bpl	unsigned_multiply					; 2nt/3
-								;==============
-								;	  6
+								;	 58
 
-	inc	NEGATE							; 3
+num1_same_as_last_time:
 
-	clc								; 2
-	lda	NUM2L							; 3
-	eor	#$ff							; 2
-	adc	#$1							; 2
-	sta	NUM2L							; 3
+	;==========================
+	; Perform NUM1L * NUM2L = AAaa
+	;==========================
 
-	lda	NUM2H							; 3
-	eor	#$ff							; 2
-	adc	#$0							; 2
-	sta	NUM2H							; 3
-								;=============
-								;	 25
-unsigned_multiply:
+	ldx	NUM2L	; (low le)					; 3
+	sec								; 2
+sm1a:
+	lda	square1_lo,x						; 4
+sm2a:
+	sbc	square2_lo,x						; 4
 
-	lda	#0		; Initialize RESULT to 0		; 2
-	sta 	RESULT+2						; 3
-	ldx	#16		; 16x16 multiply			; 2
-								;============
-								;	  7
-multiply_mainloop:
-	lsr	NUM2H		; Shift right 16-bit NUM2		; 5
-	ror	NUM2L		; low bit goes into carry		; 5
-	bcc	shift_output	; 0 or 1?				; 2nt/3
-								;============
-								;	 13
+	; a is _aa
 
-	tay			; If 1, add NUM1 (hi byte RESULT in A)	; 2
-	clc								; 2
-	lda	NUM1L							; 3
-	adc	RESULT+2						; 3
-	sta	RESULT+2						; 3
-	tya								; 2
-	adc	NUM1H							; 3
-								;============
-								;	 18
-shift_output:
-	ror	A		; "Stairstep" shift			; 2
-	ror	RESULT+2						; 5
-	ror	RESULT+1						; 5
-	ror	RESULT							; 5
-	dex								; 2
-	bne	multiply_mainloop					; 2nt/3
-								;=============
-								;	 22
-
-	sta	RESULT+3						; 3
-
-	;; Negate if necessary
-
-	lda	NEGATE							; 3
-	and	#$1							; 2
-	beq	positive						; 2nt/3
-								;==============
-								;	 11
-
-	clc								; 2
-	lda	RESULT+0						; 3
-	eor	#$ff							; 2
-	adc	#$1							; 2
 	sta	RESULT+0						; 3
 
-	lda	RESULT+1						; 3
-	eor	#$ff							; 2
-	adc	#$0							; 2
-	sta	RESULT+1						; 3
+sm3a:
+	lda	square1_hi,x						; 4
+sm4a:
+	sbc	square2_hi,x						; 4
+	; a is _AA
+	sta	_AA+1							; 3
+								;===========
+								;	27
 
-	lda	RESULT+2						; 3
-	eor	#$ff							; 2
-	adc	#$0							; 2
-	sta	RESULT+2						; 3
+	; Perform NUM1H * NUM2L = CCcc
+	sec								; 2
+sm1b:
+	lda	square1_lo,x						; 4
+sm2b:
+	sbc	square2_lo,x						; 4
+	; a is _cc
+	sta	_cc+1							; 3
+sm3b:
+	lda	square1_hi,x						; 4
+sm4b:
+	sbc square2_hi,x						; 4
+	; a is _CC
+	sta	_CC+1							; 3
+								;===========
+								;	 24
 
-	lda	RESULT+3						; 3
-	eor	#$ff							; 2
-	adc	#$0							; 2
+	;==========================
+	; Perform NUM1L * NUM2H = BBbb
+	;==========================
+	ldx	NUM2H							; 3
+	sec								; 2
+sm5a:
+	lda	square1_lo,x						; 4
+sm6a:
+	sbc	square2_lo,x						; 4
+	; a is _bb
+	sta	_bb+1							; 3
+
+sm7a:
+	lda	square1_hi,x						; 4
+sm8a:
+	sbc	square2_hi,x						; 4
+	; a is _BB
+	sta	_BB+1							; 3
+								;===========
+								;	 27
+
+	;==========================
+	; Perform NUM1H * NUM2H = DDdd
+	;==========================
+	sec								; 2
+sm5b:
+	lda	square1_lo,x						; 4
+sm6b:
+	sbc	square2_lo,x						; 4
+	; a is _dd
+	sta	_dd+1							; 3
+sm7b:
+	lda	square1_hi,x						; 4
+sm8b:
+	sbc	square2_hi,x						; 4
+	; a = _DD
 	sta	RESULT+3						; 3
 								;===========
-								;	 42
-positive:
+								; 	 24
+
+	;===========================================
+	; Add the separate multiplications together
+	;===========================================
+
+	clc								; 2
+_AA:
+	lda	#0		; loading _AA				; 2
+_bb:
+	adc	#0		; adding in _bb				; 2
+	sta	RESULT+1						; 3
+
+	; product[2]=_BB+_CC+c
+
+_BB:
+	lda	#0		; loading _BB				; 2
+_CC:
+	adc	#0		; adding in _CC				; 2
+	sta RESULT+2							; 3
+								;===========
+								;	 19
+
+	;  product[3]=_DD+c
+
+	bcc	dd_no_carry1						; ^2nt/3
+	inc	RESULT+3						; 5
+	clc								; 2
+								;=============
+								;	  6
+dd_no_carry1:
+
+	; product[1]=_AA+_bb+_cc
+
+_cc:
+	lda	#0		; load _cc				; 2
+	adc	RESULT+1						; 3
+	sta	RESULT+1						; 3
+
+	; product[2]=_BB+_CC+_dd+c
+
+_dd:
+	lda	#0		; load _dd				; 2
+	adc	RESULT+2						; 3
+	sta	RESULT+2						; 3
+
+								;===========
+								;	 19
+	; product[3]=_DD+c
+
+
+	bcc	dd_no_carry2						; ^2nt/3
+	inc	RESULT+3						; 5
+
+								;=============
+								;	 4
+
+dd_no_carry2:
+
+;	*z_i=product[1];
+;	*z_f=product[0];
+
+	rts								; 6
+
+
+	;=================
+	; Signed multiply
+	;=================
+
+multiply:
+
+	jsr fixed_16x16_mul_unsigned					; 6
+
+	lda	NUM1H		; x_i					; 3
+								;===========
+								;	 12
+
+
+	bpl	x_positive						;^3/2nt
+
+	sec								; 2
+	lda	RESULT+2						; 3
+	sbc	NUM2L							; 3
+	sta	RESULT+2						; 3
+	lda	RESULT+3						; 3
+	sbc	NUM2H							; 3
+	sta	RESULT+3						; 3
+								;============
+								;	 19
+
+x_positive:
+
+	lda	NUM2H		; y_i					; 3
+								;============
+								;	; 6
+	bpl	y_positive						; 3/2nt
+
+
+	sec								; 2
+	lda	RESULT+2						; 3
+	sbc	NUM1L							; 3
+	sta	RESULT+2						; 3
+	lda	RESULT+3						; 3
+	sbc	NUM1H							; 3
+	sta	RESULT+3						; 3
+								;===========
+								;	 19
+
+y_positive:
+;	*z_i=product[2];
+;	*z_f=product[1];
 
 	rts								; 6
 
diff --git a/tfv/tfv_multiply_slow.s b/tfv/tfv_multiply_slow.s
new file mode 100644
index 00000000..c1801ab3
--- /dev/null
+++ b/tfv/tfv_multiply_slow.s
@@ -0,0 +1,129 @@
+; http://www.llx.com/~nparker/a2/mult.html
+; MULTIPLY NUM1H:NUM1L * NUM2H:NUM2L
+; NUM2 is zeroed out
+; result is in RESULT3:RESULT2:RESULT1:RESULT0
+
+;NUM1L:	.byte 0
+;NUM1H:	.byte 0
+;NUM2L:	.byte 0
+;NUM2H:	.byte 0
+;RESULT:	.byte 0,0,0,0
+;NEGATE:	.byte 0
+
+; If we have 2k to spare we should check out
+; http://codebase64.org/doku.php?id=base:seriously_fast_multiplication
+
+multiply:
+
+	lda	#$0							; 2
+	sta	NEGATE							; 3
+
+	; Handle Signed
+	lda	NUM1H							; 3
+	bpl	check_num2						; 2nt/3
+								;==============
+								;	 10
+
+	inc	NEGATE							; 3
+
+	clc		; 2s-complement NUM1H/NUM1L			; 2
+	lda	NUM1L							; 3
+	eor	#$ff							; 2
+	adc	#$1							; 2
+	sta	NUM1L							; 3
+
+	lda	NUM1H							; 3
+	eor	#$ff							; 2
+	adc	#$0							; 2
+	sta	NUM1H							; 3
+								;===========
+								;	 25
+check_num2:
+	lda	NUM2H							; 3
+	bpl	unsigned_multiply					; 2nt/3
+								;==============
+								;	  6
+
+	inc	NEGATE							; 3
+
+	clc								; 2
+	lda	NUM2L							; 3
+	eor	#$ff							; 2
+	adc	#$1							; 2
+	sta	NUM2L							; 3
+
+	lda	NUM2H							; 3
+	eor	#$ff							; 2
+	adc	#$0							; 2
+	sta	NUM2H							; 3
+								;=============
+								;	 25
+unsigned_multiply:
+
+	lda	#0		; Initialize RESULT to 0		; 2
+	sta 	RESULT+2						; 3
+	ldx	#16		; 16x16 multiply			; 2
+								;============
+								;	  7
+multiply_mainloop:
+	lsr	NUM2H		; Shift right 16-bit NUM2		; 5
+	ror	NUM2L		; low bit goes into carry		; 5
+	bcc	shift_output	; 0 or 1?				; 2nt/3
+								;============
+								;	 13
+
+	tay			; If 1, add NUM1 (hi byte RESULT in A)	; 2
+	clc								; 2
+	lda	NUM1L							; 3
+	adc	RESULT+2						; 3
+	sta	RESULT+2						; 3
+	tya								; 2
+	adc	NUM1H							; 3
+								;============
+								;	 18
+shift_output:
+	ror	A		; "Stairstep" shift			; 2
+	ror	RESULT+2						; 5
+	ror	RESULT+1						; 5
+	ror	RESULT							; 5
+	dex								; 2
+	bne	multiply_mainloop					; 2nt/3
+								;=============
+								;	 22
+
+	sta	RESULT+3						; 3
+
+	;; Negate if necessary
+
+	lda	NEGATE							; 3
+	and	#$1							; 2
+	beq	positive						; 2nt/3
+								;==============
+								;	 11
+
+	clc								; 2
+	lda	RESULT+0						; 3
+	eor	#$ff							; 2
+	adc	#$1							; 2
+	sta	RESULT+0						; 3
+
+	lda	RESULT+1						; 3
+	eor	#$ff							; 2
+	adc	#$0							; 2
+	sta	RESULT+1						; 3
+
+	lda	RESULT+2						; 3
+	eor	#$ff							; 2
+	adc	#$0							; 2
+	sta	RESULT+2						; 3
+
+	lda	RESULT+3						; 3
+	eor	#$ff							; 2
+	adc	#$0							; 2
+	sta	RESULT+3						; 3
+								;===========
+								;	 42
+positive:
+
+	rts								; 6
+