From d5f906218aa9015d9fcf4b34edb8aa54b065daba Mon Sep 17 00:00:00 2001
From: Vince Weaver <vince@deater.net>
Date: Sat, 16 Dec 2017 14:17:33 -0500
Subject: [PATCH] tfv: move fast multiply to common area

---
 .../multiply_fast.s                           |   0
 .../multiply_slow.s                           |   0
 mode7/Makefile                                |  11 +-
 mode7/mode7.s                                 |   2 +-
 tfv/Makefile                                  |   3 +-
 tfv/tfv_flying.s                              |   2 +-
 tfv/tfv_multiply.s                            | 349 ------------------
 7 files changed, 10 insertions(+), 357 deletions(-)
 rename mode7/fast_multiply.s => asm_routines/multiply_fast.s (100%)
 rename tfv/tfv_multiply_slow.s => asm_routines/multiply_slow.s (100%)
 delete mode 100644 tfv/tfv_multiply.s

diff --git a/mode7/fast_multiply.s b/asm_routines/multiply_fast.s
similarity index 100%
rename from mode7/fast_multiply.s
rename to asm_routines/multiply_fast.s
diff --git a/tfv/tfv_multiply_slow.s b/asm_routines/multiply_slow.s
similarity index 100%
rename from tfv/tfv_multiply_slow.s
rename to asm_routines/multiply_slow.s
diff --git a/mode7/Makefile b/mode7/Makefile
index 7a2a0930..c7fb501b 100644
--- a/mode7/Makefile
+++ b/mode7/Makefile
@@ -23,7 +23,8 @@ MODE7_ISLAND:	mode7_island.o
 	ld65 -o MODE7_ISLAND mode7_island.o -C ./apple2_1000.inc
 
 mode7_island.o:	mode7.s island_lookup.s island_map.inc \
-	fast_multiply.s zp.inc sprites.inc \
+	zp.inc sprites.inc \
+	../asm_routines/multiply_fast.s \
 	../asm_routines/hlin_clearscreen.s \
 	../asm_routines/pageflip.s \
 	../asm_routines/gr_setpage.s \
@@ -36,8 +37,8 @@ MODE7_CHECKERBOARD:	mode7_checkerboard.o
 	ld65 -o MODE7_CHECKERBOARD mode7_checkerboard.o -C ./apple2_1000.inc
 
 mode7_checkerboard.o:	mode7.s checkerboard_lookup.s \
-	fast_multiply.s zp.inc sprites.inc \
-	fast_multiply.s zp.inc sprites.inc \
+	zp.inc sprites.inc \
+	../asm_routines/multiply_fast.s \
 	../asm_routines/hlin_clearscreen.s \
 	../asm_routines/pageflip.s \
 	../asm_routines/gr_setpage.s \
@@ -50,8 +51,8 @@ MODE7_RAINBOW:	mode7_rainbow.o
 	ld65 -o MODE7_RAINBOW mode7_rainbow.o -C ./apple2_1000.inc
 
 mode7_rainbow.o:	mode7.s rainbow_lookup.s \
-	fast_multiply.s zp.inc sprites.inc \
-	fast_multiply.s zp.inc sprites.inc \
+	zp.inc sprites.inc \
+	../asm_routines/multiply_fast.s \
 	../asm_routines/hlin_clearscreen.s \
 	../asm_routines/pageflip.s \
 	../asm_routines/gr_setpage.s \
diff --git a/mode7/mode7.s b/mode7/mode7.s
index bb3a4c1a..a12026cf 100644
--- a/mode7/mode7.s
+++ b/mode7/mode7.s
@@ -1113,7 +1113,7 @@ exit:
 .include "island_map.inc"
 .endif
 
-.include "fast_multiply.s"
+.include "../asm_routines/multiply_fast.s"
 
 
 ; 8.8 fixed point
diff --git a/tfv/Makefile b/tfv/Makefile
index 7a226c2f..be5578ea 100644
--- a/tfv/Makefile
+++ b/tfv/Makefile
@@ -55,9 +55,10 @@ TFV:	tfv.o
 	ld65 -o TFV tfv.o -C ./apple2_1000.inc
 
 tfv.o:	tfv.s \
-	tfv_flying.s tfv_info.s tfv_multiply.s tfv_opener.s tfv_title.s \
+	tfv_flying.s tfv_info.s tfv_opener.s tfv_title.s \
 	tfv_textentry.s tfv_worldmap.s \
 	tfv_backgrounds.inc tfv_sprites.inc tfv_zp.inc \
+	../asm_routines/multiply_fast.s \
 	../asm_routines/hlin_clearscreen.s \
 	../asm_routines/pageflip.s \
 	../asm_routines/gr_setpage.s \
diff --git a/tfv/tfv_flying.s b/tfv/tfv_flying.s
index eabbf663..44c75aa0 100644
--- a/tfv/tfv_flying.s
+++ b/tfv/tfv_flying.s
@@ -1171,7 +1171,7 @@ water_map:
 	.byte $22,$22,$22,$22,  $22,$22,$22,$22
 	.byte $22,$22,$22,$22,  $ee,$22,$22,$22
 
-.include "tfv_multiply.s"
+.include "../asm_routines/multiply_fast.s"
 
 
 ; 8.8 fixed point
diff --git a/tfv/tfv_multiply.s b/tfv/tfv_multiply.s
deleted file mode 100644
index 294b3840..00000000
--- a/tfv/tfv_multiply.s
+++ /dev/null
@@ -1,349 +0,0 @@
-; Fast mutiply
-
-
-; Note for our purposes we only care about 8.8 x 8.8 fixed point
-; with 8.8 result, which means we only care about the middle two bytes
-; of the 32 bit result.  So we disable generation of the high and low byte
-; to save some cycles.
-
-;
-; The old routine took around 700 cycles for a 16bitx16bit=32bit mutiply
-; This routine, at an expense of 2kB of looku tables, takes around 250
-;	If you reuse a term the next time this drops closer to 200
-
-; This routine was described by Stephen Judd and found
-;	in The Fridge and in the C=Hacking magazine
-; http://codebase64.org/doku.php?id=base:seriously_fast_multiplication
-
-; The key thing to note is that
-;	       (a+b)^2     (a-b)^2
-;       a*b =  -------  -  --------
-;                 4           4
-; So if you have tables of the squares of 0..511 you can lookup and subtract
-; instead of multiplying.
-
-; Table generation: I:0..511
-;                   square1_lo = <((I*I)/4)
-;                   square1_hi = >((I*I)/4)
-;                   square2_lo = <(((I-255)*(I-255))/4)
-;                   square2_hi = >(((I-255)*(I-255))/4)
-
-; Note: DOS3.3 starts at $9600
-
-square1_lo	EQU	$8E00
-square1_hi	EQU	$9000
-square2_lo	EQU	$9200
-square2_hi	EQU	$9400
-
-;	for(i=0;i<512;i++) {
-;		square1_lo[i]=((i*i)/4)&0xff;
-;		square1_hi[i]=(((i*i)/4)>>8)&0xff;
-;		square2_lo[i]=( ((i-255)*(i-255))/4)&0xff;
-;		square2_hi[i]=(( ((i-255)*(i-255))/4)>>8)&0xff;
-;	}
-
-init_multiply_tables:
-
-	; Build the add tables
-
-	ldx	#$00
-	txa
-	.byte $c9   ; CMP #immediate - skip TYA and clear carry flag
-lb1:	tya
-	adc	#$00			; 0
-ml1:	sta	square1_hi,x		; square1_hi[0]=0
-	tay				; y=0
-	cmp	#$40			; subtract 64 and update flags (c=0)
-	txa				; a=0
-	ror				; rotate
-ml9:	adc	#$00			; add 0
-	sta	ml9+1			; update add value
-	inx				; x=1
-ml0:	sta	square1_lo,x		; square1_lo[0]=1
-	bne	lb1			; if not zero, loop
-	inc	ml0+2			; increment values
-	inc	ml1+2			; increment values
-	clc				; c=0
-	iny				; y=1
-	bne	lb1			; loop
-
-	; Build the subtract tables based on the existing one
-
-	ldx	#$00
-	ldy	#$ff
-second_table:
-	lda	square1_hi+1,x
-	sta	square2_hi+$100,x
-	lda	square1_hi,x
-	sta	square2_hi,y
-	lda	square1_lo+1,x
-	sta	square2_lo+$100,x
-	lda	square1_lo,x
-	sta	square2_lo,y
-	dey
-	inx
-	bne second_table
-
-
-	rts
-
-
-; Fast 16x16 bit unsigned multiplication, 32-bit result
-; Input: NUM1H:NUM1L * NUM2H:NUM2L
-; Result: RESULT3:RESULT2:RESULT1:RESULT0
-;
-; Does self-modifying code to hard-code NUM1H:NUM1L into the code
-;  carry=0: re-use previous NUM1H:NUM1L
-;  carry=1: reload NUM1H:NUM1L (58 cycles slower)
-;
-; clobbered: RESULT, X, A, C
-; Allocation setup: T1,T2 and RESULT preferably on Zero-page.
-;
-; NUM1H (x_i), NUM1L (x_f)
-; NUM2H (y_i), NUM2L (y_f)
-
-;	NUM1L * NUM2L = AAaa
-;	NUM1L * NUM2H = BBbb
-;	NUM1H * NUM2L = CCcc
-;	NUM1H * NUM2H = DDdd
-;
-;	       AAaa
-;	     BBbb
-;	     CCcc
-;	 + DDdd
-;	 ----------
-;	   RESULT
-
-;fixed_16x16_mul_unsigned:
-
-multiply:
-
-	bcc	num1_same_as_last_time					; 2nt/3
-
-	;============================
-	; Set up self-modifying code
-	; this changes the code to be hard-coded to multiply by NUM1H:NUM1L
-	;============================
-
-	lda	NUM1L	; load the low byte				; 3
-	sta	sm1a+1							; 3
-	sta	sm3a+1							; 3
-	sta	sm5a+1							; 3
-	sta	sm7a+1							; 3
-	eor	#$ff	; invert the bits for subtracting		; 2
-	sta	sm2a+1							; 3
-	sta	sm4a+1							; 3
-	sta	sm6a+1							; 3
-	sta	sm8a+1							; 3
-	lda	NUM1H	; load the high byte				; 3
-	sta	sm1b+1							; 3
-	sta	sm3b+1							; 3
-	sta	sm5b+1							; 3
-;	sta	sm7b+1							;
-	eor	#$ff	; invert the bits for subtractin		; 2
-	sta	sm2b+1							; 3
-	sta	sm4b+1							; 3
-	sta	sm6b+1							; 3
-;	sta	sm8b+1							;
-								;===========
-								;	 52
-
-num1_same_as_last_time:
-
-	;==========================
-	; Perform NUM1L * NUM2L = AAaa
-	;==========================
-
-	ldx	NUM2L	; (low le)					; 3
-	sec								; 2
-sm1a:
-	lda	square1_lo,x						; 4
-sm2a:
-	sbc	square2_lo,x						; 4
-
-	; a is _aa
-
-;	sta	RESULT+0						;
-
-sm3a:
-	lda	square1_hi,x						; 4
-sm4a:
-	sbc	square2_hi,x						; 4
-	; a is _AA
-	sta	_AA+1							; 3
-								;===========
-								;	24
-
-	; Perform NUM1H * NUM2L = CCcc
-	sec								; 2
-sm1b:
-	lda	square1_lo,x						; 4
-sm2b:
-	sbc	square2_lo,x						; 4
-	; a is _cc
-	sta	_cc+1							; 3
-sm3b:
-	lda	square1_hi,x						; 4
-sm4b:
-	sbc square2_hi,x						; 4
-	; a is _CC
-	sta	_CC+1							; 3
-								;===========
-								;	 24
-
-	;==========================
-	; Perform NUM1L * NUM2H = BBbb
-	;==========================
-	ldx	NUM2H							; 3
-	sec								; 2
-sm5a:
-	lda	square1_lo,x						; 4
-sm6a:
-	sbc	square2_lo,x						; 4
-	; a is _bb
-	sta	_bb+1							; 3
-
-sm7a:
-	lda	square1_hi,x						; 4
-sm8a:
-	sbc	square2_hi,x						; 4
-	; a is _BB
-	sta	_BB+1							; 3
-								;===========
-								;	 27
-
-	;==========================
-	; Perform NUM1H * NUM2H = DDdd
-	;==========================
-	sec								; 2
-sm5b:
-	lda	square1_lo,x						; 4
-sm6b:
-	sbc	square2_lo,x						; 4
-	; a is _dd
-	sta	_dd+1							; 3
-;sm7b:
-;	lda	square1_hi,x						;
-;sm8b:
-;	sbc	square2_hi,x						;
-	; a = _DD
-;	sta	RESULT+3						;
-								;===========
-								; 	 13
-
-	;===========================================
-	; Add the separate multiplications together
-	;===========================================
-
-	clc								; 2
-_AA:
-	lda	#0		; loading _AA				; 2
-_bb:
-	adc	#0		; adding in _bb				; 2
-	sta	RESULT+1						; 3
-								;==========
-								;	  9
-	; product[2]=_BB+_CC+c
-
-_BB:
-	lda	#0		; loading _BB				; 2
-_CC:
-	adc	#0		; adding in _CC				; 2
-	sta RESULT+2							; 3
-								;===========
-								;	  7
-
-	;  product[3]=_DD+c
-
-;	bcc	dd_no_carry1						;
-;	inc	RESULT+3						;
-	clc								; 2
-								;=============
-								;	  2
-dd_no_carry1:
-
-	; product[1]=_AA+_bb+_cc
-
-_cc:
-	lda	#0		; load _cc				; 2
-	adc	RESULT+1						; 3
-	sta	RESULT+1						; 3
-
-	; product[2]=_BB+_CC+_dd+c
-
-_dd:
-	lda	#0		; load _dd				; 2
-	adc	RESULT+2						; 3
-	sta	RESULT+2						; 3
-
-								;===========
-								;	 16
-	; product[3]=_DD+c
-
-
-;	bcc	dd_no_carry2						;
-;	inc	RESULT+3						;
-
-								;=============
-								;	 0
-
-dd_no_carry2:
-
-;	*z_i=product[1];
-;	*z_f=product[0];
-
-;	rts								; 6
-
-
-	;=================
-	; Signed multiply
-	;=================
-
-;multiply:
-
-;	jsr fixed_16x16_mul_unsigned					; 6
-
-	lda	NUM1H		; x_i					; 3
-								;===========
-								;	 12
-
-
-	bpl	x_positive						;^3/2nt
-
-	sec								; 2
-	lda	RESULT+2						; 3
-	sbc	NUM2L							; 3
-	sta	RESULT+2						; 3
-;	lda	RESULT+3						;
-;	sbc	NUM2H							;
-;	sta	RESULT+3						;
-								;============
-								;	 10
-
-x_positive:
-
-	lda	NUM2H		; y_i					; 3
-								;============
-								;	; 6
-
-	bpl	y_positive						;^3/2nt
-
-
-	sec								; 2
-	lda	RESULT+2						; 3
-	sbc	NUM1L							; 3
-	sta	RESULT+2						; 3
-;	lda	RESULT+3						;
-;	sbc	NUM1H							;
-;	sta	RESULT+3						;
-								;===========
-								;	 10
-
-y_positive:
-	ldx	RESULT+2	;	*z_i=product[2];		; 3
-	lda	RESULT+1	;	*z_f=product[1];		; 3
-
-	rts								; 6
-								;==========
-								;	 12
-