starpath: more optimization

2025-04-16 20:40:18 +00:00 · 2025-02-26 01:42:43 -05:00 · 2025-02-26 01:42:43 -05:00 · d4232185d4
commit d4232185d4
parent 61a50b4e9f
2 changed files with 417 additions and 91 deletions
--- a/graphics/gr/starpath/starpath.s
+++ b/graphics/gr/starpath/starpath.s
@ -13,6 +13,8 @@ SETGR	= $FB40

 FULLGR	= $C052

+COLOR	= $30
+
 FRAME	= $F0
 YPOS	= $F1
 XPOS	= $F2
@ -46,10 +48,51 @@ starpath:
 	; initialize
 	;=============================

-	jsr	init_multiply_tables

-	lda	#0
-	sta	FRAME
+init_multiply_tables:
+
+	; Build the add tables
+
+	ldx	#$00
+	txa
+	.byte $c9   ; CMP #immediate - skip TYA and clear carry flag
+lb1:	tya
+	adc	#$00			; 0
+ml1:	sta	square1_hi,x		; square1_hi[0]=0
+	tay				; y=0
+	cmp	#$40			; subtract 64 and update flags (c=0)
+	txa				; a=0
+	ror				; rotate
+ml9:	adc	#$00			; add 0
+	sta	ml9+1			; update add value
+	inx				; x=1
+ml0:	sta	square1_lo,x		; square1_lo[0]=1
+	bne	lb1			; if not zero, loop
+	inc	ml0+2			; increment values
+	inc	ml1+2			; increment values
+	clc				; c=0
+	iny				; y=1
+	bne	lb1			; loop
+
+	; Build the subtract tables based on the existing one
+
+	ldx	#$00
+	ldy	#$ff
+second_table:
+	lda	square1_hi+1,x
+	sta	square2_hi+$100,x
+	lda	square1_hi,x
+	sta	square2_hi,y
+	lda	square1_lo+1,x
+	sta	square2_lo+$100,x
+	lda	square1_lo,x
+	sta	square2_lo,y
+	dey
+	inx
+	bne second_table
+
+;	lda	#0
+	stx	FRAME

 next_frame:
 	lda	#0		; start with YPOS=0
@ -58,10 +101,10 @@ yloop:
 	lda	#0		; start with XPOS=0
 	sta	XPOS
 xloop:
-	ldx	#14
+	ldx	#14		; start Depth at 14

 depth_loop:
-	stx	DEPTH		; start Depth at 14
+	stx	DEPTH


 	;===============
@ -78,8 +121,6 @@ depth_loop:
 	jsr	multiply_u8x8	; 8-bit unsigned multiply

 	sta	YPH		; store out to YPH:YPL
-	;lda	PRODLO
-	;sta	YPL

 	;========================
 	; XP=(X*6)-DEPTH
@ -202,7 +243,9 @@ plot_pixel:
 	;=====================
 	; set color

-	jsr	SETCOL			; Set COLOR with ROM routine (mul*17)
+	sta	COLOR
+
+;	jsr	SETCOL			; Set COLOR with ROM routine (mul*17)

 	;=====================
 	; plot point
@ -237,7 +280,7 @@ yloop_done:
 	jmp	next_frame

 color_lookup:
-.byte	0,5,10,5,10,7,15,15,2,1,3,9,13,12
+.byte	$00,$55,$AA,$55,$AA,$77,$FF,$FF,$22,$11,$33,$99,$DD,$CC


 ; Fast mutiply
@ -271,24 +314,18 @@ color_lookup:


 ; Fast 8x8 bit unsigned multiplication, 16-bit result
-; Input: M1xM2
-; Result: M2:M1
+
+; input AxY
+; Result: M2,A:M1
 ;

-; input A/Y
-
 multiply_u8x8:
-	sta	M1
-	sty	M2
-
-	lda     M1
        sta     sm1a+1                                                  ; 3
        sta     sm3a+1                                                  ; 3
        eor     #$ff    ; invert the bits for subtracting               ; 2
        sta     sm2a+1                                                  ; 3
        sta     sm4a+1                                                  ; 3

-	ldy	M2
 	sec
 sm1a:
 	lda	square1_lo,Y
@ -299,42 +336,13 @@ sm3a:
 	lda	square1_hi,Y
 sm4a:
 	sbc	square2_hi,Y
-	sta	M2
+;	sta	M2

 	rts


 ; Fast mutiply -- setup tables

-
-; Note for our purposes we only care about 8.8 x 8.8 fixed point
-; with 8.8 result, which means we only care about the middle two bytes
-; of the 32 bit result.  So we disable generation of the high and low byte
-; to save some cycles.
-
-;
-; The old routine took around 700 cycles for a 16bitx16bit=32bit mutiply
-; This routine, at an expense of 2kB of looku tables, takes around 250
-;	If you reuse a term the next time this drops closer to 200
-
-; This routine was described by Stephen Judd and found
-;	in The Fridge and in the C=Hacking magazine
-; http://codebase64.org/doku.php?id=base:seriously_fast_multiplication
-
-; The key thing to note is that
-;	       (a+b)^2     (a-b)^2
-;       a*b =  -------  -  --------
-;                 4           4
-; So if you have tables of the squares of 0..511 you can lookup and subtract
-; instead of multiplying.
-
-; Table generation: I:0..511
-;                   square1_lo = <((I*I)/4)
-;                   square1_hi = >((I*I)/4)
-;                   square2_lo = <(((I-255)*(I-255))/4)
-;                   square2_hi = >(((I-255)*(I-255))/4)
-
-
 .ifndef square1_lo
 square1_lo	=	$2000
 square1_hi	=	$2200
@ -349,48 +357,5 @@ square2_hi	=	$2600
 ;		square2_hi[i]=(( ((i-255)*(i-255))/4)>>8)&0xff;
 ;	}

-init_multiply_tables:
-
-	; Build the add tables
-
-	ldx	#$00
-	txa
-	.byte $c9   ; CMP #immediate - skip TYA and clear carry flag
-lb1:	tya
-	adc	#$00			; 0
-ml1:	sta	square1_hi,x		; square1_hi[0]=0
-	tay				; y=0
-	cmp	#$40			; subtract 64 and update flags (c=0)
-	txa				; a=0
-	ror				; rotate
-ml9:	adc	#$00			; add 0
-	sta	ml9+1			; update add value
-	inx				; x=1
-ml0:	sta	square1_lo,x		; square1_lo[0]=1
-	bne	lb1			; if not zero, loop
-	inc	ml0+2			; increment values
-	inc	ml1+2			; increment values
-	clc				; c=0
-	iny				; y=1
-	bne	lb1			; loop
-
-	; Build the subtract tables based on the existing one
-
-	ldx	#$00
-	ldy	#$ff
-second_table:
-	lda	square1_hi+1,x
-	sta	square2_hi+$100,x
-	lda	square1_hi,x
-	sta	square2_hi,y
-	lda	square1_lo+1,x
-	sta	square2_lo+$100,x
-	lda	square1_lo,x
-	sta	square2_lo,y
-	dey
-	inx
-	bne second_table


-	rts
-
--- a/graphics/gr/starpath/starpath_256.s
+++ b/graphics/gr/starpath/starpath_256.s
@ -0,0 +1,361 @@
+; An Apple II lores version of Hellmood's amazing 64B DOS Star Path Demo
+;
+;       See https://hellmood.111mb.de//starpath_is_55_bytes.html
+;
+;     deater -- Vince Weaver -- vince@deater.net -- 25 February 2025
+
+
+PLOT	= $F800		; PLOT AT Y,A (A colors output, Y preserved)
+PLOT1	= $F80E		; PLOT at (GBASL),Y (need MASK to be $0f or $f0)
+
+SETCOL	= $F864		; COLOR=A
+SETGR	= $FB40
+
+FULLGR	= $C052
+
+COLOR	= $30
+
+FRAME	= $F0
+YPOS	= $F1
+XPOS	= $F2
+DEPTH	= $F3
+C	= $F4
+AL	= $F5
+M1	= $F6
+M2	= $F7
+TEMP	= $F8
+YPL	= $F9
+YPH	= $FA
+XPL	= $FB
+XPH	= $FC
+Q	= $FD
+
+	;=============================
+	;=============================
+	; star path
+	;=============================
+	;=============================
+
+starpath:
+	;=============================
+	; setup graphics
+	;=============================
+
+	jsr	SETGR			; set graphics
+	bit	$C052			; set full-screen graphics
+
+	;=============================
+	; initialize
+	;=============================
+
+
+init_multiply_tables:
+
+	; Build the add tables
+
+	ldx	#$00
+	txa
+	.byte $c9   ; CMP #immediate - skip TYA and clear carry flag
+lb1:	tya
+	adc	#$00			; 0
+ml1:	sta	square1_hi,x		; square1_hi[0]=0
+	tay				; y=0
+	cmp	#$40			; subtract 64 and update flags (c=0)
+	txa				; a=0
+	ror				; rotate
+ml9:	adc	#$00			; add 0
+	sta	ml9+1			; update add value
+	inx				; x=1
+ml0:	sta	square1_lo,x		; square1_lo[0]=1
+	bne	lb1			; if not zero, loop
+	inc	ml0+2			; increment values
+	inc	ml1+2			; increment values
+	clc				; c=0
+	iny				; y=1
+	bne	lb1			; loop
+
+	; Build the subtract tables based on the existing one
+
+	ldx	#$00
+	ldy	#$ff
+second_table:
+	lda	square1_hi+1,x
+	sta	square2_hi+$100,x
+	lda	square1_hi,x
+	sta	square2_hi,y
+	lda	square1_lo+1,x
+	sta	square2_lo+$100,x
+	lda	square1_lo,x
+	sta	square2_lo,y
+	dey
+	inx
+	bne second_table
+
+;	lda	#0
+	stx	FRAME
+
+next_frame:
+	lda	#0		; start with YPOS=0
+	sta	YPOS
+yloop:
+	lda	#0		; start with XPOS=0
+	sta	XPOS
+xloop:
+	ldx	#14		; start Depth at 14
+
+depth_loop:
+	stx	DEPTH
+
+
+	;===============
+	; YP = Y*4*DEPTH
+	;===============
+
+	lda	YPOS		;
+	asl
+	asl			; A is YPOS*4
+
+	tay			; multiply Y*4*DEPTH
+;	lda	DEPTH
+	txa
+	jsr	multiply_u8x8	; 8-bit unsigned multiply
+
+	sta	YPH		; store out to YPH:YPL
+
+	;========================
+	; XP=(X*6)-DEPTH
+	;	curve X by depth
+	;=========================
+
+	lda	XPOS		; load XPOS
+	asl
+	sta	XPL
+	asl
+	;clc			; carry always 0 as x never more than 40?
+	adc	XPL
+	sta	XPL		; XPL=XPOS*6
+
+	sta	AL		; AL also is XPOS*6
+
+	sec			; Subtract DEPTH
+	sbc	DEPTH
+	sta	XPL		; XP=(XPOS*6)-DEPTH
+
+				; if carry set means not negative
+				; and draw path
+				; otherwise we draw the sky
+	bcs	draw_path
+
+	;========================
+	; draw the sky
+	;========================
+draw_sky:
+
+	;================================
+	; set color to white for star?
+
+	lda	#31		; C=31
+	sta	C
+
+	;=====================
+	; calc A=(XPOS*6)+YP
+
+	; ??? used to see if star
+
+	clc			; A=X*6+YP
+	lda	AL
+	adc	M1		; YPL from previous multiply
+
+	;==============
+	; see if star
+
+	cmp	#6		; if A&0xFF < 6 then skip, we are star
+	bcc	plot_pixel
+
+	;==============
+	; not star, sky
+
+	lda	YPOS		; C=Y/4+32
+	lsr
+	lsr
+	clc
+	adc	#32
+	sta	C
+
+	bne	plot_pixel	; bra
+
+	;====================
+	; draw path
+	;====================
+
+draw_path:
+	;=================================
+	; calc XP*DEPTH and get high byte
+
+	ldy	XPL
+;	lda	DEPTH
+	txa
+	jsr	multiply_u8x8	; 8-bit unsigned multiply
+
+	;===================================
+	; calc Q= (XP*DEPTH)/256 | (YP/256)
+	;	for texture pattern
+
+	ora	YPH		; Q=(XP*DEPTH)/256 | YP/256
+	sta	Q
+
+	;==============================
+	; calc C = Q & (Depth + Frame)
+	; 	mask geometry by time shifted depth
+
+	clc
+;	lda	DEPTH
+	txa
+	adc	FRAME		; add depth plus frame  D+F
+
+	and	Q		; C = Q & (D+FRAME)
+	sta	C
+
+	;=========================
+	; increment depth
+
+;	inc	DEPTH		; DEPTH=DEPTH+1
+	inx
+
+	;==========================
+	; to create gaps
+
+	cmp	#16		; IF C<16 THEN 3
+	bcc	depth_loop
+
+	;===========================
+	; plot pixel
+	;	XPOS,YPOS  COLOR=LOOKUP(C/2-8)
+plot_pixel:
+	lda	C
+	lsr
+	sec
+	sbc	#8			; A is C/2-8
+
+	tay
+	lda	color_lookup,Y		; Lookup in color table
+
+	;=====================
+	; set color
+
+	sta	COLOR
+
+;	jsr	SETCOL			; Set COLOR with ROM routine (mul*17)
+
+	;=====================
+	; plot point
+
+	ldy	XPOS
+	lda	YPOS
+	jsr	PLOT			; PLOT AT Y,A (Y preserved)
+
+	;===================
+	; increment xloop
+
+	inc	XPOS
+	lda	XPOS
+	cmp	#40
+	bne	xloop
+;	beq	xloop_done
+;	jmp	xloop
+xloop_done:
+
+	;===================
+	; increment yloop
+
+	inc	YPOS
+	lda	YPOS
+	cmp	#48
+	bne	yloop
+;	beq	yloop_done
+;	jmp	yloop
+yloop_done:
+	inc	FRAME
+
+	jmp	next_frame
+
+color_lookup:
+.byte	$00,$55,$AA,$55,$AA,$77,$FF,$FF,$22,$11,$33,$99,$DD,$CC
+
+
+; Fast mutiply
+
+; Note for our purposes we only care about 8.8 x 8.8 fixed point
+; with 8.8 result, which means we only care about the middle two bytes
+; of the 32 bit result.  So we disable generation of the high and low byte
+; to save some cycles.
+
+;
+; The old routine took around 700 cycles for a 16bitx16bit=32bit mutiply
+; This routine, at an expense of 2kB of looku tables, takes around 250
+;	If you reuse a term the next time this drops closer to 200
+
+; This routine was described by Stephen Judd and found
+;	in The Fridge and in the C=Hacking magazine
+; http://codebase64.org/doku.php?id=base:seriously_fast_multiplication
+
+; The key thing to note is that
+;	       (a+b)^2     (a-b)^2
+;       a*b =  -------  -  --------
+;                 4           4
+; So if you have tables of the squares of 0..511 you can lookup and subtract
+; instead of multiplying.
+
+; Table generation: I:0..511
+;                   square1_lo = <((I*I)/4)
+;                   square1_hi = >((I*I)/4)
+;                   square2_lo = <(((I-255)*(I-255))/4)
+;                   square2_hi = >(((I-255)*(I-255))/4)
+
+
+; Fast 8x8 bit unsigned multiplication, 16-bit result
+
+; input AxY
+; Result: M2,A:M1
+;
+
+multiply_u8x8:
+        sta     sm1a+1                                                  ; 3
+        sta     sm3a+1                                                  ; 3
+        eor     #$ff    ; invert the bits for subtracting               ; 2
+        sta     sm2a+1                                                  ; 3
+        sta     sm4a+1                                                  ; 3
+
+	sec
+sm1a:
+	lda	square1_lo,Y
+sm2a:
+	sbc	square2_lo,Y
+	sta	M1
+sm3a:
+	lda	square1_hi,Y
+sm4a:
+	sbc	square2_hi,Y
+;	sta	M2
+
+	rts
+
+
+; Fast mutiply -- setup tables
+
+.ifndef square1_lo
+square1_lo	=	$2000
+square1_hi	=	$2200
+square2_lo	=	$2400
+square2_hi	=	$2600
+.endif
+
+;	for(i=0;i<512;i++) {
+;		square1_lo[i]=((i*i)/4)&0xff;
+;		square1_hi[i]=(((i*i)/4)>>8)&0xff;
+;		square2_lo[i]=( ((i-255)*(i-255))/4)&0xff;
+;		square2_hi[i]=(( ((i-255)*(i-255))/4)>>8)&0xff;
+;	}
+
+
+