starpath: more optimization

This commit is contained in:
Vince Weaver 2025-02-26 01:42:43 -05:00
parent 61a50b4e9f
commit d4232185d4
2 changed files with 417 additions and 91 deletions

View File

@ -13,6 +13,8 @@ SETGR = $FB40
FULLGR = $C052
COLOR = $30
FRAME = $F0
YPOS = $F1
XPOS = $F2
@ -46,10 +48,51 @@ starpath:
; initialize
;=============================
jsr init_multiply_tables
lda #0
sta FRAME
init_multiply_tables:
; Build the add tables
ldx #$00
txa
.byte $c9 ; CMP #immediate - skip TYA and clear carry flag
lb1: tya
adc #$00 ; 0
ml1: sta square1_hi,x ; square1_hi[0]=0
tay ; y=0
cmp #$40 ; subtract 64 and update flags (c=0)
txa ; a=0
ror ; rotate
ml9: adc #$00 ; add 0
sta ml9+1 ; update add value
inx ; x=1
ml0: sta square1_lo,x ; square1_lo[0]=1
bne lb1 ; if not zero, loop
inc ml0+2 ; increment values
inc ml1+2 ; increment values
clc ; c=0
iny ; y=1
bne lb1 ; loop
; Build the subtract tables based on the existing one
ldx #$00
ldy #$ff
second_table:
lda square1_hi+1,x
sta square2_hi+$100,x
lda square1_hi,x
sta square2_hi,y
lda square1_lo+1,x
sta square2_lo+$100,x
lda square1_lo,x
sta square2_lo,y
dey
inx
bne second_table
; lda #0
stx FRAME
next_frame:
lda #0 ; start with YPOS=0
@ -58,10 +101,10 @@ yloop:
lda #0 ; start with XPOS=0
sta XPOS
xloop:
ldx #14
ldx #14 ; start Depth at 14
depth_loop:
stx DEPTH ; start Depth at 14
stx DEPTH
;===============
@ -78,8 +121,6 @@ depth_loop:
jsr multiply_u8x8 ; 8-bit unsigned multiply
sta YPH ; store out to YPH:YPL
;lda PRODLO
;sta YPL
;========================
; XP=(X*6)-DEPTH
@ -202,7 +243,9 @@ plot_pixel:
;=====================
; set color
jsr SETCOL ; Set COLOR with ROM routine (mul*17)
sta COLOR
; jsr SETCOL ; Set COLOR with ROM routine (mul*17)
;=====================
; plot point
@ -237,7 +280,7 @@ yloop_done:
jmp next_frame
color_lookup:
.byte 0,5,10,5,10,7,15,15,2,1,3,9,13,12
.byte $00,$55,$AA,$55,$AA,$77,$FF,$FF,$22,$11,$33,$99,$DD,$CC
; Fast mutiply
@ -271,24 +314,18 @@ color_lookup:
; Fast 8x8 bit unsigned multiplication, 16-bit result
; Input: M1xM2
; Result: M2:M1
; input AxY
; Result: M2,A:M1
;
; input A/Y
multiply_u8x8:
sta M1
sty M2
lda M1
sta sm1a+1 ; 3
sta sm3a+1 ; 3
eor #$ff ; invert the bits for subtracting ; 2
sta sm2a+1 ; 3
sta sm4a+1 ; 3
ldy M2
sec
sm1a:
lda square1_lo,Y
@ -299,42 +336,13 @@ sm3a:
lda square1_hi,Y
sm4a:
sbc square2_hi,Y
sta M2
; sta M2
rts
; Fast mutiply -- setup tables
; Note for our purposes we only care about 8.8 x 8.8 fixed point
; with 8.8 result, which means we only care about the middle two bytes
; of the 32 bit result. So we disable generation of the high and low byte
; to save some cycles.
;
; The old routine took around 700 cycles for a 16bitx16bit=32bit mutiply
; This routine, at an expense of 2kB of looku tables, takes around 250
; If you reuse a term the next time this drops closer to 200
; This routine was described by Stephen Judd and found
; in The Fridge and in the C=Hacking magazine
; http://codebase64.org/doku.php?id=base:seriously_fast_multiplication
; The key thing to note is that
; (a+b)^2 (a-b)^2
; a*b = ------- - --------
; 4 4
; So if you have tables of the squares of 0..511 you can lookup and subtract
; instead of multiplying.
; Table generation: I:0..511
; square1_lo = <((I*I)/4)
; square1_hi = >((I*I)/4)
; square2_lo = <(((I-255)*(I-255))/4)
; square2_hi = >(((I-255)*(I-255))/4)
.ifndef square1_lo
square1_lo = $2000
square1_hi = $2200
@ -349,48 +357,5 @@ square2_hi = $2600
; square2_hi[i]=(( ((i-255)*(i-255))/4)>>8)&0xff;
; }
init_multiply_tables:
; Build the add tables
ldx #$00
txa
.byte $c9 ; CMP #immediate - skip TYA and clear carry flag
lb1: tya
adc #$00 ; 0
ml1: sta square1_hi,x ; square1_hi[0]=0
tay ; y=0
cmp #$40 ; subtract 64 and update flags (c=0)
txa ; a=0
ror ; rotate
ml9: adc #$00 ; add 0
sta ml9+1 ; update add value
inx ; x=1
ml0: sta square1_lo,x ; square1_lo[0]=1
bne lb1 ; if not zero, loop
inc ml0+2 ; increment values
inc ml1+2 ; increment values
clc ; c=0
iny ; y=1
bne lb1 ; loop
; Build the subtract tables based on the existing one
ldx #$00
ldy #$ff
second_table:
lda square1_hi+1,x
sta square2_hi+$100,x
lda square1_hi,x
sta square2_hi,y
lda square1_lo+1,x
sta square2_lo+$100,x
lda square1_lo,x
sta square2_lo,y
dey
inx
bne second_table
rts

View File

@ -0,0 +1,361 @@
; An Apple II lores version of Hellmood's amazing 64B DOS Star Path Demo
;
; See https://hellmood.111mb.de//starpath_is_55_bytes.html
;
; deater -- Vince Weaver -- vince@deater.net -- 25 February 2025
PLOT = $F800 ; PLOT AT Y,A (A colors output, Y preserved)
PLOT1 = $F80E ; PLOT at (GBASL),Y (need MASK to be $0f or $f0)
SETCOL = $F864 ; COLOR=A
SETGR = $FB40
FULLGR = $C052
COLOR = $30
FRAME = $F0
YPOS = $F1
XPOS = $F2
DEPTH = $F3
C = $F4
AL = $F5
M1 = $F6
M2 = $F7
TEMP = $F8
YPL = $F9
YPH = $FA
XPL = $FB
XPH = $FC
Q = $FD
;=============================
;=============================
; star path
;=============================
;=============================
starpath:
;=============================
; setup graphics
;=============================
jsr SETGR ; set graphics
bit $C052 ; set full-screen graphics
;=============================
; initialize
;=============================
init_multiply_tables:
; Build the add tables
ldx #$00
txa
.byte $c9 ; CMP #immediate - skip TYA and clear carry flag
lb1: tya
adc #$00 ; 0
ml1: sta square1_hi,x ; square1_hi[0]=0
tay ; y=0
cmp #$40 ; subtract 64 and update flags (c=0)
txa ; a=0
ror ; rotate
ml9: adc #$00 ; add 0
sta ml9+1 ; update add value
inx ; x=1
ml0: sta square1_lo,x ; square1_lo[0]=1
bne lb1 ; if not zero, loop
inc ml0+2 ; increment values
inc ml1+2 ; increment values
clc ; c=0
iny ; y=1
bne lb1 ; loop
; Build the subtract tables based on the existing one
ldx #$00
ldy #$ff
second_table:
lda square1_hi+1,x
sta square2_hi+$100,x
lda square1_hi,x
sta square2_hi,y
lda square1_lo+1,x
sta square2_lo+$100,x
lda square1_lo,x
sta square2_lo,y
dey
inx
bne second_table
; lda #0
stx FRAME
next_frame:
lda #0 ; start with YPOS=0
sta YPOS
yloop:
lda #0 ; start with XPOS=0
sta XPOS
xloop:
ldx #14 ; start Depth at 14
depth_loop:
stx DEPTH
;===============
; YP = Y*4*DEPTH
;===============
lda YPOS ;
asl
asl ; A is YPOS*4
tay ; multiply Y*4*DEPTH
; lda DEPTH
txa
jsr multiply_u8x8 ; 8-bit unsigned multiply
sta YPH ; store out to YPH:YPL
;========================
; XP=(X*6)-DEPTH
; curve X by depth
;=========================
lda XPOS ; load XPOS
asl
sta XPL
asl
;clc ; carry always 0 as x never more than 40?
adc XPL
sta XPL ; XPL=XPOS*6
sta AL ; AL also is XPOS*6
sec ; Subtract DEPTH
sbc DEPTH
sta XPL ; XP=(XPOS*6)-DEPTH
; if carry set means not negative
; and draw path
; otherwise we draw the sky
bcs draw_path
;========================
; draw the sky
;========================
draw_sky:
;================================
; set color to white for star?
lda #31 ; C=31
sta C
;=====================
; calc A=(XPOS*6)+YP
; ??? used to see if star
clc ; A=X*6+YP
lda AL
adc M1 ; YPL from previous multiply
;==============
; see if star
cmp #6 ; if A&0xFF < 6 then skip, we are star
bcc plot_pixel
;==============
; not star, sky
lda YPOS ; C=Y/4+32
lsr
lsr
clc
adc #32
sta C
bne plot_pixel ; bra
;====================
; draw path
;====================
draw_path:
;=================================
; calc XP*DEPTH and get high byte
ldy XPL
; lda DEPTH
txa
jsr multiply_u8x8 ; 8-bit unsigned multiply
;===================================
; calc Q= (XP*DEPTH)/256 | (YP/256)
; for texture pattern
ora YPH ; Q=(XP*DEPTH)/256 | YP/256
sta Q
;==============================
; calc C = Q & (Depth + Frame)
; mask geometry by time shifted depth
clc
; lda DEPTH
txa
adc FRAME ; add depth plus frame D+F
and Q ; C = Q & (D+FRAME)
sta C
;=========================
; increment depth
; inc DEPTH ; DEPTH=DEPTH+1
inx
;==========================
; to create gaps
cmp #16 ; IF C<16 THEN 3
bcc depth_loop
;===========================
; plot pixel
; XPOS,YPOS COLOR=LOOKUP(C/2-8)
plot_pixel:
lda C
lsr
sec
sbc #8 ; A is C/2-8
tay
lda color_lookup,Y ; Lookup in color table
;=====================
; set color
sta COLOR
; jsr SETCOL ; Set COLOR with ROM routine (mul*17)
;=====================
; plot point
ldy XPOS
lda YPOS
jsr PLOT ; PLOT AT Y,A (Y preserved)
;===================
; increment xloop
inc XPOS
lda XPOS
cmp #40
bne xloop
; beq xloop_done
; jmp xloop
xloop_done:
;===================
; increment yloop
inc YPOS
lda YPOS
cmp #48
bne yloop
; beq yloop_done
; jmp yloop
yloop_done:
inc FRAME
jmp next_frame
color_lookup:
.byte $00,$55,$AA,$55,$AA,$77,$FF,$FF,$22,$11,$33,$99,$DD,$CC
; Fast mutiply
; Note for our purposes we only care about 8.8 x 8.8 fixed point
; with 8.8 result, which means we only care about the middle two bytes
; of the 32 bit result. So we disable generation of the high and low byte
; to save some cycles.
;
; The old routine took around 700 cycles for a 16bitx16bit=32bit mutiply
; This routine, at an expense of 2kB of looku tables, takes around 250
; If you reuse a term the next time this drops closer to 200
; This routine was described by Stephen Judd and found
; in The Fridge and in the C=Hacking magazine
; http://codebase64.org/doku.php?id=base:seriously_fast_multiplication
; The key thing to note is that
; (a+b)^2 (a-b)^2
; a*b = ------- - --------
; 4 4
; So if you have tables of the squares of 0..511 you can lookup and subtract
; instead of multiplying.
; Table generation: I:0..511
; square1_lo = <((I*I)/4)
; square1_hi = >((I*I)/4)
; square2_lo = <(((I-255)*(I-255))/4)
; square2_hi = >(((I-255)*(I-255))/4)
; Fast 8x8 bit unsigned multiplication, 16-bit result
; input AxY
; Result: M2,A:M1
;
multiply_u8x8:
sta sm1a+1 ; 3
sta sm3a+1 ; 3
eor #$ff ; invert the bits for subtracting ; 2
sta sm2a+1 ; 3
sta sm4a+1 ; 3
sec
sm1a:
lda square1_lo,Y
sm2a:
sbc square2_lo,Y
sta M1
sm3a:
lda square1_hi,Y
sm4a:
sbc square2_hi,Y
; sta M2
rts
; Fast mutiply -- setup tables
.ifndef square1_lo
square1_lo = $2000
square1_hi = $2200
square2_lo = $2400
square2_hi = $2600
.endif
; for(i=0;i<512;i++) {
; square1_lo[i]=((i*i)/4)&0xff;
; square1_hi[i]=(((i*i)/4)>>8)&0xff;
; square2_lo[i]=( ((i-255)*(i-255))/4)&0xff;
; square2_hi[i]=(( ((i-255)*(i-255))/4)>>8)&0xff;
; }