Initial checkin

fdraw v0.3
This commit is contained in:
Andy McFadden 2015-08-21 14:15:18 -07:00
parent 2ca9d4084f
commit 418e7b7191
10 changed files with 4446 additions and 2 deletions

549
AMPERFDRAW.S Normal file
View File

@ -0,0 +1,549 @@
********************************
* *
* Amper-fdraw *
* By Andy McFadden *
* For fdraw version 0.3 *
* *
* Applesoft ampersand *
* interface for fdraw. *
* *
* Developed with Merlin-16 *
* *
********************************
lst off
org $1d60
* All of the handler entry points can fit on a single
* page, so it's possible to save a few bytes by
* dropping the high jump table and just hardcoding
* the first page into the jump. This requires that
* the ORG be at $xx00.
PUT FDRAW.DEFS
* Applesoft BASIC tokens.
tok_plot equ $8d
tok_hgr2 equ $90
tok_hgr equ $91
tok_hcolor equ $92
tok_hplot equ $93
tok_draw equ $94
tok_xdraw equ $95
tok_inverse equ $9e
tok_clear equ $bd
tok_new equ $bf
tok_to equ $c1
tok_at equ $c5
*tok_sgn equ $d2
tok_scrn equ $d7
tok_exp equ $dd
tok_cos equ $de
tok_sin equ $df
* System locations.
PCL equ $3a ;used by monitor
PCH equ $3b ;used by monitor
A1L equ $3c ;used by monitor
A1H equ $3d ;used by monitor
LINNUM equ $50 ;50-51
FACLO equ $a1
CHRGET equ $b1 ;advance ptr, get next tok
CHRGOT equ $b7 ;get next tok (no advance)
TXTPTR equ $b8
HPAG equ $e6 ;$20 or $40
AMPERV equ $3f5
TXTCLR equ $c050
TXTSET equ $c051
MIXCLR equ $c052
MIXSET equ $c053
LOWSCR equ $c054
HISCR equ $c055
LORES equ $c056
HIRES equ $c057
ERROR equ $d412 ;error based on X reg
FRMNUM equ $dd67
SynError equ $dec9 ;throw SYNTAX ERROR
CHKCOM equ $debe
IllQError equ $e199 ;throw ILLEGAL QUANTITY ERROR
GETADR equ $e752
GETBYT equ $e6f8 ;gets byte, in X/FACLO
HFNS equ $f6b9 ;get hi-res x/y for hplot
* Prepare the ampersand vector.
*
* Ideally we'd check to see if the existing vector is
* different from ours, and if so, jump to it when we
* get a token we don't recognize. Not convinced
* there's an actual use case for this.
init
lda #$4c ;JMP, in case it got
sta AMPERV ; trashed
lda #<dispatch
sta AMPERV+1
lda #>dispatch
sta AMPERV+2
rts
* Entry point from BASIC. The token is in A.
dispatch
ldx #:cmdend-:cmdtab-1
]loop cmp :cmdtab,x
beq :match
dex
bpl ]loop
jmp SynError
:match
lda :jmptabh,x
* lda #>h_new ;all on first page
pha
lda :jmptabl,x
pha
jmp CHRGET ;eat token, jump
:cmdtab dfb tok_new
dfb tok_hgr
dfb tok_hgr2
dfb tok_scrn
dfb tok_hcolor
dfb tok_inverse
dfb tok_clear
dfb tok_hplot
dfb tok_xdraw
dfb tok_draw
dfb tok_exp
dfb tok_cos
dfb tok_sin
dfb tok_at
dfb tok_plot
:cmdend
:jmptabl dfb <h_new-1
dfb <h_hgr-1
dfb <h_hgr2-1
dfb <h_scrn-1
dfb <h_hcolor-1
dfb <h_inverse-1
dfb <h_clear-1
dfb <h_hplot-1
dfb <h_xdraw-1
dfb <h_draw-1
dfb <h_exp-1
dfb <h_cos-1
dfb <h_sin-1
dfb <h_at-1
dfb <h_plot-1
:jmptabh dfb >h_new-1
dfb >h_hgr-1
dfb >h_hgr2-1
dfb >h_scrn-1
dfb >h_hcolor-1
dfb >h_inverse-1
dfb >h_clear-1
dfb >h_hplot-1
dfb >h_xdraw-1
dfb >h_draw-1
dfb >h_exp-1
dfb >h_cos-1
dfb >h_sin-1
dfb >h_at-1
dfb >h_plot-1
********************************
* &NEW - initialize
h_new
lda #$20 ;match Init result
sta g_cur_page
lda #$00
sta g_hcolor
tax ;init "previous hplot"
tay ; coord to zero
jsr storeprv
ldx #139 ;279/2
ldy #0
lda #95 ;191/2
jsr storeac
jmp f_Init
********************************
* &HGR - show page 1 with mixed text, and clear screen.
* Sets the color to zero.
h_hgr
ldx #$20 ;page 1
lda #$00 ;$c054
beq hgr_com
********************************
* &HGR2 - show page 2 with no text, and clear screen.
* Sets the color to zero.
h_hgr2
ldx #$40 ;page 2
lda #$01 ;$c055
;fall through to hgr_com
* We go slightly out of our way to clear the screen
* before tripping the softswitches. This avoids
* flashing the previous hi-res page contents when
* entering from text mode.
*
* We also want to go nomix-page2 but page1-mix
* (note reverse order) to avoid flashing text pg 2.
hgr_com stx f_in_arg
stx g_cur_page
stx HPAG ;probably useful
pha
jsr f_SetPage
lda #$00
sta f_in_arg
jsr f_SetColor
jsr f_Clear
lda g_hcolor ;restore color
sta f_in_arg
jsr f_SetColor
bit TXTCLR ;$c050
bit HIRES ;$c057
pla
beq :pg1
bit MIXCLR ;$c052
bit HISCR ;$c055
rts
:pg1 bit LOWSCR ;$c054
bit MIXSET ;$c053
rts
********************************
* &SCRN({1,2}) - set the current hi-res page
h_scrn
jsr GETBYT
cpx #1
beq :okay
cpx #2
beq :okay
jmp IllQError
:okay jsr CHRGET ;eat ')' (we assume)
txa ;X/Y unaltered
asl
asl
asl
asl
asl ;multiply x32
sta g_cur_page
sta f_in_arg
jmp f_SetPage
********************************
* &HCOLOR={0-7} - set the current color
h_hcolor
jsr GETBYT ;get color
cpx #8
blt :okay
jmp IllQError
:okay stx f_in_arg
stx g_hcolor
jmp f_SetColor
********************************
* &INVERSE - flip pages
*
* If we're currently drawing on $20, we set the page
* to $40 and hit $c054 to show $20. And vice-versa.
* The goal is to make double-buffered animation easy.
h_inverse
lda g_cur_page
eor #$60
sta g_cur_page
ldx #$00
cmp #$40 ;about to start drawing on 2?
beq :showpg1 ;yes, show page 1
inx ;no, show page 2
:showpg1 ldy LOWSCR,x
sta f_in_arg
jmp f_SetPage
********************************
* &CLEAR - clear current page to current color.
h_clear
jmp f_Clear ;well, that was easy
********************************
* &XDRAW left,top,right,bottom - draw rectangle outline
h_xdraw
jsr getltrb
jmp f_DrawRect
********************************
* &DRAW left,top,right,bottom - draw filled rectangle
h_draw
jsr getltrb
jmp f_FillRect
********************************
* &EXP {0,1} - set line draw mode
h_exp
jsr GETBYT
cpx #2
blt :okay
jmp IllQError
:okay stx f_in_arg
jmp f_SetLineMode
********************************
* &COS cx,cy,rad - draw outline circle
h_cos
jsr getcxcyr
jmp f_DrawCircle
********************************
* &SIN cx,cy,rad - draw filled circle
h_sin
jsr getcxcyr
jmp f_FillCircle
********************************
* &AT x,y - select center for array draw
h_at
jsr HFNS
jmp storeac
********************************
* &PLOT vertexAddr, indexAddr, indexCount [AT cx,cy]
* draw lines from arrays of vertices and indices
h_plot jmp array_draw
********************************
* &HPLOT x,y - draw a point
* &HPLOT TO x,y - draw a line from last point to x,y
* &HPLOT x0,y0 to x1,y1 - draw a line
lst on ;last token handler --
h_hplot equ * ; must be on first page
lst off ; to omit high byte table
jsr CHRGOT ;check next token
lst off
cmp #tok_to ;is this an "HPLOT TO"?
beq :leadingto
jsr getx1y1 ;get the first coord
jsr copy1to0
jsr CHRGOT ;see if single point
cmp #tok_to
beq :hplot_to ;nope, draw line
jsr copy0toprev ;draw point, and save x/y
jmp f_DrawPoint ; for subsequent HPLOT TO
:leadingto ;"HPLOT TO", restore the
lda g_prevxl ; previous coord to x0/y0
sta f_in_x0l ;(can't rely on f_in_zzz
lda g_prevxh ; being there -- we might
sta f_in_x0h ; have drawn a rect)
lda g_prevy
sta f_in_y0
:hplot_to
jsr CHRGET ;eat the TO
jsr getx1y1 ;get the coords
jsr f_DrawLine ;draw it
jsr copy1to0 ;shift 1->0 for next round
jsr CHRGOT
cmp #tok_to ;another TO?
beq :hplot_to ;yes, branch
jmp copy0toprev ;no, save prev and bail
* Get coordinates and store in X1/Y1.
getx1y1
jsr HFNS
store1 stx f_in_x1l ;store X/Y/A in coord1
sty f_in_x1h
sta f_in_y1
rts
* Save x0/y0 as our "previous" coordinate.
copy0toprev
ldx f_in_x0l
ldy f_in_x0h
lda f_in_y0
storeprv stx g_prevxl ;store X/Y/A in g_prev
sty g_prevxh
sta g_prevy
rts
* Copy X1/Y1 into X0/Y0.
copy1to0
ldx f_in_x1l
ldy f_in_x1h
lda f_in_y1
store0 stx f_in_x0l ;store X/Y/A in coord 0
sty f_in_x0h
sta f_in_y0
rts
* Store X/Y/A into array-center.
storeac stx g_ac_xl
sty g_ac_xh
sta g_ac_y
rts
* Get left/top/right/bottom coordinates.
getltrb
jsr HFNS
jsr store0 ;save as X0/Y0
jsr CHKCOM ;eat a comma
jsr HFNS
jsr store1 ;save as X1/Y1
rts
* Get center coordinates and radius.
getcxcyr
jsr HFNS ;get CX and CY
jsr store0 ;save as X0/Y0
jsr CHKCOM ;eat a comma
jsr GETBYT ;convert to 0-255
stx f_in_rad
rts
* Array-draw handler.
*
* We know that fdraw doesn't use LINNUM or A1L/A1H,
* so it's safe to use them here.
array_draw
]vertices equ A1L ;2b
]indices equ LINNUM ;2b
]count equ PCL
]cur equ PCH
jsr FRMNUM ;get vertex buffer address
jsr GETADR
lda LINNUM ;copy to A1L
sta ]vertices
lda LINNUM+1
sta ]vertices+1
jsr CHKCOM ;eat the comma
jsr FRMNUM ;get index buffer address
jsr GETADR ;leave it in LINNUM
jsr CHKCOM
jsr GETBYT ;get the count
cpx #128 ;range check (0-127)
blt :countok
jmp IllQError
:countok txa
beq :done ;nothing to do
asl ;double it
sta ]count ;stash it
lda #$00
sta ]cur
* Check for optional AT cx,cy.
jsr CHRGOT
cmp #tok_at
bne :noat
JSR CHRGET ;eat the AT
lda LINNUM ;the code that reads the
pha ; hi-res coordinates will
lda LINNUM+1 ; overwrite LINNUM, so
pha ; we have to save & restore
jsr h_at
pla
sta LINNUM+1
pla
sta LINNUM
:noat
]loop jsr getvertex
bcs :skip2
jsr store0
jsr getvertex
bcs :skip
jsr store1
jsr f_DrawLine
dfb $2c ;BIT addr
:skip2 inc ]cur
:skip lda ]cur
cmp ]count
blt ]loop
:done rts
* Get the Nth vertex, specified by ]cur, and load it
* into X/Y/A (xlo/xhi/y). Returns with carry set if
* the vertex is invalid.
*
* Increments ]cur by 1.
getvertex
ldy ]cur
inc ]cur
lda (]indices),y
bmi :badv ;must be 0-127
jsr :calcvertex
ldx g_out_x
ldy g_out_x+1
beq :xok ;0-255, ok
cpy #1
bne :badv ;512+
cpx #280-256
bge :badv ;280-511
:xok
lda g_out_y+1
bne :badv ;Y is neg or > 255
lda g_out_y
cmp #192
bcc :goodv
:badv
sec
:goodv rts
* Get VX and VY, merging with AC, and store in
* 16-bit g_out_x and g_out_y. Range not checked
* here. On entry, A has vertex index.
:calcvertex
asl
tay
ldx #$00 ;hi byte of vertex
lda (]vertices),y ;x-coord
bpl :xpos
dex ;sign-extend hi byte
:xpos clc
adc g_ac_xl
sta g_out_x
txa
adc g_ac_xh
sta g_out_x+1
iny
ldx #$00
lda (]vertices),y ;y-coord
bpl :ypos
dex ;sign-extend hi byte
:ypos clc
adc g_ac_y
sta g_out_y
bcc :nocarry
inx
:nocarry stx g_out_y+1
rts
********************************
* Global variables
g_cur_page ds 1 ;$20 or $40
g_hcolor ds 1
g_prevxl ds 1
g_prevxh ds 1
g_prevy ds 1
g_ac_xl ds 1 ;Center-point coordinates
g_ac_xh ds 1 ; for array-based line
g_ac_y ds 1 ; draw (&AT, &PLOT).
g_out_x ds 2 ;16-bit coordinates for
g_out_y ds 2 ; array-based line draw
lst on
end equ *
sav amperfdraw
lst off

752
FDRAW.CIRCLE.S Normal file
View File

@ -0,0 +1,752 @@
********************************
* *
* Fast Apple II Graphics *
* By Andy McFadden *
* Version 0.3, Aug 2015 *
* *
* Circle rendering *
* (Included by FDRAW.S) *
* *
* Developed with Merlin-16 *
* *
********************************
* TODO: if USE_FAST is 0, replace the outline circle
* plot code with calls to DrawPoint (or maybe a
* common sub-function so we don't trash the input
* parameters). Saves a little space.
********************************
*
* Draw a circle. The radius is in in_rad, and
* the center is at in_x0l+in_x0h,in_y0.
*
********************************
DrawCircle
lda #$20 ;JSR
cmp _cp08 ;configured for outline?
beq :okay
jsr fixcplot
:okay
jmp calc_circle
********************************
*
* Draw filled circle.
*
********************************
FillCircle
lda #$2c ;BIT
cmp _cp08 ;configured for fill?
beq :okay
jsr fixcplot
:okay
jsr calc_circle
jmp FillRaster
* Calculate a circle, using Bresenham's algorithm. The
* results are placed into the rasterization buffers.
*
* in_rad must be from 0 to 255. The x/y center
* coordinates must be on the screen, but the circle
* can extend off the edge.
*
* The computed values are stored in the rasterization
* tables. For an outline circle, we also plot the
* points immediately.
do USE_FAST ;*****
* local storage -- not used often enough to merit DP
circ_8bit ds 1
circ_clip ds 1
fin ;*****
calc_circle
max_fast_rad equ 41
]cxl equ zloc0
]cxh equ zloc1
]cy equ zloc2
]dlo equ zloc3
]dhi equ zloc4
]xsav equ zloc5
]ysav equ zloc6
]min_x equ zloc7 ;min/max offsets from center
]max_x equ zloc8 ;(min is above center, max
]min_y equ zloc9 ; is below)
]max_y equ zloc10
]hitmp equ zloc11
* only used by hplot for outline circles
]hbasl equ zptr0
]andmask equ zloc11 ;overlaps with ]hitmp
]savxreg equ zloc12
]savyreg equ zloc13
* Special-case radius=0. It removes an annoying
* edge case (first y-- becomes 0xff, but 6502 cmp
* is unsigned).
lda in_rad
bne :notzero
ldy in_y0
sty rast_top
sty rast_bottom
lda in_x0l
sta rastx0l,y
sta rastx1l,y
lda in_x0h
sta rastx0h,y
sta rastx1h,y
rts
* Use different version of function for small
* circles, because we can do it all in 8 bits.
:notzero
do USE_FAST ;*****
ldy #$01
cmp #max_fast_rad ;in_rad in Acc
blt :use_fast
dey
:use_fast sty circ_8bit
fin ;*****
lda in_x0l ;copy center to DP for speed
sta ]cxl
lda in_x0h
sta ]cxh
lda in_y0
sta ]cy
* Compute min/max values, based on offset from center.
* These are compared against offset-from-center x/y.
* We need tight bounds on Y because we use it to
* compute the rast_render top/bottom. Getting tight
* bounds on X is not so important, but we still need
* it for the no-clip optimization.
ldx #$04 ;count edges needing clip
lda #NUM_ROWS-1 ;191
sec
sbc ]cy ;maxY = 191-cy
cmp in_rad
blt :ylimok
lda in_rad ;clamp to radius
dex
:ylimok sta ]max_y ;maxY = 191-cy
lda ]cy ;minY = cy
cmp in_rad
blt :ylimok2
lda in_rad ;clamp to radius
dex
:ylimok2 sta ]min_y
lda ]cxh
beq :xlimlo
* Examples (note #<NUM_COLS-1 is 279-256 = 23):
* cx=265 (cxh=1 cxl=11), 23-11=14, chk rad
lda #<NUM_COLS-1 ;maxX = 279-cx
sec
sbc ]cxl
cmp in_rad
blt :xlimhok
lda in_rad ;clamp to radius
dex
:xlimhok sta ]max_x
lda in_rad ;min X always out of range
dex ; so just clamp to radius
sta ]min_x
jmp :xlimdone
* Examples:
* For cx=0 to 24, we can never pass right edge (our
* maximum radius is 255).
* cx=3, 23-3=20 + carry set --> bad, must use rad
* cx=24, 23-24=255 + carry clear --> ok, chk rad
* cx=255, 23-255=24 + carry clear --> ok, chk rad
:xlimlo
lda #<NUM_COLS-1 ;maxX = 279-cx
sec
sbc ]cxl
bcs :xuserad
cmp in_rad
blt :xlimok
:xuserad lda in_rad ;clamp to radius
dex
:xlimok sta ]max_x
lda ]cxl ;minX = (cx > 255) ?
cmp in_rad
blt :xlimok2
lda in_rad ;clamp to radius
dex
:xlimok2 sta ]min_x
:xlimdone
do USE_FAST ;*****
stx circ_clip
fin ;*****
* set top/bottom rows for rasterizer
lda ]cy
clc
adc ]max_y
sta rast_bottom
lda ]cy
sec
sbc ]min_y
sta rast_top
DO 0 ;debug debug debug
LDA ]min_x ;save a copy where the
STA $0380 ; monitor won't trash it
LDA ]max_x
STA $0381
LDA ]min_y
STA $0382
LDA ]max_y
STA $0383
FIN
* Set initial conditions for Bresenham.
ldx #0 ;:x = 0
stx ]xsav
ldy in_rad ;:y = rad
sty ]ysav
lda #1 ;:d = 1 - rad
sec
sbc ]ysav ;in_rad
sta ]dlo
bcs :hizero ;C==1 if in_rad<=1
ldx #$ff ;C was 0, make neg
:hizero stx ]dhi
*
* Outer loop -- plot 8 points, then update values.
*
circ_loop
do USE_FAST ;*****
lda circ_clip
beq ncypy
jmp with_clip
* Quick version, no clipping required
* row cy+y: cx-x and cx+x
ncypy
lda ]ysav
clc
adc ]cy
tay ;y-coord in Y-reg
lda ]cxl
sec
sbc ]xsav
sta rastx0l,y
lda ]cxh
sbc #$00
sta rastx0h,y
_cp00 jsr cplotl
lda ]cxl
clc
adc ]xsav
sta rastx1l,y
lda ]cxh
adc #$00
sta rastx1h,y
_cp01 jsr cplotrn
* row cy-y: cx-x and cx+x
ncymy
lda ]cy
sec
sbc ]ysav
tay ;y-coord in Y-reg
lda ]cxl
sec
sbc ]xsav
sta rastx0l,y
lda ]cxh
sbc #$00
sta rastx0h,y
_cp02 jsr cplotl
lda ]cxl
clc
adc ]xsav
sta rastx1l,y
lda ]cxh
adc #$00
sta rastx1h,y
_cp03 jsr cplotrn
* row cy+x: cx-y and cx+y
ncypx
lda ]xsav ;off bottom?
clc
adc ]cy
tay ;y-coord in Y-reg
lda ]cxl
sec
sbc ]ysav
sta rastx0l,y
lda ]cxh
sbc #$00
sta rastx0h,y
_cp04 jsr cplotl
lda ]cxl
clc
adc ]ysav
sta rastx1l,y
lda ]cxh
adc #$00
sta rastx1h,y
_cp05 jsr cplotrn
* row cy-x: cx-y and cx+y
ncymx
lda ]cy
sec
sbc ]xsav
tay ;y-coord in Y-reg
lda ]cxl
sec
sbc ]ysav
sta rastx0l,y
lda ]cxh
sbc #$00
sta rastx0h,y
_cp06 jsr cplotl
lda ]cxl
clc
adc ]ysav
sta rastx1l,y
lda ]cxh
adc #$00
sta rastx1h,y
_cp07 jsr cplotrn
* CLICK
jmp circ_plot_done
fin ;***** (USE_FAST)
*
* Same thing, but this time clipping edges.
*
with_clip
* row cy+y: cx-x and cx+x
ccypy
lda ]ysav ;off bottom?
cmp ]max_y
beq :cypy_ok
bge cypy_skip ;completely off screen
:cypy_ok clc
adc ]cy
tay ;y-coord in Y-reg
ldx ]xsav ;handle cx-x
cpx ]min_x
blt :cxmx_ok
beq :cxmx_ok
lda #0 ;clip at 0
sta rastx0l,y
sta rastx0h,y
beq cxmx_done0 ;always
BREAK
:cxmx_ok lda ]cxl
sec
sbc ]xsav
sta rastx0l,y
lda ]cxh
sbc #$00
sta rastx0h,y
_cp08 jsr cplotl
cxmx_done0
cpx ]max_x ;handle cx+x
blt :cxpx_ok
beq :cxpx_ok
lda #<NUM_COLS-1
sta rastx1l,y
lda #>NUM_COLS-1
sta rastx1h,y
bne cxpx_done0 ;always
BREAK
:cxpx_ok lda ]cxl
clc
adc ]xsav
sta rastx1l,y
lda ]cxh
adc #$00
sta rastx1h,y
_cp09 jsr cplotr
cxpx_done0
cypy_skip
* row cy-y: cx-x and cx+x
ccymy
lda ]ysav ;off top?
cmp ]min_y
beq :cymy_ok
bge cymy_skip
:cymy_ok lda ]cy
sec
sbc ]ysav
tay ;y-coord in Y-reg
ldx ]xsav ;handle cx-x
cpx ]min_x
blt :cxmx_ok
beq :cxmx_ok
lda #0 ;clip at 0
sta rastx0l,y
sta rastx0h,y
beq cxmx_done1 ;always
BREAK
:cxmx_ok lda ]cxl
sec
sbc ]xsav
sta rastx0l,y
lda ]cxh
sbc #$00
sta rastx0h,y
_cp10 jsr cplotl
cxmx_done1
cpx ]max_x ;handle cx+x
blt :cxpx_ok
beq :cxpx_ok
lda #<NUM_COLS-1
sta rastx1l,y
lda #>NUM_COLS-1
sta rastx1h,y
bne cxpx_done1 ;always
BREAK
:cxpx_ok lda ]cxl
clc
adc ]xsav
sta rastx1l,y
lda ]cxh
adc #$00
sta rastx1h,y
_cp11 jsr cplotr
cxpx_done1
cymy_skip
* row cy+x: cx-y and cx+y
ccypx
lda ]xsav ;off bottom?
cmp ]max_y
beq :cypx_ok
bge cypx_skip
:cypx_ok clc
adc ]cy
tay ;y-coord in Y-reg
ldx ]ysav ;handle cx-y
cpx ]min_x
blt :cxmy_ok
beq :cxmy_ok
lda #0 ;clip at 0
sta rastx0l,y
sta rastx0h,y
beq cxmy_done2 ;always
BREAK
:cxmy_ok lda ]cxl
sec
sbc ]ysav
sta rastx0l,y
lda ]cxh
sbc #$00
sta rastx0h,y
_cp12 jsr cplotl
cxmy_done2
cpx ]max_x ;handle cx+y
blt :cxpy_ok
beq :cxpy_ok
lda #<NUM_COLS-1
sta rastx1l,y
lda #>NUM_COLS-1
sta rastx1h,y
bne cxpy_done2 ;always
BREAK
:cxpy_ok lda ]cxl
clc
adc ]ysav
sta rastx1l,y
lda ]cxh
adc #$00
sta rastx1h,y
_cp13 jsr cplotr
cxpy_done2
cypx_skip
* row cy-x: cx-y and cx+y
ccymx
lda ]xsav ;off top?
cmp ]min_y
beq :cymx_ok
bge cymx_skip
:cymx_ok lda ]cy
sec
sbc ]xsav
tay ;y-coord in Y-reg
ldx ]ysav ;handle cx-y
cpx ]min_x
blt :cxmy_ok
beq :cxmy_ok
lda #0 ;clip at 0
sta rastx0l,y
sta rastx0h,y
beq cxmy_done3 ;always
BREAK
:cxmy_ok lda ]cxl
sec
sbc ]ysav
sta rastx0l,y
lda ]cxh
sbc #$00
sta rastx0h,y
_cp14 jsr cplotl
cxmy_done3
cpx ]max_x ;handle cx+y
blt :cxpy_ok
beq :cxpy_ok
lda #<NUM_COLS-1
sta rastx1l,y
lda #>NUM_COLS-1
sta rastx1h,y
bne cxpy_done3 ;always
BREAK
:cxpy_ok lda ]cxl
clc
adc ]ysav
sta rastx1l,y
lda ]cxh
adc #$00
sta rastx1h,y
_cp15 jsr cplotr
cxpy_done3
cymx_skip
circ_plot_done
* Update X/Y/D. Up to about radius=41 we can maintain
* 'd' in an 8-bit register.
do USE_FAST ;*****
lda circ_8bit
beq circ_slow
*
* Bresenham update, with 8-bit 'd'.
*
ldx ]xsav
lda ]dlo
bmi :dneg
txa ;:d = d + ((x-y)*4) +5
sec
sbc ]ysav ;x <= y, may be neg or 0
asl
asl
clc ;can't know carry
adc #5
clc ;still don't want carry
adc ]dlo
sta ]dlo
dec ]ysav ;:y--
jmp :loopbot
:dneg txa ;:d = d + (x*4) +3
asl
asl ;x always pos, C=0
DO 0
BCC :TEST ;debug
BREAK ;debug
:TEST ;debug
FIN
adc #3
adc ]dlo
sta ]dlo
:loopbot
inx ;:x++
stx ]xsav
cpx ]ysav
beq :again
bge circ_done
:again jmp circ_loop
fin ;*****
*
* Bresenham update, with 16-bit 'd'
*
circ_slow
CLICK
ldx ]xsav
lda ]dhi
bmi :dneg
lda ]dlo
clc
adc #5
sta ]dlo
bcc :noinc
inc ]dhi
:noinc
txa ;:d = d + ((x-y)*4) +5
ldy #$00
sty ]hitmp
sec
sbc ]ysav ;x <= y, may be neg or 0
beq :xeqy ;if x==y, nothing to add
ldy #$ff
sty ]hitmp
asl
rol ]hitmp
asl
rol ]hitmp
clc
adc ]dlo
sta ]dlo
lda ]dhi
adc ]hitmp
sta ]dhi
:xeqy
dec ]ysav ;:y--
jmp :loopbot
:dneg lda ]dlo ;:d = d + (x*4) + 3
clc
adc #3
sta ]dlo
bcc :noinc2
inc ]dhi
:noinc2 txa
ldy #0 ;x always positive
sty ]hitmp
asl
rol ]hitmp
asl
rol ]hitmp
clc ;not needed?
adc ]dlo
sta ]dlo
lda ]dhi
adc ]hitmp
sta ]dhi
:loopbot
inx ;:x++
stx ]xsav
cpx ]ysav
beq :again
bge circ_done
:again jmp circ_loop
circ_done rts
* Plot a point for outline circle rendering.
*
* X and Y must be preserved. Y holds the current line
* number.
*
* Most DP locations are in use -- see the variable
* declarations at the start of the circle function.
* cplotl is the entry point for the leftmost point.
cplotl
stx ]savxreg
sty ]savyreg
lda ylooklo,y
sta ]hbasl
lda ylookhi,y
_pg_or2 ora #$20
sta ]hbasl+1
* Convert the X coordinate into byte/bit.
ldx rastx0l,y ;x coord, lo
lda rastx0h,y ;>= 256?
beq :lotabl ;no, use the low table
ldy div7hi,x
lda mod7hi,x
bpl cplotcom ;always
BREAK ;debug
:lotabl ldy div7lo,x
lda mod7lo,x
jmp cplotcom
* cplotr is the entry point for the rightmost point.
* We use rastx1 instead of rastx0.
cplotr
lda ylooklo,y
sta ]hbasl
lda ylookhi,y
_pg_or3 ora #$20
sta ]hbasl+1
* If we just plotted the left point on the same line,
* we can skip the Y-lookup by jumping here.
cplotrn
stx ]savxreg
sty ]savyreg
ldx rastx1l,y ;x coord, lo
lda rastx1h,y ;>= 256?
beq :lotabl ;no, use the low table
ldy div7hi,x
lda mod7hi,x
bpl cplotcom ;always
BREAK ;debug
:lotabl ldy div7lo,x
lda mod7lo,x
* Plot the point. The byte offset (0-39) is in Y,
* the bit offset (0-6) is in A.
cplotcom
tax
lda colorline,y ;start with color pattern
eor (]hbasl),y ;flip all bits
and andmask,x ;clear other bits
eor (]hbasl),y ;restore ours, set theirs
sta (]hbasl),y
ldx ]savxreg
ldy ]savyreg
rts
* Reconfigure calc_circle to either JSR to cplotl/r,
* or just BIT the address (a 4-cycle no-op). The
* desired instruction is in A.
fixcplot
do USE_FAST ;*****
sta _cp00
sta _cp01
sta _cp02
sta _cp03
sta _cp04
sta _cp05
sta _cp06
sta _cp07
fin ;*****
sta _cp08
sta _cp09
sta _cp10
sta _cp11
sta _cp12
sta _cp13
sta _cp14
sta _cp15
rts

588
FDRAW.LINE.S Normal file
View File

@ -0,0 +1,588 @@
********************************
* *
* Fast Apple II Graphics *
* By Andy McFadden *
* Version 0.3, Aug 2015 *
* *
* Point and line functions *
* (Included by FDRAW.S) *
* *
* Developed with Merlin-16 *
* *
********************************
********************************
*
* Draw a single point in the current color.
*
********************************
DrawPoint
]hbasl equ zptr0
ldy in_y0
lda ylooklo,y
sta ]hbasl
lda ylookhi,y
ora g_page
sta ]hbasl+1
ldx in_x0l ;x coord, lo
lda in_x0h ;>= 256?
beq :lotabl ;no, use the low table
ldy div7hi,x
lda mod7hi,x
bpl :plotit ;always
BREAK ;debug
:lotabl ldy div7lo,x
lda mod7lo,x
* Plot the point. The byte offset (0-39) is in Y,
* the bit offset (0-6) is in A.
:plotit
tax
lda colorline,y ;start with color pattern
eor (]hbasl),y ;flip all bits
and andmask,x ;clear other bits
eor (]hbasl),y ;restore ours, set theirs
sta (]hbasl),y
rts
********************************
*
* Draw a line between two points.
*
********************************
DrawLine
]hbasl equ zptr0
]xposl equ zloc0 ;always left edge
]xposh equ zloc1
]ypos equ zloc2 ;top or bottom
]deltaxl equ zloc3
]deltaxh equ zloc4
]deltay equ zloc5
]count equ zloc6
]counth equ zloc7
]diff equ zloc8
]diffh equ zloc9
]andmask equ zloc10
]wideflag equ zloc11 ;doesn't really need DP
* We use a traditional Bresenham run-length approach.
* Run-slicing is possible, but the code is larger
* and the increased cost means it's only valuable
* for longer lines. An optimal solution would switch
* approaches based on line length.
*
* Start by identifying where x0 or x1 is on the
* left. To make life simpler we always work from
* left to right, flipping the coordinates if
* needed.
*
* We also need to figure out if the line is more
* than 255 pixels long -- which, because of
* inclusive coordinates, means abs(x0-x1) > 254.
lda in_x1l ;assume x0 on left
sec
sbc in_x0l
tax
beq checkvert ;low bytes even, check hi
lda in_x1h
sbc in_x0h
bcs lx0left
* x1 is on the left, so the values are negative
* (hi byte in A, lo byte in X)
lx0right eor #$ff ;invert hi
sta ]deltaxh ;store
txa
eor #$ff ;invert lo
sta ]deltaxl
inc ]deltaxl ;add one for 2s complement
bne :noinchi ;rolled into high byte?
inc ]deltaxh ;yes
:noinchi lda in_x1l ;start with x1
sta ]xposl
lda in_x1h
sta ]xposh
lda in_y1
sta ]ypos
sec
sbc in_y0 ;compute deltay
jmp lncommon
checkvert
lda in_x1h ;diff high bytes
sbc in_x0h ;(carry still set)
blt lx0right ;width=256, x0 right
bne lx0left ;width=256, x0 left
jmp vertline ;all zero, go vert
* (branch back from below)
* This is a purely horizontal line. We farm the job
* out to the raster fill code for speed. (There's
* no problem with the line code handling it; its just
* more efficient to let the raster code do it.)
phorizontal
ldy ]ypos
sty rast_top
sty rast_bottom
lda ]xposl
sta rastx0l,y
clc
adc ]deltaxl ;easier to add delta back
sta rastx1l,y ; in than sort out which
lda ]xposh ; arg is left vs. right
sta rastx0h,y
adc ]deltaxh
sta rastx1h,y
jmp FillRaster
* x0 is on the left, so the values are positive
lx0left stx ]deltaxl
sta ]deltaxh
lda in_x0l ;start with x0
sta ]xposl
lda in_x0h
sta ]xposh
lda in_y0 ;and y0
sta ]ypos
sec
sbc in_y1 ;compute deltay
* Value of (starty - endy) is in A, flags still set.
lncommon
bcs :posy
eor #$ff ;negative, invert
adc #$01
sta ]deltay
lda #$e8 ;INX
bne gotdy
:posy
_lmb beq phorizontal
sta ]deltay
lda #$ca ;DEX
gotdy sta _hmody
sta _vmody
sta _wmody
do 0 ;***** for regression test
ldx #$01
lda ]deltaxh
bne :iswide
lda ]deltaxl
cmp #$ff ;== 255?
beq :iswide
ldx #$00 ;notwide
:iswide stx $300
lda ]xposl
sta $301
lda ]xposh
sta $302
lda ]ypos
sta $303
ldx ]deltaxl
stx $304
ldx ]deltaxh
stx $305
ldx ]deltay
stx $306
lda _hmody
and #$20 ;nonzero means inc,
sta $307 ; zero means dec
fin ;*****
* At this point we have the initial X position in
* ]startxl/h, the initial Y position in ]starty,
* deltax in ]deltaxl, deltay in ]deltay, and we've
* tweaked the Y-update instructions to either INC or
* DEC depending on the direction of movement.
*
* The next step is to decide whether the line is
* horizontal-dominant or vertical-dominant, and
* branch to the appropriate handler.
*
* The core loops for horiz and vert take about
* 80 cycles when moving diagonally, and about
* 20 fewer when moving in the primary direction.
* The wide-horiz is a bit slower.
ldy #$01 ;set "wide" flag to 1
lda ]deltaxl
ldx ]deltaxh
bne horzdom ;width >= 256
cmp #$ff ;width == 255
beq horzdom
dey ;not wide
cmp ]deltay
bge horzdom ; for diagonal lines
jmp vertdom
* We could special-case pure-diagonal lines here
* (just BEQ a couple lines up). It does
* represent our worst case. I'm not convinced
* we'll see them often enough to make it worthwhile.
* horizontal-dominant
horzdom
sty ]wideflag
sta ]count ;:count = deltax + 1
inc ]count
lsr ;:diff = deltax / 2
sta ]diff
* set Y to the byte offset in the line
* load the AND mask into ]andmask
ldx ]xposl
lda ]xposh ;>= 256?
beq :lotabl ;no, use the low table
ldy div7hi,x
lda mod7hi,x
bpl :gottab ;always
* BREAK ;debug
:lotabl ldy div7lo,x
lda mod7lo,x
:gottab
tax
lda andmask,x
sta ]andmask
* Set initial value for line address.
ldx ]ypos
lda ylooklo,x
sta ]hbasl
lda ylookhi,x
ora g_page
sta ]hbasl+1
lda ]wideflag ;is this a "wide" line?
beq :notwide ;nope, stay local
jmp widedom
:notwide lda colorline,y ;set initial color mask
sta _hlcolor+1
jmp horzloop
hrts rts
* bottom of loop, essentially
hnoroll sta ]diff ;3
hdecc dec ]count ;5 :count--
beq hrts ;2 :while (count != 0)
;= 7 or 10
* We keep the byte offset in the line in Y, and the
* line index in X, for the entire loop.
horzloop
_hlcolor lda #$00 ;2 start with color pattern
_lmdh eor (]hbasl),y ;5 flip all bits
and ]andmask ;3 clear other bits
eor (]hbasl),y ;5 restore ours, set theirs
sta (]hbasl),y ;6 = 21
* Move right. We shift the bit mask that determines
* the pixel. When we shift into bit 7, we know it's
* time to advance another byte.
*
* If this is a shallow line we would benefit from
* keeping the index in X and just doing a 4-cycle
* indexed load to get the mask. Not having the
* line number in X makes the line calc more
* expensive for steeper lines though.
lda ]andmask ;3
asl ;2 shift, losing hi bit
eor #$80 ;2 set the hi bit
bne :noh8 ;3 cleared hi bit?
* We could BEQ away and branch back in, but this
* happens every 7 iterations, so on average it's
* a very small improvement. If we happen to branch
* across a page boundary the double-branch adds
* two more cycles and we lose.
iny ;2 advance to next byte
lda colorline,y ;4 update color mask
sta _hlcolor+1 ;4
lda #$81 ;2 reset
:noh8 sta ]andmask ;3 = 13 + ((12-1)/7) = 14
* Update error diff.
lda ]diff ;3
sec ;2
sbc ]deltay ;3 :diff -= deltay
bcs hnoroll ;2+ :if (diff < 0) ...
;= 11 level, 10 up/down
adc ]deltaxl ;3 : diff += deltax
sta ]diff ;3
_hmody inx ;2 : ypos++ (or --)
lda ylooklo,x ;4 update hbasl after line
sta ]hbasl ;3 change
lda ylookhi,x ;4
_pg_or4 ora #$20 ;2
sta ]hbasl+1 ;3
bne hdecc ;3 = +27 this path -> 37
BREAK
* horizontal: 10+21+14+11=56 cycles/pixel
* diagonal: 7+21+14+37=79 cycles/pixel
* Vertical-dominant line. Could go up or down.
vertdom
ldx in_y0
cpx ]ypos ;starting at y0?
bne :endy0 ;yup
ldx in_y1 ;nope
:endy0 stx _vchk+1 ;end condition
lda ]deltay
lsr
sta ]diff ;:diff = deltay / 2
* set Y to the byte offset in the line
* load the AND mask into ]andmask
ldx ]xposl
lda ]xposh ;>= 256?
beq :lotabl ;no, use the low table
ldy div7hi,x
lda mod7hi,x
bpl :gottab ;always
BREAK ;debug
:lotabl ldy div7lo,x
lda mod7lo,x
:gottab
tax
lda andmask,x ;initial pixel mask
sta ]andmask
lda colorline,y ;initial color mask
sta _vlcolor+1
ldx ]ypos
jmp vertloop
* We keep the byte offset in the line in Y, and the
* line index in X, for the entire loop.
* Bottom of loop, essentially.
vnoroll sta ]diff ;3
vertloop
lda ylooklo,x ;4
sta ]hbasl ;3
lda ylookhi,x ;4
_pg_or5 ora #$20 ;2
sta ]hbasl+1 ;3 = 16
_vlcolor lda #$00 ;2 start with color pattern
_lmdv eor (]hbasl),y ;5 flip all bits
and ]andmask ;3 clear other bits
eor (]hbasl),y ;5 restore ours, set theirs
sta (]hbasl),y ;6 = 21
_vchk cpx #$00 ;2 was this last line?
beq vrts ;2 yes, done
_vmody inx ;2 :ypos++ (or --)
* Update error diff.
lda ]diff ;3
sec ;2
sbc ]deltaxl ;3 :diff -= deltax
bcs vnoroll ;2 :if (diff < 0) ...
;= 10 vert, 9 move right
adc ]deltay ;3 : diff += deltay
sta ]diff ;3
* Move right. We shift the bit mask that determines
* the pixel. When we shift into bit 7, we know it's
* time to advance another byte.
lda ]andmask ;3
asl ;2 shift, losing hi bit
eor #$80 ;2 set the hi bit
beq :is8 ;2+ goes to zero on 8th bit
sta ]andmask ;3
bne vertloop ;3 = 21 + (18/7) = 24
BREAK
:is8 iny ;2 advance to next byte
lda colorline,y ;4 update color
sta _vlcolor+1 ;4
lda #$81 ;2 reset
sta ]andmask ;3
bne vertloop ;3 = 18
BREAK
vrts rts
* vertical: 3 + 16 + 21 + 6 + 10 = 56 cycles
* diagonal: 16 + 21 + 6 + 9 + 24 = 76 cycles
* "Wide" horizontally-dominant loop. We have to
* maintain error-diff and deltax as 16-bit values.
* Most of the setup from the "narrow" version carried
* over, but we have to re-do the count and diff.
*
* Normally we set count to (deltax + 1) and decrement
* to zero, but it's actually easier to set it equal
* to deltax and check for -1.
widedom
lda ]deltaxh ;:count = deltax
sta ]counth
ldx ]deltaxl
stx ]count
stx ]diff
lsr ;:diff = deltax / 2
ror ]diff
sta ]diffh
ldx ]ypos
lda colorline,y ;set initial color mask
sta _wlcolor+1
* We keep the byte offset in the line in Y, and the
* line index in X, for the entire loop.
wideloop
_wlcolor lda #$00 ;2 start with color pattern
_lmdw eor (]hbasl),y ;5 flip all bits
and ]andmask ;3 clear other bits
eor (]hbasl),y ;5 restore ours, set theirs
sta (]hbasl),y ;6 = 21
* Move right. We shift the bit mask that determines
* the pixel. When we shift into bit 7, we know it's
* time to advance another byte.
lda ]andmask ;3
asl ;2 shift, losing hi bit
eor #$80 ;2 set the hi bit
bne :not7 ;3 goes to zero on 8th bit
iny ; 2 advance to next byte
lda colorline,y ; 4 update color mask
sta _hlcolor+1 ; 4
lda #$81 ; 2 reset
:not7 sta ]andmask ;3 = 13 usually, 25 every 7
* Update error diff, which is a positive number. If
* it goes negative ("if (diff < 0)") we act.
lda ]diff
sec
sbc ]deltay ;:diff -= deltay
bcs wnoroll ;didn't even roll low byte
dec ]diffh ;check hi byte
bpl wnoroll ;went 1->0, keep going
adc ]deltaxl ;: diff += deltax
sta ]diff
lda ]diffh
adc ]deltaxh
sta ]diffh
_wmody inx ;: ypos++ (or --)
lda ylooklo,x ;update hbasl after line
sta ]hbasl ; change
lda ylookhi,x
_pg_or6 ora #$20
sta ]hbasl+1
bne wdecc
BREAK
wnoroll sta ]diff
wdecc dec ]count ;5 :count--
lda ]count ;3
cmp #$ff ;2
bne wideloop ;3 :while (count > -1)
dec ]counth ;low rolled, decr high
beq wideloop ;went 1->0, keep going
rts
* Pure-vertical line. These are common in certain
* applications, and checking for it only adds two
* cycles to the general case.
vertline
ldx in_y0
ldy in_y1
cpx in_y1 ;y0 < y1?
blt :usey0 ;yes, go from y0 to y1
txa ;swap X/A
tay
ldx in_y1
:usey0 stx ]ypos
iny
sty _pvytest+1
ldx in_x0l ;xc lo
lda in_x0h ;>= 256?
beq :lotabl
ldy div7hi,x
lda mod7hi,x
bpl :gotit ;always
:lotabl ldy div7lo,x
lda mod7lo,x
* Byte offset is in Y, mod-7 value is in A.
:gotit tax
lda andmask,x
sta _pvand+1 ;this doesn't change
lda colorline,y
sta _pvcolor+1 ;nor does this
ldx ]ypos ;top line
* There's a trick where, when (linenum & 0x07) is
* nonzero, you just add 4 to hbasl+1 instead of
* re-doing the lookup. However, TXA+AND+BEQ
* followed by LDA+CLC+ADC+STA is 16 cycles, the same
* as our self-modified lookup, so it's not a win.
* (And if we used a second ylookhi and self-modded
* the table address, we could shave off another 2.)
* Main pure-vertical loop
pverloop
lda ylooklo,x ;4
sta ]hbasl ;3
lda ylookhi,x ;4
_pg_or7 ora #$20 ;2
sta ]hbasl+1 ;3 (= 16)
_pvcolor lda #$00 ;2 start with color pattern
_lmdpv eor (]hbasl),y ;5 flip all bits
_pvand and #$00 ;2 clear other bits
eor (]hbasl),y ;5
sta (]hbasl),y ;6 (= 20)
inx ;2
_pvytest cpx #$00 ;2 done?
bne pverloop ;3 = 7
rts
* 43 cycles/pixel
********************************
*
* Set the line mode according to in_arg
*
* A slightly silly feature to get xdraw lines
* without really working for it.
*
********************************
SetLineMode
lda in_arg
beq :standard
* configure for xdraw
lda #$24 ;BIT dp
sta _lmb
sta _lmdh
sta _lmdv
sta _lmdw
sta _lmdpv
rts
* configure for standard drawing
:standard lda #$f0 ;BEQ
sta _lmb
lda #$51 ;EOR (dp),y
sta _lmdh
sta _lmdv
sta _lmdw
sta _lmdpv
rts

805
FDRAW.S Normal file
View File

@ -0,0 +1,805 @@
********************************
* *
* Fast Apple II Graphics *
* By Andy McFadden *
* Version 0.3, Aug 2015 *
* *
* Main source file *
* *
* Developed with Merlin-16 *
* *
********************************
* Set to 1 to build FDRAW.FAST, set to zero to
* build FDRAW.SMALL.
USE_FAST equ 1
* Set to 1 to turn on beeps/clicks for debugging.
NOISE_ON equ 0
lst off
org $6000
*
* Macros.
*
spkr equ $c030
bell equ $ff3a
* If enabled, click the speaker (changes flags only).
CLICK mac
do NOISE_ON
bit spkr
fin
<<<
* If enabled, beep the speaker (scrambles regs).
BEEP mac
do NOISE_ON
jsr bell
fin
<<<
* If enabled, insert a BRK.
BREAK mac
do NOISE_ON
brk $99
fin
<<<
* In "fast" mode, we align tables on page boundaries so we
* don't take a 1-cycle hit when the indexing crosses a page.
* In "small" mode, we skip the alignment.
PG_ALIGN mac
do USE_FAST
ds \
fin
<<<
*
* Hi-res screen constants.
*
BYTES_PER_ROW = 40
NUM_ROWS = 192
NUM_COLS = 280
*
* Variable storage. We assign generic names to
* zero-page scratch locations, then assign variables
* with real names to these.
*
* 06-09 are unused (except by SWEET-16)
* 1a-1d are Applesoft hi-res scratch
* cc-cf are only used by INTBASIC
* eb-ef and ff appear totally unused by ROM routines
*
zptr0 equ $1a ;2b
zloc0 equ $06
zloc1 equ $07
zloc2 equ $08
zloc3 equ $09
zloc4 equ $1c
zloc5 equ $1d
zloc6 equ $cc
zloc7 equ $cd
zloc8 equ $ce
zloc9 equ $cf
zloc10 equ $eb
zloc11 equ $ec
zloc12 equ $ed
zloc13 equ $ee
********************************
*
* Entry points for external programs.
*
********************************
Entry
jmp Init ;initialize data tables
dfb 0,3 ;version number
*
* Parameters passed from external programs.
*
in_arg ds 1 ;generic argument
in_x0l ds 1 ;X coordinate 0, low part
in_x0h ds 1 ;X coordinate 0, high part
in_y0 ds 1 ;Y coordinate 0
in_x1l ds 1
in_x1h ds 1
in_y1 ds 1
in_rad ds 1 ;radius for circles
ds 3 ;pad to 16 bytes
jmp SetColor
jmp SetPage
jmp Clear
jmp DrawPoint
jmp DrawLine
jmp DrawRect
jmp FillRect
jmp DrawCircle
jmp FillCircle
jmp SetLineMode
jmp noimpl ;reserved2
jmp FillRaster
* Raster fill values. Top, bottom, and pointers to tables
* for the benefit of external callers.
rast_top ds 1
rast_bottom ds 1
da rastx0l
da rastx0h
da rastx1l
da rastx1h
noimpl rts
********************************
*
* Global variables.
*
********************************
g_inited dfb 0 ;initialized?
g_color dfb 0 ;hi-res color (0-7)
g_page dfb $20 ;hi-res page ($20 or $40)
********************************
*
* Initialize.
*
********************************
Init
lda #$00
sta in_arg
jsr SetColor ;set color to zero
jsr SetLineMode ;set normal lines
lda #$20
sta in_arg
sta g_inited
jmp SetPage ;set hi-res page 1
********************************
*
* Set the color.
*
********************************
SetColor
lda in_arg
cmp g_color ;same as the old color?
beq :done
and #$07 ;safety first
sta g_color
* Update the "colorline" table, which provides a quick color
* lookup for odd/even bytes. We could also have one table
* per color and self-mod the "LDA addr,y" instructions to
* point to the current one, but that uses a bunch of memory
* and is kind of ugly. Takes 16 + (12 * 40) = 496 cycles.
tax ;2
lda xormask,x ;4
sta :_xormsk+1 ;4
lda oddcolor,x ;4
ldy #BYTES_PER_ROW-1 ;2
]loop sta colorline,y ;5
:_xormsk eor #$00 ;2
dey ;2
bpl ]loop ;3
:done rts
********************************
*
* Set the page.
*
********************************
SetPage
lda g_inited ;let's just check this
beq noinit ; (not called too often)
lda in_arg
cmp #$20
beq :good
cmp #$40
beq :good
jmp bell
:good
sta g_page
do 0 ;*****
cmp ylookhi
beq :tabok
* Check to see if the values currently in the Y-lookup table
* match our current page setting. If they don't, we need to
* adjust the code that does lookups.
* This approach modifies the table itself, paying a large
* cost now so we don't have to pay it on every lookup.
* However, this costs 2+(16*192)=3074 cycles, while an
* "ORA imm" only adds two to each lookup, so we'd have
* to do a lot of drawing to make this worthwhile.
* (Note: assumes ylookhi is based at $2000 not $0000)
ldy #NUM_ROWS ;2
]loop lda ylookhi-1,y ;4
eor #$60 ;2 $20 <--> $40
sta ylookhi-1,y ;5
dey ;2
bne ]loop ;3
else ;*****
* This approach uses self-modifying code to update the
* relevant instructions. It's a bit messy to have it
* here, but it saves us from having to do it on
* every call.
*
* We could also have a second y-lookup table and
* use this to update the pointers. That would let
* us drop the "ORA imm" entirely, without the cost
* of the rewrite above, but eating up another 192 bytes.
sta _pg_or1+1 ;rastfill
sta _pg_or2+1 ;circle hplot
sta _pg_or3+1 ;circle hplot
sta _pg_or4+1 ;drawline
sta _pg_or5+1 ;drawline
sta _pg_or6+1 ;drawline
sta _pg_or7+1 ;drawline
fin ;*****
:tabok rts
noinit ldy #$00
]loop lda :initmsg,y
beq :done
jsr $fded ;cout
iny
bne ]loop
:done rts
:initmsg asc "FDRAW NOT INITIALIZED",87,87,00
********************************
*
* Clear the screen to the current color.
*
********************************
Clear
do USE_FAST ;*****
* This performs a "visually linear" clear, erasing the screen
* from left to right and top to bottom. To reduce the amount
* of code required we erase in thirds (top/middle/bottom).
*
* Compare to a "venetian blind" clear, which is what you get
* if you erase memory linearly.
*
* The docs discuss different approaches. This version
* requires ((2 + 5*64 + 11) * 40 + 14) * 3 = 40002 cycles.
* If we didn't divide it into thirds to keep the top-down
* look, we'd need (5*64 + 9) * 120 = 39480 cycles, so
* we're spending 522 cycles to avoid the venetian look.
lda :clrloop+2
cmp g_page
beq :pageok
* We're on the wrong hi-res page. Flip to the other one.
* 4 + (20*64) = 1284 cycles to do the flip (+ a few more
* because we're probably crossing a page boundary).
BEEP
ldy #NUM_ROWS ;2
]loop lda :clrloop-3+2,y ;4
eor #$60 ;2
sta :clrloop-3+2,y ;5
dey ;2
dey ;2
dey ;2
bne ]loop ;3
:pageok ldx g_color ;grab the current color
lda xormask,x
sta :_xormsk+1
lda evencolor,x
ldy #0
jsr :clearthird
ldy #BYTES_PER_ROW
jsr :clearthird
ldy #BYTES_PER_ROW*2
* fall through into :clearthird for final pass
:clearthird
ldx #BYTES_PER_ROW-1 ;2
:clrloop sta $2000,y ;5 (* 64)
sta $2400,y ;this could probably be
sta $2800,y ; done with LUP math
sta $2c00,y
sta $3000,y
sta $3400,y
sta $3800,y
sta $3c00,y
sta $2080,y
sta $2480,y
sta $2880,y
sta $2c80,y
sta $3080,y
sta $3480,y
sta $3880,y
sta $3c80,y
sta $2100,y
sta $2500,y
sta $2900,y
sta $2d00,y
sta $3100,y
sta $3500,y
sta $3900,y
sta $3d00,y
sta $2180,y
sta $2580,y
sta $2980,y
sta $2d80,y
sta $3180,y
sta $3580,y
sta $3980,y
sta $3d80,y
sta $2200,y
sta $2600,y
sta $2a00,y
sta $2e00,y
sta $3200,y
sta $3600,y
sta $3a00,y
sta $3e00,y
sta $2280,y
sta $2680,y
sta $2a80,y
sta $2e80,y
sta $3280,y
sta $3680,y
sta $3a80,y
sta $3e80,y
sta $2300,y
sta $2700,y
sta $2b00,y
sta $2f00,y
sta $3300,y
sta $3700,y
sta $3b00,y
sta $3f00,y
sta $2380,y
sta $2780,y
sta $2b80,y
sta $2f80,y
sta $3380,y
sta $3780,y
sta $3b80,y
sta $3f80,y
:_xormsk eor #$00 ;2 flip odd/even bits
iny ;2
dex ;2
bmi :done ;2
jmp :clrloop ;3
:done rts
else ;***** not USE_FAST
* This version was suggested by Marcus Heuser on
* comp.sys.apple2.programmer. It does a "venetian blind"
* clear, and takes (5 * 32 + 7) * 248 = 41416 cycles.
* It overwrites half of the screen holes.
lda :clrloop+5
cmp g_page
beq :pageok
* We're on the wrong hi-res page. Flip to the other one.
* 12 + (20*31) = 632 cycles to do the flip. We have to
* single out the first entry because it's $1f not $20.
BEEP
lda :clrloop+2 ;4
eor #$20 ;2 $1f <-> $3f
sta :clrloop+2 ;4
ldy #31*3 ;2
]loop lda :clrloop+2,y ;4
eor #$60 ;2 $20 <-> $40
sta :clrloop+2,y ;5
dey ;2
dey ;2
dey ;2
bne ]loop ;3
:pageok ldx g_color
lda xormask,x
sta :_xormsk+1
lda oddcolor,x
ldy #248 ;120 + 8 + 120
:clrloop
]addr = $1fff
lup 32 ;begin a loop in assembler
sta ]addr,y ;5
]addr = ]addr+$100 ;sta 20ff,21ff,...
--^
:_xormsk eor #$00 ;2
dey ;2
bne :clrloop ;3
rts
fin ;***** not USE_FAST
********************************
*
* Draw rectangle outline.
*
********************************
DrawRect
* We could just issue 4 line draw calls here, maybe
* adjusting the vertical lines by 1 pixel up/down to
* avoid overdraw. But if the user wanted 4 lines,
* they could just draw 4 lines. Instead, we're going
* to draw a double line on each edge to ensure that
* the outline rectangle always has the correct color.
*
* Rather than draw two vertical lines, we draw a
* two-pixel-wide filled rectangle on each side.
*
* We don't want to double-up if the rect is only one
* pixel wide, so we have to check for that.
*
* If the rect is one pixel high, it's just a line.
* If it's two pixels high, we don't need to draw
* the left/right edges, just the top/bottom lines.
* If it's more than two tall, we don't need to draw
* the left/right edges on the top and bottom lines,
* so we save a few cycles by skipping those.
lda in_y1 ;copy top/bottom to local
sta rast_bottom
dec rast_bottom ;move up one
sec
sbc in_y0
beq :isline ;1 pixel high, just draw line
cmp #1
beq :twolines ;2 pixels high, lines only
ldy in_y0
iny ;start down a line
sty rast_top
lda in_x0h ;check to see if left/right
cmp in_x1h ; coords are the same; if
bne :notline ; so, going +1/-1 at edge
lda in_x0l ; will overdraw.
cmp in_x1l
bne :notlin1
:isline jmp DrawLine ;just treat like line
* Set up left edge. Top line is in Y.
:notline lda in_x0l
:notlin1 sta rastx0l,y
clc
adc #1
sta rastx1l,y
lda in_x0h
ora #$80 ;"repeat" flag
sta rastx0h,y
and #$7f
adc #0
sta rastx1h,y
jsr FillRaster
ldy rast_top
lda in_x1l ;now set up right edge
sta rastx1l,y
sec
sbc #1
sta rastx0l,y
lda in_x1h
sta rastx1h,y
sbc #0
ora #$80 ;"repeat" flag
sta rastx0h,y
jsr FillRaster
* Now the top/bottom lines.
:twolines
ldy in_y0
jsr :drawline
ldy in_y1
:drawline
sty rast_top
sty rast_bottom
lda in_x0l ;copy left/right to the
sta rastx0l,y ; table entry for the
lda in_x0h ; appropriate line
sta rastx0h,y
lda in_x1l
sta rastx1l,y
lda in_x1h
sta rastx1h,y
jmp FillRaster
********************************
*
* Draw filled rectangle.
*
********************************
FillRect
* Just fill out the raster table and call the fill routine.
* We require y0=top, y1=bottom, x0=left, x1=right.
ldy in_y0
sty rast_top
lda in_y1
sta rast_bottom
lda in_x0l
sta rastx0l,y
lda in_x0h
ora #$80 ;"repeat" flag
sta rastx0h,y
lda in_x1l
sta rastx1l,y
lda in_x1h
sta rastx1h,y
jmp FillRaster
********************************
*
* Fill an area defined by the raster tables.
*
********************************
FillRaster
* Render rasterized output. The left and right edges
* are stored in the rastx0/rastx1 tables, and the top
* and bottom-most pixels are in rast_top/rast_bottom.
*
* This can be used to render an arbitrary convex
* polygon after it has been rasterized.
*
* If the high bit of the high byte of X0 is set, we
* go into "repeat" mode, where we just repeat the
* previous line. This saves about 40 cycles of
* overhead per line when drawing rectangles, plus
* what we would have to spend to populate multiple
* lines of the raster table. It only increases the
* general per-line cost by 3 cycles.
*
* We could use the "repeat" flag to use this code to
* draw vertical lines, though that's mostly of value
* to an external caller who knows ahead of time that
* the line is vertical. The DrawLine code is pretty
* good with vertical lines, and adding additional
* setup time to every vertical-dominant line to
* decide if it should call here seems like a
* losing proposition.
]hbasl equ zptr0
]hbash equ zptr0+1
]lftbyte equ zloc0
]lftbit equ zloc1
]rgtbyte equ zloc2
]rgtbit equ zloc3
]line equ zloc4
]andmask equ zloc5
]cur_line equ zloc6
]repting equ zloc7
ldx g_color ;configure color XOR byte
lda xormask,x
do USE_FAST ;*****
cmp rast_unroll+3 ;already configured?
beq :goodmask
jsr fixrastxor
:goodmask
else
sta _xorcolor+1
fin ;*****
lda #$00
sta ]repting
ldy rast_top
* Main rasterization loop. Y holds the line number.
rastloop
sty ]cur_line ;3
ldx ylooklo,y ;4
stx ]hbasl ;3
lda ylookhi,y ;4
_pg_or1 ora #$20 ;2 will be $20 or $40
sta ]hbash ;3 = 19 cycles
do USE_FAST-1 ;***** i.e. not USE_FAST
stx _wrhires+1
sta _wrhires+2
fin ;*****
* divide left edge by 7
ldx rastx0l,y ;4 line num in Y
lda rastx0h,y ;4
bpl :noflag ;2
sta rastx0h+1,y ;4 propagate
lda ]repting ;3 first time through?
beq :firstre ;2 yup, finish calculations
lda ]rgtbyte ;3 need this in A
bpl :repeat ;3 always
:firstre lda rastx0h,y ;reload
sta ]repting ;any nonzero will do
and #$7f ;strip repeat flag
:noflag beq :lotabl
lda mod7hi,x
sta ]lftbit
lda div7hi,x
sta ]lftbyte
bpl :gotlft ;always
BREAK ;debug
:lotabl lda mod7lo,x
sta ]lftbit
lda div7lo,x
sta ]lftbyte
:gotlft
* divide right edge by 7
ldx rastx1l,y ;4 line num in Y
lda rastx1h,y ;4
beq :lotabr ;3
lda mod7hi,x
sta ]rgtbit
lda div7hi,x
sta ]rgtbyte
bpl :gotrgt ;always
BREAK ;debug
:lotabr lda mod7lo,x ;4
sta ]rgtbit ;3
lda div7lo,x ;4
sta ]rgtbyte ;3 = 25 for X1 < 256
:gotrgt
:repeat
cmp ]lftbyte ;3
bne :not1byte ;3
* The left and right edges are in the same byte. We
* need to set up the mask differently, so we deal with
* it as a special case.
ldy ]lftbit
lda leftmask,y ;create the AND mask
ldx ]rgtbit
and rightmask,x ;strip out bits on right
sta ]andmask
ldy ]lftbyte
lda colorline,y ;get color bits
eor (]hbasl),y ;combine w/screen
and ]andmask ;remove not-ours
eor (]hbasl),y ;combine again
sta (]hbasl),y
jmp rastlinedone
* This is the more general case. We special-case the
* left and right edges, then byte-stomp the middle.
* On entry, ]rgtbyte is in A
:not1byte
sec ;2 compute number of full
sbc ]lftbyte ;3 and partial bytes to
tax ;2 draw
inx ;2
ldy ]rgtbit ;3
cpy #6 ;2
beq :rgtnospcl ;3
lda rightmask,y ;handle partial-byte right
sta ]andmask
ldy ]rgtbyte
lda colorline,y
eor (]hbasl),y
and ]andmask
eor (]hbasl),y
sta (]hbasl),y
dex ;adjust count
:rgtnospcl
ldy ]lftbit ;3 check left for partial
beq :lftnospcl ;3
lda leftmask,y ;handle partial-byte left
sta ]andmask
ldy ]lftbyte
lda colorline,y
eor (]hbasl),y
and ]andmask
eor (]hbasl),y
sta (]hbasl),y
dex ;adjust count
beq rastlinedone ;bail if all done
iny ;advance start position
bne :liny ;always
BREAK
:lftnospcl
ldy ]lftbyte ;3
:liny
do USE_FAST ;***** "fast" loop
* Instead of looping, jump into an unrolled loop.
* Cost is 10 cycles per byte with an extra 14 cycles
* of overhead, so we start to win at 4 bytes.
lda rastunidx,x ;4
sta :_rastun+1 ;4
lda colorline,y ;4 get odd/even color val
:_rastun jmp rast_unroll ;3
else ;***** "slow" loop
* Inner loop of the renderer. This runs 0-40x.
* Cost is 14 cycles/byte.
lda colorline,y ;get appropriate odd/even val
_wrhires sta $2000,y ;5 replaced with line addr
_xorcolor eor #$00 ;2 replaced with $00/$7f
iny ;2
dex ;2
bne _wrhires ;3
fin ;*****
rastlinedone
ldy ]cur_line ;3 more lines to go?
cpy rast_bottom ;4
bge :done ;2
iny ;2
jmp rastloop ;3 must have line in Y
:done rts
fixrastxor
do USE_FAST ;*****
* Update the EOR statements in the unrolled rastfill code.
* Doing this with a loop takes ~600 cycles, doing it with
* unrolled stores takes 160. We only do this when we
* need to, so changing the color from green to blue won't
* cause this to run.
*
* Call with the XOR value in A.
]offset = 0
lup BYTES_PER_ROW
sta rast_unroll+3+]offset
]offset = ]offset+5
--^
BEEP
rts
fin ;*****
* include the line functions
put FDRAW.LINE
* include the circle functions
put FDRAW.CIRCLE
lst on
CODE_END equ * ;end of code section
lst off
* include the data tables
put FDRAW.TABLES
lst on
DAT_END equ * ;end of data / BSS
lst off
* Save the appropriate object file.
do USE_FAST
sav FDRAW.FAST
else
sav FDRAW.SMALL
fin

339
FDRAW.TABLES.S Normal file
View File

@ -0,0 +1,339 @@
********************************
* *
* Fast Apple II Graphics *
* By Andy McFadden *
* Version 0.3, Aug 2015 *
* *
* Pre-computed data and *
* large internal buffers. *
* (Included by FDRAW.S) *
* *
* Developed with Merlin-16 *
* *
********************************
* Expected layout with alignment:
*
* P1 ylooklo, misc tables
* P2 ylookhi, colorline
* P3 rastx0l
* P4 rastx0h
* P5 rastx1l
* P6 rastx1h, div7hi, mod7hi
* P7 div7lo
* P8 mod7lo
* P9 rast_unroll, rastunidx
*
* Tables should be just under $900 bytes.
PG_ALIGN
* Hi-res Y lookup, low part (192 bytes).
ylooklo HEX 0000000000000000
HEX 8080808080808080
HEX 0000000000000000
HEX 8080808080808080
HEX 0000000000000000
HEX 8080808080808080
HEX 0000000000000000
HEX 8080808080808080
HEX 2828282828282828
HEX a8a8a8a8a8a8a8a8
HEX 2828282828282828
HEX a8a8a8a8a8a8a8a8
HEX 2828282828282828
HEX a8a8a8a8a8a8a8a8
HEX 2828282828282828
HEX a8a8a8a8a8a8a8a8
HEX 5050505050505050
HEX d0d0d0d0d0d0d0d0
HEX 5050505050505050
HEX d0d0d0d0d0d0d0d0
HEX 5050505050505050
HEX d0d0d0d0d0d0d0d0
HEX 5050505050505050
HEX d0d0d0d0d0d0d0d0
* Color masks for odd/even bytes, colors 0-7.
evencolor dfb $00,$2a,$55,$7f,$80,$aa,$d5,$ff
oddcolor dfb $00,$55,$2a,$7f,$80,$d5,$aa,$ff
* XOR mask for colors 0-7 - non-BW flip on odd/even.
xormask dfb $00,$7f,$7f,$00,$00,$7f,$7f,$00
* AND mask for the 7 pixel positions, high bit set
* for the color shift.
andmask dfb $81,$82,$84,$88,$90,$a0,$c0
* These are pixel AND masks, used with the modulo 7
* result. Entry #2 in leftmask means we're touching
* the rightmost 5 pixels, and entry #2 in rightmask
* means we're touching the 3 leftmost pixels.
*
* The high bit is always set, because we want to
* keep the color's high bit.
leftmask dfb $ff,$fe,$fc,$f8,$f0,$e0,$c0
rightmask dfb $81,$83,$87,$8f,$9f,$bf,$ff
PG_ALIGN
* Hi-res Y lookup, high part (192 bytes).
* OR with $20 or $40.
ylookhi HEX 0004080c1014181c
HEX 0004080c1014181c
HEX 0105090d1115191d
HEX 0105090d1115191d
HEX 02060a0e12161a1e
HEX 02060a0e12161a1e
HEX 03070b0f13171b1f
HEX 03070b0f13171b1f
HEX 0004080c1014181c
HEX 0004080c1014181c
HEX 0105090d1115191d
HEX 0105090d1115191d
HEX 02060a0e12161a1e
HEX 02060a0e12161a1e
HEX 03070b0f13171b1f
HEX 03070b0f13171b1f
HEX 0004080c1014181c
HEX 0004080c1014181c
HEX 0105090d1115191d
HEX 0105090d1115191d
HEX 02060a0e12161a1e
HEX 02060a0e12161a1e
HEX 03070b0f13171b1f
HEX 03070b0f13171b1f
* Masks for current color (even/odd), e.g. 55 2a 55 2a ...
* Updated whenever the color changes.
colorline ds 40
PG_ALIGN
rastx0l ds NUM_ROWS
PG_ALIGN
rastx0h ds NUM_ROWS
ds 1 ;repeat mode can overstep
PG_ALIGN
rastx1l ds NUM_ROWS
PG_ALIGN
rastx1h ds NUM_ROWS
* Lookup tables for dividing 0-279 by 7. The "hi"
* parts are 24 bytes each, so they fit inside
* the previous 192-byte entry. The "lo" parts
* each fill a page.
div7hi HEX 2424242525252525
HEX 2525262626262626
HEX 2627272727272727
mod7hi HEX 0405060001020304
HEX 0506000102030405
HEX 0600010203040506
PG_ALIGN
div7lo HEX 0000000000000001
HEX 0101010101010202
HEX 0202020202030303
HEX 0303030304040404
HEX 0404040505050505
HEX 0505060606060606
HEX 0607070707070707
HEX 0808080808080809
HEX 0909090909090a0a
HEX 0a0a0a0a0a0b0b0b
HEX 0b0b0b0b0c0c0c0c
HEX 0c0c0c0d0d0d0d0d
HEX 0d0d0e0e0e0e0e0e
HEX 0e0f0f0f0f0f0f0f
HEX 1010101010101011
HEX 1111111111111212
HEX 1212121212131313
HEX 1313131314141414
HEX 1414141515151515
HEX 1515161616161616
HEX 1617171717171717
HEX 1818181818181819
HEX 1919191919191a1a
HEX 1a1a1a1a1a1b1b1b
HEX 1b1b1b1b1c1c1c1c
HEX 1c1c1c1d1d1d1d1d
HEX 1d1d1e1e1e1e1e1e
HEX 1e1f1f1f1f1f1f1f
HEX 2020202020202021
HEX 2121212121212222
HEX 2222222222232323
HEX 2323232324242424
mod7lo HEX 0001020304050600
HEX 0102030405060001
HEX 0203040506000102
HEX 0304050600010203
HEX 0405060001020304
HEX 0506000102030405
HEX 0600010203040506
HEX 0001020304050600
HEX 0102030405060001
HEX 0203040506000102
HEX 0304050600010203
HEX 0405060001020304
HEX 0506000102030405
HEX 0600010203040506
HEX 0001020304050600
HEX 0102030405060001
HEX 0203040506000102
HEX 0304050600010203
HEX 0405060001020304
HEX 0506000102030405
HEX 0600010203040506
HEX 0001020304050600
HEX 0102030405060001
HEX 0203040506000102
HEX 0304050600010203
HEX 0405060001020304
HEX 0506000102030405
HEX 0600010203040506
HEX 0001020304050600
HEX 0102030405060001
HEX 0203040506000102
HEX 0304050600010203
* RastFill unrolled loop. At each step we store the current
* color value, XOR it to flip the bits if needed, and advance.
* The caller needs to set the appropriate initial value based
* on whether the address is odd or even.
*
* We can use a 3-cycle "EOR dp" or a 2-cycle "EOR imm". The
* former is one cycle slower, the latter requires us to
* self-mod 40 instructions when the color changes.
*
* This must be page-aligned so that we can take the value
* from the rastunidx table and self-mod a JMP without having
* to do a 16-bit add. We have just enough room for the
* unrolled loop (40*5+3) and x5 table (41) = 244 bytes, fits
* on a single page.
do USE_FAST ;*****
ds \
]hbasl equ zptr0 ;must match FillRaster
rast_unroll equ *
lst off
lup BYTES_PER_ROW
sta (]hbasl),y ;6
eor #$00 ;2
iny ;2 10 cycles, 5 bytes
--^
jmp rastlinedone
* Index into rast_unroll. If we need to output N bytes,
* we want to jump to (rast_unroll + (40 - N) * 5) (where
* 5 is the number of bytes per iteration).
rastunidx
]offset = BYTES_PER_ROW*5
lup BYTES_PER_ROW+1 ;0-40
dfb ]offset
]offset = ]offset-5
--^
fin ;*****
********************************
*
* Code used to generate tables above. If you want to
* decrease load size, use these functions to generate
* the data into empty memory, then discard the code.
* (Maybe use a negative DS and overlap with rastx0l?)
*
********************************
DO 0 ;*****
init_ylook
]hbasl equ zptr1
]hbash equ zptr1+1
* Initialize Y-lookup table. We just call the bascalc
* function.
ldx #NUM_ROWS
ldy #NUM_ROWS-1
]loop tya
jsr bascalc
lda hbasl
sta ylooklo,y
lda hbash
ora #$20 ;remove for $0000 base
sta ylookhi,y
dey
dex
bne ]loop
rts
* Hi-res base address calculation. This is based on the
* HPOSN routine at $F411.
*
* Call with the line in A. The results are placed into
* zptr1. X and Y are not disturbed.
*
* The value is in the $0000-1fff range, so you must OR
* the desired hi-res page in.
*
bascalc
pha
and #$c0
sta ]hbasl
lsr
lsr
ora ]hbasl
sta ]hbasl
pla
sta ]hbash
asl
asl
asl
rol ]hbash
asl
rol ]hbash
asl
ror ]hbasl
lda ]hbash
and #$1f
sta ]hbash
rts
*
* Create divide-by-7 tables.
*
mkdivtab
]val equ zloc0
ldy #0
sty ]val
ldx #0
]loop lda ]val
sta div7lo,y
txa
sta mod7lo,y
inx
iny
beq :lodone
cpx #7
bne ]loop
inc ]val
ldx #0
beq ]loop ;always
:lodone ;safe to ignore ]va update
]loop lda ]val
sta div7hi,y
txa
sta mod7hi,y
iny
cpy #280-256
beq :hidone
inx
cpx #7
bne ]loop
inc ]val
ldx #0
beq ]loop ;always
:hidone rts
FIN ;*****

View File

@ -1,2 +1,59 @@
# fdraw
Fast Apple II graphics
fdraw
=====
Fast graphics routines for the Apple II
By Andy McFadden
Version 0.3, August 2015
## Overview ##
The fdraw library provides fast rendering of points, lines, rectangles,
and circles, as well as high-speed screen clears, for Apple II hi-res
graphics. It can be used from Applesoft or 6502 assembly language.
Two disk images are available in the [fdraw-disks.zip](fdraw-disks) zip
archive. `fdrawdemo.do` is a 140K disk image with the demos that will
run on an Apple ][+ or later. `fdrawdev.po` is an 800K disk image with
the source code, demos, and a few extras.
A video of the demos running in the AppleWin emulator
[https://www.youtube.com/watch?v=z2RFGVoaROE](is available).
Learn more about how fdraw works in the
[docs/manual.md](library documentation).
Learn about the demos in the [docs/demos.md](demo documentation).
Learn more about what possessed me to write a graphics library for the
Apple II more than 20 years after the platform was discontinued in the
[docs/personal-notes.md](fadden's brain documentation).
The main bits of source code are accessible from git for easy viewing,
but the "official" home is on `fdrawdev.po`.
All code is copyright 2015 by Andy McFadden. All rights reserved. The
source code is available under the Apache 2 license (a very friendly
open-source license).
### Version History ###
##### v0.1 March 13, 2006
No source code, just a demo with fast filled circles and screen clears.
##### v0.2 March 20, 2006
Polished up the sources and published. This version implemented Clear,
FillRect, FillCircle, and FillRaster.
##### v0.3 August 21, 2015
Added DrawPoint, DrawLine, DrawRect, DrawCircle, and SetLineMode. Various
size and performance improvements.
Added Amperfdraw to make Applesoft BASIC programming easier.
Added several more demos and tests.
Added documentation.

167
docs/demos.md Normal file
View File

@ -0,0 +1,167 @@
fdraw Demo README
=================
The fdraw distribution comes with a handful of demonstration programs.
Most of them are written in Applesoft BASIC, and use the amperfdraw
interface. This is a somewhat poor way to demonstrate animation
performance, as Applesoft adds a tremendous amount of overhead, but it
is the only way to show what you *can* do with Applesoft.
The easiest way to run them is with the "DEMO" program, which scans the
DEMOS directory for BASIC programs and presents a list. You can also
just run them directly.
* INTRO : Sort of a "hello, world" for fdraw. Mix of single- and
double-buffered animation.
* CIRCULAR : Draws lots of circles.
* RECTSPLAT : Draws lots of rectangles.
* CUBIC : Draws a spinning wireframe 3D cube. (The 3D coordinates are
pre-computed -- fdraw doesn't do matrix transforms.)
* TUNNEL : Animates circles to simulate driving through a tunnel.
* LINEAR : Draws lots of lines. The wipes show speed differences for
horizontal and vertical special cases, while the circular spinner
shows HPLOT is not as fast as &HPLOT which is not as fast as &PLOT for
a set of lines at a variety of angles.
* LINE.DIFF : Draws several lines with the ROM routines and fdraw
side-by-side to illustrate the difference in line style.
* CLEARLY : Clears the screen 32 times, 4 sets in each of the 8 colors.
The first round is done with the Applesoft ROM routine ("CALL 62454"),
the second round uses the fdraw &CLEAR function.
* HRFAN : A simple line-art demo, using "xdraw" DrawLine with lines in
different colors. Not a great demo, as the Applesoft code driving it
is rather slow, but it looks pretty good if you bump up the emulation
speed or switch to IIgs "fast" mode. (This deserves a conversion to
assembly language.)
* BRIAN.THEME.ORI : The Brian's Theme demo from the DOS 3.3 System
Master. Unmodified except for integration with the demo menu
system, and with the bug on line 31112 fixed.
* BRIAN.THEME.NEW : The Brian's Theme demo with '&' placed in front of
the various draw calls. There isn't a huge difference in speed, as
there's a lot of overhead from Applesoft, but its interesting to note
the change in the appearance of the lines.
* WIGGLE : Sample program that shows direct use of rasterization tables.
When the demos are launched from the menu, they will assume that fdraw
is already loaded and won't try to load it again. If you run the demo
program directly, it will try to load FDRAW.FAST and AMPERFDRAW from the
parent directory before doing any drawing.
## Extras ##
The EXTRAS directory has some additional software that isn't "officially"
part of fdraw, but may be of use.
NOTE: some of these assume fdraw and amperfdraw are already loaded, and
will hang if not. Run DEMO and hit <esc> before running these.
* ARRAY.EXAMPLE : The &PLOT example from the documentation.
* XDRAW.ANIM : A demonstration of line animation using "xdraw" mode and
a simple shape that is drawn twice by a single &PLOT call. One copy
is offset by 2 pixels, so each &PLOT call erases the previous copy and
draws a new copy 2 pixels to the right. The animation is shown twice,
once with "erase all, draw all", and once with the erase and draw calls
interleaved for every line.
* LINEFONT : Program for creating draw-array tables for text phrases. Used
to create data files for the "intro" demo. See the "LINEFONT Details"
section for more information.
* DAVIEWER: Views the contents of .DA files created by LINEFONT.
* BENCHCLEAR : Calls the "clear" function 256 times from a small
assembly-language program. Handy for benchmarks, but slightly silly
since it's relatively easy to calculate the exact cycle cost.
## LINEFONT Details ##
NOTE: this program is an unfinished rough cut ("pre alpha"), used for
preparing data for demos.
The program includes a font definition, routines for displaying
characters, and code for generating and exporting pre-rendered strings.
Character vertices are expressed as floating-point values. The baseline
is at zero, the peak ascent is at 1.0, the lowest descent is -1.0. The
leftmost pixel is at zero, the maximum value for the rightmost pixel is 1.0.
Characters don't have to fill out the entire cell -- proportionally-spaced
fonts are supported -- but they are expected to start at the left edge.
So a capital 'M' might look like this:
0.0,0.0 -> 0.0,1.0 -> 0.5,0.7 -> 1.0,1.0 -> 1.0,0.0
There is currently no "user interface", unless the "user" can program in
Applesoft BASIC. To generate strings, add a series of statements that set
variables and call 20000 to add rendered strings to the set. The relevant
variables are:
S$ - string to add
DW - desired width, in pixels, of a cell 1.0 units wide
DH - desired height, in pixels of a cell 2.0 units high (ascent + descent)
IS% - inter-character spacing, in pixels
SW% - width of the space character (usually same as DW)
MO% - monospace flag; if nonzero, all chars are treated as 1.0 units wide
Remove the REM from the start of line 1010 to enable the character viewer.
At present only a couple of lower-case letters are defined.
#### LINEFONT Output ####
The LINEFONT program outputs a binary blob that can be passed to
the &PLOT array-draw function. The file structure is:
+0 byte - number of array sets in the list.
+1 2 bytes * N - table of offsets to individual array sets. One of
these per array set. The value is the offset from the start of the
file.
(2N+1) array set #1:
+0 byte - number of vertices (0-127)
+1 byte - number of index pairs (0-127)
+2 2 bytes * V - vertices (values are signed X/Y)
+X 2 bytes * I - index pairs (values are 0-127)
To display phrase #3, you would get the 16-bit value from the offset
table with PEEK(start + 1 + 3 * 2) + PEEK(start + 2 + 3 * 2) * 256.
You get the number of vertices from PEEK(start + offset), and the number
of index pairs from PEEK(start + offset + 1). Finally, call the array-draw
function with:
VA = start + offset + 2
IA = VA + num_vertices * 2
&PLOT va, ia, num_index_pairs
The 0,0 point in the blob is in the center of the phrase horizontally
(which allows a maximum width of 255 pixels), and at the font baseline
vertically (so most of the font will appear above the zero point, but
descenders will extend below).
#### Future Enhancements ####
Right now the font definition is embedded in the program. This takes up
a lot of space -- before too long the BASIC program is going to intrude
on the hi-res page -- and is unnecessarily restrictive. The font should be
defined by a separate program, and BSAVEd into a line-font file that
LINEFONT can load.
Generating strings should be menu-driven and interactive, rather than
requiring manual changes to the code to fiddle with sizes and spacing.
DAVIEWER should be folded into the generation program (though it's kind
of handy as a simple example of how to unpack and access content).

990
docs/manual.md Normal file
View File

@ -0,0 +1,990 @@
fdraw Library Documentation
===========================
Fast graphics primitives for the Apple II
By Andy McFadden
Version 0.3, August 2015
## Overview ##
The fdraw library provides fast rendering of points, lines, rectangles,
and circles, as well as high-speed screen clears, for Apple II hi-res
graphics. It can be used from Applesoft or assembly language.
The Applesoft ROM routines were designed to be as compact as possible,
and were unable to use self-modifying code techniques, so their speed is
less than what the Apple II is capable of. The fdraw routines pick a
different point in the speed/space trade-off continuum, providing fast
speeds at a reasonable size. Not everyone agrees on what "reasonable"
means, so the fdraw code can be built in two modes, one that favors
speed, one that reduces size.
**Contents:**
- [Applesoft BASIC Ampersand API](#amperapi)
- [Raw API](#rawapi)
- [Building the Code](#building)
- [Apple II Hi-res in a Nutshell](#nutshell)
- [Notes on the Drawing Functions](#notes)
- [General Notes](#additional-notes)
- [Enhancement Ideas](#ideas)
- [My Quest for Lines](#history)
<div id='amperapi'/>
## Applesoft BASIC Ampersand API (Amperfdraw) ##
The ampersand API acts as a bridge between Applesoft BASIC and fdraw.
It's more convenient and has less overhead than POKE and CALL, though
you are not prevented from using that approach if you prefer. It's
best to use one or the other though, not mix and match.
All arguments are checked for validity. An appropriate Applesoft
error is thrown if invalid syntax or arguments are discovered.
This is not intended to be compatible with, nor a replacement for, the
ampersand utilities in Beagle Graphics.
* &NEW - calls the fdraw Init function (which sets the color to 0 and
selects hi-res page 1). You must do this once, at the start of
your program, after fdraw has been loaded. This also resets internal
amperfdraw state, setting the "HPLOT TO" origin to (0,0) and the "AT"
point to (139,95).
* &HGR - does what HGR does, only faster. Equivalent to executing
`&HCOLOR=0:&SCRN(1):&CLEAR:&HCOLOR=[prevcolor]`, and then setting the
display softswitches to display hi-res page 1 in mixed mode. Also sets
$e6 (HPAG) for convenience in case you want to mix & match with ROM
routines.
* &HGR2 - like &HGR, but for page 2. Like HGR2, this turns off
mixed-text mode.
* &SCRN({1,2}) - sets the hi-res page that will be used for drawing. Does
not change which page is displayed. (Use the softswitches, or call
&INVERSE.)
* &INVERSE - flips the render page to the other page, and hits the
display softswitches to show the page that was just rendered. Intended
for double-buffered animation.
* &HCOLOR={0-7} - sets color, using the same numbering scheme as Applesoft.
Does not affect the color used by the ROM routines.
* &CLEAR - clears screen to current color.
* &HPLOT [TO] x,y [TO x,y ...] - draws a point or a line. Works the same as
Applesoft, e.g. "&HPLOT TO" starts from the end of the previously
drawn line, and you can chain multiple "TO x,y" in a single statement.
* &EXP {0,1} - set line mode. 0 is normal, 1 is "xdraw".
* &XDRAW left,top,right,bottom - draws outline rectangle.
* &DRAW left,top,right,bottom - draws filled rectangle.
* &COS cx,cy,r - draws outline circle.
* &SIN cx,cy,r - draws filled circle.
* &AT cx,cy - sets center offset for array-based rendering. Position must
be on the hi-res screen (0-279, 0-191).
* &PLOT vertexAddr, indexAddr, indexCount [AT cx,cy] - draws from the
specified byte-arrays. See the "Drawing Lines with Indexed Byte-Arrays"
section for the full explanation.
<div id='rawapi'/>
## Raw API ##
The code is assembled at $6000 by default. The program's length includes
all data tables and work areas, and no memory outside of the program,
zero page, and the current hi-res page is modified.
Input parameters and the function jump table are located near the start
of the program. The API description below describes the addresses in
relative terms.
Input parameters are not checked for validity. They must be in the range
specified by the API, or undefined (but probably bad) behavior will result.
The values will not be modified by fdraw functions.
All drawing operations use the current color.
* +0 Init - call this when the library is first loaded. It must be
called before any other functions are used. It initializes the
color to zero and the page to $20.
* +3 (major version number, currently 0)
* +4 (minor version number, currently 3)
* +5 Input parameter area:
* +5 arg - used for misc functions, e.g. SetColor and SetPage
* +6 x0l - low part of the X0 coordinate (0-279)
* +7 x0h - high part of X0
* +8 y0 - Y0 coordinate (0-191)
* +9 x1l - low part of X1 (0-279)
* +10 x1h - high part of X1
* +11 y1 - Y1 coordinate (0-191)
* +12 rad - circle radius (0-255)
* +13 (reserved)
* +16 SetColor - set the color used for drawing (0-7) to the value in "arg".
The numbering is the same as the Applesoft hi-res colors.
* +19 SetPage - set the hi-res page used for drawing to the value in "arg",
which must be $20 or $40. Does not change the page that is displayed.
(Because a bad value can cause memory corruption, this value *is*
checked, and bad values rejected.)
* +22 Clear - erase the current hi-res page to the current color.
* +25 DrawPoint - plot a single point at x0,y0.
* +28 DrawLine - draw a line from x0,y0 to x1,y1 (inclusive).
* +31 DrawRect - draw a rectangle with corners at x0,y0 and x1,y1 (inclusive).
x0,y0 is the top-left, x1,y1 is the bottom-right. The left and
right edges will be drawn two bits wide to ensure that the edges
are visible (drawn at x0+1, x1-1).
* +34 FillRect - draw a filled rectangle with corners at x0,y0 and x1,y1
(inclusive).
* +37 DrawCircle - draw a circle with center at x0,y0 and radius=rad.
* +40 FillCircle - draw a filled circle with center at x0,y0 and radius=rad.
* +43 SetLineMode - set the DrawLine mode to the value in "arg", which can
be 0 (normal) or 1 (xdraw).
* +46 (reserved)
* +49 FillRaster - draw an arbitrary shape from the rasterization tables.
For each line from top to bottom, the left and right edges will
be read from rastx1/rastx2 and a raster drawn in the current color.
* +52 (byte) topmost line to rasterize (0-191)
* +53 (byte) bottom-most line to rasterize (0-191), inclusive
* +54 (2 bytes) address of rastx1l table
* +56 (2 bytes) address of rastx1h table
* +58 (2 bytes) address of rastx2l table
* +60 (2 bytes) address of rastx2h table
The rasterization table addresses are read-only; changing them will have
no effect.
fdraw uses a fair number of zero page locations. The exact set can be
determined by looking at FDRAW.S. The locations were chosen to not
interfere with DOS, ProDOS, Applesoft, or the Monitor. They may
interfere with Integer BASIC, SWEET16, or your own application code.
Remapping them to different locations is straightforward: just change
the assignment of zptr/zloc values near the top of FDRAW.S to use
different addresses. fdraw does not expect any zero page value to be
preserved across calls, so you're welcome to use those locations in your
own code, but understand that fdraw functions will overwrite them.
<div id='nutshell'/>
## Apple II Hi-res in a Nutshell ##
This is a quick overview of the Apple II hi-res graphics architecture
for anyone not recently acquainted.
The Apple II hi-res graphics screen is a quirky beast. The typical
API treats it as 280x192 with 6 colors (black, white, green, purple,
orange, blue), though the reality is more complicated than that.
There are two hi-res screens, occupying 8K each, at $2000 and $4000.
You turn them on and flip between them by accessing softswitches in
memory-mapped I/O space.
Each byte determines the color of seven adjacent pixels, so it takes
(280 / 7) = 40 bytes to store each line. The lines are organized into
groups of three (120 bytes), which are interleaved across thirds of
the screen. To speed the computation used to find the start of a
line in memory, the group is padded out to 128 bytes; this means
((192 / 3) * 8) = 512 of the 8192 bytes are part of invisible
"screen holes". The interleaving is responsible for the characteristic
"venetian blind" effect when clearing the screen.
Now imagine 280 bits in a row. If two consecutive bits are on, you
get white. If they're both off, you get black. If they alternate
on and off, you get color. The color depends on the position of the bit;
for example, if even-numbered bits are on, you get purple, while
odd-numbered bits yield green. The high bit in each byte adjusts the
position of bits within that byte by half a pixel, changing purple and
green to blue and orange.
This arrangement has some curious consequences. If you have green and
purple next to each other, there will be a color glitch where they meet.
The reason is obvious if you look at the bit patterns when odd/even meet:
`...010101101010...` or `...101010010101...`. The first pattern has two
adjacent 1 bits (white), the latter two adjacent 0 bits (black). Things
get even weirder if split occurs at a byte boundary and the high bit is
different, as the half-pixel shift can make the "glitch" pixel wider or
narrower by half a pixel.
The Applesoft ROM routines draw lines that are 1 bit wide. If you execute
a command like `HGR : HCOLOR=1 : HPLOT 0,0 to 0,10`, you won't see
anything happen. That's because HCOLOR=1 sets the color to green,
which means it only draws on odd pixels, but the HPLOT command we gave
drew a vertical line on even pixels. It set 11 bits to zero, but since
the screen was already zeroed out there was no apparent effect.
If you execute `HGR : HCOLOR=3 : HPLOT 1,0 to 1,10`, you would expect a
white line to appear. However, drawing in "white" just means that no
bit positions are excluded. So it drew a vertical column of pixels at
X=1, which appears as a green line.
If (without clearing the screen after the previous command) you execute
"HCOLOR=4 : HPLOT 5,0 to 5,10`, something curious happens: the green line
turns orange. HCOLOR=4 is black with the high-bit set. So we drew a
line of black in column 5 (which we won't see, because that part of the
screen is already black), and set the high bit in that byte. The same
byte holds columns 0 through 6, so drawing in column 5 also affected
column 1. We can put it back to green with "HCOLOR=0 : HPLOT 5,0 to 5,10".
It's important to keep the structure in mind while drawing to avoid
surprises.
Note that the Applesoft ROM routines treat 0,0 as the top-left corner,
with positive coordinates moving right and down, and lines are drawn
with inclusive end coordinates. This is different from many modern
systems. fdraw follows the Applesoft conventions to avoid confusion.
Handy table of graphics softswitches:
name | addr | decimal | purpose
------ | ----- | ------- | ------------------
TXTCLR | $c050 | -16304 | enable graphics
TXTSET | $c051 | -16303 | text-only
MIXCLR | $c052 | -16302 | disable mixed mode
MIXSET | $c053 | -16301 | enable mixed mode (4 lines of text)
LOWSCR | $c054 | -16300 | display page 1
HISCR | $c055 | -16299 | display page 2
LORES | $c056 | -16298 | show lo-res screen
HIRES | $c057 | -16297 | show hi-res screen
<div id='building'/>
## Building the Code ##
The main fdraw code is written for the Merlin assembler (specifically
Merlin-16 3.40, though other versions should work). It uses plain 6502
code, and is expected to run on an Apple ][+.
For convenience when editing the files on an Apple II, and to allow the
code to be compiled by Merlin-16 running under ProDOS 8, the code is
broken into four files. The main file, FDRAW.S, includes the other
three with PUT directives. FDRAW.S holds the API entry points and some
of the drawing code. FDRAW.LINE.S has the code for drawing points and
lines, while FDRAW.CIRCLE.S has the code for drawing circles.
FDRAW.TABLE.S holds the data tables, as well as empty space for work
areas. The empty space is included in the binary so you can determine
the full memory footprint by looking at the length of the file.
Near the top of FDRAW.S is a constant, `USE_FAST`, which may be set
to 0 or 1. If set to 0, some code optimizations are disabled,
reducing the size of the code and data areas. Further, the page
alignment on data tables is disabled, reducing the internal fragmentation
of the data area.
The USE_FAST setting also determines which file recevies the assembler
output: FDRAW.FAST or FDRAW.SMALL. To generate both, it is necessary to
assemble the file, change the constant, and then assemble the file again.
Tests and demos are written in Applesoft BASIC, with a couple of
exceptions.
### Why So Big? ###
The fdraw code weighs in at a hefty 5KB (or 4KB for the "small" build).
That doesn't sound like much in the age of multi-gigabyte mobile phones,
but it's a sizeable fraction of the space available on an Apple ][+.
If you want to modify individual pixels quickly, you need two things:
a line base-address table, and a divide-by-7 table. Computing base
addresses and dividing by 7 aren't hugely expensive, but we're going
to be doing them often, so they need to be as fast as possible.
The line address table has 192 entries, one for each line, 2 bytes per
entry. The divide-by-7 table has 280 entries, one for each horizontal
pixel position, with one byte for the dividend and one for the quotient.
(The quotient can be expressed as a numeric value from 0 to 6, or as
a byte with a specific bit set.)
That's 944 bytes. For optimum performance, each table must fit on a
single page of memory. We can split the division table into two pieces,
one for 0-255 and one for 256-279, and put the smaller half on the same
page as the Y table, along with 16 bytes of padding. The final size is
256 + 256 + (192+24+24+pad) + 192 = 960. So you can write off 1K of
memory before you've written any code.
(There's a clever way to reduce the size of the y-lookup table to 24
entries, but it's slightly faster and much easier to use full tables.)
For the FillRaster function, fdraw needs to record the left and right
X coordinates on each line (2 bytes each), so that's 192 * 4 = 768 bytes.
Again, for optimum performance, each table needs to be on its own page,
so for USE_FAST=1 that expands to 1024 bytes.
Add to that another full page of unrolled rasterization code, and you've
got 2304 bytes of tables.
The rest is code, most of which was written with a flagrant disregard
for size. Many common code fragments are repeated inline, rather than
called as a subroutine, because a subroutine call (JSR+RTS) costs 12
cycles. Calling a common "plot a point" function from the line-drawing
code would increase the per-pixel cost by 15-20%.
<div id='notes'/>
## Notes on the Drawing Functions ##
### Screen Clear ###
The Clear function erases the current hi-res page to the current color.
It's several times faster than the version built into the ROM.
#### Performance ####
The fastest possible way to clear the screen to a specific color on a
6502 is to write to every visible location with an absolute store
instruction. Subtracting the screen holes, that's 7680 address *
4 cycles = 30720 cycles. The code to do that would be 23,040 bytes long,
making it impractical.
A slower but more memory-efficient approach has one store statement for
each line, and iterates through 40 times (280 / 7 = 40). Factoring in the
loop overhead, that comes out to 40 * (192 * 5 + 9) = 38760 cycles.
192 sets of store instructions fills 576 bytes, which is much better
than 23K, but still quite a lot.
We can reduce the size further by taking the lines 3 at a time, erasing
the first 120 bytes in each 128-byte group (the last 8 bytes are the
screen hole). We'd need to use 7680/120 = 64 store instructions, for a
total of 120 * (64 * 5 + 9) = 39480 cycles, with 192 bytes for the main
part of the erase loop. We're not quite 2% slower, but 384 bytes
smaller, which seems a fair trade-off. Because we're accessing memory
linearly we now have a "venetian blind" clear, which is something of an
Apple II trademark, but we can fix that by spending an additional 522
cycles to erase the screen in thirds (top/middle/bottom).
Any further changes that make the code smaller also increase the execution
time. When built with USE_FAST=0, the code will use a different loop
with 32 stores that write 248 bytes each, and takes 41416 cycles. It's
half the size, but nearly 2000 cycles slower, and overwrites half of the
screen holes.
At the extreme end of space over speed is the Applesoft ROM routine -- HGR
or "CALL 62454" -- which only needs about 30 bytes for its main loop, but
takes (8192*33)+(12*64)+17 = 271121 cycles for black or white, or
(8192*40)+(12*64)+17 = 328465 cycles for green/purple/blue/orange --
7-8x slower than our preferred implementation.
The screen clear is wired to a specific hi-res page, so the SetPage
function must rewrite the store instructions when the page changes (or
we need to keep two full copies of the function around). For an
application that is constantly doing flip-erase, the overhead must be
factored into the efficiency of the approach -- for example, rewriting
stores with indexed LDA/EOR/STA in a loop will take 20 cycles per iteration,
1280 cycles for the full set of 64. The "slow" clear has half the
number of store instructions, so takes half the time to fix up after
a page flip.
### Raster Fill ###
Drawing an outline of a rectangle or circle can be done efficiently by
drawing lines or plotting points. Drawing a filled shape is more
expensive if one point is plotted at a time, especially on the Apple II
where every byte affects 7 pixels.
For filled shapes, fdraw populates a rasterization table. The table has
192 entries, each of which holds the left and right edges of the shape
on that line. The code fills in the pixels one line at a time, using
a simple byte store for the middle parts, and bit masks at the edges.
External applications can use the raster renderer directly by filling
out the rasterization table and calling FillRaster.
While the FillRaster function itself will not modify the contents of the
raster tables, other fdraw calls will, sometimes unexpectedly. For
example, drawing a horizontal line is performed with a single-line
fill call. Filled rectangles might populate the table in the way you'd
expect, or might use some internal shortcut that only fills out one line
and sets a "repeat" flag. Don't make assumptions about what will be in
the table after a call to one of the drawing functions. You *can* count
on whatever you wrote there yourself to be unmodified after calls to
FillRaster, SetColor, or SetPage, so you can do page-flipping and
color-cycling without having to repopulate the tables.
#### Performance ####
The fill code needs about 100 cycles to set up each line when drawing
a rectangle, more if the line doesn't start and end on byte boundaries.
The inner loop costs 10 cycles per byte. To clear the screen with the
raster fill code, it would take (192 * (100 + 40 * 10)) = 96000 cycles,
or nearly 2.5x the time required for the dedicated clear code. Which is
about what you'd expect, as the screen erase needs 4 cycles per byte, and
has lower per-line overhead. (This can be improved significantly; see
the notes in the "enhancements" section.)
Non-rectangular shapes take slightly longer to set up, as the edges must
be recomputed for each line.
### Lines ###
The goal is to provide a replacement for Applesoft's HPLOT function
that is faster and more consistent in appearance. Lines are drawn using
Bresenham's run-length algorithm.
Internally, there are five separate functions. Horizontal and vertical
lines each get a special-case handler. There's another for mostly-vertical
lines, one for mostly-horizontal lines, and one for wide mostly-horizontal
lines (255 pixels or wider). The latter requires 16-bit math, and is
slightly slower.
The Applesoft routine isn't quite the same as the standard Bresenham
algorithm, because it doesn't move diagonally. Consider a line from
(0,0) to (50,10) -- gently sloping down and to the right. The standard
algorithm would plot exactly 51 pixels, one in each horizontal position.
The "pen" always moves one pixel right, but sometimes also moves down.
In Applesoft, the "pen" can move either right or down, but can't do
both at once. This results in lines that feel thin when near horizontal
or vertical, but become thicker as they approach 45 degrees. This
reduces performance, because Applesoft draws twice as many pixels for a
diagonal line as the basic algorithm. It can also be visually jarring
when animated, because lines get very thick when near diagonal.
Different applications have used different styles; for example:
- Stellar 7 and Elite for the Apple II use Bresenham-style lines. If
you look at near-diagonal lines on a color monitor you can see the
pixels alternating green and purple.
- A2-FS1 Flight Simulator appears to be using Bresenham lines but with
doubled bits, effectively treating the screen as having 140 pixels. This
gives solid white lines with a fairly consistent feel.
- GraFORTH doubles the bits, but treats the screen as 256 pixels wide
(not 280... it gives up 24 pixels to improve performance). White
lines are thick like Flight Simulator, but feel less jagged because
each step can move left or right by one bit rather than two.
The SetLineMode function lets you choose between "draw" and "xdraw". The
former draws color pixels, setting and clearing bits as needed, while
the latter inverts whatever is currently on the screen. This can have
some unusual effects. Drawing the same line twice erases the line.
Drawing a green line over a purple line gives you a white line. Drawing
with colors 5 and 6 can produce odd results, because the high bit inverts
every time you touch a byte -- which means the ends of a horizontal line
will be a different color if the byte holds an even number of affected
pixels. It's best to draw with colors 0-3 when in xdraw mode. Clearing
the background to color 4, rather than 0, will cause drawing in colors
0-3 to actually be 4-7.
#### Performance ####
Mostly-horizontal lines step horizontally each iteration, and sometimes
step vertically. Mostly-vertical lines step vertically each iteration,
and sometimes step horizontally. Each part of the operation has a cost,
so the fastest lines are the ones drawn primarily in a single direction.
Diagonal lines are the worst case for performance.
The current code requires just under 80 cycles per pixel for diagonal
movement, and about 56 for single-direction movement. There's another
150 cycles or so per line for the initial setup.
Vertical lines cost about 43 cycles per pixel. Horizontal lines are
handled as a trivial FillRaster call, which at peak performance can write
7 pixels in 10 cycles.
This is about as fast as you can get with the Bresenham run-length
algorithm and Applesoft-style color handling. It's possible to go faster
by switching to a different pixel style, or using a run-slice approach.
### Rectangles ###
Filled rectangles are currently implemented by putting the left and
right edges into the rasterization table, and calling FillRaster.
Outline rectangles could be drawn as four lines, but that doesn't look
very good in color unless you get the lines on the right columns. To
ensure that the edges are in the correct color, outline rectangles are
drawn as four separate items: a two-pixel-wide left edge, a two-pixel-wide
right edge, and horizontal lines at the top and bottom. FillRaster does
the actual work.
#### Performance ####
FillRaster is suboptimal for rectangles, because it works by rows rather
than by columns (see "Vertically-Challenged Rasterization" later in this
document). Rectangles could be drawn 2.5x faster with dedicated code,
but at a cost of hundreds of bytes of memory.
The advantage of using FillRaster is that we need it for filled circles,
so adding support for rectangles was nearly free. And it's still pretty
fast.
### Circles ###
Circles are computed with Bresenham's algorithm. The idea is to compute
one octant of the circle with this bit of magic:
void drawOutline(int cx, int cy, int rad) {
int x, y, d;
d = 1 - rad;
x = 0;
y = rad;
while (x <= y) {
plot(cx, cy, x, y);
if (d < 0) {
d = d + (x * 4) + 3;
} else {
d = d + ((x - y) * 4) + 5;
y--;
}
x++;
}
}
Then each X/Y coordinate is plotted eight times:
(cx+x, cy+y) (cx-x, cy+y) (cx+x, cy-y) (cx-x, cy-y)
(cx+y, cy+x) (cx-y, cy+x) (cx+y, cy-x) (cx-y, cy-x)
For an outline circle, we plot every point. For a filled circle, we add
each point to a rasterization table. Near the top and bottom of the
circle there will be multiple updates to the same line, with each update
replacing the previous one (which works, as we are moving "outward").
The center point of the circle must be on screen, but it's not necessary
for the entire circle to fit. Coordinates outside screen space are clipped.
#### Performance ####
The implementation of Bresenham's algorithm is straightforward, and is
about as fast as it's going to get. There are actually two versions of
the core computation. If the radius is less than 41, we can keep all of
the variables in 8 bits. For circles with radius 41 and larger, we need
to use 16 bits, slowing each step slightly.
There are also two versions of the octant plot. If the circle fits entirely
on-screen, we use a simple version. If it doesn't, we use a version that
clips values. For rasterization that means clamping X to the left or
right edge, and skipping updates that are off the screen in the Y dimension.
For an outline circle we simply don't plot any clipped points.
The rendering of filled circles is very fast, though there is a possibility
of optimizing the center-fill of large circles. Outline circles were
added by inserting JSR PLOT at key points, and could perhaps be faster.
### Drawing Lines with Indexed Byte-Arrays ###
The &PLOT command allows a BASIC program to execute a series of line-draw
commands with a single statement. Think of it like shape-table animation
with lines instead of plotted points.
Suppose you want to draw a rectangle with an X through the middle. We'll
make it 11 units wide and 21 units high. To draw that in the middle of
the screen, we'd set CX=139 and CY=95, then draw lines offset from that
by +/- 5 in X and +/- 10 in Y:
HPLOT CX-5,CY-10 TO CX-5,CY+10 : REM LEFT
HPLOT CX-5,CY-10 TO CX+5,CY-10 : REM TOP
HPLOT CX+5,CY-10 TO CX+5,CY+10 : REM RIGHT
HPLOT CX-5,CY+10 TO CX+5,CY+10 : REM BOTTOM
HPLOT CX-5,CY-10 to CX+5,CY+10 : SLASH
HPLOT CX+5,CY-10 to CX-5,CY+10 : BACKSLASH
Six lines, each of which needs four coordinates. We'd need 24 bytes
to store that in an integer array.
Suppose instead we identified the four vertices, and numbered them:
#0 CX-5,CY-10
#1 CX+5,CY-10
#2 CX-5,CY+10
#3 CX+5,CY+10
and then created a list of line segments using the vertex indices:
HPLOT #0 TO #2
HPLOT #0 to #1
HPLOT #1 TO #3
HPLOT #2 TO #3
HPLOT #0 TO #3
HPLOT #1 TO #2
This requires (4*2) + (6*2) = 20 bytes, for a small savings. The real
value in the approach is that it separates the description of the shape
from the placement of the points. For example, if you want to change
vertex #0 to (CX-7,CY-12), you don't have to make changes two three
separate HPLOT calls. (This is particularly useful when you have code
that scales and rotates the vertices.)
For the current release of fdraw, the only built-in transform is
translation. Using "&AT cx,cy", you can place the center point anywhere
on the screen. This allows you to animate movement of the shape by
simply calling &AT to change the position, and &PLOT to draw.
The &PLOT command takes three arguments: the address of a vertex array,
the address of an index array, and the number of line segments to draw.
These are referred to as "byte arrays" because they are arbitrary
locations in memory where you have BLOADed or POKEd your shape data, not
Applesoft arrays. The count can be from 0 to 127. You can optionally
add an AT to the end; if not present, the coordinates of the previous AT
are used. The initial value is the center of the screen (x=139 y=95).
The vertex array uses two signed bytes per vertex (-128 to 127), one for
the X coordinate and one for the Y coordinate.
The index array uses two bytes per line segment. Each byte is an index
into the vertex array, from 0 to 127.
Here's an Applesoft program that implements the above example. (The DATA
statements use negative numbers for clarity; if you replace the negative
values with 256+value, e.g. -5 becomes 251, then you can avoid the IF
statement and just poke the value directly.)
100 TEXT : NORMAL : HOME
200 & NEW : & HGR : VTAB 21
210 & HCOLOR= 3
500 REM ARRAY TEST
510 AD = 768: REM $300
520 READ D: IF D = 1000 THEN 560
530 IF D < 0 THEN D = 256 + D
540 POKE AD,D:AD = AD + 1: GOTO 520
560 & PLOT 768,776,6: & AT 50,50: & PLOT 768,776,6
570 POKE 768,256 - 10: POKE 769,256 - 20: & PLOT 768,776,6 AT 100,50
600 DATA -5,-10, 5,-10, -5,10, 5,10
610 DATA 0,2, 0,1, 1,3, 2,3, 0,3, 1,2, 1000
This draws the shape twice, once at the middle of the screen, once centered
at 50,50. It then adjusts the top-left coordinate, and draws the shape
centered at 100,50. Looking at the output, you can see that the top-left
corner of the third instance has moved, and all three lines from that
point have moved with it.
If a vertex ends up off-screen, lines that use that vertex are omitted
(not clipped). If you tried to draw the example shape at (0,0), nothing
would happen, because every line has at least one point that would be
off-screen -- only point #3 is still visible, and all of the lines that
use that point extend off screen.
You can specify a maximum of 128 vertices and 128 index pairs for a
single call. If none of the line segments share vertices, you'll need
two vertices per line, which means a cap of 64 lines.
#### Performance ####
There isn't a whole lot to it -- it just feeds the lines to DrawLine.
The key speed advantage is the removal of the Applesoft overhead.
<div id='ideas'/>
## Enhancement Ideas ##
Some ideas for future versions of fdraw.
### fdraw ###
Line clipping would make the array-draw function more useful for
animation projects. If we accepted signed 16-bit values as input to
the clip function, we could specify an AT point outside the screen bounds.
That could be extended to circles, which could have off-screen centers.
A "game line" function or line mode that restricts coordinates to 0-255
and ignores color might be worth an experiment.
Triangle rasterization is possible, but perhaps a bit silly.
We could handle ellipses, but they're more complicated than circles, and
are slower to compute -- you need a couple of multiplications during
setup, and the asymmetry means you have to compute a quadrant rather
than an octant. If the goal is fast animation rather than general-purpose
picture painting then there's little value in supporting ellipses.
Some of the inner loops are almost certainly paying an extra cycle to
cross a page boundary. That's not easy to fix without adding absurd
amounts of padding.
"USE_FAST" could be applied more aggressively to reduce the size.
Having "fast" vs. "small" builds was mostly an experiment to see how
much of a difference in size and speed we'd get by dropping some of
the more expensive operations. Another way to reduce size would be to
make the build modular, so you could (say) omit circle drawing or only
include line drawing. Some trade-offs would have to be made, e.g. if
you only wanted line drawing then it makese sense to disable (or replace)
the horizontal-line optimization that calls FillRaster, as that requires
some sizeable tables that would otherwise be unused.
### Amperfdraw ###
The Amperfdraw API is somewhat minimal and could be improved. Taking a
cue from Beagle Graphics, the rect and circle calls should probably look
more like:
&DRAW width,height [AT left,top]
&COS radius [AT left,top]
The "&AT" coordinate, currently only used by &PLOT, should be more
widely used. Not only is it more convenient, it's also slightly faster,
since we don't have to parse the left/top coordinates each time.
The existing code is (somewhat lazily) using the Applesoft routines to
parse coordinates, which includes the range check. We wouldn't be able
to use them for width/height, because we would need to take values in the
range (0-280, 0-192), where width/height of zero means "draw nothing".
I deliberately used Applesoft tokens, rather than arbitrary words, to
make commands simpler to parse. Some of them don't fit that well. COS
and SIN are circle-related, but it's not obvious which is outline and
which is filled. DRAW and XDRAW don't really sound like rectangle-draw
calls, and would be much more appropriate if used to set the line draw
mode. Spending a few bytes & cycles to get better names might be
worthwhile.
It's possible to store &PLOT arrays in actual BASIC integer arrays,
which might make them easier to code for. The fact that arrays are
DIM()ed once, cannot be resized, and cannot be discarded makes them
difficult to use for dynamic data.
Currently &PLOT takes a list of vertices and a list of line segments.
We could also support "continuous line" mode, where it just plays
connect-the-dots (saves space, doesn't really affect speed). Being
able to embed color changes could be handy.
&PLOT handles lines and vertices the way Applesoft does, with inclusive
coordinates. This results in overdraw when vertices are shared. This
is a (small) performance hit, and causes graphical glitches when connected
lines are drawn in "xdraw" mode.
<div id='additional-notes'/>
# Additional Notes #
Getting into the gory details here.
## Setting a pixel ##
Hi-res pixels are curious creatures.
Pixel color values are determined by adjacent bits. The various drawing
routines only set one bit at a time, so "drawing" in green (hcolor=1) will
cause bits to be set in odd columns, cleared in even columns. We don't
touch adjacent bits, so drawing purple (hcolor=2) in column 0 and green
in column 1 will produce a white line, while drawing them with the columns
reversed will produce a black line.
Making life more complicated is the use of the high bit in each byte, which
affects the color. If you draw a purple line in column 0, and a black1
line with hcolor=4 in column 6, the purple line turns blue, because the
black1 line sets the high bit.
To set a bit at an arbitrary X offset, we need to do the following:
(1) Determine which byte to change (xc / 7) and which bit (xc mod 7).
(2) Determine the color mask for that byte. For green, it's 0x2a
(00101010) in even columns, 0x55 (01010101) in odd columns.
(3) Set or clear the target bit and the high bit, leaving the others
intact.
One way to do this is illustrated below. Assume we're drawing a green
line at X=17. There's already a green dot at X=15, which gives us a
bit pattern of 00000010. (Bits are "backwards", i.e. the bit on the
right is the pixel on the left.)
LDY byteoffset X=2
LDX bitoffset X=3
LDA bitmask,x A=0x88 (10001000)
STA <andmask
LDA oddevencolor,y 4 cyc A=0x2a (00101010)
EOR (hbasl),y 5 cyc A=0x28 (00101010 ^ 00000010 = 00101000)
AND <andmask 3 cyc A=0x08 (00101000 & 10001000 = 00001000)
EOR (hbasl),y 5 cyc A=0x0a (00001000 ^ 00000010 = 00001010)
STA (hbasl),y 6 cyc
As a second example, here's how we plot a black1 (hcolor=4) point at X=6
when there's a purple point (hcolor=2) at X=0 (00000001).
LDA bitmask,x A=0xc0 (11000000)
STA <andmask
LDA oddevencolor,y 4 cyc A=0x80 (10000000)
EOR (hbasl),y 5 cyc A=0x81 (10000000 ^ 10000001 = 00000001)
AND <andmask 3 cyc A=0x81 (00000001 & 11000000 = 00000000)
EOR (hbasl),y 5 cyc A=0x81 (00000000 ^ 10000001 = 10000001)
STA (hbasl),y 6 cyc
Note the purple pixel is still set, but now the high bit is as well,
changing it to blue.
The trick is to start with the color pattern, which specifies how we want
the bits to be set or cleared. We EOR in the screen, which causes the
bits in A to be inverted wherever they were set on the screen. Next we
use the AND mask to zero out the bits we don't want to update on-screen.
When we do the second EOR from the screen, the bits we just zeroed will
take on the values from the screen, while the bits we didn't zero will
return to their original values from the color pattern (because EORing
twice with the same value restores the original).
It might look a little nicer if we always set two adjacent bits. That
would avoid the phenomenon where drawing from 0,0 to 0,10 in green doesn't
appear to do anything. For 6 out of 7 pixels this is easy, a simple
adjustment to the bitmask, but for the 7th pixel we'll need to update an
adjacent byte... unless it's the rightmost byte, which would cause us to
overflow and wrap around (or write into a screen hole). GraFORTH
renders lines this way, avoiding the overflow issue by limiting the X
coordinate range to (0,255).
To implement "xdraw" mode, where instead of setting pixels we invert
the current value, we can just omit (or NOP out) the first EOR.
We could draw faster if we simply set the new bits, rather than setting
some and clearing others according to the color mask. This could result
in some odd behavior, e.g. drawing a horizontal green line over a
horizontal purple line would result in a white line. Given how strange
things are in general this might not be an issue.
For 3D games like Stellar 7 or Elite, which essentially draw thin
monochromatic lines, we can drop the color mask and just set the bit on
the screen. Plotting a pixel is then simply:
LDA (hbasl),y 5 cyc
ORA <bitmask 3 cyc
STA (hbasl),y 6 cyc
This cuts the cycle count from 23 to 14. It's also not necessary to
worry about the high bit, which can save a few cycles when shifting
the bitmask. Most games are also able to limit the "active" part of
the screen to fewer than 255 pixels, which eliminates some 16-bit math
during setup.
For "xdraw" mode, the "ORA <bitmask" becomes "EOR <bitmask".
## Single- or Double-Buffered Animation ##
Because the Apple II has two hi-res graphics pages, it's possible to
double-buffer the animation to reduce or eliminate flicker. The
application displays one page while erasing and redrawing the other.
In most cases it's faster to erase the entire screen with the Clear
function than it is to draw over with black. For example, consider four
diagonal lines in a diamond shape, 100 pixels on a side. Diagonal
lines are the most expensive, as each step requires advancing in
both vertical and horizontal directions. The current implementation
needs about 80 cycles per diagonal pixel, or 100 * 4 * 80 = 32,000 cycles
to draw four medium-length lines (ignoring the setup cost for each line).
If you assume that the average cost to draw a pixel is about 70 cycles,
you can draw 570 pixels in the time it takes to erase the full screen.
We can clear the entire screen in about 40,000 cycles. If the drawing
area is smaller, a custom clear routine could do it in even less.
(Imagine your drawing routines keep track of the highest and lowest
line that anything touches, and then just erase the "dirty" lines.) So
unless you're doing relatively light rendering, you'll get the best
performance by wiping all or part of screen rather than drawing over the
previous contents.
The &INVERSE command is intended to make double-buffered animation
easier from BASIC. Use &HGR2 to switch to full-screen mode, then call
`&SCRN(1):&HCOLOR=0:&CLEAR` to select page 1 and clear it. Draw your
first frame, then call &INVERSE to display page 1 and select page 2
for drawing.
An alternative approach is exemplified by Elite. The game only uses
one hi-res page, but doesn't noticeably flicker (though distant objects
sort of "sparkle"). Suppose you're writing a similarly line-oriented
game, and your rendering cycle looks like this:
- Step 1: draw over previous content with black
- Step 2: draw new content with white
Your game will flicker badly without double-buffering, because there will
be a few display refresh periods where most of the lines have been erased.
Suppose instead you did this:
- For each line in the shape, erase the old line, then draw the line in
its new position
Now you might get some flickering on certain lines if the beam crosses
them while they're black, but the shape as a whole will be visible most
of the time. The trouble with this approach is that, if your shape is
moving across the screen, you'll be drawing black over some recent white
lines, causing some distracting artifacts.
The way to make this work is to use "xdraw" mode, where bits are toggled
rather than set or cleared. If you draw a new line across an old line that
will soon be erased, the crossing point is cleared. When the old line
is erased, the crossing point is set white again, so your new line
appears unbroken.
It should be noted that this works well for Elite because they use backface
elimination, so lines within a single shape don't cross. It's also
important to avoid re-drawing points at shared vertices, or your corners
will disappear unless there are an odd number of lines.
If there's very little on screen, this could be faster than a full clear.
Mostly it's of value if you need the 8KB occupied by the second hi-res
page for something other than output.
## Vertically-Challenged Rasterization ##
As noted earlier, we can clear the screen in about 40,000 cycles with
the Clear function, but drawing a screen-sized filled rectangle takes
about 96,000. Why the difference?
The FillRaster function handles one horizontal line at a time. For
each line it sets any pixels sticking out on the left and right edges,
and then it jumps into an unrolled byte-stomp function that blasts
its way through the middle at 10 cycles per byte. Compare this to the
Clear function, which only needs 5 cycles per byte.
The trick to improving the speed at which we draw filled rectangles
is to make it more like the Clear function, which operates on columns
rather than rows.
Suppose, for example, we figured out which bits we need to set on the
left edge, and then applied them to every row. Then we did the same
for the right edge. The set-up cost for each edge went from
(N cycles * Y rows) to (N cycles). Can we apply this to the middle
byte as well?
It turns out we can. The fundamental problem with setting bytes
horizontally is that we have to index off of a direct page register,
e.g. "STA ([hbasl),y". The only ways around this either add too much
loop overhead, too much setup overhead, or require too much memory.
For any given line, we need to find the base address, and issue a
6-cycle indirect store, followed immediately by an increment of the Y
register. If we're drawing in color it's worse than that, because we
also have to exclusive-OR the color because the bit pattern flips for
odd/even columns.
We're much better off unrolling vertically. Suppose you have 192
"STA abs,y" instructions, one for each row, one after the other. You
no longer need the base address lookup, because it's baked into the
code, and since we're only touching one column we don't need to worry
about odd/even color values here. To use this to draw rows 50-100, you
would replace the STA in row 101 with an RTS, and then JSR to the 50th
STA instruction. After the row is painted, you increment Y, exclusive-OR
the color value, and jump through again. (You can make this a little
faster by JMPing in and out instead, but you pay a bit more for setup
and cleanup, especially when you have to restore the base address that
got overwritten by the JMP.)
With this change we're working at 5 cycles per byte, plus the loop
overhead. A full-screen FillRect will be about as fast as a Clear.
There are a couple of down sides. First, you need 192*3=576 bytes to
hold this pile of store instructions. If you're drawing a lot of filled
rectangles, though, the 2x speed improvement would make the size penalty
worthwhile. The other problem arises if you use double-buffered animation,
as the table is hard-wired to page 1. You can either spend a couple
thousand cycles when the page flips to rewrite the addresses, or you can
have a second full copy of the stores for page 2.
The current horizontally-focused implementation uses 256 bytes for its
unrolled code area, but you wouldn't be able to get rid of that by
switching to the vertical approach. The reason the code works the way
it does is that it's designed to render circles, and those are hard to do
vertically. With horizontal rasters, when you look at the left and right
edges you only need to examine the current row, and set pixels in a
single byte. With vertical strips, each byte spans seven columns of
pixels, so the top and bottom "edges" might be several bytes deep. The
code would have to iterate in "edge space" until it reached the meaty
center, and the cost of doing so would likely erase the benefit of vertical
fills until your circles got reasonably large.
It's possible that a hybrid approach, in which selected rectangles in the
center of a large circle are drawn with a fast vertical fill, could be
used, with slower code rendering the outer edges. The trick would be to
come up with an approach that doesn't leave gaps, minimizes overdraw, and
is sufficiently faster to make the effort worthwhile.

197
docs/personal-notes.md Normal file
View File

@ -0,0 +1,197 @@
My Quest for Lines
==================
As far back as I can remember, I always wanted to draw lines on the
hi-res screen.
This probably started when I saw Battlezone in the arcades in the early
1980s. I still think the game is beautiful -- a first-person shooter
reduced to the essential elements. I wanted to write something similar
for the Apple II, but I didn't know where to start. (I should probably
mention that I was 11 years old in 1980.)
Battlezone had a dedicated matrix processor (the "math box"), and a
vector display that handled the line drawing. The Apple II had neither
of those things, which meant that achieving the same level of performance
and graphical detail weren't possible. Despite those shortcomings, Damon
Slye create a pretty solid Battlezone-ish game in 1983, called Stellar 7.
A couple of years later, Braben and Bell made another compelling wireframe
combat game, the space combat sim Elite. (The A2-FS1 flight simulator
came out much earlier, but the graphics were blinky, enemies were just
dots, and the action was much slower-paced. Of course, it loaded from
cassette tape and ran in 16KB, so they didn't have much choice.)
Seeing these games showed me that the problems could be solved. I decided
that the place to start was line drawing, because (a) line drawing is
pretty fundamental to wireframe 3D, and (b) I wasn't getting the performance
I needed out of HPLOT TO.
Somewhere in the mid-1980s -- I was in high school now -- I began by trying
to figure out how line drawing worked. Suppose, for example, you want to
HPLOT 0,0 TO 19,5. How do you decide which pixels to set?
I wrote a program (which I recently found) called "HPLOT SIMULATOR". It
computed the ratio of vertical to horizontal pixels (e.g. 20 / 6 = 0.3),
and marched horizontally across the screen, adding the fractional value to
the Y coordinate at each step. The result was a pretty good-looking line.
The trouble was that it used floating-point math and required division,
things that the 6502 is not very good at. It occurred to me that division
can be performed as a series of integer subtractions. (It probably occurred
to me because I didn't know any other way to divide on the 6502, not having
encountered the shift-and-subtract approach yet.) So if you initialize a
counter to zero, and add 6 to it each time you move horizontally, then when
it reaches 20 you know it's time to move vertically. Subtracting 20 from
the counter resets it, but retains the division remainder as the starting
point, so you retain the fractional part.
When I went to college I took a graphics class, and was introduced to
Bresenham's classic line algorithm. This was essentially the same as what
I'd figured out for myself, but with two refinements: (1) it used signed
values, allowing a slightly cheaper "< 0" comparison, and (2) it started
with the counter half full, correcting the slight lopsidedness of my lines.
The graphics class inspired me to write a 3D game library called Arc3D
in 1990. I used it to create a pair of demos: "Not Modulae", which
animated several 3D shapes on the screen, including a pair of ships from
Elite; and "Not Stellar 7", a graphics demo that let you drive around
(and, sadly, through) some tanks from Stellar 7. The Arc3D library was
written for the IIgs, in 65816 assembly, and used the super-hi-res screen.
Having a better CPU, lots more memory, and a less-quirky graphics
architecture made things easier than doing the same on a classic Apple II.
I wrote my own super-hi-res line drawing code, of course, but a year later
when I disassembled somebody else's demo I found better code. Which, it
turned out, they had also lifted from another source, an FTA demo. I
dropped mine and used theirs.
After I graduated from college, my side projects tended more toward data
compression and Netrek, so Arc3D was never improved upon.
Fifteen years later, in 2006, there was a discussion on a Usenet group
about circle rendering. Once upon a time I'd drawn circles from BASIC
with trig functions, but it was painfully slow, which made me wonder
about a part of the game Horizon V where you steer through a series of
circles. I wanted to try it for myself and see what it would take.
(Looking at a youtube video of Horizon V, the animation is more radial
than circular... I suspect it's not really drawing circles at all.)
I first announced my results in a
[comp.sys.apple2.programmer](https://groups.google.com/forum/#!msg/comp.sys.apple2.programmer/Vj_xVjMHaR0/cLU3t2TlPrMJ)
posting. I had focused on filled circles, rather than outline circles,
since that seemed like a more interesting challenge. The "fdraw" demo
supported fast rendering of filled circles, filled rectangles, and had
a very fast screen clear. A week later, after a bit of cleanup, I
[https://groups.google.com/d/msg/comp.sys.apple2.programmer/Un4pV5p8Elw/6qZVAPc_da0J](released the fdraw v0.2 sources).
It occurred to me at the time that this would be a handy place to stick
the hi-res line drawing code I'd always wanted to write. Somewhere around
this time I also sort of poked at the idea of writing a dedicated hi-res
graphics compression program.
Fast forward another nine years, to 2015. After learning about the LZ4
format, I went back to my data compression roots and wrote
[https://github.com/fadden/fhpack](fhpack) and some demos. I had so much
fun doing it that I decided it was finally time to write some hi-res
line drawing code.
Being older, wiser, and having easy access to relevant information, I
began with the appropriate chapters in Michael Abrash's _Graphics
Programming Black Book Special Edition_. This covered the standard
algorithm, but also had a chapter on a faster "run-slice" approach.
This intrigued me, because instead of the usual "step right, check if
it's time to move down, step right, check if it's time ..." logic, it
says, "figure out how long each line segment is; then, move right 3
times, step down, move right 4 times, step down, ...", saving a lot of
redundant computation. The trouble is that it requires fixed-point
division, and drawing N adjacent pixels is tricky when your graphics
architecture has 7 horizontal pixels per byte. You'd have to be a bit
crazy to try to get that to work.
So I went with a standard approach, and used the Applesoft ROM method of
coloring pixels (discussed in the fdraw docs). I carefully optimized
the code, and squeezed out as much performance as I could.
When I was done, I began looking around at what other people did to see if
there were any tricks I missed.
I looked at the Applesoft ROM code. Very clever, but very much optimized
for space over speed. Also, because it's in ROM, self-modifying code is
not possible, so they lose a cycle here and there.
Next I looked at GraFORTH. I figured out how functions were arranged,
identified the plot function, and disassembled it with CiderPress. It uses
a pretty standard algorithm, but supports multiple drawing modes and sets
two adjacent bits for better-looking colored lines. Good use of
self-modifying code, but some choices were made to reduce code size at the
expense of speed. My code was faster.
Next I looked at Elite. Digging through memory after the program had
loaded, I found a collection of purpose-built line functions. Some drew
color, most used EOR to "xdraw" monochrome lines. Standard Bresenham
approach, with a bit of variation on the Y-lookup table -- their table is
only 24 bytes (1/8th of the screen), and they use a quick "add 4 to the
high byte" 7 out of every eight lines. I tried applying this to my code,
but it turned out that just using a full lookup table was a tiny bit faster.
Next I looked at Stellar 7, one of my earliest inspirations. I scanned
through some files with CiderPress, looking for anything line-draw-esque.
(If you spend enough time drawing lines you start to see patterns.)
After about five minutes I found the code, in the same file as this
gigantic unrolled division routine. But as I started to dig into the code
I noticed that it was using a count oddly, and this one function was...
HOLY CATS he did run-slicing.
And he did it big. There are several line functions, all of them padded
out to live on a single page (so that none of the branches cross page
boundaries, which costs an extra cycle). It has the usual special cases --
simple horizontal and vertical lines -- and the usual split between
vertically-dominant and horizontally-dominant lines. But there are *three*
different functions for drawing mostly-horizontal lines, selected based on
slope, all of which try to set multiple horizontal pixels at once. The
slope of the line affects how the code is structured; for example, for
very shallow lines it expects that it will often be able to set an entire
byte at once. Color is not supported, so pixels are set with a simple
OR operation.
It's very impressive, and a wee bit terrifying. But when you're making
a game that will be spending much of its time drawing lines, you really
want to optimize those draw functions.
The tricky part is that divide. The division routine is unrolled to a
healthy 187 bytes long, and might take 240 cycles to run. For short
lines and mostly-vertical lines it might have been more efficicent to skip
the division and just use a run-length implementation, but the ability to
set multiple bits at once for mostly-horizontal lines is a huge win. It's
a fair bet that the code in Stellar 7 is the fastest line drawing
implementation for the Apple II. (Of course, I haven't looked at Arcticfox,
the sequel...)
The general structure of the code was actually very similar to mine: always
draw left to right, use self-modifying code to handle up vs. down, and so on.
I didn't come away with any new ideas for optimizations to my run-length
implementation from this or the other programs I looked at... but there
are a lot of other games that I haven't disassembled.
So, 30+ years after HPLOT SIMULATOR, here I am with a bunch of code for
drawing lines on the Apple II hi-res screen.
I don't plan on writing Battlezone for the Apple II. Stellar 7 did that,
and more. My goal in developing fdraw was to scratch a very old itch.
I had forgotten how much fun this stuff is. Working in ARM assembly
language on Android offered similar challenges, but you're never entirely
sure exactly how your code will perform on the wide range of CPU
architectures (affecting instruction interleave, cache size and
replacement policy, etc.), you have to guess at cache misses and the
success rate of data prefetching, and it's difficult to measure results when
there's multiple threads running and interrupts firing. On the Apple II
you can count every cycle, and know exactly what will happen when.
I don't expect that anyone will find the code useful, but that wasn't
really the point.
Andy McFadden
August 2015

BIN
fdraw-disks.zip Normal file

Binary file not shown.