fdraw/FDRAW.S

806 lines
22 KiB
ArmAsm

********************************
* *
* Fast Apple II Graphics *
* By Andy McFadden *
* Version 0.3, Aug 2015 *
* *
* Main source file *
* *
* Developed with Merlin-16 *
* *
********************************
* Set to 1 to build FDRAW.FAST, set to zero to
* build FDRAW.SMALL.
USE_FAST equ 1
* Set to 1 to turn on beeps/clicks for debugging.
NOISE_ON equ 0
lst off
org $6000
*
* Macros.
*
spkr equ $c030
bell equ $ff3a
* If enabled, click the speaker (changes flags only).
CLICK mac
do NOISE_ON
bit spkr
fin
<<<
* If enabled, beep the speaker (scrambles regs).
BEEP mac
do NOISE_ON
jsr bell
fin
<<<
* If enabled, insert a BRK.
BREAK mac
do NOISE_ON
brk $99
fin
<<<
* In "fast" mode, we align tables on page boundaries so we
* don't take a 1-cycle hit when the indexing crosses a page.
* In "small" mode, we skip the alignment.
PG_ALIGN mac
do USE_FAST
ds \
fin
<<<
*
* Hi-res screen constants.
*
BYTES_PER_ROW = 40
NUM_ROWS = 192
NUM_COLS = 280
*
* Variable storage. We assign generic names to
* zero-page scratch locations, then assign variables
* with real names to these.
*
* 06-09 are unused (except by SWEET-16)
* 1a-1d are Applesoft hi-res scratch
* cc-cf are only used by INTBASIC
* eb-ef and ff appear totally unused by ROM routines
*
zptr0 equ $1a ;2b
zloc0 equ $06
zloc1 equ $07
zloc2 equ $08
zloc3 equ $09
zloc4 equ $1c
zloc5 equ $1d
zloc6 equ $cc
zloc7 equ $cd
zloc8 equ $ce
zloc9 equ $cf
zloc10 equ $eb
zloc11 equ $ec
zloc12 equ $ed
zloc13 equ $ee
********************************
*
* Entry points for external programs.
*
********************************
Entry
jmp Init ;initialize data tables
dfb 0,3 ;version number
*
* Parameters passed from external programs.
*
in_arg ds 1 ;generic argument
in_x0l ds 1 ;X coordinate 0, low part
in_x0h ds 1 ;X coordinate 0, high part
in_y0 ds 1 ;Y coordinate 0
in_x1l ds 1
in_x1h ds 1
in_y1 ds 1
in_rad ds 1 ;radius for circles
ds 3 ;pad to 16 bytes
jmp SetColor
jmp SetPage
jmp Clear
jmp DrawPoint
jmp DrawLine
jmp DrawRect
jmp FillRect
jmp DrawCircle
jmp FillCircle
jmp SetLineMode
jmp noimpl ;reserved2
jmp FillRaster
* Raster fill values. Top, bottom, and pointers to tables
* for the benefit of external callers.
rast_top ds 1
rast_bottom ds 1
da rastx0l
da rastx0h
da rastx1l
da rastx1h
noimpl rts
********************************
*
* Global variables.
*
********************************
g_inited dfb 0 ;initialized?
g_color dfb 0 ;hi-res color (0-7)
g_page dfb $20 ;hi-res page ($20 or $40)
********************************
*
* Initialize.
*
********************************
Init
lda #$00
sta in_arg
jsr SetColor ;set color to zero
jsr SetLineMode ;set normal lines
lda #$20
sta in_arg
sta g_inited
jmp SetPage ;set hi-res page 1
********************************
*
* Set the color.
*
********************************
SetColor
lda in_arg
cmp g_color ;same as the old color?
beq :done
and #$07 ;safety first
sta g_color
* Update the "colorline" table, which provides a quick color
* lookup for odd/even bytes. We could also have one table
* per color and self-mod the "LDA addr,y" instructions to
* point to the current one, but that uses a bunch of memory
* and is kind of ugly. Takes 16 + (12 * 40) = 496 cycles.
tax ;2
lda xormask,x ;4
sta :_xormsk+1 ;4
lda oddcolor,x ;4
ldy #BYTES_PER_ROW-1 ;2
]loop sta colorline,y ;5
:_xormsk eor #$00 ;2
dey ;2
bpl ]loop ;3
:done rts
********************************
*
* Set the page.
*
********************************
SetPage
lda g_inited ;let's just check this
beq noinit ; (not called too often)
lda in_arg
cmp #$20
beq :good
cmp #$40
beq :good
jmp bell
:good
sta g_page
do 0 ;*****
cmp ylookhi
beq :tabok
* Check to see if the values currently in the Y-lookup table
* match our current page setting. If they don't, we need to
* adjust the code that does lookups.
* This approach modifies the table itself, paying a large
* cost now so we don't have to pay it on every lookup.
* However, this costs 2+(16*192)=3074 cycles, while an
* "ORA imm" only adds two to each lookup, so we'd have
* to do a lot of drawing to make this worthwhile.
* (Note: assumes ylookhi is based at $2000 not $0000)
ldy #NUM_ROWS ;2
]loop lda ylookhi-1,y ;4
eor #$60 ;2 $20 <--> $40
sta ylookhi-1,y ;5
dey ;2
bne ]loop ;3
else ;*****
* This approach uses self-modifying code to update the
* relevant instructions. It's a bit messy to have it
* here, but it saves us from having to do it on
* every call.
*
* We could also have a second y-lookup table and
* use this to update the pointers. That would let
* us drop the "ORA imm" entirely, without the cost
* of the rewrite above, but eating up another 192 bytes.
sta _pg_or1+1 ;rastfill
sta _pg_or2+1 ;circle hplot
sta _pg_or3+1 ;circle hplot
sta _pg_or4+1 ;drawline
sta _pg_or5+1 ;drawline
sta _pg_or6+1 ;drawline
sta _pg_or7+1 ;drawline
fin ;*****
:tabok rts
noinit ldy #$00
]loop lda :initmsg,y
beq :done
jsr $fded ;cout
iny
bne ]loop
:done rts
:initmsg asc "FDRAW NOT INITIALIZED",87,87,00
********************************
*
* Clear the screen to the current color.
*
********************************
Clear
do USE_FAST ;*****
* This performs a "visually linear" clear, erasing the screen
* from left to right and top to bottom. To reduce the amount
* of code required we erase in thirds (top/middle/bottom).
*
* Compare to a "venetian blind" clear, which is what you get
* if you erase memory linearly.
*
* The docs discuss different approaches. This version
* requires ((2 + 5*64 + 11) * 40 + 14) * 3 = 40002 cycles.
* If we didn't divide it into thirds to keep the top-down
* look, we'd need (5*64 + 9) * 120 = 39480 cycles, so
* we're spending 522 cycles to avoid the venetian look.
lda :clrloop+2
cmp g_page
beq :pageok
* We're on the wrong hi-res page. Flip to the other one.
* 4 + (20*64) = 1284 cycles to do the flip (+ a few more
* because we're probably crossing a page boundary).
BEEP
ldy #NUM_ROWS ;2
]loop lda :clrloop-3+2,y ;4
eor #$60 ;2
sta :clrloop-3+2,y ;5
dey ;2
dey ;2
dey ;2
bne ]loop ;3
:pageok ldx g_color ;grab the current color
lda xormask,x
sta :_xormsk+1
lda evencolor,x
ldy #0
jsr :clearthird
ldy #BYTES_PER_ROW
jsr :clearthird
ldy #BYTES_PER_ROW*2
* fall through into :clearthird for final pass
:clearthird
ldx #BYTES_PER_ROW-1 ;2
:clrloop sta $2000,y ;5 (* 64)
sta $2400,y ;this could probably be
sta $2800,y ; done with LUP math
sta $2c00,y
sta $3000,y
sta $3400,y
sta $3800,y
sta $3c00,y
sta $2080,y
sta $2480,y
sta $2880,y
sta $2c80,y
sta $3080,y
sta $3480,y
sta $3880,y
sta $3c80,y
sta $2100,y
sta $2500,y
sta $2900,y
sta $2d00,y
sta $3100,y
sta $3500,y
sta $3900,y
sta $3d00,y
sta $2180,y
sta $2580,y
sta $2980,y
sta $2d80,y
sta $3180,y
sta $3580,y
sta $3980,y
sta $3d80,y
sta $2200,y
sta $2600,y
sta $2a00,y
sta $2e00,y
sta $3200,y
sta $3600,y
sta $3a00,y
sta $3e00,y
sta $2280,y
sta $2680,y
sta $2a80,y
sta $2e80,y
sta $3280,y
sta $3680,y
sta $3a80,y
sta $3e80,y
sta $2300,y
sta $2700,y
sta $2b00,y
sta $2f00,y
sta $3300,y
sta $3700,y
sta $3b00,y
sta $3f00,y
sta $2380,y
sta $2780,y
sta $2b80,y
sta $2f80,y
sta $3380,y
sta $3780,y
sta $3b80,y
sta $3f80,y
:_xormsk eor #$00 ;2 flip odd/even bits
iny ;2
dex ;2
bmi :done ;2
jmp :clrloop ;3
:done rts
else ;***** not USE_FAST
* This version was suggested by Marcus Heuser on
* comp.sys.apple2.programmer. It does a "venetian blind"
* clear, and takes (5 * 32 + 7) * 248 = 41416 cycles.
* It overwrites half of the screen holes.
lda :clrloop+5
cmp g_page
beq :pageok
* We're on the wrong hi-res page. Flip to the other one.
* 12 + (20*31) = 632 cycles to do the flip. We have to
* single out the first entry because it's $1f not $20.
BEEP
lda :clrloop+2 ;4
eor #$20 ;2 $1f <-> $3f
sta :clrloop+2 ;4
ldy #31*3 ;2
]loop lda :clrloop+2,y ;4
eor #$60 ;2 $20 <-> $40
sta :clrloop+2,y ;5
dey ;2
dey ;2
dey ;2
bne ]loop ;3
:pageok ldx g_color
lda xormask,x
sta :_xormsk+1
lda oddcolor,x
ldy #248 ;120 + 8 + 120
:clrloop
]addr = $1fff
lup 32 ;begin a loop in assembler
sta ]addr,y ;5
]addr = ]addr+$100 ;sta 20ff,21ff,...
--^
:_xormsk eor #$00 ;2
dey ;2
bne :clrloop ;3
rts
fin ;***** not USE_FAST
********************************
*
* Draw rectangle outline.
*
********************************
DrawRect
* We could just issue 4 line draw calls here, maybe
* adjusting the vertical lines by 1 pixel up/down to
* avoid overdraw. But if the user wanted 4 lines,
* they could just draw 4 lines. Instead, we're going
* to draw a double line on each edge to ensure that
* the outline rectangle always has the correct color.
*
* Rather than draw two vertical lines, we draw a
* two-pixel-wide filled rectangle on each side.
*
* We don't want to double-up if the rect is only one
* pixel wide, so we have to check for that.
*
* If the rect is one pixel high, it's just a line.
* If it's two pixels high, we don't need to draw
* the left/right edges, just the top/bottom lines.
* If it's more than two tall, we don't need to draw
* the left/right edges on the top and bottom lines,
* so we save a few cycles by skipping those.
lda in_y1 ;copy top/bottom to local
sta rast_bottom
dec rast_bottom ;move up one
sec
sbc in_y0
beq :isline ;1 pixel high, just draw line
cmp #1
beq :twolines ;2 pixels high, lines only
ldy in_y0
iny ;start down a line
sty rast_top
lda in_x0h ;check to see if left/right
cmp in_x1h ; coords are the same; if
bne :notline ; so, going +1/-1 at edge
lda in_x0l ; will overdraw.
cmp in_x1l
bne :notlin1
:isline jmp DrawLine ;just treat like line
* Set up left edge. Top line is in Y.
:notline lda in_x0l
:notlin1 sta rastx0l,y
clc
adc #1
sta rastx1l,y
lda in_x0h
ora #$80 ;"repeat" flag
sta rastx0h,y
and #$7f
adc #0
sta rastx1h,y
jsr FillRaster
ldy rast_top
lda in_x1l ;now set up right edge
sta rastx1l,y
sec
sbc #1
sta rastx0l,y
lda in_x1h
sta rastx1h,y
sbc #0
ora #$80 ;"repeat" flag
sta rastx0h,y
jsr FillRaster
* Now the top/bottom lines.
:twolines
ldy in_y0
jsr :drawline
ldy in_y1
:drawline
sty rast_top
sty rast_bottom
lda in_x0l ;copy left/right to the
sta rastx0l,y ; table entry for the
lda in_x0h ; appropriate line
sta rastx0h,y
lda in_x1l
sta rastx1l,y
lda in_x1h
sta rastx1h,y
jmp FillRaster
********************************
*
* Draw filled rectangle.
*
********************************
FillRect
* Just fill out the raster table and call the fill routine.
* We require y0=top, y1=bottom, x0=left, x1=right.
ldy in_y0
sty rast_top
lda in_y1
sta rast_bottom
lda in_x0l
sta rastx0l,y
lda in_x0h
ora #$80 ;"repeat" flag
sta rastx0h,y
lda in_x1l
sta rastx1l,y
lda in_x1h
sta rastx1h,y
jmp FillRaster
********************************
*
* Fill an area defined by the raster tables.
*
********************************
FillRaster
* Render rasterized output. The left and right edges
* are stored in the rastx0/rastx1 tables, and the top
* and bottom-most pixels are in rast_top/rast_bottom.
*
* This can be used to render an arbitrary convex
* polygon after it has been rasterized.
*
* If the high bit of the high byte of X0 is set, we
* go into "repeat" mode, where we just repeat the
* previous line. This saves about 40 cycles of
* overhead per line when drawing rectangles, plus
* what we would have to spend to populate multiple
* lines of the raster table. It only increases the
* general per-line cost by 3 cycles.
*
* We could use the "repeat" flag to use this code to
* draw vertical lines, though that's mostly of value
* to an external caller who knows ahead of time that
* the line is vertical. The DrawLine code is pretty
* good with vertical lines, and adding additional
* setup time to every vertical-dominant line to
* decide if it should call here seems like a
* losing proposition.
]hbasl equ zptr0
]hbash equ zptr0+1
]lftbyte equ zloc0
]lftbit equ zloc1
]rgtbyte equ zloc2
]rgtbit equ zloc3
]line equ zloc4
]andmask equ zloc5
]cur_line equ zloc6
]repting equ zloc7
ldx g_color ;configure color XOR byte
lda xormask,x
do USE_FAST ;*****
cmp rast_unroll+3 ;already configured?
beq :goodmask
jsr fixrastxor
:goodmask
else
sta _xorcolor+1
fin ;*****
lda #$00
sta ]repting
ldy rast_top
* Main rasterization loop. Y holds the line number.
rastloop
sty ]cur_line ;3
ldx ylooklo,y ;4
stx ]hbasl ;3
lda ylookhi,y ;4
_pg_or1 ora #$20 ;2 will be $20 or $40
sta ]hbash ;3 = 19 cycles
do USE_FAST-1 ;***** i.e. not USE_FAST
stx _wrhires+1
sta _wrhires+2
fin ;*****
* divide left edge by 7
ldx rastx0l,y ;4 line num in Y
lda rastx0h,y ;4
bpl :noflag ;2
sta rastx0h+1,y ;4 propagate
lda ]repting ;3 first time through?
beq :firstre ;2 yup, finish calculations
lda ]rgtbyte ;3 need this in A
bpl :repeat ;3 always
:firstre lda rastx0h,y ;reload
sta ]repting ;any nonzero will do
and #$7f ;strip repeat flag
:noflag beq :lotabl
lda mod7hi,x
sta ]lftbit
lda div7hi,x
sta ]lftbyte
bpl :gotlft ;always
BREAK ;debug
:lotabl lda mod7lo,x
sta ]lftbit
lda div7lo,x
sta ]lftbyte
:gotlft
* divide right edge by 7
ldx rastx1l,y ;4 line num in Y
lda rastx1h,y ;4
beq :lotabr ;3
lda mod7hi,x
sta ]rgtbit
lda div7hi,x
sta ]rgtbyte
bpl :gotrgt ;always
BREAK ;debug
:lotabr lda mod7lo,x ;4
sta ]rgtbit ;3
lda div7lo,x ;4
sta ]rgtbyte ;3 = 25 for X1 < 256
:gotrgt
:repeat
cmp ]lftbyte ;3
bne :not1byte ;3
* The left and right edges are in the same byte. We
* need to set up the mask differently, so we deal with
* it as a special case.
ldy ]lftbit
lda leftmask,y ;create the AND mask
ldx ]rgtbit
and rightmask,x ;strip out bits on right
sta ]andmask
ldy ]lftbyte
lda colorline,y ;get color bits
eor (]hbasl),y ;combine w/screen
and ]andmask ;remove not-ours
eor (]hbasl),y ;combine again
sta (]hbasl),y
jmp rastlinedone
* This is the more general case. We special-case the
* left and right edges, then byte-stomp the middle.
* On entry, ]rgtbyte is in A
:not1byte
sec ;2 compute number of full
sbc ]lftbyte ;3 and partial bytes to
tax ;2 draw
inx ;2
ldy ]rgtbit ;3
cpy #6 ;2
beq :rgtnospcl ;3
lda rightmask,y ;handle partial-byte right
sta ]andmask
ldy ]rgtbyte
lda colorline,y
eor (]hbasl),y
and ]andmask
eor (]hbasl),y
sta (]hbasl),y
dex ;adjust count
:rgtnospcl
ldy ]lftbit ;3 check left for partial
beq :lftnospcl ;3
lda leftmask,y ;handle partial-byte left
sta ]andmask
ldy ]lftbyte
lda colorline,y
eor (]hbasl),y
and ]andmask
eor (]hbasl),y
sta (]hbasl),y
dex ;adjust count
beq rastlinedone ;bail if all done
iny ;advance start position
bne :liny ;always
BREAK
:lftnospcl
ldy ]lftbyte ;3
:liny
do USE_FAST ;***** "fast" loop
* Instead of looping, jump into an unrolled loop.
* Cost is 10 cycles per byte with an extra 14 cycles
* of overhead, so we start to win at 4 bytes.
lda rastunidx,x ;4
sta :_rastun+1 ;4
lda colorline,y ;4 get odd/even color val
:_rastun jmp rast_unroll ;3
else ;***** "slow" loop
* Inner loop of the renderer. This runs 0-40x.
* Cost is 14 cycles/byte.
lda colorline,y ;get appropriate odd/even val
_wrhires sta $2000,y ;5 replaced with line addr
_xorcolor eor #$00 ;2 replaced with $00/$7f
iny ;2
dex ;2
bne _wrhires ;3
fin ;*****
rastlinedone
ldy ]cur_line ;3 more lines to go?
cpy rast_bottom ;4
bge :done ;2
iny ;2
jmp rastloop ;3 must have line in Y
:done rts
fixrastxor
do USE_FAST ;*****
* Update the EOR statements in the unrolled rastfill code.
* Doing this with a loop takes ~600 cycles, doing it with
* unrolled stores takes 160. We only do this when we
* need to, so changing the color from green to blue won't
* cause this to run.
*
* Call with the XOR value in A.
]offset = 0
lup BYTES_PER_ROW
sta rast_unroll+3+]offset
]offset = ]offset+5
--^
BEEP
rts
fin ;*****
* include the line functions
put FDRAW.LINE
* include the circle functions
put FDRAW.CIRCLE
lst on
CODE_END equ * ;end of code section
lst off
* include the data tables
put FDRAW.TABLES
lst on
DAT_END equ * ;end of data / BSS
lst off
* Save the appropriate object file.
do USE_FAST
sav FDRAW.FAST
else
sav FDRAW.SMALL
fin