diff --git a/AMPERFDRAW.S b/AMPERFDRAW.S new file mode 100644 index 0000000..5a1096c --- /dev/null +++ b/AMPERFDRAW.S @@ -0,0 +1,549 @@ +******************************** +* * +* Amper-fdraw * +* By Andy McFadden * +* For fdraw version 0.3 * +* * +* Applesoft ampersand * +* interface for fdraw. * +* * +* Developed with Merlin-16 * +* * +******************************** + + lst off + org $1d60 + +* All of the handler entry points can fit on a single +* page, so it's possible to save a few bytes by +* dropping the high jump table and just hardcoding +* the first page into the jump. This requires that +* the ORG be at $xx00. + + PUT FDRAW.DEFS + +* Applesoft BASIC tokens. +tok_plot equ $8d +tok_hgr2 equ $90 +tok_hgr equ $91 +tok_hcolor equ $92 +tok_hplot equ $93 +tok_draw equ $94 +tok_xdraw equ $95 +tok_inverse equ $9e +tok_clear equ $bd +tok_new equ $bf +tok_to equ $c1 +tok_at equ $c5 +*tok_sgn equ $d2 +tok_scrn equ $d7 +tok_exp equ $dd +tok_cos equ $de +tok_sin equ $df + +* System locations. +PCL equ $3a ;used by monitor +PCH equ $3b ;used by monitor +A1L equ $3c ;used by monitor +A1H equ $3d ;used by monitor +LINNUM equ $50 ;50-51 +FACLO equ $a1 +CHRGET equ $b1 ;advance ptr, get next tok +CHRGOT equ $b7 ;get next tok (no advance) +TXTPTR equ $b8 +HPAG equ $e6 ;$20 or $40 + +AMPERV equ $3f5 + +TXTCLR equ $c050 +TXTSET equ $c051 +MIXCLR equ $c052 +MIXSET equ $c053 +LOWSCR equ $c054 +HISCR equ $c055 +LORES equ $c056 +HIRES equ $c057 + +ERROR equ $d412 ;error based on X reg +FRMNUM equ $dd67 +SynError equ $dec9 ;throw SYNTAX ERROR +CHKCOM equ $debe +IllQError equ $e199 ;throw ILLEGAL QUANTITY ERROR +GETADR equ $e752 +GETBYT equ $e6f8 ;gets byte, in X/FACLO +HFNS equ $f6b9 ;get hi-res x/y for hplot + +* Prepare the ampersand vector. +* +* Ideally we'd check to see if the existing vector is +* different from ours, and if so, jump to it when we +* get a token we don't recognize. Not convinced +* there's an actual use case for this. +init + lda #$4c ;JMP, in case it got + sta AMPERV ; trashed + lda #dispatch + sta AMPERV+2 + rts + +* Entry point from BASIC. The token is in A. +dispatch + ldx #:cmdend-:cmdtab-1 +]loop cmp :cmdtab,x + beq :match + dex + bpl ]loop + jmp SynError + +:match + lda :jmptabh,x +* lda #>h_new ;all on first page + pha + lda :jmptabl,x + pha + jmp CHRGET ;eat token, jump + + +:cmdtab dfb tok_new + dfb tok_hgr + dfb tok_hgr2 + dfb tok_scrn + dfb tok_hcolor + dfb tok_inverse + dfb tok_clear + dfb tok_hplot + dfb tok_xdraw + dfb tok_draw + dfb tok_exp + dfb tok_cos + dfb tok_sin + dfb tok_at + dfb tok_plot +:cmdend + +:jmptabl dfb h_new-1 + dfb >h_hgr-1 + dfb >h_hgr2-1 + dfb >h_scrn-1 + dfb >h_hcolor-1 + dfb >h_inverse-1 + dfb >h_clear-1 + dfb >h_hplot-1 + dfb >h_xdraw-1 + dfb >h_draw-1 + dfb >h_exp-1 + dfb >h_cos-1 + dfb >h_sin-1 + dfb >h_at-1 + dfb >h_plot-1 + + +******************************** +* &NEW - initialize +h_new + lda #$20 ;match Init result + sta g_cur_page + lda #$00 + sta g_hcolor + tax ;init "previous hplot" + tay ; coord to zero + jsr storeprv + ldx #139 ;279/2 + ldy #0 + lda #95 ;191/2 + jsr storeac + jmp f_Init + +******************************** +* &HGR - show page 1 with mixed text, and clear screen. +* Sets the color to zero. +h_hgr + ldx #$20 ;page 1 + lda #$00 ;$c054 + beq hgr_com + +******************************** +* &HGR2 - show page 2 with no text, and clear screen. +* Sets the color to zero. +h_hgr2 + ldx #$40 ;page 2 + lda #$01 ;$c055 + ;fall through to hgr_com + +* We go slightly out of our way to clear the screen +* before tripping the softswitches. This avoids +* flashing the previous hi-res page contents when +* entering from text mode. +* +* We also want to go nomix-page2 but page1-mix +* (note reverse order) to avoid flashing text pg 2. +hgr_com stx f_in_arg + stx g_cur_page + stx HPAG ;probably useful + pha + jsr f_SetPage + lda #$00 + sta f_in_arg + jsr f_SetColor + jsr f_Clear + lda g_hcolor ;restore color + sta f_in_arg + jsr f_SetColor + bit TXTCLR ;$c050 + bit HIRES ;$c057 + pla + beq :pg1 + bit MIXCLR ;$c052 + bit HISCR ;$c055 + rts +:pg1 bit LOWSCR ;$c054 + bit MIXSET ;$c053 + rts + +******************************** +* &SCRN({1,2}) - set the current hi-res page +h_scrn + jsr GETBYT + cpx #1 + beq :okay + cpx #2 + beq :okay + jmp IllQError +:okay jsr CHRGET ;eat ')' (we assume) + txa ;X/Y unaltered + asl + asl + asl + asl + asl ;multiply x32 + sta g_cur_page + sta f_in_arg + jmp f_SetPage + +******************************** +* &HCOLOR={0-7} - set the current color +h_hcolor + jsr GETBYT ;get color + cpx #8 + blt :okay + jmp IllQError +:okay stx f_in_arg + stx g_hcolor + jmp f_SetColor + +******************************** +* &INVERSE - flip pages +* +* If we're currently drawing on $20, we set the page +* to $40 and hit $c054 to show $20. And vice-versa. +* The goal is to make double-buffered animation easy. +h_inverse + lda g_cur_page + eor #$60 + sta g_cur_page + ldx #$00 + cmp #$40 ;about to start drawing on 2? + beq :showpg1 ;yes, show page 1 + inx ;no, show page 2 +:showpg1 ldy LOWSCR,x + sta f_in_arg + jmp f_SetPage + +******************************** +* &CLEAR - clear current page to current color. +h_clear + jmp f_Clear ;well, that was easy + +******************************** +* &XDRAW left,top,right,bottom - draw rectangle outline +h_xdraw + jsr getltrb + jmp f_DrawRect + +******************************** +* &DRAW left,top,right,bottom - draw filled rectangle +h_draw + jsr getltrb + jmp f_FillRect + +******************************** +* &EXP {0,1} - set line draw mode +h_exp + jsr GETBYT + cpx #2 + blt :okay + jmp IllQError +:okay stx f_in_arg + jmp f_SetLineMode + +******************************** +* &COS cx,cy,rad - draw outline circle +h_cos + jsr getcxcyr + jmp f_DrawCircle + +******************************** +* &SIN cx,cy,rad - draw filled circle +h_sin + jsr getcxcyr + jmp f_FillCircle + +******************************** +* &AT x,y - select center for array draw +h_at + jsr HFNS + jmp storeac + +******************************** +* &PLOT vertexAddr, indexAddr, indexCount [AT cx,cy] +* draw lines from arrays of vertices and indices +h_plot jmp array_draw + +******************************** +* &HPLOT x,y - draw a point +* &HPLOT TO x,y - draw a line from last point to x,y +* &HPLOT x0,y0 to x1,y1 - draw a line + lst on ;last token handler -- +h_hplot equ * ; must be on first page + lst off ; to omit high byte table + + jsr CHRGOT ;check next token + lst off + cmp #tok_to ;is this an "HPLOT TO"? + beq :leadingto + jsr getx1y1 ;get the first coord + jsr copy1to0 + jsr CHRGOT ;see if single point + cmp #tok_to + beq :hplot_to ;nope, draw line + jsr copy0toprev ;draw point, and save x/y + jmp f_DrawPoint ; for subsequent HPLOT TO + +:leadingto ;"HPLOT TO", restore the + lda g_prevxl ; previous coord to x0/y0 + sta f_in_x0l ;(can't rely on f_in_zzz + lda g_prevxh ; being there -- we might + sta f_in_x0h ; have drawn a rect) + lda g_prevy + sta f_in_y0 +:hplot_to + jsr CHRGET ;eat the TO + jsr getx1y1 ;get the coords + jsr f_DrawLine ;draw it + jsr copy1to0 ;shift 1->0 for next round + jsr CHRGOT + cmp #tok_to ;another TO? + beq :hplot_to ;yes, branch + jmp copy0toprev ;no, save prev and bail + +* Get coordinates and store in X1/Y1. +getx1y1 + jsr HFNS +store1 stx f_in_x1l ;store X/Y/A in coord1 + sty f_in_x1h + sta f_in_y1 + rts + +* Save x0/y0 as our "previous" coordinate. +copy0toprev + ldx f_in_x0l + ldy f_in_x0h + lda f_in_y0 +storeprv stx g_prevxl ;store X/Y/A in g_prev + sty g_prevxh + sta g_prevy + rts + +* Copy X1/Y1 into X0/Y0. +copy1to0 + ldx f_in_x1l + ldy f_in_x1h + lda f_in_y1 +store0 stx f_in_x0l ;store X/Y/A in coord 0 + sty f_in_x0h + sta f_in_y0 + rts + +* Store X/Y/A into array-center. +storeac stx g_ac_xl + sty g_ac_xh + sta g_ac_y + rts + +* Get left/top/right/bottom coordinates. +getltrb + jsr HFNS + jsr store0 ;save as X0/Y0 + jsr CHKCOM ;eat a comma + jsr HFNS + jsr store1 ;save as X1/Y1 + rts + +* Get center coordinates and radius. +getcxcyr + jsr HFNS ;get CX and CY + jsr store0 ;save as X0/Y0 + jsr CHKCOM ;eat a comma + jsr GETBYT ;convert to 0-255 + stx f_in_rad + rts + +* Array-draw handler. +* +* We know that fdraw doesn't use LINNUM or A1L/A1H, +* so it's safe to use them here. +array_draw +]vertices equ A1L ;2b +]indices equ LINNUM ;2b +]count equ PCL +]cur equ PCH + + jsr FRMNUM ;get vertex buffer address + jsr GETADR + lda LINNUM ;copy to A1L + sta ]vertices + lda LINNUM+1 + sta ]vertices+1 + jsr CHKCOM ;eat the comma + jsr FRMNUM ;get index buffer address + jsr GETADR ;leave it in LINNUM + jsr CHKCOM + jsr GETBYT ;get the count + cpx #128 ;range check (0-127) + blt :countok + jmp IllQError +:countok txa + beq :done ;nothing to do + asl ;double it + sta ]count ;stash it + lda #$00 + sta ]cur + +* Check for optional AT cx,cy. + jsr CHRGOT + cmp #tok_at + bne :noat + JSR CHRGET ;eat the AT + lda LINNUM ;the code that reads the + pha ; hi-res coordinates will + lda LINNUM+1 ; overwrite LINNUM, so + pha ; we have to save & restore + jsr h_at + pla + sta LINNUM+1 + pla + sta LINNUM +:noat + +]loop jsr getvertex + bcs :skip2 + jsr store0 + jsr getvertex + bcs :skip + jsr store1 + jsr f_DrawLine + dfb $2c ;BIT addr +:skip2 inc ]cur +:skip lda ]cur + cmp ]count + blt ]loop +:done rts + +* Get the Nth vertex, specified by ]cur, and load it +* into X/Y/A (xlo/xhi/y). Returns with carry set if +* the vertex is invalid. +* +* Increments ]cur by 1. +getvertex + ldy ]cur + inc ]cur + lda (]indices),y + bmi :badv ;must be 0-127 + jsr :calcvertex + + ldx g_out_x + ldy g_out_x+1 + beq :xok ;0-255, ok + cpy #1 + bne :badv ;512+ + cpx #280-256 + bge :badv ;280-511 +:xok + lda g_out_y+1 + bne :badv ;Y is neg or > 255 + lda g_out_y + cmp #192 + bcc :goodv +:badv + sec +:goodv rts + +* Get VX and VY, merging with AC, and store in +* 16-bit g_out_x and g_out_y. Range not checked +* here. On entry, A has vertex index. +:calcvertex + asl + tay + ldx #$00 ;hi byte of vertex + lda (]vertices),y ;x-coord + bpl :xpos + dex ;sign-extend hi byte +:xpos clc + adc g_ac_xl + sta g_out_x + txa + adc g_ac_xh + sta g_out_x+1 + + iny + ldx #$00 + lda (]vertices),y ;y-coord + bpl :ypos + dex ;sign-extend hi byte +:ypos clc + adc g_ac_y + sta g_out_y + bcc :nocarry + inx +:nocarry stx g_out_y+1 + rts + + + +******************************** +* Global variables + +g_cur_page ds 1 ;$20 or $40 +g_hcolor ds 1 +g_prevxl ds 1 +g_prevxh ds 1 +g_prevy ds 1 +g_ac_xl ds 1 ;Center-point coordinates +g_ac_xh ds 1 ; for array-based line +g_ac_y ds 1 ; draw (&AT, &PLOT). +g_out_x ds 2 ;16-bit coordinates for +g_out_y ds 2 ; array-based line draw + + + + lst on +end equ * + sav amperfdraw + lst off diff --git a/FDRAW.CIRCLE.S b/FDRAW.CIRCLE.S new file mode 100644 index 0000000..bfe84db --- /dev/null +++ b/FDRAW.CIRCLE.S @@ -0,0 +1,752 @@ +******************************** +* * +* Fast Apple II Graphics * +* By Andy McFadden * +* Version 0.3, Aug 2015 * +* * +* Circle rendering * +* (Included by FDRAW.S) * +* * +* Developed with Merlin-16 * +* * +******************************** + +* TODO: if USE_FAST is 0, replace the outline circle +* plot code with calls to DrawPoint (or maybe a +* common sub-function so we don't trash the input +* parameters). Saves a little space. + + +******************************** +* +* Draw a circle. The radius is in in_rad, and +* the center is at in_x0l+in_x0h,in_y0. +* +******************************** +DrawCircle + lda #$20 ;JSR + cmp _cp08 ;configured for outline? + beq :okay + jsr fixcplot +:okay + jmp calc_circle + + +******************************** +* +* Draw filled circle. +* +******************************** +FillCircle + lda #$2c ;BIT + cmp _cp08 ;configured for fill? + beq :okay + jsr fixcplot +:okay + jsr calc_circle + jmp FillRaster + + +* Calculate a circle, using Bresenham's algorithm. The +* results are placed into the rasterization buffers. +* +* in_rad must be from 0 to 255. The x/y center +* coordinates must be on the screen, but the circle +* can extend off the edge. +* +* The computed values are stored in the rasterization +* tables. For an outline circle, we also plot the +* points immediately. + + do USE_FAST ;***** +* local storage -- not used often enough to merit DP +circ_8bit ds 1 +circ_clip ds 1 + fin ;***** + +calc_circle +max_fast_rad equ 41 +]cxl equ zloc0 +]cxh equ zloc1 +]cy equ zloc2 +]dlo equ zloc3 +]dhi equ zloc4 +]xsav equ zloc5 +]ysav equ zloc6 +]min_x equ zloc7 ;min/max offsets from center +]max_x equ zloc8 ;(min is above center, max +]min_y equ zloc9 ; is below) +]max_y equ zloc10 +]hitmp equ zloc11 +* only used by hplot for outline circles +]hbasl equ zptr0 +]andmask equ zloc11 ;overlaps with ]hitmp +]savxreg equ zloc12 +]savyreg equ zloc13 + +* Special-case radius=0. It removes an annoying +* edge case (first y-- becomes 0xff, but 6502 cmp +* is unsigned). + lda in_rad + bne :notzero + ldy in_y0 + sty rast_top + sty rast_bottom + lda in_x0l + sta rastx0l,y + sta rastx1l,y + lda in_x0h + sta rastx0h,y + sta rastx1h,y + rts + +* Use different version of function for small +* circles, because we can do it all in 8 bits. +:notzero + do USE_FAST ;***** + ldy #$01 + cmp #max_fast_rad ;in_rad in Acc + blt :use_fast + dey +:use_fast sty circ_8bit + fin ;***** + + lda in_x0l ;copy center to DP for speed + sta ]cxl + lda in_x0h + sta ]cxh + lda in_y0 + sta ]cy + +* Compute min/max values, based on offset from center. +* These are compared against offset-from-center x/y. +* We need tight bounds on Y because we use it to +* compute the rast_render top/bottom. Getting tight +* bounds on X is not so important, but we still need +* it for the no-clip optimization. + ldx #$04 ;count edges needing clip + + lda #NUM_ROWS-1 ;191 + sec + sbc ]cy ;maxY = 191-cy + cmp in_rad + blt :ylimok + lda in_rad ;clamp to radius + dex +:ylimok sta ]max_y ;maxY = 191-cy + + lda ]cy ;minY = cy + cmp in_rad + blt :ylimok2 + lda in_rad ;clamp to radius + dex +:ylimok2 sta ]min_y + + lda ]cxh + beq :xlimlo +* Examples (note # bad, must use rad +* cx=24, 23-24=255 + carry clear --> ok, chk rad +* cx=255, 23-255=24 + carry clear --> ok, chk rad +:xlimlo + lda # 255) ? + cmp in_rad + blt :xlimok2 + lda in_rad ;clamp to radius + dex +:xlimok2 sta ]min_x + +:xlimdone + + do USE_FAST ;***** + stx circ_clip + fin ;***** + +* set top/bottom rows for rasterizer + lda ]cy + clc + adc ]max_y + sta rast_bottom + lda ]cy + sec + sbc ]min_y + sta rast_top + + DO 0 ;debug debug debug + LDA ]min_x ;save a copy where the + STA $0380 ; monitor won't trash it + LDA ]max_x + STA $0381 + LDA ]min_y + STA $0382 + LDA ]max_y + STA $0383 + FIN + +* Set initial conditions for Bresenham. + ldx #0 ;:x = 0 + stx ]xsav + ldy in_rad ;:y = rad + sty ]ysav + lda #1 ;:d = 1 - rad + sec + sbc ]ysav ;in_rad + sta ]dlo + bcs :hizero ;C==1 if in_rad<=1 + ldx #$ff ;C was 0, make neg +:hizero stx ]dhi + +* +* Outer loop -- plot 8 points, then update values. +* +circ_loop + + do USE_FAST ;***** + lda circ_clip + beq ncypy + jmp with_clip + +* Quick version, no clipping required +* row cy+y: cx-x and cx+x +ncypy + lda ]ysav + clc + adc ]cy + tay ;y-coord in Y-reg + + lda ]cxl + sec + sbc ]xsav + sta rastx0l,y + lda ]cxh + sbc #$00 + sta rastx0h,y +_cp00 jsr cplotl + + lda ]cxl + clc + adc ]xsav + sta rastx1l,y + lda ]cxh + adc #$00 + sta rastx1h,y +_cp01 jsr cplotrn + +* row cy-y: cx-x and cx+x +ncymy + lda ]cy + sec + sbc ]ysav + tay ;y-coord in Y-reg + + lda ]cxl + sec + sbc ]xsav + sta rastx0l,y + lda ]cxh + sbc #$00 + sta rastx0h,y +_cp02 jsr cplotl + + lda ]cxl + clc + adc ]xsav + sta rastx1l,y + lda ]cxh + adc #$00 + sta rastx1h,y +_cp03 jsr cplotrn + +* row cy+x: cx-y and cx+y +ncypx + lda ]xsav ;off bottom? + clc + adc ]cy + tay ;y-coord in Y-reg + + lda ]cxl + sec + sbc ]ysav + sta rastx0l,y + lda ]cxh + sbc #$00 + sta rastx0h,y +_cp04 jsr cplotl + + lda ]cxl + clc + adc ]ysav + sta rastx1l,y + lda ]cxh + adc #$00 + sta rastx1h,y +_cp05 jsr cplotrn + +* row cy-x: cx-y and cx+y +ncymx + lda ]cy + sec + sbc ]xsav + tay ;y-coord in Y-reg + + lda ]cxl + sec + sbc ]ysav + sta rastx0l,y + lda ]cxh + sbc #$00 + sta rastx0h,y +_cp06 jsr cplotl + + lda ]cxl + clc + adc ]ysav + sta rastx1l,y + lda ]cxh + adc #$00 + sta rastx1h,y +_cp07 jsr cplotrn + +* CLICK + jmp circ_plot_done + + fin ;***** (USE_FAST) + +* +* Same thing, but this time clipping edges. +* +with_clip + +* row cy+y: cx-x and cx+x +ccypy + lda ]ysav ;off bottom? + cmp ]max_y + beq :cypy_ok + bge cypy_skip ;completely off screen +:cypy_ok clc + adc ]cy + tay ;y-coord in Y-reg + + ldx ]xsav ;handle cx-x + cpx ]min_x + blt :cxmx_ok + beq :cxmx_ok + lda #0 ;clip at 0 + sta rastx0l,y + sta rastx0h,y + beq cxmx_done0 ;always + BREAK +:cxmx_ok lda ]cxl + sec + sbc ]xsav + sta rastx0l,y + lda ]cxh + sbc #$00 + sta rastx0h,y +_cp08 jsr cplotl +cxmx_done0 + + cpx ]max_x ;handle cx+x + blt :cxpx_ok + beq :cxpx_ok + lda #NUM_COLS-1 + sta rastx1h,y + bne cxpx_done0 ;always + BREAK +:cxpx_ok lda ]cxl + clc + adc ]xsav + sta rastx1l,y + lda ]cxh + adc #$00 + sta rastx1h,y +_cp09 jsr cplotr +cxpx_done0 +cypy_skip + +* row cy-y: cx-x and cx+x +ccymy + lda ]ysav ;off top? + cmp ]min_y + beq :cymy_ok + bge cymy_skip +:cymy_ok lda ]cy + sec + sbc ]ysav + tay ;y-coord in Y-reg + + ldx ]xsav ;handle cx-x + cpx ]min_x + blt :cxmx_ok + beq :cxmx_ok + lda #0 ;clip at 0 + sta rastx0l,y + sta rastx0h,y + beq cxmx_done1 ;always + BREAK +:cxmx_ok lda ]cxl + sec + sbc ]xsav + sta rastx0l,y + lda ]cxh + sbc #$00 + sta rastx0h,y +_cp10 jsr cplotl +cxmx_done1 + + cpx ]max_x ;handle cx+x + blt :cxpx_ok + beq :cxpx_ok + lda #NUM_COLS-1 + sta rastx1h,y + bne cxpx_done1 ;always + BREAK +:cxpx_ok lda ]cxl + clc + adc ]xsav + sta rastx1l,y + lda ]cxh + adc #$00 + sta rastx1h,y +_cp11 jsr cplotr +cxpx_done1 +cymy_skip + +* row cy+x: cx-y and cx+y +ccypx + lda ]xsav ;off bottom? + cmp ]max_y + beq :cypx_ok + bge cypx_skip +:cypx_ok clc + adc ]cy + tay ;y-coord in Y-reg + + ldx ]ysav ;handle cx-y + cpx ]min_x + blt :cxmy_ok + beq :cxmy_ok + lda #0 ;clip at 0 + sta rastx0l,y + sta rastx0h,y + beq cxmy_done2 ;always + BREAK +:cxmy_ok lda ]cxl + sec + sbc ]ysav + sta rastx0l,y + lda ]cxh + sbc #$00 + sta rastx0h,y +_cp12 jsr cplotl +cxmy_done2 + + cpx ]max_x ;handle cx+y + blt :cxpy_ok + beq :cxpy_ok + lda #NUM_COLS-1 + sta rastx1h,y + bne cxpy_done2 ;always + BREAK +:cxpy_ok lda ]cxl + clc + adc ]ysav + sta rastx1l,y + lda ]cxh + adc #$00 + sta rastx1h,y +_cp13 jsr cplotr +cxpy_done2 +cypx_skip + +* row cy-x: cx-y and cx+y +ccymx + lda ]xsav ;off top? + cmp ]min_y + beq :cymx_ok + bge cymx_skip +:cymx_ok lda ]cy + sec + sbc ]xsav + tay ;y-coord in Y-reg + + ldx ]ysav ;handle cx-y + cpx ]min_x + blt :cxmy_ok + beq :cxmy_ok + lda #0 ;clip at 0 + sta rastx0l,y + sta rastx0h,y + beq cxmy_done3 ;always + BREAK +:cxmy_ok lda ]cxl + sec + sbc ]ysav + sta rastx0l,y + lda ]cxh + sbc #$00 + sta rastx0h,y +_cp14 jsr cplotl +cxmy_done3 + + cpx ]max_x ;handle cx+y + blt :cxpy_ok + beq :cxpy_ok + lda #NUM_COLS-1 + sta rastx1h,y + bne cxpy_done3 ;always + BREAK +:cxpy_ok lda ]cxl + clc + adc ]ysav + sta rastx1l,y + lda ]cxh + adc #$00 + sta rastx1h,y +_cp15 jsr cplotr +cxpy_done3 +cymx_skip + +circ_plot_done +* Update X/Y/D. Up to about radius=41 we can maintain +* 'd' in an 8-bit register. + do USE_FAST ;***** + lda circ_8bit + beq circ_slow + +* +* Bresenham update, with 8-bit 'd'. +* + ldx ]xsav + lda ]dlo + bmi :dneg + txa ;:d = d + ((x-y)*4) +5 + sec + sbc ]ysav ;x <= y, may be neg or 0 + asl + asl + clc ;can't know carry + adc #5 + clc ;still don't want carry + adc ]dlo + sta ]dlo + dec ]ysav ;:y-- + jmp :loopbot +:dneg txa ;:d = d + (x*4) +3 + asl + asl ;x always pos, C=0 + DO 0 + BCC :TEST ;debug + BREAK ;debug +:TEST ;debug + FIN + adc #3 + adc ]dlo + sta ]dlo +:loopbot + inx ;:x++ + stx ]xsav + cpx ]ysav + beq :again + bge circ_done +:again jmp circ_loop + + fin ;***** + +* +* Bresenham update, with 16-bit 'd' +* +circ_slow + CLICK + ldx ]xsav + lda ]dhi + bmi :dneg + lda ]dlo + clc + adc #5 + sta ]dlo + bcc :noinc + inc ]dhi +:noinc + txa ;:d = d + ((x-y)*4) +5 + ldy #$00 + sty ]hitmp + sec + sbc ]ysav ;x <= y, may be neg or 0 + beq :xeqy ;if x==y, nothing to add + ldy #$ff + sty ]hitmp + asl + rol ]hitmp + asl + rol ]hitmp + clc + adc ]dlo + sta ]dlo + lda ]dhi + adc ]hitmp + sta ]dhi +:xeqy + dec ]ysav ;:y-- + jmp :loopbot + +:dneg lda ]dlo ;:d = d + (x*4) + 3 + clc + adc #3 + sta ]dlo + bcc :noinc2 + inc ]dhi +:noinc2 txa + ldy #0 ;x always positive + sty ]hitmp + asl + rol ]hitmp + asl + rol ]hitmp + clc ;not needed? + adc ]dlo + sta ]dlo + lda ]dhi + adc ]hitmp + sta ]dhi +:loopbot + inx ;:x++ + stx ]xsav + cpx ]ysav + beq :again + bge circ_done +:again jmp circ_loop + + +circ_done rts + + +* Plot a point for outline circle rendering. +* +* X and Y must be preserved. Y holds the current line +* number. +* +* Most DP locations are in use -- see the variable +* declarations at the start of the circle function. + +* cplotl is the entry point for the leftmost point. +cplotl + stx ]savxreg + sty ]savyreg + + lda ylooklo,y + sta ]hbasl + lda ylookhi,y +_pg_or2 ora #$20 + sta ]hbasl+1 + +* Convert the X coordinate into byte/bit. + ldx rastx0l,y ;x coord, lo + lda rastx0h,y ;>= 256? + beq :lotabl ;no, use the low table + ldy div7hi,x + lda mod7hi,x + bpl cplotcom ;always + BREAK ;debug +:lotabl ldy div7lo,x + lda mod7lo,x + jmp cplotcom + +* cplotr is the entry point for the rightmost point. +* We use rastx1 instead of rastx0. +cplotr + lda ylooklo,y + sta ]hbasl + lda ylookhi,y +_pg_or3 ora #$20 + sta ]hbasl+1 + +* If we just plotted the left point on the same line, +* we can skip the Y-lookup by jumping here. +cplotrn + stx ]savxreg + sty ]savyreg + + ldx rastx1l,y ;x coord, lo + lda rastx1h,y ;>= 256? + beq :lotabl ;no, use the low table + ldy div7hi,x + lda mod7hi,x + bpl cplotcom ;always + BREAK ;debug +:lotabl ldy div7lo,x + lda mod7lo,x + +* Plot the point. The byte offset (0-39) is in Y, +* the bit offset (0-6) is in A. +cplotcom + tax + lda colorline,y ;start with color pattern + eor (]hbasl),y ;flip all bits + and andmask,x ;clear other bits + eor (]hbasl),y ;restore ours, set theirs + sta (]hbasl),y + + ldx ]savxreg + ldy ]savyreg + rts + +* Reconfigure calc_circle to either JSR to cplotl/r, +* or just BIT the address (a 4-cycle no-op). The +* desired instruction is in A. +fixcplot + do USE_FAST ;***** + sta _cp00 + sta _cp01 + sta _cp02 + sta _cp03 + sta _cp04 + sta _cp05 + sta _cp06 + sta _cp07 + fin ;***** + sta _cp08 + sta _cp09 + sta _cp10 + sta _cp11 + sta _cp12 + sta _cp13 + sta _cp14 + sta _cp15 + rts diff --git a/FDRAW.LINE.S b/FDRAW.LINE.S new file mode 100644 index 0000000..db0df77 --- /dev/null +++ b/FDRAW.LINE.S @@ -0,0 +1,588 @@ +******************************** +* * +* Fast Apple II Graphics * +* By Andy McFadden * +* Version 0.3, Aug 2015 * +* * +* Point and line functions * +* (Included by FDRAW.S) * +* * +* Developed with Merlin-16 * +* * +******************************** + + +******************************** +* +* Draw a single point in the current color. +* +******************************** +DrawPoint +]hbasl equ zptr0 + + ldy in_y0 + lda ylooklo,y + sta ]hbasl + lda ylookhi,y + ora g_page + sta ]hbasl+1 + + ldx in_x0l ;x coord, lo + lda in_x0h ;>= 256? + beq :lotabl ;no, use the low table + ldy div7hi,x + lda mod7hi,x + bpl :plotit ;always + BREAK ;debug +:lotabl ldy div7lo,x + lda mod7lo,x + +* Plot the point. The byte offset (0-39) is in Y, +* the bit offset (0-6) is in A. +:plotit + tax + lda colorline,y ;start with color pattern + eor (]hbasl),y ;flip all bits + and andmask,x ;clear other bits + eor (]hbasl),y ;restore ours, set theirs + sta (]hbasl),y + rts + + +******************************** +* +* Draw a line between two points. +* +******************************** +DrawLine + +]hbasl equ zptr0 +]xposl equ zloc0 ;always left edge +]xposh equ zloc1 +]ypos equ zloc2 ;top or bottom +]deltaxl equ zloc3 +]deltaxh equ zloc4 +]deltay equ zloc5 +]count equ zloc6 +]counth equ zloc7 +]diff equ zloc8 +]diffh equ zloc9 +]andmask equ zloc10 +]wideflag equ zloc11 ;doesn't really need DP + +* We use a traditional Bresenham run-length approach. +* Run-slicing is possible, but the code is larger +* and the increased cost means it's only valuable +* for longer lines. An optimal solution would switch +* approaches based on line length. +* +* Start by identifying where x0 or x1 is on the +* left. To make life simpler we always work from +* left to right, flipping the coordinates if +* needed. +* +* We also need to figure out if the line is more +* than 255 pixels long -- which, because of +* inclusive coordinates, means abs(x0-x1) > 254. + lda in_x1l ;assume x0 on left + sec + sbc in_x0l + tax + beq checkvert ;low bytes even, check hi + lda in_x1h + sbc in_x0h + bcs lx0left + +* x1 is on the left, so the values are negative +* (hi byte in A, lo byte in X) +lx0right eor #$ff ;invert hi + sta ]deltaxh ;store + txa + eor #$ff ;invert lo + sta ]deltaxl + inc ]deltaxl ;add one for 2s complement + bne :noinchi ;rolled into high byte? + inc ]deltaxh ;yes +:noinchi lda in_x1l ;start with x1 + sta ]xposl + lda in_x1h + sta ]xposh + lda in_y1 + sta ]ypos + sec + sbc in_y0 ;compute deltay + jmp lncommon + +checkvert + lda in_x1h ;diff high bytes + sbc in_x0h ;(carry still set) + blt lx0right ;width=256, x0 right + bne lx0left ;width=256, x0 left + jmp vertline ;all zero, go vert + +* (branch back from below) +* This is a purely horizontal line. We farm the job +* out to the raster fill code for speed. (There's +* no problem with the line code handling it; its just +* more efficient to let the raster code do it.) +phorizontal + ldy ]ypos + sty rast_top + sty rast_bottom + lda ]xposl + sta rastx0l,y + clc + adc ]deltaxl ;easier to add delta back + sta rastx1l,y ; in than sort out which + lda ]xposh ; arg is left vs. right + sta rastx0h,y + adc ]deltaxh + sta rastx1h,y + jmp FillRaster + +* x0 is on the left, so the values are positive +lx0left stx ]deltaxl + sta ]deltaxh + lda in_x0l ;start with x0 + sta ]xposl + lda in_x0h + sta ]xposh + lda in_y0 ;and y0 + sta ]ypos + sec + sbc in_y1 ;compute deltay + +* Value of (starty - endy) is in A, flags still set. +lncommon + bcs :posy + eor #$ff ;negative, invert + adc #$01 + sta ]deltay + lda #$e8 ;INX + bne gotdy +:posy +_lmb beq phorizontal + sta ]deltay + lda #$ca ;DEX +gotdy sta _hmody + sta _vmody + sta _wmody + + do 0 ;***** for regression test + ldx #$01 + lda ]deltaxh + bne :iswide + lda ]deltaxl + cmp #$ff ;== 255? + beq :iswide + ldx #$00 ;notwide +:iswide stx $300 + lda ]xposl + sta $301 + lda ]xposh + sta $302 + lda ]ypos + sta $303 + ldx ]deltaxl + stx $304 + ldx ]deltaxh + stx $305 + ldx ]deltay + stx $306 + lda _hmody + and #$20 ;nonzero means inc, + sta $307 ; zero means dec + fin ;***** + +* At this point we have the initial X position in +* ]startxl/h, the initial Y position in ]starty, +* deltax in ]deltaxl, deltay in ]deltay, and we've +* tweaked the Y-update instructions to either INC or +* DEC depending on the direction of movement. +* +* The next step is to decide whether the line is +* horizontal-dominant or vertical-dominant, and +* branch to the appropriate handler. +* +* The core loops for horiz and vert take about +* 80 cycles when moving diagonally, and about +* 20 fewer when moving in the primary direction. +* The wide-horiz is a bit slower. + ldy #$01 ;set "wide" flag to 1 + lda ]deltaxl + ldx ]deltaxh + bne horzdom ;width >= 256 + cmp #$ff ;width == 255 + beq horzdom + dey ;not wide + cmp ]deltay + bge horzdom ; for diagonal lines + jmp vertdom + +* We could special-case pure-diagonal lines here +* (just BEQ a couple lines up). It does +* represent our worst case. I'm not convinced +* we'll see them often enough to make it worthwhile. + + +* horizontal-dominant +horzdom + sty ]wideflag + sta ]count ;:count = deltax + 1 + inc ]count + lsr ;:diff = deltax / 2 + sta ]diff + +* set Y to the byte offset in the line +* load the AND mask into ]andmask + ldx ]xposl + lda ]xposh ;>= 256? + beq :lotabl ;no, use the low table + ldy div7hi,x + lda mod7hi,x + bpl :gottab ;always +* BREAK ;debug +:lotabl ldy div7lo,x + lda mod7lo,x +:gottab + tax + lda andmask,x + sta ]andmask + +* Set initial value for line address. + ldx ]ypos + lda ylooklo,x + sta ]hbasl + lda ylookhi,x + ora g_page + sta ]hbasl+1 + + lda ]wideflag ;is this a "wide" line? + beq :notwide ;nope, stay local + jmp widedom + +:notwide lda colorline,y ;set initial color mask + sta _hlcolor+1 + jmp horzloop + +hrts rts + +* bottom of loop, essentially +hnoroll sta ]diff ;3 +hdecc dec ]count ;5 :count-- + beq hrts ;2 :while (count != 0) + ;= 7 or 10 + +* We keep the byte offset in the line in Y, and the +* line index in X, for the entire loop. +horzloop +_hlcolor lda #$00 ;2 start with color pattern +_lmdh eor (]hbasl),y ;5 flip all bits + and ]andmask ;3 clear other bits + eor (]hbasl),y ;5 restore ours, set theirs + sta (]hbasl),y ;6 = 21 + +* Move right. We shift the bit mask that determines +* the pixel. When we shift into bit 7, we know it's +* time to advance another byte. +* +* If this is a shallow line we would benefit from +* keeping the index in X and just doing a 4-cycle +* indexed load to get the mask. Not having the +* line number in X makes the line calc more +* expensive for steeper lines though. + lda ]andmask ;3 + asl ;2 shift, losing hi bit + eor #$80 ;2 set the hi bit + bne :noh8 ;3 cleared hi bit? +* We could BEQ away and branch back in, but this +* happens every 7 iterations, so on average it's +* a very small improvement. If we happen to branch +* across a page boundary the double-branch adds +* two more cycles and we lose. + iny ;2 advance to next byte + lda colorline,y ;4 update color mask + sta _hlcolor+1 ;4 + lda #$81 ;2 reset +:noh8 sta ]andmask ;3 = 13 + ((12-1)/7) = 14 + +* Update error diff. + lda ]diff ;3 + sec ;2 + sbc ]deltay ;3 :diff -= deltay + bcs hnoroll ;2+ :if (diff < 0) ... + ;= 11 level, 10 up/down + adc ]deltaxl ;3 : diff += deltax + sta ]diff ;3 +_hmody inx ;2 : ypos++ (or --) + lda ylooklo,x ;4 update hbasl after line + sta ]hbasl ;3 change + lda ylookhi,x ;4 +_pg_or4 ora #$20 ;2 + sta ]hbasl+1 ;3 + bne hdecc ;3 = +27 this path -> 37 + BREAK +* horizontal: 10+21+14+11=56 cycles/pixel +* diagonal: 7+21+14+37=79 cycles/pixel + + +* Vertical-dominant line. Could go up or down. +vertdom + ldx in_y0 + cpx ]ypos ;starting at y0? + bne :endy0 ;yup + ldx in_y1 ;nope +:endy0 stx _vchk+1 ;end condition + + lda ]deltay + lsr + sta ]diff ;:diff = deltay / 2 + +* set Y to the byte offset in the line +* load the AND mask into ]andmask + ldx ]xposl + lda ]xposh ;>= 256? + beq :lotabl ;no, use the low table + ldy div7hi,x + lda mod7hi,x + bpl :gottab ;always + BREAK ;debug +:lotabl ldy div7lo,x + lda mod7lo,x +:gottab + tax + lda andmask,x ;initial pixel mask + sta ]andmask + + lda colorline,y ;initial color mask + sta _vlcolor+1 + + ldx ]ypos + jmp vertloop + +* We keep the byte offset in the line in Y, and the +* line index in X, for the entire loop. + +* Bottom of loop, essentially. +vnoroll sta ]diff ;3 + +vertloop + lda ylooklo,x ;4 + sta ]hbasl ;3 + lda ylookhi,x ;4 +_pg_or5 ora #$20 ;2 + sta ]hbasl+1 ;3 = 16 + +_vlcolor lda #$00 ;2 start with color pattern +_lmdv eor (]hbasl),y ;5 flip all bits + and ]andmask ;3 clear other bits + eor (]hbasl),y ;5 restore ours, set theirs + sta (]hbasl),y ;6 = 21 + +_vchk cpx #$00 ;2 was this last line? + beq vrts ;2 yes, done +_vmody inx ;2 :ypos++ (or --) + +* Update error diff. + lda ]diff ;3 + sec ;2 + sbc ]deltaxl ;3 :diff -= deltax + bcs vnoroll ;2 :if (diff < 0) ... + ;= 10 vert, 9 move right + + adc ]deltay ;3 : diff += deltay + sta ]diff ;3 +* Move right. We shift the bit mask that determines +* the pixel. When we shift into bit 7, we know it's +* time to advance another byte. + lda ]andmask ;3 + asl ;2 shift, losing hi bit + eor #$80 ;2 set the hi bit + beq :is8 ;2+ goes to zero on 8th bit + sta ]andmask ;3 + bne vertloop ;3 = 21 + (18/7) = 24 + BREAK + +:is8 iny ;2 advance to next byte + lda colorline,y ;4 update color + sta _vlcolor+1 ;4 + lda #$81 ;2 reset + sta ]andmask ;3 + bne vertloop ;3 = 18 + BREAK +vrts rts +* vertical: 3 + 16 + 21 + 6 + 10 = 56 cycles +* diagonal: 16 + 21 + 6 + 9 + 24 = 76 cycles + + +* "Wide" horizontally-dominant loop. We have to +* maintain error-diff and deltax as 16-bit values. +* Most of the setup from the "narrow" version carried +* over, but we have to re-do the count and diff. +* +* Normally we set count to (deltax + 1) and decrement +* to zero, but it's actually easier to set it equal +* to deltax and check for -1. +widedom + lda ]deltaxh ;:count = deltax + sta ]counth + ldx ]deltaxl + stx ]count + stx ]diff + lsr ;:diff = deltax / 2 + ror ]diff + sta ]diffh + ldx ]ypos + + lda colorline,y ;set initial color mask + sta _wlcolor+1 + +* We keep the byte offset in the line in Y, and the +* line index in X, for the entire loop. +wideloop +_wlcolor lda #$00 ;2 start with color pattern +_lmdw eor (]hbasl),y ;5 flip all bits + and ]andmask ;3 clear other bits + eor (]hbasl),y ;5 restore ours, set theirs + sta (]hbasl),y ;6 = 21 + +* Move right. We shift the bit mask that determines +* the pixel. When we shift into bit 7, we know it's +* time to advance another byte. + lda ]andmask ;3 + asl ;2 shift, losing hi bit + eor #$80 ;2 set the hi bit + bne :not7 ;3 goes to zero on 8th bit + iny ; 2 advance to next byte + lda colorline,y ; 4 update color mask + sta _hlcolor+1 ; 4 + lda #$81 ; 2 reset +:not7 sta ]andmask ;3 = 13 usually, 25 every 7 + +* Update error diff, which is a positive number. If +* it goes negative ("if (diff < 0)") we act. + lda ]diff + sec + sbc ]deltay ;:diff -= deltay + bcs wnoroll ;didn't even roll low byte + dec ]diffh ;check hi byte + bpl wnoroll ;went 1->0, keep going + + adc ]deltaxl ;: diff += deltax + sta ]diff + lda ]diffh + adc ]deltaxh + sta ]diffh +_wmody inx ;: ypos++ (or --) + lda ylooklo,x ;update hbasl after line + sta ]hbasl ; change + lda ylookhi,x +_pg_or6 ora #$20 + sta ]hbasl+1 + bne wdecc + BREAK + +wnoroll sta ]diff + +wdecc dec ]count ;5 :count-- + lda ]count ;3 + cmp #$ff ;2 + bne wideloop ;3 :while (count > -1) + dec ]counth ;low rolled, decr high + beq wideloop ;went 1->0, keep going + rts + + +* Pure-vertical line. These are common in certain +* applications, and checking for it only adds two +* cycles to the general case. +vertline + ldx in_y0 + ldy in_y1 + cpx in_y1 ;y0 < y1? + blt :usey0 ;yes, go from y0 to y1 + txa ;swap X/A + tay + ldx in_y1 +:usey0 stx ]ypos + iny + sty _pvytest+1 + + ldx in_x0l ;xc lo + lda in_x0h ;>= 256? + beq :lotabl + ldy div7hi,x + lda mod7hi,x + bpl :gotit ;always +:lotabl ldy div7lo,x + lda mod7lo,x + +* Byte offset is in Y, mod-7 value is in A. +:gotit tax + lda andmask,x + sta _pvand+1 ;this doesn't change + + lda colorline,y + sta _pvcolor+1 ;nor does this + + ldx ]ypos ;top line + +* There's a trick where, when (linenum & 0x07) is +* nonzero, you just add 4 to hbasl+1 instead of +* re-doing the lookup. However, TXA+AND+BEQ +* followed by LDA+CLC+ADC+STA is 16 cycles, the same +* as our self-modified lookup, so it's not a win. +* (And if we used a second ylookhi and self-modded +* the table address, we could shave off another 2.) + +* Main pure-vertical loop +pverloop + lda ylooklo,x ;4 + sta ]hbasl ;3 + lda ylookhi,x ;4 +_pg_or7 ora #$20 ;2 + sta ]hbasl+1 ;3 (= 16) + +_pvcolor lda #$00 ;2 start with color pattern +_lmdpv eor (]hbasl),y ;5 flip all bits +_pvand and #$00 ;2 clear other bits + eor (]hbasl),y ;5 + sta (]hbasl),y ;6 (= 20) + + inx ;2 +_pvytest cpx #$00 ;2 done? + bne pverloop ;3 = 7 + rts +* 43 cycles/pixel + + +******************************** +* +* Set the line mode according to in_arg +* +* A slightly silly feature to get xdraw lines +* without really working for it. +* +******************************** +SetLineMode + lda in_arg + beq :standard + +* configure for xdraw + lda #$24 ;BIT dp + sta _lmb + sta _lmdh + sta _lmdv + sta _lmdw + sta _lmdpv + rts + +* configure for standard drawing +:standard lda #$f0 ;BEQ + sta _lmb + lda #$51 ;EOR (dp),y + sta _lmdh + sta _lmdv + sta _lmdw + sta _lmdpv + rts diff --git a/FDRAW.S b/FDRAW.S new file mode 100644 index 0000000..64a2096 --- /dev/null +++ b/FDRAW.S @@ -0,0 +1,805 @@ +******************************** +* * +* Fast Apple II Graphics * +* By Andy McFadden * +* Version 0.3, Aug 2015 * +* * +* Main source file * +* * +* Developed with Merlin-16 * +* * +******************************** + +* Set to 1 to build FDRAW.FAST, set to zero to +* build FDRAW.SMALL. +USE_FAST equ 1 + +* Set to 1 to turn on beeps/clicks for debugging. +NOISE_ON equ 0 + + + lst off + org $6000 + +* +* Macros. +* +spkr equ $c030 +bell equ $ff3a + +* If enabled, click the speaker (changes flags only). +CLICK mac + do NOISE_ON + bit spkr + fin + <<< +* If enabled, beep the speaker (scrambles regs). +BEEP mac + do NOISE_ON + jsr bell + fin + <<< +* If enabled, insert a BRK. +BREAK mac + do NOISE_ON + brk $99 + fin + <<< + +* In "fast" mode, we align tables on page boundaries so we +* don't take a 1-cycle hit when the indexing crosses a page. +* In "small" mode, we skip the alignment. +PG_ALIGN mac + do USE_FAST + ds \ + fin + <<< + +* +* Hi-res screen constants. +* +BYTES_PER_ROW = 40 +NUM_ROWS = 192 +NUM_COLS = 280 + +* +* Variable storage. We assign generic names to +* zero-page scratch locations, then assign variables +* with real names to these. +* +* 06-09 are unused (except by SWEET-16) +* 1a-1d are Applesoft hi-res scratch +* cc-cf are only used by INTBASIC +* eb-ef and ff appear totally unused by ROM routines +* +zptr0 equ $1a ;2b +zloc0 equ $06 +zloc1 equ $07 +zloc2 equ $08 +zloc3 equ $09 +zloc4 equ $1c +zloc5 equ $1d +zloc6 equ $cc +zloc7 equ $cd +zloc8 equ $ce +zloc9 equ $cf +zloc10 equ $eb +zloc11 equ $ec +zloc12 equ $ed +zloc13 equ $ee + + +******************************** +* +* Entry points for external programs. +* +******************************** +Entry + jmp Init ;initialize data tables + dfb 0,3 ;version number + +* +* Parameters passed from external programs. +* +in_arg ds 1 ;generic argument +in_x0l ds 1 ;X coordinate 0, low part +in_x0h ds 1 ;X coordinate 0, high part +in_y0 ds 1 ;Y coordinate 0 +in_x1l ds 1 +in_x1h ds 1 +in_y1 ds 1 +in_rad ds 1 ;radius for circles + + ds 3 ;pad to 16 bytes + + jmp SetColor + jmp SetPage + jmp Clear + jmp DrawPoint + jmp DrawLine + jmp DrawRect + jmp FillRect + jmp DrawCircle + jmp FillCircle + jmp SetLineMode + jmp noimpl ;reserved2 + jmp FillRaster + +* Raster fill values. Top, bottom, and pointers to tables +* for the benefit of external callers. +rast_top ds 1 +rast_bottom ds 1 + da rastx0l + da rastx0h + da rastx1l + da rastx1h + +noimpl rts + + +******************************** +* +* Global variables. +* +******************************** + +g_inited dfb 0 ;initialized? +g_color dfb 0 ;hi-res color (0-7) +g_page dfb $20 ;hi-res page ($20 or $40) + + +******************************** +* +* Initialize. +* +******************************** +Init + lda #$00 + sta in_arg + jsr SetColor ;set color to zero + jsr SetLineMode ;set normal lines + lda #$20 + sta in_arg + sta g_inited + jmp SetPage ;set hi-res page 1 + + +******************************** +* +* Set the color. +* +******************************** +SetColor + lda in_arg + cmp g_color ;same as the old color? + beq :done + + and #$07 ;safety first + sta g_color + +* Update the "colorline" table, which provides a quick color +* lookup for odd/even bytes. We could also have one table +* per color and self-mod the "LDA addr,y" instructions to +* point to the current one, but that uses a bunch of memory +* and is kind of ugly. Takes 16 + (12 * 40) = 496 cycles. + tax ;2 + lda xormask,x ;4 + sta :_xormsk+1 ;4 + + lda oddcolor,x ;4 + ldy #BYTES_PER_ROW-1 ;2 +]loop sta colorline,y ;5 +:_xormsk eor #$00 ;2 + dey ;2 + bpl ]loop ;3 + +:done rts + + +******************************** +* +* Set the page. +* +******************************** +SetPage + lda g_inited ;let's just check this + beq noinit ; (not called too often) + + lda in_arg + cmp #$20 + beq :good + cmp #$40 + beq :good + jmp bell +:good + sta g_page + + do 0 ;***** + cmp ylookhi + beq :tabok +* Check to see if the values currently in the Y-lookup table +* match our current page setting. If they don't, we need to +* adjust the code that does lookups. + +* This approach modifies the table itself, paying a large +* cost now so we don't have to pay it on every lookup. +* However, this costs 2+(16*192)=3074 cycles, while an +* "ORA imm" only adds two to each lookup, so we'd have +* to do a lot of drawing to make this worthwhile. +* (Note: assumes ylookhi is based at $2000 not $0000) + ldy #NUM_ROWS ;2 +]loop lda ylookhi-1,y ;4 + eor #$60 ;2 $20 <--> $40 + sta ylookhi-1,y ;5 + dey ;2 + bne ]loop ;3 + + else ;***** + +* This approach uses self-modifying code to update the +* relevant instructions. It's a bit messy to have it +* here, but it saves us from having to do it on +* every call. +* +* We could also have a second y-lookup table and +* use this to update the pointers. That would let +* us drop the "ORA imm" entirely, without the cost +* of the rewrite above, but eating up another 192 bytes. + sta _pg_or1+1 ;rastfill + sta _pg_or2+1 ;circle hplot + sta _pg_or3+1 ;circle hplot + sta _pg_or4+1 ;drawline + sta _pg_or5+1 ;drawline + sta _pg_or6+1 ;drawline + sta _pg_or7+1 ;drawline + + fin ;***** + +:tabok rts + +noinit ldy #$00 +]loop lda :initmsg,y + beq :done + jsr $fded ;cout + iny + bne ]loop +:done rts + +:initmsg asc "FDRAW NOT INITIALIZED",87,87,00 + + +******************************** +* +* Clear the screen to the current color. +* +******************************** +Clear + + do USE_FAST ;***** +* This performs a "visually linear" clear, erasing the screen +* from left to right and top to bottom. To reduce the amount +* of code required we erase in thirds (top/middle/bottom). +* +* Compare to a "venetian blind" clear, which is what you get +* if you erase memory linearly. +* +* The docs discuss different approaches. This version +* requires ((2 + 5*64 + 11) * 40 + 14) * 3 = 40002 cycles. +* If we didn't divide it into thirds to keep the top-down +* look, we'd need (5*64 + 9) * 120 = 39480 cycles, so +* we're spending 522 cycles to avoid the venetian look. + lda :clrloop+2 + cmp g_page + beq :pageok + +* We're on the wrong hi-res page. Flip to the other one. +* 4 + (20*64) = 1284 cycles to do the flip (+ a few more +* because we're probably crossing a page boundary). + BEEP + ldy #NUM_ROWS ;2 +]loop lda :clrloop-3+2,y ;4 + eor #$60 ;2 + sta :clrloop-3+2,y ;5 + dey ;2 + dey ;2 + dey ;2 + bne ]loop ;3 + +:pageok ldx g_color ;grab the current color + lda xormask,x + sta :_xormsk+1 + lda evencolor,x + + ldy #0 + jsr :clearthird + ldy #BYTES_PER_ROW + jsr :clearthird + ldy #BYTES_PER_ROW*2 +* fall through into :clearthird for final pass + +:clearthird + ldx #BYTES_PER_ROW-1 ;2 +:clrloop sta $2000,y ;5 (* 64) + sta $2400,y ;this could probably be + sta $2800,y ; done with LUP math + sta $2c00,y + sta $3000,y + sta $3400,y + sta $3800,y + sta $3c00,y + sta $2080,y + sta $2480,y + sta $2880,y + sta $2c80,y + sta $3080,y + sta $3480,y + sta $3880,y + sta $3c80,y + sta $2100,y + sta $2500,y + sta $2900,y + sta $2d00,y + sta $3100,y + sta $3500,y + sta $3900,y + sta $3d00,y + sta $2180,y + sta $2580,y + sta $2980,y + sta $2d80,y + sta $3180,y + sta $3580,y + sta $3980,y + sta $3d80,y + sta $2200,y + sta $2600,y + sta $2a00,y + sta $2e00,y + sta $3200,y + sta $3600,y + sta $3a00,y + sta $3e00,y + sta $2280,y + sta $2680,y + sta $2a80,y + sta $2e80,y + sta $3280,y + sta $3680,y + sta $3a80,y + sta $3e80,y + sta $2300,y + sta $2700,y + sta $2b00,y + sta $2f00,y + sta $3300,y + sta $3700,y + sta $3b00,y + sta $3f00,y + sta $2380,y + sta $2780,y + sta $2b80,y + sta $2f80,y + sta $3380,y + sta $3780,y + sta $3b80,y + sta $3f80,y +:_xormsk eor #$00 ;2 flip odd/even bits + iny ;2 + dex ;2 + bmi :done ;2 + jmp :clrloop ;3 +:done rts + + else ;***** not USE_FAST + +* This version was suggested by Marcus Heuser on +* comp.sys.apple2.programmer. It does a "venetian blind" +* clear, and takes (5 * 32 + 7) * 248 = 41416 cycles. +* It overwrites half of the screen holes. + lda :clrloop+5 + cmp g_page + beq :pageok + +* We're on the wrong hi-res page. Flip to the other one. +* 12 + (20*31) = 632 cycles to do the flip. We have to +* single out the first entry because it's $1f not $20. + BEEP + lda :clrloop+2 ;4 + eor #$20 ;2 $1f <-> $3f + sta :clrloop+2 ;4 + ldy #31*3 ;2 +]loop lda :clrloop+2,y ;4 + eor #$60 ;2 $20 <-> $40 + sta :clrloop+2,y ;5 + dey ;2 + dey ;2 + dey ;2 + bne ]loop ;3 + +:pageok ldx g_color + lda xormask,x + sta :_xormsk+1 + lda oddcolor,x + ldy #248 ;120 + 8 + 120 +:clrloop +]addr = $1fff + lup 32 ;begin a loop in assembler + sta ]addr,y ;5 +]addr = ]addr+$100 ;sta 20ff,21ff,... + --^ +:_xormsk eor #$00 ;2 + dey ;2 + bne :clrloop ;3 + rts + + fin ;***** not USE_FAST + + +******************************** +* +* Draw rectangle outline. +* +******************************** +DrawRect +* We could just issue 4 line draw calls here, maybe +* adjusting the vertical lines by 1 pixel up/down to +* avoid overdraw. But if the user wanted 4 lines, +* they could just draw 4 lines. Instead, we're going +* to draw a double line on each edge to ensure that +* the outline rectangle always has the correct color. +* +* Rather than draw two vertical lines, we draw a +* two-pixel-wide filled rectangle on each side. +* +* We don't want to double-up if the rect is only one +* pixel wide, so we have to check for that. +* +* If the rect is one pixel high, it's just a line. +* If it's two pixels high, we don't need to draw +* the left/right edges, just the top/bottom lines. +* If it's more than two tall, we don't need to draw +* the left/right edges on the top and bottom lines, +* so we save a few cycles by skipping those. + + lda in_y1 ;copy top/bottom to local + sta rast_bottom + dec rast_bottom ;move up one + sec + sbc in_y0 + beq :isline ;1 pixel high, just draw line + cmp #1 + beq :twolines ;2 pixels high, lines only + ldy in_y0 + iny ;start down a line + sty rast_top + + lda in_x0h ;check to see if left/right + cmp in_x1h ; coords are the same; if + bne :notline ; so, going +1/-1 at edge + lda in_x0l ; will overdraw. + cmp in_x1l + bne :notlin1 + +:isline jmp DrawLine ;just treat like line + +* Set up left edge. Top line is in Y. +:notline lda in_x0l +:notlin1 sta rastx0l,y + clc + adc #1 + sta rastx1l,y + lda in_x0h + ora #$80 ;"repeat" flag + sta rastx0h,y + and #$7f + adc #0 + sta rastx1h,y + jsr FillRaster + + ldy rast_top + lda in_x1l ;now set up right edge + sta rastx1l,y + sec + sbc #1 + sta rastx0l,y + lda in_x1h + sta rastx1h,y + sbc #0 + ora #$80 ;"repeat" flag + sta rastx0h,y + jsr FillRaster + +* Now the top/bottom lines. +:twolines + ldy in_y0 + jsr :drawline + ldy in_y1 + +:drawline + sty rast_top + sty rast_bottom + lda in_x0l ;copy left/right to the + sta rastx0l,y ; table entry for the + lda in_x0h ; appropriate line + sta rastx0h,y + lda in_x1l + sta rastx1l,y + lda in_x1h + sta rastx1h,y + jmp FillRaster + + +******************************** +* +* Draw filled rectangle. +* +******************************** +FillRect +* Just fill out the raster table and call the fill routine. +* We require y0=top, y1=bottom, x0=left, x1=right. + ldy in_y0 + sty rast_top + lda in_y1 + sta rast_bottom + + lda in_x0l + sta rastx0l,y + lda in_x0h + ora #$80 ;"repeat" flag + sta rastx0h,y + lda in_x1l + sta rastx1l,y + lda in_x1h + sta rastx1h,y + + jmp FillRaster + + +******************************** +* +* Fill an area defined by the raster tables. +* +******************************** +FillRaster + +* Render rasterized output. The left and right edges +* are stored in the rastx0/rastx1 tables, and the top +* and bottom-most pixels are in rast_top/rast_bottom. +* +* This can be used to render an arbitrary convex +* polygon after it has been rasterized. +* +* If the high bit of the high byte of X0 is set, we +* go into "repeat" mode, where we just repeat the +* previous line. This saves about 40 cycles of +* overhead per line when drawing rectangles, plus +* what we would have to spend to populate multiple +* lines of the raster table. It only increases the +* general per-line cost by 3 cycles. +* +* We could use the "repeat" flag to use this code to +* draw vertical lines, though that's mostly of value +* to an external caller who knows ahead of time that +* the line is vertical. The DrawLine code is pretty +* good with vertical lines, and adding additional +* setup time to every vertical-dominant line to +* decide if it should call here seems like a +* losing proposition. + +]hbasl equ zptr0 +]hbash equ zptr0+1 +]lftbyte equ zloc0 +]lftbit equ zloc1 +]rgtbyte equ zloc2 +]rgtbit equ zloc3 +]line equ zloc4 +]andmask equ zloc5 +]cur_line equ zloc6 +]repting equ zloc7 + + ldx g_color ;configure color XOR byte + lda xormask,x + do USE_FAST ;***** + cmp rast_unroll+3 ;already configured? + beq :goodmask + jsr fixrastxor +:goodmask + else + sta _xorcolor+1 + fin ;***** + + lda #$00 + sta ]repting + + ldy rast_top + +* Main rasterization loop. Y holds the line number. +rastloop + sty ]cur_line ;3 + ldx ylooklo,y ;4 + stx ]hbasl ;3 + lda ylookhi,y ;4 +_pg_or1 ora #$20 ;2 will be $20 or $40 + sta ]hbash ;3 = 19 cycles + do USE_FAST-1 ;***** i.e. not USE_FAST + stx _wrhires+1 + sta _wrhires+2 + fin ;***** + +* divide left edge by 7 + ldx rastx0l,y ;4 line num in Y + lda rastx0h,y ;4 + bpl :noflag ;2 + sta rastx0h+1,y ;4 propagate + lda ]repting ;3 first time through? + beq :firstre ;2 yup, finish calculations + lda ]rgtbyte ;3 need this in A + bpl :repeat ;3 always +:firstre lda rastx0h,y ;reload + sta ]repting ;any nonzero will do + and #$7f ;strip repeat flag +:noflag beq :lotabl + lda mod7hi,x + sta ]lftbit + lda div7hi,x + sta ]lftbyte + bpl :gotlft ;always + BREAK ;debug +:lotabl lda mod7lo,x + sta ]lftbit + lda div7lo,x + sta ]lftbyte +:gotlft + +* divide right edge by 7 + ldx rastx1l,y ;4 line num in Y + lda rastx1h,y ;4 + beq :lotabr ;3 + lda mod7hi,x + sta ]rgtbit + lda div7hi,x + sta ]rgtbyte + bpl :gotrgt ;always + BREAK ;debug +:lotabr lda mod7lo,x ;4 + sta ]rgtbit ;3 + lda div7lo,x ;4 + sta ]rgtbyte ;3 = 25 for X1 < 256 +:gotrgt + +:repeat + cmp ]lftbyte ;3 + bne :not1byte ;3 + +* The left and right edges are in the same byte. We +* need to set up the mask differently, so we deal with +* it as a special case. + ldy ]lftbit + lda leftmask,y ;create the AND mask + ldx ]rgtbit + and rightmask,x ;strip out bits on right + sta ]andmask + + ldy ]lftbyte + lda colorline,y ;get color bits + eor (]hbasl),y ;combine w/screen + and ]andmask ;remove not-ours + eor (]hbasl),y ;combine again + sta (]hbasl),y + jmp rastlinedone + +* This is the more general case. We special-case the +* left and right edges, then byte-stomp the middle. +* On entry, ]rgtbyte is in A +:not1byte + sec ;2 compute number of full + sbc ]lftbyte ;3 and partial bytes to + tax ;2 draw + inx ;2 + + ldy ]rgtbit ;3 + cpy #6 ;2 + beq :rgtnospcl ;3 + lda rightmask,y ;handle partial-byte right + sta ]andmask + ldy ]rgtbyte + lda colorline,y + eor (]hbasl),y + and ]andmask + eor (]hbasl),y + sta (]hbasl),y + dex ;adjust count +:rgtnospcl + + ldy ]lftbit ;3 check left for partial + beq :lftnospcl ;3 + lda leftmask,y ;handle partial-byte left + sta ]andmask + ldy ]lftbyte + lda colorline,y + eor (]hbasl),y + and ]andmask + eor (]hbasl),y + sta (]hbasl),y + dex ;adjust count + beq rastlinedone ;bail if all done + iny ;advance start position + bne :liny ;always + BREAK +:lftnospcl + + ldy ]lftbyte ;3 +:liny + + do USE_FAST ;***** "fast" loop +* Instead of looping, jump into an unrolled loop. +* Cost is 10 cycles per byte with an extra 14 cycles +* of overhead, so we start to win at 4 bytes. + lda rastunidx,x ;4 + sta :_rastun+1 ;4 + lda colorline,y ;4 get odd/even color val +:_rastun jmp rast_unroll ;3 + + else ;***** "slow" loop +* Inner loop of the renderer. This runs 0-40x. +* Cost is 14 cycles/byte. + lda colorline,y ;get appropriate odd/even val +_wrhires sta $2000,y ;5 replaced with line addr +_xorcolor eor #$00 ;2 replaced with $00/$7f + iny ;2 + dex ;2 + bne _wrhires ;3 + + fin ;***** + +rastlinedone + ldy ]cur_line ;3 more lines to go? + cpy rast_bottom ;4 + bge :done ;2 + iny ;2 + jmp rastloop ;3 must have line in Y + +:done rts + +fixrastxor + do USE_FAST ;***** +* Update the EOR statements in the unrolled rastfill code. +* Doing this with a loop takes ~600 cycles, doing it with +* unrolled stores takes 160. We only do this when we +* need to, so changing the color from green to blue won't +* cause this to run. +* +* Call with the XOR value in A. +]offset = 0 + lup BYTES_PER_ROW + sta rast_unroll+3+]offset +]offset = ]offset+5 + --^ + BEEP + rts + fin ;***** + + +* include the line functions + put FDRAW.LINE + +* include the circle functions + put FDRAW.CIRCLE + + lst on +CODE_END equ * ;end of code section + lst off + +* include the data tables + put FDRAW.TABLES + + lst on +DAT_END equ * ;end of data / BSS + lst off + +* Save the appropriate object file. + do USE_FAST + sav FDRAW.FAST + else + sav FDRAW.SMALL + fin diff --git a/FDRAW.TABLES.S b/FDRAW.TABLES.S new file mode 100644 index 0000000..d1d91f2 --- /dev/null +++ b/FDRAW.TABLES.S @@ -0,0 +1,339 @@ +******************************** +* * +* Fast Apple II Graphics * +* By Andy McFadden * +* Version 0.3, Aug 2015 * +* * +* Pre-computed data and * +* large internal buffers. * +* (Included by FDRAW.S) * +* * +* Developed with Merlin-16 * +* * +******************************** + +* Expected layout with alignment: +* +* P1 ylooklo, misc tables +* P2 ylookhi, colorline +* P3 rastx0l +* P4 rastx0h +* P5 rastx1l +* P6 rastx1h, div7hi, mod7hi +* P7 div7lo +* P8 mod7lo +* P9 rast_unroll, rastunidx +* +* Tables should be just under $900 bytes. + + PG_ALIGN + +* Hi-res Y lookup, low part (192 bytes). +ylooklo HEX 0000000000000000 + HEX 8080808080808080 + HEX 0000000000000000 + HEX 8080808080808080 + HEX 0000000000000000 + HEX 8080808080808080 + HEX 0000000000000000 + HEX 8080808080808080 + HEX 2828282828282828 + HEX a8a8a8a8a8a8a8a8 + HEX 2828282828282828 + HEX a8a8a8a8a8a8a8a8 + HEX 2828282828282828 + HEX a8a8a8a8a8a8a8a8 + HEX 2828282828282828 + HEX a8a8a8a8a8a8a8a8 + HEX 5050505050505050 + HEX d0d0d0d0d0d0d0d0 + HEX 5050505050505050 + HEX d0d0d0d0d0d0d0d0 + HEX 5050505050505050 + HEX d0d0d0d0d0d0d0d0 + HEX 5050505050505050 + HEX d0d0d0d0d0d0d0d0 + +* Color masks for odd/even bytes, colors 0-7. +evencolor dfb $00,$2a,$55,$7f,$80,$aa,$d5,$ff +oddcolor dfb $00,$55,$2a,$7f,$80,$d5,$aa,$ff + +* XOR mask for colors 0-7 - non-BW flip on odd/even. +xormask dfb $00,$7f,$7f,$00,$00,$7f,$7f,$00 + +* AND mask for the 7 pixel positions, high bit set +* for the color shift. +andmask dfb $81,$82,$84,$88,$90,$a0,$c0 + +* These are pixel AND masks, used with the modulo 7 +* result. Entry #2 in leftmask means we're touching +* the rightmost 5 pixels, and entry #2 in rightmask +* means we're touching the 3 leftmost pixels. +* +* The high bit is always set, because we want to +* keep the color's high bit. +leftmask dfb $ff,$fe,$fc,$f8,$f0,$e0,$c0 +rightmask dfb $81,$83,$87,$8f,$9f,$bf,$ff + + PG_ALIGN + +* Hi-res Y lookup, high part (192 bytes). +* OR with $20 or $40. +ylookhi HEX 0004080c1014181c + HEX 0004080c1014181c + HEX 0105090d1115191d + HEX 0105090d1115191d + HEX 02060a0e12161a1e + HEX 02060a0e12161a1e + HEX 03070b0f13171b1f + HEX 03070b0f13171b1f + HEX 0004080c1014181c + HEX 0004080c1014181c + HEX 0105090d1115191d + HEX 0105090d1115191d + HEX 02060a0e12161a1e + HEX 02060a0e12161a1e + HEX 03070b0f13171b1f + HEX 03070b0f13171b1f + HEX 0004080c1014181c + HEX 0004080c1014181c + HEX 0105090d1115191d + HEX 0105090d1115191d + HEX 02060a0e12161a1e + HEX 02060a0e12161a1e + HEX 03070b0f13171b1f + HEX 03070b0f13171b1f + +* Masks for current color (even/odd), e.g. 55 2a 55 2a ... +* Updated whenever the color changes. +colorline ds 40 + + PG_ALIGN +rastx0l ds NUM_ROWS + PG_ALIGN +rastx0h ds NUM_ROWS + ds 1 ;repeat mode can overstep + PG_ALIGN +rastx1l ds NUM_ROWS + PG_ALIGN +rastx1h ds NUM_ROWS + +* Lookup tables for dividing 0-279 by 7. The "hi" +* parts are 24 bytes each, so they fit inside +* the previous 192-byte entry. The "lo" parts +* each fill a page. +div7hi HEX 2424242525252525 + HEX 2525262626262626 + HEX 2627272727272727 +mod7hi HEX 0405060001020304 + HEX 0506000102030405 + HEX 0600010203040506 + + PG_ALIGN + +div7lo HEX 0000000000000001 + HEX 0101010101010202 + HEX 0202020202030303 + HEX 0303030304040404 + HEX 0404040505050505 + HEX 0505060606060606 + HEX 0607070707070707 + HEX 0808080808080809 + HEX 0909090909090a0a + HEX 0a0a0a0a0a0b0b0b + HEX 0b0b0b0b0c0c0c0c + HEX 0c0c0c0d0d0d0d0d + HEX 0d0d0e0e0e0e0e0e + HEX 0e0f0f0f0f0f0f0f + HEX 1010101010101011 + HEX 1111111111111212 + HEX 1212121212131313 + HEX 1313131314141414 + HEX 1414141515151515 + HEX 1515161616161616 + HEX 1617171717171717 + HEX 1818181818181819 + HEX 1919191919191a1a + HEX 1a1a1a1a1a1b1b1b + HEX 1b1b1b1b1c1c1c1c + HEX 1c1c1c1d1d1d1d1d + HEX 1d1d1e1e1e1e1e1e + HEX 1e1f1f1f1f1f1f1f + HEX 2020202020202021 + HEX 2121212121212222 + HEX 2222222222232323 + HEX 2323232324242424 +mod7lo HEX 0001020304050600 + HEX 0102030405060001 + HEX 0203040506000102 + HEX 0304050600010203 + HEX 0405060001020304 + HEX 0506000102030405 + HEX 0600010203040506 + HEX 0001020304050600 + HEX 0102030405060001 + HEX 0203040506000102 + HEX 0304050600010203 + HEX 0405060001020304 + HEX 0506000102030405 + HEX 0600010203040506 + HEX 0001020304050600 + HEX 0102030405060001 + HEX 0203040506000102 + HEX 0304050600010203 + HEX 0405060001020304 + HEX 0506000102030405 + HEX 0600010203040506 + HEX 0001020304050600 + HEX 0102030405060001 + HEX 0203040506000102 + HEX 0304050600010203 + HEX 0405060001020304 + HEX 0506000102030405 + HEX 0600010203040506 + HEX 0001020304050600 + HEX 0102030405060001 + HEX 0203040506000102 + HEX 0304050600010203 + + +* RastFill unrolled loop. At each step we store the current +* color value, XOR it to flip the bits if needed, and advance. +* The caller needs to set the appropriate initial value based +* on whether the address is odd or even. +* +* We can use a 3-cycle "EOR dp" or a 2-cycle "EOR imm". The +* former is one cycle slower, the latter requires us to +* self-mod 40 instructions when the color changes. +* +* This must be page-aligned so that we can take the value +* from the rastunidx table and self-mod a JMP without having +* to do a 16-bit add. We have just enough room for the +* unrolled loop (40*5+3) and x5 table (41) = 244 bytes, fits +* on a single page. + + do USE_FAST ;***** + ds \ +]hbasl equ zptr0 ;must match FillRaster +rast_unroll equ * + lst off + lup BYTES_PER_ROW + sta (]hbasl),y ;6 + eor #$00 ;2 + iny ;2 10 cycles, 5 bytes + --^ + jmp rastlinedone + +* Index into rast_unroll. If we need to output N bytes, +* we want to jump to (rast_unroll + (40 - N) * 5) (where +* 5 is the number of bytes per iteration). +rastunidx +]offset = BYTES_PER_ROW*5 + lup BYTES_PER_ROW+1 ;0-40 + dfb ]offset +]offset = ]offset-5 + --^ + + fin ;***** + + +******************************** +* +* Code used to generate tables above. If you want to +* decrease load size, use these functions to generate +* the data into empty memory, then discard the code. +* (Maybe use a negative DS and overlap with rastx0l?) +* +******************************** + DO 0 ;***** + +init_ylook +]hbasl equ zptr1 +]hbash equ zptr1+1 + +* Initialize Y-lookup table. We just call the bascalc +* function. + ldx #NUM_ROWS + ldy #NUM_ROWS-1 +]loop tya + jsr bascalc + lda hbasl + sta ylooklo,y + lda hbash + ora #$20 ;remove for $0000 base + sta ylookhi,y + dey + dex + bne ]loop + rts + +* Hi-res base address calculation. This is based on the +* HPOSN routine at $F411. +* +* Call with the line in A. The results are placed into +* zptr1. X and Y are not disturbed. +* +* The value is in the $0000-1fff range, so you must OR +* the desired hi-res page in. +* +bascalc + pha + and #$c0 + sta ]hbasl + lsr + lsr + ora ]hbasl + sta ]hbasl + pla + sta ]hbash + asl + asl + asl + rol ]hbash + asl + rol ]hbash + asl + ror ]hbasl + lda ]hbash + and #$1f + sta ]hbash + rts + +* +* Create divide-by-7 tables. +* +mkdivtab +]val equ zloc0 + + ldy #0 + sty ]val + ldx #0 +]loop lda ]val + sta div7lo,y + txa + sta mod7lo,y + inx + iny + beq :lodone + cpx #7 + bne ]loop + inc ]val + ldx #0 + beq ]loop ;always +:lodone ;safe to ignore ]va update +]loop lda ]val + sta div7hi,y + txa + sta mod7hi,y + iny + cpy #280-256 + beq :hidone + inx + cpx #7 + bne ]loop + inc ]val + ldx #0 + beq ]loop ;always +:hidone rts + + FIN ;***** diff --git a/README.md b/README.md index f70f48e..f356872 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,59 @@ -# fdraw -Fast Apple II graphics +fdraw +===== + +Fast graphics routines for the Apple II +By Andy McFadden +Version 0.3, August 2015 + +## Overview ## + +The fdraw library provides fast rendering of points, lines, rectangles, +and circles, as well as high-speed screen clears, for Apple II hi-res +graphics. It can be used from Applesoft or 6502 assembly language. + +Two disk images are available in the [fdraw-disks.zip](fdraw-disks) zip +archive. `fdrawdemo.do` is a 140K disk image with the demos that will +run on an Apple ][+ or later. `fdrawdev.po` is an 800K disk image with +the source code, demos, and a few extras. + +A video of the demos running in the AppleWin emulator +[https://www.youtube.com/watch?v=z2RFGVoaROE](is available). + +Learn more about how fdraw works in the +[docs/manual.md](library documentation). + +Learn about the demos in the [docs/demos.md](demo documentation). + +Learn more about what possessed me to write a graphics library for the +Apple II more than 20 years after the platform was discontinued in the +[docs/personal-notes.md](fadden's brain documentation). + +The main bits of source code are accessible from git for easy viewing, +but the "official" home is on `fdrawdev.po`. + +All code is copyright 2015 by Andy McFadden. All rights reserved. The +source code is available under the Apache 2 license (a very friendly +open-source license). + + +### Version History ### + +##### v0.1 March 13, 2006 + +No source code, just a demo with fast filled circles and screen clears. + +##### v0.2 March 20, 2006 + +Polished up the sources and published. This version implemented Clear, +FillRect, FillCircle, and FillRaster. + +##### v0.3 August 21, 2015 + +Added DrawPoint, DrawLine, DrawRect, DrawCircle, and SetLineMode. Various +size and performance improvements. + +Added Amperfdraw to make Applesoft BASIC programming easier. + +Added several more demos and tests. + +Added documentation. diff --git a/docs/demos.md b/docs/demos.md new file mode 100644 index 0000000..669562e --- /dev/null +++ b/docs/demos.md @@ -0,0 +1,167 @@ +fdraw Demo README +================= + +The fdraw distribution comes with a handful of demonstration programs. +Most of them are written in Applesoft BASIC, and use the amperfdraw +interface. This is a somewhat poor way to demonstrate animation +performance, as Applesoft adds a tremendous amount of overhead, but it +is the only way to show what you *can* do with Applesoft. + +The easiest way to run them is with the "DEMO" program, which scans the +DEMOS directory for BASIC programs and presents a list. You can also +just run them directly. + +* INTRO : Sort of a "hello, world" for fdraw. Mix of single- and + double-buffered animation. + +* CIRCULAR : Draws lots of circles. + +* RECTSPLAT : Draws lots of rectangles. + +* CUBIC : Draws a spinning wireframe 3D cube. (The 3D coordinates are + pre-computed -- fdraw doesn't do matrix transforms.) + +* TUNNEL : Animates circles to simulate driving through a tunnel. + +* LINEAR : Draws lots of lines. The wipes show speed differences for + horizontal and vertical special cases, while the circular spinner + shows HPLOT is not as fast as &HPLOT which is not as fast as &PLOT for + a set of lines at a variety of angles. + +* LINE.DIFF : Draws several lines with the ROM routines and fdraw + side-by-side to illustrate the difference in line style. + +* CLEARLY : Clears the screen 32 times, 4 sets in each of the 8 colors. + The first round is done with the Applesoft ROM routine ("CALL 62454"), + the second round uses the fdraw &CLEAR function. + +* HRFAN : A simple line-art demo, using "xdraw" DrawLine with lines in + different colors. Not a great demo, as the Applesoft code driving it + is rather slow, but it looks pretty good if you bump up the emulation + speed or switch to IIgs "fast" mode. (This deserves a conversion to + assembly language.) + +* BRIAN.THEME.ORI : The Brian's Theme demo from the DOS 3.3 System + Master. Unmodified except for integration with the demo menu + system, and with the bug on line 31112 fixed. + +* BRIAN.THEME.NEW : The Brian's Theme demo with '&' placed in front of + the various draw calls. There isn't a huge difference in speed, as + there's a lot of overhead from Applesoft, but its interesting to note + the change in the appearance of the lines. + +* WIGGLE : Sample program that shows direct use of rasterization tables. + +When the demos are launched from the menu, they will assume that fdraw +is already loaded and won't try to load it again. If you run the demo +program directly, it will try to load FDRAW.FAST and AMPERFDRAW from the +parent directory before doing any drawing. + + +## Extras ## + +The EXTRAS directory has some additional software that isn't "officially" +part of fdraw, but may be of use. + +NOTE: some of these assume fdraw and amperfdraw are already loaded, and +will hang if not. Run DEMO and hit before running these. + +* ARRAY.EXAMPLE : The &PLOT example from the documentation. + +* XDRAW.ANIM : A demonstration of line animation using "xdraw" mode and + a simple shape that is drawn twice by a single &PLOT call. One copy + is offset by 2 pixels, so each &PLOT call erases the previous copy and + draws a new copy 2 pixels to the right. The animation is shown twice, + once with "erase all, draw all", and once with the erase and draw calls + interleaved for every line. + +* LINEFONT : Program for creating draw-array tables for text phrases. Used + to create data files for the "intro" demo. See the "LINEFONT Details" + section for more information. + +* DAVIEWER: Views the contents of .DA files created by LINEFONT. + +* BENCHCLEAR : Calls the "clear" function 256 times from a small + assembly-language program. Handy for benchmarks, but slightly silly + since it's relatively easy to calculate the exact cycle cost. + + +## LINEFONT Details ## + +NOTE: this program is an unfinished rough cut ("pre alpha"), used for +preparing data for demos. + +The program includes a font definition, routines for displaying +characters, and code for generating and exporting pre-rendered strings. + +Character vertices are expressed as floating-point values. The baseline +is at zero, the peak ascent is at 1.0, the lowest descent is -1.0. The +leftmost pixel is at zero, the maximum value for the rightmost pixel is 1.0. +Characters don't have to fill out the entire cell -- proportionally-spaced +fonts are supported -- but they are expected to start at the left edge. + +So a capital 'M' might look like this: + + 0.0,0.0 -> 0.0,1.0 -> 0.5,0.7 -> 1.0,1.0 -> 1.0,0.0 + +There is currently no "user interface", unless the "user" can program in +Applesoft BASIC. To generate strings, add a series of statements that set +variables and call 20000 to add rendered strings to the set. The relevant +variables are: + + S$ - string to add + DW - desired width, in pixels, of a cell 1.0 units wide + DH - desired height, in pixels of a cell 2.0 units high (ascent + descent) + IS% - inter-character spacing, in pixels + SW% - width of the space character (usually same as DW) + MO% - monospace flag; if nonzero, all chars are treated as 1.0 units wide + +Remove the REM from the start of line 1010 to enable the character viewer. +At present only a couple of lower-case letters are defined. + + +#### LINEFONT Output #### + +The LINEFONT program outputs a binary blob that can be passed to +the &PLOT array-draw function. The file structure is: + ++0 byte - number of array sets in the list. ++1 2 bytes * N - table of offsets to individual array sets. One of + these per array set. The value is the offset from the start of the + file. + +(2N+1) array set #1: ++0 byte - number of vertices (0-127) ++1 byte - number of index pairs (0-127) ++2 2 bytes * V - vertices (values are signed X/Y) ++X 2 bytes * I - index pairs (values are 0-127) + +To display phrase #3, you would get the 16-bit value from the offset +table with PEEK(start + 1 + 3 * 2) + PEEK(start + 2 + 3 * 2) * 256. +You get the number of vertices from PEEK(start + offset), and the number +of index pairs from PEEK(start + offset + 1). Finally, call the array-draw +function with: + + VA = start + offset + 2 + IA = VA + num_vertices * 2 + &PLOT va, ia, num_index_pairs + +The 0,0 point in the blob is in the center of the phrase horizontally +(which allows a maximum width of 255 pixels), and at the font baseline +vertically (so most of the font will appear above the zero point, but +descenders will extend below). + + +#### Future Enhancements #### + +Right now the font definition is embedded in the program. This takes up +a lot of space -- before too long the BASIC program is going to intrude +on the hi-res page -- and is unnecessarily restrictive. The font should be +defined by a separate program, and BSAVEd into a line-font file that +LINEFONT can load. + +Generating strings should be menu-driven and interactive, rather than +requiring manual changes to the code to fiddle with sizes and spacing. +DAVIEWER should be folded into the generation program (though it's kind +of handy as a simple example of how to unpack and access content). + diff --git a/docs/manual.md b/docs/manual.md new file mode 100644 index 0000000..a188b36 --- /dev/null +++ b/docs/manual.md @@ -0,0 +1,990 @@ +fdraw Library Documentation +=========================== + +Fast graphics primitives for the Apple II +By Andy McFadden +Version 0.3, August 2015 + +## Overview ## + +The fdraw library provides fast rendering of points, lines, rectangles, +and circles, as well as high-speed screen clears, for Apple II hi-res +graphics. It can be used from Applesoft or assembly language. + +The Applesoft ROM routines were designed to be as compact as possible, +and were unable to use self-modifying code techniques, so their speed is +less than what the Apple II is capable of. The fdraw routines pick a +different point in the speed/space trade-off continuum, providing fast +speeds at a reasonable size. Not everyone agrees on what "reasonable" +means, so the fdraw code can be built in two modes, one that favors +speed, one that reduces size. + +**Contents:** + +- [Applesoft BASIC Ampersand API](#amperapi) +- [Raw API](#rawapi) +- [Building the Code](#building) +- [Apple II Hi-res in a Nutshell](#nutshell) +- [Notes on the Drawing Functions](#notes) +- [General Notes](#additional-notes) +- [Enhancement Ideas](#ideas) +- [My Quest for Lines](#history) + + +
+## Applesoft BASIC Ampersand API (Amperfdraw) ## + +The ampersand API acts as a bridge between Applesoft BASIC and fdraw. +It's more convenient and has less overhead than POKE and CALL, though +you are not prevented from using that approach if you prefer. It's +best to use one or the other though, not mix and match. + +All arguments are checked for validity. An appropriate Applesoft +error is thrown if invalid syntax or arguments are discovered. + +This is not intended to be compatible with, nor a replacement for, the +ampersand utilities in Beagle Graphics. + +* &NEW - calls the fdraw Init function (which sets the color to 0 and + selects hi-res page 1). You must do this once, at the start of + your program, after fdraw has been loaded. This also resets internal + amperfdraw state, setting the "HPLOT TO" origin to (0,0) and the "AT" + point to (139,95). +* &HGR - does what HGR does, only faster. Equivalent to executing + `&HCOLOR=0:&SCRN(1):&CLEAR:&HCOLOR=[prevcolor]`, and then setting the + display softswitches to display hi-res page 1 in mixed mode. Also sets + $e6 (HPAG) for convenience in case you want to mix & match with ROM + routines. +* &HGR2 - like &HGR, but for page 2. Like HGR2, this turns off + mixed-text mode. +* &SCRN({1,2}) - sets the hi-res page that will be used for drawing. Does + not change which page is displayed. (Use the softswitches, or call + &INVERSE.) +* &INVERSE - flips the render page to the other page, and hits the + display softswitches to show the page that was just rendered. Intended + for double-buffered animation. +* &HCOLOR={0-7} - sets color, using the same numbering scheme as Applesoft. + Does not affect the color used by the ROM routines. +* &CLEAR - clears screen to current color. +* &HPLOT [TO] x,y [TO x,y ...] - draws a point or a line. Works the same as + Applesoft, e.g. "&HPLOT TO" starts from the end of the previously + drawn line, and you can chain multiple "TO x,y" in a single statement. +* &EXP {0,1} - set line mode. 0 is normal, 1 is "xdraw". +* &XDRAW left,top,right,bottom - draws outline rectangle. +* &DRAW left,top,right,bottom - draws filled rectangle. +* &COS cx,cy,r - draws outline circle. +* &SIN cx,cy,r - draws filled circle. + +* &AT cx,cy - sets center offset for array-based rendering. Position must + be on the hi-res screen (0-279, 0-191). +* &PLOT vertexAddr, indexAddr, indexCount [AT cx,cy] - draws from the + specified byte-arrays. See the "Drawing Lines with Indexed Byte-Arrays" + section for the full explanation. + + +
+## Raw API ## + +The code is assembled at $6000 by default. The program's length includes +all data tables and work areas, and no memory outside of the program, +zero page, and the current hi-res page is modified. + +Input parameters and the function jump table are located near the start +of the program. The API description below describes the addresses in +relative terms. + +Input parameters are not checked for validity. They must be in the range +specified by the API, or undefined (but probably bad) behavior will result. +The values will not be modified by fdraw functions. + +All drawing operations use the current color. + +* +0 Init - call this when the library is first loaded. It must be + called before any other functions are used. It initializes the + color to zero and the page to $20. +* +3 (major version number, currently 0) +* +4 (minor version number, currently 3) +* +5 Input parameter area: + * +5 arg - used for misc functions, e.g. SetColor and SetPage + * +6 x0l - low part of the X0 coordinate (0-279) + * +7 x0h - high part of X0 + * +8 y0 - Y0 coordinate (0-191) + * +9 x1l - low part of X1 (0-279) + * +10 x1h - high part of X1 + * +11 y1 - Y1 coordinate (0-191) + * +12 rad - circle radius (0-255) +* +13 (reserved) +* +16 SetColor - set the color used for drawing (0-7) to the value in "arg". + The numbering is the same as the Applesoft hi-res colors. +* +19 SetPage - set the hi-res page used for drawing to the value in "arg", + which must be $20 or $40. Does not change the page that is displayed. + (Because a bad value can cause memory corruption, this value *is* + checked, and bad values rejected.) +* +22 Clear - erase the current hi-res page to the current color. +* +25 DrawPoint - plot a single point at x0,y0. +* +28 DrawLine - draw a line from x0,y0 to x1,y1 (inclusive). +* +31 DrawRect - draw a rectangle with corners at x0,y0 and x1,y1 (inclusive). + x0,y0 is the top-left, x1,y1 is the bottom-right. The left and + right edges will be drawn two bits wide to ensure that the edges + are visible (drawn at x0+1, x1-1). +* +34 FillRect - draw a filled rectangle with corners at x0,y0 and x1,y1 + (inclusive). +* +37 DrawCircle - draw a circle with center at x0,y0 and radius=rad. +* +40 FillCircle - draw a filled circle with center at x0,y0 and radius=rad. +* +43 SetLineMode - set the DrawLine mode to the value in "arg", which can + be 0 (normal) or 1 (xdraw). +* +46 (reserved) + +* +49 FillRaster - draw an arbitrary shape from the rasterization tables. + For each line from top to bottom, the left and right edges will + be read from rastx1/rastx2 and a raster drawn in the current color. +* +52 (byte) topmost line to rasterize (0-191) +* +53 (byte) bottom-most line to rasterize (0-191), inclusive +* +54 (2 bytes) address of rastx1l table +* +56 (2 bytes) address of rastx1h table +* +58 (2 bytes) address of rastx2l table +* +60 (2 bytes) address of rastx2h table + +The rasterization table addresses are read-only; changing them will have +no effect. + +fdraw uses a fair number of zero page locations. The exact set can be +determined by looking at FDRAW.S. The locations were chosen to not +interfere with DOS, ProDOS, Applesoft, or the Monitor. They may +interfere with Integer BASIC, SWEET16, or your own application code. +Remapping them to different locations is straightforward: just change +the assignment of zptr/zloc values near the top of FDRAW.S to use +different addresses. fdraw does not expect any zero page value to be +preserved across calls, so you're welcome to use those locations in your +own code, but understand that fdraw functions will overwrite them. + + +
+## Apple II Hi-res in a Nutshell ## + +This is a quick overview of the Apple II hi-res graphics architecture +for anyone not recently acquainted. + +The Apple II hi-res graphics screen is a quirky beast. The typical +API treats it as 280x192 with 6 colors (black, white, green, purple, +orange, blue), though the reality is more complicated than that. + +There are two hi-res screens, occupying 8K each, at $2000 and $4000. +You turn them on and flip between them by accessing softswitches in +memory-mapped I/O space. + +Each byte determines the color of seven adjacent pixels, so it takes +(280 / 7) = 40 bytes to store each line. The lines are organized into +groups of three (120 bytes), which are interleaved across thirds of +the screen. To speed the computation used to find the start of a +line in memory, the group is padded out to 128 bytes; this means +((192 / 3) * 8) = 512 of the 8192 bytes are part of invisible +"screen holes". The interleaving is responsible for the characteristic +"venetian blind" effect when clearing the screen. + +Now imagine 280 bits in a row. If two consecutive bits are on, you +get white. If they're both off, you get black. If they alternate +on and off, you get color. The color depends on the position of the bit; +for example, if even-numbered bits are on, you get purple, while +odd-numbered bits yield green. The high bit in each byte adjusts the +position of bits within that byte by half a pixel, changing purple and +green to blue and orange. + +This arrangement has some curious consequences. If you have green and +purple next to each other, there will be a color glitch where they meet. +The reason is obvious if you look at the bit patterns when odd/even meet: +`...010101101010...` or `...101010010101...`. The first pattern has two +adjacent 1 bits (white), the latter two adjacent 0 bits (black). Things +get even weirder if split occurs at a byte boundary and the high bit is +different, as the half-pixel shift can make the "glitch" pixel wider or +narrower by half a pixel. + +The Applesoft ROM routines draw lines that are 1 bit wide. If you execute +a command like `HGR : HCOLOR=1 : HPLOT 0,0 to 0,10`, you won't see +anything happen. That's because HCOLOR=1 sets the color to green, +which means it only draws on odd pixels, but the HPLOT command we gave +drew a vertical line on even pixels. It set 11 bits to zero, but since +the screen was already zeroed out there was no apparent effect. + +If you execute `HGR : HCOLOR=3 : HPLOT 1,0 to 1,10`, you would expect a +white line to appear. However, drawing in "white" just means that no +bit positions are excluded. So it drew a vertical column of pixels at +X=1, which appears as a green line. + +If (without clearing the screen after the previous command) you execute +"HCOLOR=4 : HPLOT 5,0 to 5,10`, something curious happens: the green line +turns orange. HCOLOR=4 is black with the high-bit set. So we drew a +line of black in column 5 (which we won't see, because that part of the +screen is already black), and set the high bit in that byte. The same +byte holds columns 0 through 6, so drawing in column 5 also affected +column 1. We can put it back to green with "HCOLOR=0 : HPLOT 5,0 to 5,10". + +It's important to keep the structure in mind while drawing to avoid +surprises. + +Note that the Applesoft ROM routines treat 0,0 as the top-left corner, +with positive coordinates moving right and down, and lines are drawn +with inclusive end coordinates. This is different from many modern +systems. fdraw follows the Applesoft conventions to avoid confusion. + +Handy table of graphics softswitches: + +name | addr | decimal | purpose +------ | ----- | ------- | ------------------ +TXTCLR | $c050 | -16304 | enable graphics +TXTSET | $c051 | -16303 | text-only +MIXCLR | $c052 | -16302 | disable mixed mode +MIXSET | $c053 | -16301 | enable mixed mode (4 lines of text) +LOWSCR | $c054 | -16300 | display page 1 +HISCR | $c055 | -16299 | display page 2 +LORES | $c056 | -16298 | show lo-res screen +HIRES | $c057 | -16297 | show hi-res screen + + +
+## Building the Code ## + +The main fdraw code is written for the Merlin assembler (specifically +Merlin-16 3.40, though other versions should work). It uses plain 6502 +code, and is expected to run on an Apple ][+. + +For convenience when editing the files on an Apple II, and to allow the +code to be compiled by Merlin-16 running under ProDOS 8, the code is +broken into four files. The main file, FDRAW.S, includes the other +three with PUT directives. FDRAW.S holds the API entry points and some +of the drawing code. FDRAW.LINE.S has the code for drawing points and +lines, while FDRAW.CIRCLE.S has the code for drawing circles. +FDRAW.TABLE.S holds the data tables, as well as empty space for work +areas. The empty space is included in the binary so you can determine +the full memory footprint by looking at the length of the file. + +Near the top of FDRAW.S is a constant, `USE_FAST`, which may be set +to 0 or 1. If set to 0, some code optimizations are disabled, +reducing the size of the code and data areas. Further, the page +alignment on data tables is disabled, reducing the internal fragmentation +of the data area. + +The USE_FAST setting also determines which file recevies the assembler +output: FDRAW.FAST or FDRAW.SMALL. To generate both, it is necessary to +assemble the file, change the constant, and then assemble the file again. + +Tests and demos are written in Applesoft BASIC, with a couple of +exceptions. + + +### Why So Big? ### + +The fdraw code weighs in at a hefty 5KB (or 4KB for the "small" build). +That doesn't sound like much in the age of multi-gigabyte mobile phones, +but it's a sizeable fraction of the space available on an Apple ][+. + +If you want to modify individual pixels quickly, you need two things: +a line base-address table, and a divide-by-7 table. Computing base +addresses and dividing by 7 aren't hugely expensive, but we're going +to be doing them often, so they need to be as fast as possible. + +The line address table has 192 entries, one for each line, 2 bytes per +entry. The divide-by-7 table has 280 entries, one for each horizontal +pixel position, with one byte for the dividend and one for the quotient. +(The quotient can be expressed as a numeric value from 0 to 6, or as +a byte with a specific bit set.) + +That's 944 bytes. For optimum performance, each table must fit on a +single page of memory. We can split the division table into two pieces, +one for 0-255 and one for 256-279, and put the smaller half on the same +page as the Y table, along with 16 bytes of padding. The final size is +256 + 256 + (192+24+24+pad) + 192 = 960. So you can write off 1K of +memory before you've written any code. + +(There's a clever way to reduce the size of the y-lookup table to 24 +entries, but it's slightly faster and much easier to use full tables.) + +For the FillRaster function, fdraw needs to record the left and right +X coordinates on each line (2 bytes each), so that's 192 * 4 = 768 bytes. +Again, for optimum performance, each table needs to be on its own page, +so for USE_FAST=1 that expands to 1024 bytes. + +Add to that another full page of unrolled rasterization code, and you've +got 2304 bytes of tables. + +The rest is code, most of which was written with a flagrant disregard +for size. Many common code fragments are repeated inline, rather than +called as a subroutine, because a subroutine call (JSR+RTS) costs 12 +cycles. Calling a common "plot a point" function from the line-drawing +code would increase the per-pixel cost by 15-20%. + + +
+## Notes on the Drawing Functions ## + +### Screen Clear ### + +The Clear function erases the current hi-res page to the current color. +It's several times faster than the version built into the ROM. + +#### Performance #### + +The fastest possible way to clear the screen to a specific color on a +6502 is to write to every visible location with an absolute store +instruction. Subtracting the screen holes, that's 7680 address * +4 cycles = 30720 cycles. The code to do that would be 23,040 bytes long, +making it impractical. + +A slower but more memory-efficient approach has one store statement for +each line, and iterates through 40 times (280 / 7 = 40). Factoring in the +loop overhead, that comes out to 40 * (192 * 5 + 9) = 38760 cycles. +192 sets of store instructions fills 576 bytes, which is much better +than 23K, but still quite a lot. + +We can reduce the size further by taking the lines 3 at a time, erasing +the first 120 bytes in each 128-byte group (the last 8 bytes are the +screen hole). We'd need to use 7680/120 = 64 store instructions, for a +total of 120 * (64 * 5 + 9) = 39480 cycles, with 192 bytes for the main +part of the erase loop. We're not quite 2% slower, but 384 bytes +smaller, which seems a fair trade-off. Because we're accessing memory +linearly we now have a "venetian blind" clear, which is something of an +Apple II trademark, but we can fix that by spending an additional 522 +cycles to erase the screen in thirds (top/middle/bottom). + +Any further changes that make the code smaller also increase the execution +time. When built with USE_FAST=0, the code will use a different loop +with 32 stores that write 248 bytes each, and takes 41416 cycles. It's +half the size, but nearly 2000 cycles slower, and overwrites half of the +screen holes. + +At the extreme end of space over speed is the Applesoft ROM routine -- HGR +or "CALL 62454" -- which only needs about 30 bytes for its main loop, but +takes (8192*33)+(12*64)+17 = 271121 cycles for black or white, or +(8192*40)+(12*64)+17 = 328465 cycles for green/purple/blue/orange -- +7-8x slower than our preferred implementation. + +The screen clear is wired to a specific hi-res page, so the SetPage +function must rewrite the store instructions when the page changes (or +we need to keep two full copies of the function around). For an +application that is constantly doing flip-erase, the overhead must be +factored into the efficiency of the approach -- for example, rewriting +stores with indexed LDA/EOR/STA in a loop will take 20 cycles per iteration, +1280 cycles for the full set of 64. The "slow" clear has half the +number of store instructions, so takes half the time to fix up after +a page flip. + + +### Raster Fill ### + +Drawing an outline of a rectangle or circle can be done efficiently by +drawing lines or plotting points. Drawing a filled shape is more +expensive if one point is plotted at a time, especially on the Apple II +where every byte affects 7 pixels. + +For filled shapes, fdraw populates a rasterization table. The table has +192 entries, each of which holds the left and right edges of the shape +on that line. The code fills in the pixels one line at a time, using +a simple byte store for the middle parts, and bit masks at the edges. + +External applications can use the raster renderer directly by filling +out the rasterization table and calling FillRaster. + +While the FillRaster function itself will not modify the contents of the +raster tables, other fdraw calls will, sometimes unexpectedly. For +example, drawing a horizontal line is performed with a single-line +fill call. Filled rectangles might populate the table in the way you'd +expect, or might use some internal shortcut that only fills out one line +and sets a "repeat" flag. Don't make assumptions about what will be in +the table after a call to one of the drawing functions. You *can* count +on whatever you wrote there yourself to be unmodified after calls to +FillRaster, SetColor, or SetPage, so you can do page-flipping and +color-cycling without having to repopulate the tables. + +#### Performance #### + +The fill code needs about 100 cycles to set up each line when drawing +a rectangle, more if the line doesn't start and end on byte boundaries. +The inner loop costs 10 cycles per byte. To clear the screen with the +raster fill code, it would take (192 * (100 + 40 * 10)) = 96000 cycles, +or nearly 2.5x the time required for the dedicated clear code. Which is +about what you'd expect, as the screen erase needs 4 cycles per byte, and +has lower per-line overhead. (This can be improved significantly; see +the notes in the "enhancements" section.) + +Non-rectangular shapes take slightly longer to set up, as the edges must +be recomputed for each line. + + +### Lines ### + +The goal is to provide a replacement for Applesoft's HPLOT function +that is faster and more consistent in appearance. Lines are drawn using +Bresenham's run-length algorithm. + +Internally, there are five separate functions. Horizontal and vertical +lines each get a special-case handler. There's another for mostly-vertical +lines, one for mostly-horizontal lines, and one for wide mostly-horizontal +lines (255 pixels or wider). The latter requires 16-bit math, and is +slightly slower. + +The Applesoft routine isn't quite the same as the standard Bresenham +algorithm, because it doesn't move diagonally. Consider a line from +(0,0) to (50,10) -- gently sloping down and to the right. The standard +algorithm would plot exactly 51 pixels, one in each horizontal position. +The "pen" always moves one pixel right, but sometimes also moves down. + +In Applesoft, the "pen" can move either right or down, but can't do +both at once. This results in lines that feel thin when near horizontal +or vertical, but become thicker as they approach 45 degrees. This +reduces performance, because Applesoft draws twice as many pixels for a +diagonal line as the basic algorithm. It can also be visually jarring +when animated, because lines get very thick when near diagonal. + +Different applications have used different styles; for example: + +- Stellar 7 and Elite for the Apple II use Bresenham-style lines. If + you look at near-diagonal lines on a color monitor you can see the + pixels alternating green and purple. +- A2-FS1 Flight Simulator appears to be using Bresenham lines but with + doubled bits, effectively treating the screen as having 140 pixels. This + gives solid white lines with a fairly consistent feel. +- GraFORTH doubles the bits, but treats the screen as 256 pixels wide + (not 280... it gives up 24 pixels to improve performance). White + lines are thick like Flight Simulator, but feel less jagged because + each step can move left or right by one bit rather than two. + +The SetLineMode function lets you choose between "draw" and "xdraw". The +former draws color pixels, setting and clearing bits as needed, while +the latter inverts whatever is currently on the screen. This can have +some unusual effects. Drawing the same line twice erases the line. +Drawing a green line over a purple line gives you a white line. Drawing +with colors 5 and 6 can produce odd results, because the high bit inverts +every time you touch a byte -- which means the ends of a horizontal line +will be a different color if the byte holds an even number of affected +pixels. It's best to draw with colors 0-3 when in xdraw mode. Clearing +the background to color 4, rather than 0, will cause drawing in colors +0-3 to actually be 4-7. + +#### Performance #### + +Mostly-horizontal lines step horizontally each iteration, and sometimes +step vertically. Mostly-vertical lines step vertically each iteration, +and sometimes step horizontally. Each part of the operation has a cost, +so the fastest lines are the ones drawn primarily in a single direction. +Diagonal lines are the worst case for performance. + +The current code requires just under 80 cycles per pixel for diagonal +movement, and about 56 for single-direction movement. There's another +150 cycles or so per line for the initial setup. + +Vertical lines cost about 43 cycles per pixel. Horizontal lines are +handled as a trivial FillRaster call, which at peak performance can write +7 pixels in 10 cycles. + +This is about as fast as you can get with the Bresenham run-length +algorithm and Applesoft-style color handling. It's possible to go faster +by switching to a different pixel style, or using a run-slice approach. + + +### Rectangles ### + +Filled rectangles are currently implemented by putting the left and +right edges into the rasterization table, and calling FillRaster. + +Outline rectangles could be drawn as four lines, but that doesn't look +very good in color unless you get the lines on the right columns. To +ensure that the edges are in the correct color, outline rectangles are +drawn as four separate items: a two-pixel-wide left edge, a two-pixel-wide +right edge, and horizontal lines at the top and bottom. FillRaster does +the actual work. + +#### Performance #### + +FillRaster is suboptimal for rectangles, because it works by rows rather +than by columns (see "Vertically-Challenged Rasterization" later in this +document). Rectangles could be drawn 2.5x faster with dedicated code, +but at a cost of hundreds of bytes of memory. + +The advantage of using FillRaster is that we need it for filled circles, +so adding support for rectangles was nearly free. And it's still pretty +fast. + + +### Circles ### + +Circles are computed with Bresenham's algorithm. The idea is to compute +one octant of the circle with this bit of magic: + +void drawOutline(int cx, int cy, int rad) { + int x, y, d; + + d = 1 - rad; + x = 0; + y = rad; + + while (x <= y) { + plot(cx, cy, x, y); + + if (d < 0) { + d = d + (x * 4) + 3; + } else { + d = d + ((x - y) * 4) + 5; + y--; + } + x++; + } +} + +Then each X/Y coordinate is plotted eight times: + + (cx+x, cy+y) (cx-x, cy+y) (cx+x, cy-y) (cx-x, cy-y) + (cx+y, cy+x) (cx-y, cy+x) (cx+y, cy-x) (cx-y, cy-x) + +For an outline circle, we plot every point. For a filled circle, we add +each point to a rasterization table. Near the top and bottom of the +circle there will be multiple updates to the same line, with each update +replacing the previous one (which works, as we are moving "outward"). + +The center point of the circle must be on screen, but it's not necessary +for the entire circle to fit. Coordinates outside screen space are clipped. + +#### Performance #### + +The implementation of Bresenham's algorithm is straightforward, and is +about as fast as it's going to get. There are actually two versions of +the core computation. If the radius is less than 41, we can keep all of +the variables in 8 bits. For circles with radius 41 and larger, we need +to use 16 bits, slowing each step slightly. + +There are also two versions of the octant plot. If the circle fits entirely +on-screen, we use a simple version. If it doesn't, we use a version that +clips values. For rasterization that means clamping X to the left or +right edge, and skipping updates that are off the screen in the Y dimension. +For an outline circle we simply don't plot any clipped points. + +The rendering of filled circles is very fast, though there is a possibility +of optimizing the center-fill of large circles. Outline circles were +added by inserting JSR PLOT at key points, and could perhaps be faster. + + +### Drawing Lines with Indexed Byte-Arrays ### + +The &PLOT command allows a BASIC program to execute a series of line-draw +commands with a single statement. Think of it like shape-table animation +with lines instead of plotted points. + +Suppose you want to draw a rectangle with an X through the middle. We'll +make it 11 units wide and 21 units high. To draw that in the middle of +the screen, we'd set CX=139 and CY=95, then draw lines offset from that +by +/- 5 in X and +/- 10 in Y: + + HPLOT CX-5,CY-10 TO CX-5,CY+10 : REM LEFT + HPLOT CX-5,CY-10 TO CX+5,CY-10 : REM TOP + HPLOT CX+5,CY-10 TO CX+5,CY+10 : REM RIGHT + HPLOT CX-5,CY+10 TO CX+5,CY+10 : REM BOTTOM + HPLOT CX-5,CY-10 to CX+5,CY+10 : SLASH + HPLOT CX+5,CY-10 to CX-5,CY+10 : BACKSLASH + +Six lines, each of which needs four coordinates. We'd need 24 bytes +to store that in an integer array. + +Suppose instead we identified the four vertices, and numbered them: + + #0 CX-5,CY-10 + #1 CX+5,CY-10 + #2 CX-5,CY+10 + #3 CX+5,CY+10 + +and then created a list of line segments using the vertex indices: + + HPLOT #0 TO #2 + HPLOT #0 to #1 + HPLOT #1 TO #3 + HPLOT #2 TO #3 + HPLOT #0 TO #3 + HPLOT #1 TO #2 + +This requires (4*2) + (6*2) = 20 bytes, for a small savings. The real +value in the approach is that it separates the description of the shape +from the placement of the points. For example, if you want to change +vertex #0 to (CX-7,CY-12), you don't have to make changes two three +separate HPLOT calls. (This is particularly useful when you have code +that scales and rotates the vertices.) + +For the current release of fdraw, the only built-in transform is +translation. Using "&AT cx,cy", you can place the center point anywhere +on the screen. This allows you to animate movement of the shape by +simply calling &AT to change the position, and &PLOT to draw. + +The &PLOT command takes three arguments: the address of a vertex array, +the address of an index array, and the number of line segments to draw. +These are referred to as "byte arrays" because they are arbitrary +locations in memory where you have BLOADed or POKEd your shape data, not +Applesoft arrays. The count can be from 0 to 127. You can optionally +add an AT to the end; if not present, the coordinates of the previous AT +are used. The initial value is the center of the screen (x=139 y=95). + +The vertex array uses two signed bytes per vertex (-128 to 127), one for +the X coordinate and one for the Y coordinate. + +The index array uses two bytes per line segment. Each byte is an index +into the vertex array, from 0 to 127. + +Here's an Applesoft program that implements the above example. (The DATA +statements use negative numbers for clarity; if you replace the negative +values with 256+value, e.g. -5 becomes 251, then you can avoid the IF +statement and just poke the value directly.) + + 100 TEXT : NORMAL : HOME + 200 & NEW : & HGR : VTAB 21 + 210 & HCOLOR= 3 + 500 REM ARRAY TEST + 510 AD = 768: REM $300 + 520 READ D: IF D = 1000 THEN 560 + 530 IF D < 0 THEN D = 256 + D + 540 POKE AD,D:AD = AD + 1: GOTO 520 + 560 & PLOT 768,776,6: & AT 50,50: & PLOT 768,776,6 + 570 POKE 768,256 - 10: POKE 769,256 - 20: & PLOT 768,776,6 AT 100,50 + 600 DATA -5,-10, 5,-10, -5,10, 5,10 + 610 DATA 0,2, 0,1, 1,3, 2,3, 0,3, 1,2, 1000 + +This draws the shape twice, once at the middle of the screen, once centered +at 50,50. It then adjusts the top-left coordinate, and draws the shape +centered at 100,50. Looking at the output, you can see that the top-left +corner of the third instance has moved, and all three lines from that +point have moved with it. + +If a vertex ends up off-screen, lines that use that vertex are omitted +(not clipped). If you tried to draw the example shape at (0,0), nothing +would happen, because every line has at least one point that would be +off-screen -- only point #3 is still visible, and all of the lines that +use that point extend off screen. + +You can specify a maximum of 128 vertices and 128 index pairs for a +single call. If none of the line segments share vertices, you'll need +two vertices per line, which means a cap of 64 lines. + +#### Performance #### + +There isn't a whole lot to it -- it just feeds the lines to DrawLine. +The key speed advantage is the removal of the Applesoft overhead. + + +
+## Enhancement Ideas ## + +Some ideas for future versions of fdraw. + +### fdraw ### + +Line clipping would make the array-draw function more useful for +animation projects. If we accepted signed 16-bit values as input to +the clip function, we could specify an AT point outside the screen bounds. +That could be extended to circles, which could have off-screen centers. + +A "game line" function or line mode that restricts coordinates to 0-255 +and ignores color might be worth an experiment. + +Triangle rasterization is possible, but perhaps a bit silly. + +We could handle ellipses, but they're more complicated than circles, and +are slower to compute -- you need a couple of multiplications during +setup, and the asymmetry means you have to compute a quadrant rather +than an octant. If the goal is fast animation rather than general-purpose +picture painting then there's little value in supporting ellipses. + +Some of the inner loops are almost certainly paying an extra cycle to +cross a page boundary. That's not easy to fix without adding absurd +amounts of padding. + +"USE_FAST" could be applied more aggressively to reduce the size. + +Having "fast" vs. "small" builds was mostly an experiment to see how +much of a difference in size and speed we'd get by dropping some of +the more expensive operations. Another way to reduce size would be to +make the build modular, so you could (say) omit circle drawing or only +include line drawing. Some trade-offs would have to be made, e.g. if +you only wanted line drawing then it makese sense to disable (or replace) +the horizontal-line optimization that calls FillRaster, as that requires +some sizeable tables that would otherwise be unused. + +### Amperfdraw ### + +The Amperfdraw API is somewhat minimal and could be improved. Taking a +cue from Beagle Graphics, the rect and circle calls should probably look +more like: + + &DRAW width,height [AT left,top] + &COS radius [AT left,top] + +The "&AT" coordinate, currently only used by &PLOT, should be more +widely used. Not only is it more convenient, it's also slightly faster, +since we don't have to parse the left/top coordinates each time. + +The existing code is (somewhat lazily) using the Applesoft routines to +parse coordinates, which includes the range check. We wouldn't be able +to use them for width/height, because we would need to take values in the +range (0-280, 0-192), where width/height of zero means "draw nothing". + +I deliberately used Applesoft tokens, rather than arbitrary words, to +make commands simpler to parse. Some of them don't fit that well. COS +and SIN are circle-related, but it's not obvious which is outline and +which is filled. DRAW and XDRAW don't really sound like rectangle-draw +calls, and would be much more appropriate if used to set the line draw +mode. Spending a few bytes & cycles to get better names might be +worthwhile. + +It's possible to store &PLOT arrays in actual BASIC integer arrays, +which might make them easier to code for. The fact that arrays are +DIM()ed once, cannot be resized, and cannot be discarded makes them +difficult to use for dynamic data. + +Currently &PLOT takes a list of vertices and a list of line segments. +We could also support "continuous line" mode, where it just plays +connect-the-dots (saves space, doesn't really affect speed). Being +able to embed color changes could be handy. + +&PLOT handles lines and vertices the way Applesoft does, with inclusive +coordinates. This results in overdraw when vertices are shared. This +is a (small) performance hit, and causes graphical glitches when connected +lines are drawn in "xdraw" mode. + + +
+# Additional Notes # + +Getting into the gory details here. + +## Setting a pixel ## + +Hi-res pixels are curious creatures. + +Pixel color values are determined by adjacent bits. The various drawing +routines only set one bit at a time, so "drawing" in green (hcolor=1) will +cause bits to be set in odd columns, cleared in even columns. We don't +touch adjacent bits, so drawing purple (hcolor=2) in column 0 and green +in column 1 will produce a white line, while drawing them with the columns +reversed will produce a black line. + +Making life more complicated is the use of the high bit in each byte, which +affects the color. If you draw a purple line in column 0, and a black1 +line with hcolor=4 in column 6, the purple line turns blue, because the +black1 line sets the high bit. + +To set a bit at an arbitrary X offset, we need to do the following: + +(1) Determine which byte to change (xc / 7) and which bit (xc mod 7). +(2) Determine the color mask for that byte. For green, it's 0x2a + (00101010) in even columns, 0x55 (01010101) in odd columns. +(3) Set or clear the target bit and the high bit, leaving the others + intact. + +One way to do this is illustrated below. Assume we're drawing a green +line at X=17. There's already a green dot at X=15, which gives us a +bit pattern of 00000010. (Bits are "backwards", i.e. the bit on the +right is the pixel on the left.) + + LDY byteoffset X=2 + LDX bitoffset X=3 + LDA bitmask,x A=0x88 (10001000) + STA