Initial checkin

fdraw v0.3
2015-08-21 14:15:18 -07:00 · 2015-08-21 14:15:18 -07:00 · 418e7b7191
parent 2ca9d4084f
commit 418e7b7191
10 changed files with 4446 additions and 2 deletions
--- a/AMPERFDRAW.S
+++ b/AMPERFDRAW.S
@ -0,0 +1,549 @@
+********************************
+*                              *
+* Amper-fdraw                  *
+* By Andy McFadden             *
+* For fdraw version 0.3        *
+*                              *
+* Applesoft ampersand          *
+* interface for fdraw.         *
+*                              *
+* Developed with Merlin-16     *
+*                              *
+********************************
+
+         lst   off
+         org   $1d60
+
+* All of the handler entry points can fit on a single
+* page, so it's possible to save a few bytes by
+* dropping the high jump table and just hardcoding
+* the first page into the jump.  This requires that
+* the ORG be at $xx00.
+
+         PUT   FDRAW.DEFS
+
+* Applesoft BASIC tokens.
+tok_plot equ   $8d
+tok_hgr2 equ   $90
+tok_hgr  equ   $91
+tok_hcolor equ $92
+tok_hplot equ  $93
+tok_draw equ   $94
+tok_xdraw equ  $95
+tok_inverse equ $9e
+tok_clear equ  $bd
+tok_new  equ   $bf
+tok_to   equ   $c1
+tok_at   equ   $c5
+*tok_sgn equ $d2
+tok_scrn equ   $d7
+tok_exp  equ   $dd
+tok_cos  equ   $de
+tok_sin  equ   $df
+
+* System locations.
+PCL      equ   $3a        ;used by monitor
+PCH      equ   $3b        ;used by monitor
+A1L      equ   $3c        ;used by monitor
+A1H      equ   $3d        ;used by monitor
+LINNUM   equ   $50        ;50-51
+FACLO    equ   $a1
+CHRGET   equ   $b1        ;advance ptr, get next tok
+CHRGOT   equ   $b7        ;get next tok (no advance)
+TXTPTR   equ   $b8
+HPAG     equ   $e6        ;$20 or $40
+
+AMPERV   equ   $3f5
+
+TXTCLR   equ   $c050
+TXTSET   equ   $c051
+MIXCLR   equ   $c052
+MIXSET   equ   $c053
+LOWSCR   equ   $c054
+HISCR    equ   $c055
+LORES    equ   $c056
+HIRES    equ   $c057
+
+ERROR    equ   $d412      ;error based on X reg
+FRMNUM   equ   $dd67
+SynError equ   $dec9      ;throw SYNTAX ERROR
+CHKCOM   equ   $debe
+IllQError equ  $e199      ;throw ILLEGAL QUANTITY ERROR
+GETADR   equ   $e752
+GETBYT   equ   $e6f8      ;gets byte, in X/FACLO
+HFNS     equ   $f6b9      ;get hi-res x/y for hplot
+
+* Prepare the ampersand vector.
+*
+* Ideally we'd check to see if the existing vector is
+* different from ours, and if so, jump to it when we
+* get a token we don't recognize.  Not convinced
+* there's an actual use case for this.
+init
+         lda   #$4c       ;JMP, in case it got
+         sta   AMPERV     ; trashed
+         lda   #<dispatch
+         sta   AMPERV+1
+         lda   #>dispatch
+         sta   AMPERV+2
+         rts
+
+* Entry point from BASIC.  The token is in A.
+dispatch
+         ldx   #:cmdend-:cmdtab-1
+]loop    cmp   :cmdtab,x
+         beq   :match
+         dex
+         bpl   ]loop
+         jmp   SynError
+
+:match
+         lda   :jmptabh,x
+* lda #>h_new ;all on first page
+         pha
+         lda   :jmptabl,x
+         pha
+         jmp   CHRGET     ;eat token, jump
+
+
+:cmdtab  dfb   tok_new
+         dfb   tok_hgr
+         dfb   tok_hgr2
+         dfb   tok_scrn
+         dfb   tok_hcolor
+         dfb   tok_inverse
+         dfb   tok_clear
+         dfb   tok_hplot
+         dfb   tok_xdraw
+         dfb   tok_draw
+         dfb   tok_exp
+         dfb   tok_cos
+         dfb   tok_sin
+         dfb   tok_at
+         dfb   tok_plot
+:cmdend
+
+:jmptabl dfb   <h_new-1
+         dfb   <h_hgr-1
+         dfb   <h_hgr2-1
+         dfb   <h_scrn-1
+         dfb   <h_hcolor-1
+         dfb   <h_inverse-1
+         dfb   <h_clear-1
+         dfb   <h_hplot-1
+         dfb   <h_xdraw-1
+         dfb   <h_draw-1
+         dfb   <h_exp-1
+         dfb   <h_cos-1
+         dfb   <h_sin-1
+         dfb   <h_at-1
+         dfb   <h_plot-1
+:jmptabh dfb   >h_new-1
+         dfb   >h_hgr-1
+         dfb   >h_hgr2-1
+         dfb   >h_scrn-1
+         dfb   >h_hcolor-1
+         dfb   >h_inverse-1
+         dfb   >h_clear-1
+         dfb   >h_hplot-1
+         dfb   >h_xdraw-1
+         dfb   >h_draw-1
+         dfb   >h_exp-1
+         dfb   >h_cos-1
+         dfb   >h_sin-1
+         dfb   >h_at-1
+         dfb   >h_plot-1
+
+
+********************************
+* &NEW - initialize
+h_new
+         lda   #$20       ;match Init result
+         sta   g_cur_page
+         lda   #$00
+         sta   g_hcolor
+         tax              ;init "previous hplot"
+         tay              ; coord to zero
+         jsr   storeprv
+         ldx   #139       ;279/2
+         ldy   #0
+         lda   #95        ;191/2
+         jsr   storeac
+         jmp   f_Init
+
+********************************
+* &HGR - show page 1 with mixed text, and clear screen.
+* Sets the color to zero.
+h_hgr
+         ldx   #$20       ;page 1
+         lda   #$00       ;$c054
+         beq   hgr_com
+
+********************************
+* &HGR2 - show page 2 with no text, and clear screen.
+* Sets the color to zero.
+h_hgr2
+         ldx   #$40       ;page 2
+         lda   #$01       ;$c055
+                          ;fall through to hgr_com
+
+* We go slightly out of our way to clear the screen
+* before tripping the softswitches.  This avoids
+* flashing the previous hi-res page contents when
+* entering from text mode.
+*
+* We also want to go nomix-page2 but page1-mix
+* (note reverse order) to avoid flashing text pg 2.
+hgr_com  stx   f_in_arg
+         stx   g_cur_page
+         stx   HPAG       ;probably useful
+         pha
+         jsr   f_SetPage
+         lda   #$00
+         sta   f_in_arg
+         jsr   f_SetColor
+         jsr   f_Clear
+         lda   g_hcolor   ;restore color
+         sta   f_in_arg
+         jsr   f_SetColor
+         bit   TXTCLR     ;$c050
+         bit   HIRES      ;$c057
+         pla
+         beq   :pg1
+         bit   MIXCLR     ;$c052
+         bit   HISCR      ;$c055
+         rts
+:pg1     bit   LOWSCR     ;$c054
+         bit   MIXSET     ;$c053
+         rts
+
+********************************
+* &SCRN({1,2}) - set the current hi-res page
+h_scrn
+         jsr   GETBYT
+         cpx   #1
+         beq   :okay
+         cpx   #2
+         beq   :okay
+         jmp   IllQError
+:okay    jsr   CHRGET     ;eat ')' (we assume)
+         txa              ;X/Y unaltered
+         asl
+         asl
+         asl
+         asl
+         asl              ;multiply x32
+         sta   g_cur_page
+         sta   f_in_arg
+         jmp   f_SetPage
+
+********************************
+* &HCOLOR={0-7} - set the current color
+h_hcolor
+         jsr   GETBYT     ;get color
+         cpx   #8
+         blt   :okay
+         jmp   IllQError
+:okay    stx   f_in_arg
+         stx   g_hcolor
+         jmp   f_SetColor
+
+********************************
+* &INVERSE - flip pages
+*
+* If we're currently drawing on $20, we set the page
+* to $40 and hit $c054 to show $20.  And vice-versa.
+* The goal is to make double-buffered animation easy.
+h_inverse
+         lda   g_cur_page
+         eor   #$60
+         sta   g_cur_page
+         ldx   #$00
+         cmp   #$40       ;about to start drawing on 2?
+         beq   :showpg1   ;yes, show page 1
+         inx              ;no, show page 2
+:showpg1 ldy   LOWSCR,x
+         sta   f_in_arg
+         jmp   f_SetPage
+
+********************************
+* &CLEAR - clear current page to current color.
+h_clear
+         jmp   f_Clear    ;well, that was easy
+
+********************************
+* &XDRAW left,top,right,bottom - draw rectangle outline
+h_xdraw
+         jsr   getltrb
+         jmp   f_DrawRect
+
+********************************
+* &DRAW left,top,right,bottom - draw filled rectangle
+h_draw
+         jsr   getltrb
+         jmp   f_FillRect
+
+********************************
+* &EXP {0,1} - set line draw mode
+h_exp
+         jsr   GETBYT
+         cpx   #2
+         blt   :okay
+         jmp   IllQError
+:okay    stx   f_in_arg
+         jmp   f_SetLineMode
+
+********************************
+* &COS cx,cy,rad - draw outline circle
+h_cos
+         jsr   getcxcyr
+         jmp   f_DrawCircle
+
+********************************
+* &SIN cx,cy,rad - draw filled circle
+h_sin
+         jsr   getcxcyr
+         jmp   f_FillCircle
+
+********************************
+* &AT x,y - select center for array draw
+h_at
+         jsr   HFNS
+         jmp   storeac
+
+********************************
+* &PLOT vertexAddr, indexAddr, indexCount [AT cx,cy]
+*  draw lines from arrays of vertices and indices
+h_plot   jmp   array_draw
+
+********************************
+* &HPLOT x,y - draw a point
+* &HPLOT TO x,y - draw a line from last point to x,y
+* &HPLOT x0,y0 to x1,y1 - draw a line
+         lst   on         ;last token handler --
+h_hplot  equ   *          ; must be on first page
+         lst   off        ; to omit high byte table
+
+         jsr   CHRGOT     ;check next token
+         lst   off
+         cmp   #tok_to    ;is this an "HPLOT TO"?
+         beq   :leadingto
+         jsr   getx1y1    ;get the first coord
+         jsr   copy1to0
+         jsr   CHRGOT     ;see if single point
+         cmp   #tok_to
+         beq   :hplot_to  ;nope, draw line
+         jsr   copy0toprev ;draw point, and save x/y
+         jmp   f_DrawPoint ; for subsequent HPLOT TO
+
+:leadingto                ;"HPLOT TO", restore the
+         lda   g_prevxl   ; previous coord to x0/y0
+         sta   f_in_x0l   ;(can't rely on f_in_zzz
+         lda   g_prevxh   ; being there -- we might
+         sta   f_in_x0h   ; have drawn a rect)
+         lda   g_prevy
+         sta   f_in_y0
+:hplot_to
+         jsr   CHRGET     ;eat the TO
+         jsr   getx1y1    ;get the coords
+         jsr   f_DrawLine ;draw it
+         jsr   copy1to0   ;shift 1->0 for next round
+         jsr   CHRGOT
+         cmp   #tok_to    ;another TO?
+         beq   :hplot_to  ;yes, branch
+         jmp   copy0toprev ;no, save prev and bail
+
+* Get coordinates and store in X1/Y1.
+getx1y1
+         jsr   HFNS
+store1   stx   f_in_x1l   ;store X/Y/A in coord1
+         sty   f_in_x1h
+         sta   f_in_y1
+         rts
+
+* Save x0/y0 as our "previous" coordinate.
+copy0toprev
+         ldx   f_in_x0l
+         ldy   f_in_x0h
+         lda   f_in_y0
+storeprv stx   g_prevxl   ;store X/Y/A in g_prev
+         sty   g_prevxh
+         sta   g_prevy
+         rts
+
+* Copy X1/Y1 into X0/Y0.
+copy1to0
+         ldx   f_in_x1l
+         ldy   f_in_x1h
+         lda   f_in_y1
+store0   stx   f_in_x0l   ;store X/Y/A in coord 0
+         sty   f_in_x0h
+         sta   f_in_y0
+         rts
+
+* Store X/Y/A into array-center.
+storeac  stx   g_ac_xl
+         sty   g_ac_xh
+         sta   g_ac_y
+         rts
+
+* Get left/top/right/bottom coordinates.
+getltrb
+         jsr   HFNS
+         jsr   store0     ;save as X0/Y0
+         jsr   CHKCOM     ;eat a comma
+         jsr   HFNS
+         jsr   store1     ;save as X1/Y1
+         rts
+
+* Get center coordinates and radius.
+getcxcyr
+         jsr   HFNS       ;get CX and CY
+         jsr   store0     ;save as X0/Y0
+         jsr   CHKCOM     ;eat a comma
+         jsr   GETBYT     ;convert to 0-255
+         stx   f_in_rad
+         rts
+
+* Array-draw handler.
+*
+* We know that fdraw doesn't use LINNUM or A1L/A1H,
+* so it's safe to use them here.
+array_draw
+]vertices equ  A1L        ;2b
+]indices equ   LINNUM     ;2b
+]count   equ   PCL
+]cur     equ   PCH
+
+         jsr   FRMNUM     ;get vertex buffer address
+         jsr   GETADR
+         lda   LINNUM     ;copy to A1L
+         sta   ]vertices
+         lda   LINNUM+1
+         sta   ]vertices+1
+         jsr   CHKCOM     ;eat the comma
+         jsr   FRMNUM     ;get index buffer address
+         jsr   GETADR     ;leave it in LINNUM
+         jsr   CHKCOM
+         jsr   GETBYT     ;get the count
+         cpx   #128       ;range check (0-127)
+         blt   :countok
+         jmp   IllQError
+:countok txa
+         beq   :done      ;nothing to do
+         asl              ;double it
+         sta   ]count     ;stash it
+         lda   #$00
+         sta   ]cur
+
+* Check for optional AT cx,cy.
+         jsr   CHRGOT
+         cmp   #tok_at
+         bne   :noat
+         JSR   CHRGET     ;eat the AT
+         lda   LINNUM     ;the code that reads the
+         pha              ; hi-res coordinates will
+         lda   LINNUM+1   ; overwrite LINNUM, so
+         pha              ; we have to save & restore
+         jsr   h_at
+         pla
+         sta   LINNUM+1
+         pla
+         sta   LINNUM
+:noat
+
+]loop    jsr   getvertex
+         bcs   :skip2
+         jsr   store0
+         jsr   getvertex
+         bcs   :skip
+         jsr   store1
+         jsr   f_DrawLine
+         dfb   $2c        ;BIT addr
+:skip2   inc   ]cur
+:skip    lda   ]cur
+         cmp   ]count
+         blt   ]loop
+:done    rts
+
+* Get the Nth vertex, specified by ]cur, and load it
+* into X/Y/A (xlo/xhi/y).  Returns with carry set if
+* the vertex is invalid.
+*
+* Increments ]cur by 1.
+getvertex
+         ldy   ]cur
+         inc   ]cur
+         lda   (]indices),y
+         bmi   :badv      ;must be 0-127
+         jsr   :calcvertex
+
+         ldx   g_out_x
+         ldy   g_out_x+1
+         beq   :xok       ;0-255, ok
+         cpy   #1
+         bne   :badv      ;512+
+         cpx   #280-256
+         bge   :badv      ;280-511
+:xok
+         lda   g_out_y+1
+         bne   :badv      ;Y is neg or > 255
+         lda   g_out_y
+         cmp   #192
+         bcc   :goodv
+:badv
+         sec
+:goodv   rts
+
+* Get VX and VY, merging with AC, and store in
+* 16-bit g_out_x and g_out_y.  Range not checked
+* here.  On entry, A has vertex index.
+:calcvertex
+         asl
+         tay
+         ldx   #$00       ;hi byte of vertex
+         lda   (]vertices),y ;x-coord
+         bpl   :xpos
+         dex              ;sign-extend hi byte
+:xpos    clc
+         adc   g_ac_xl
+         sta   g_out_x
+         txa
+         adc   g_ac_xh
+         sta   g_out_x+1
+
+         iny
+         ldx   #$00
+         lda   (]vertices),y ;y-coord
+         bpl   :ypos
+         dex              ;sign-extend hi byte
+:ypos    clc
+         adc   g_ac_y
+         sta   g_out_y
+         bcc   :nocarry
+         inx
+:nocarry stx   g_out_y+1
+         rts
+
+
+
+********************************
+* Global variables
+
+g_cur_page ds  1          ;$20 or $40
+g_hcolor ds    1
+g_prevxl ds    1
+g_prevxh ds    1
+g_prevy  ds    1
+g_ac_xl  ds    1          ;Center-point coordinates
+g_ac_xh  ds    1          ; for array-based line
+g_ac_y   ds    1          ; draw (&AT, &PLOT).
+g_out_x  ds    2          ;16-bit coordinates for
+g_out_y  ds    2          ; array-based line draw
+
+
+
+         lst   on
+end      equ   *
+         sav   amperfdraw
+         lst   off
--- a/FDRAW.CIRCLE.S
+++ b/FDRAW.CIRCLE.S
@ -0,0 +1,752 @@
+********************************
+*                              *
+* Fast Apple II Graphics       *
+* By Andy McFadden             *
+* Version 0.3, Aug 2015        *
+*                              *
+* Circle rendering             *
+* (Included by FDRAW.S)        *
+*                              *
+* Developed with Merlin-16     *
+*                              *
+********************************
+
+* TODO: if USE_FAST is 0, replace the outline circle
+*  plot code with calls to DrawPoint (or maybe a
+*  common sub-function so we don't trash the input
+*  parameters).  Saves a little space.
+
+
+********************************
+*
+* Draw a circle.  The radius is in in_rad, and
+* the center is at in_x0l+in_x0h,in_y0.
+*
+********************************
+DrawCircle
+         lda   #$20       ;JSR
+         cmp   _cp08      ;configured for outline?
+         beq   :okay
+         jsr   fixcplot
+:okay
+         jmp   calc_circle
+
+
+********************************
+*
+* Draw filled circle.
+*
+********************************
+FillCircle
+         lda   #$2c       ;BIT
+         cmp   _cp08      ;configured for fill?
+         beq   :okay
+         jsr   fixcplot
+:okay
+         jsr   calc_circle
+         jmp   FillRaster
+
+
+* Calculate a circle, using Bresenham's algorithm.  The
+* results are placed into the rasterization buffers.
+*
+* in_rad must be from 0 to 255.  The x/y center
+* coordinates must be on the screen, but the circle
+* can extend off the edge.
+*
+* The computed values are stored in the rasterization
+* tables.  For an outline circle, we also plot the
+* points immediately.
+
+         do    USE_FAST   ;*****
+* local storage -- not used often enough to merit DP
+circ_8bit ds   1
+circ_clip ds   1
+         fin              ;*****
+
+calc_circle
+max_fast_rad equ 41
+]cxl     equ   zloc0
+]cxh     equ   zloc1
+]cy      equ   zloc2
+]dlo     equ   zloc3
+]dhi     equ   zloc4
+]xsav    equ   zloc5
+]ysav    equ   zloc6
+]min_x   equ   zloc7      ;min/max offsets from center
+]max_x   equ   zloc8      ;(min is above center, max
+]min_y   equ   zloc9      ; is below)
+]max_y   equ   zloc10
+]hitmp   equ   zloc11
+* only used by hplot for outline circles
+]hbasl   equ   zptr0
+]andmask equ   zloc11     ;overlaps with ]hitmp
+]savxreg equ   zloc12
+]savyreg equ   zloc13
+
+* Special-case radius=0.  It removes an annoying
+* edge case (first y-- becomes 0xff, but 6502 cmp
+* is unsigned).
+         lda   in_rad
+         bne   :notzero
+         ldy   in_y0
+         sty   rast_top
+         sty   rast_bottom
+         lda   in_x0l
+         sta   rastx0l,y
+         sta   rastx1l,y
+         lda   in_x0h
+         sta   rastx0h,y
+         sta   rastx1h,y
+         rts
+
+* Use different version of function for small
+* circles, because we can do it all in 8 bits.
+:notzero
+         do    USE_FAST   ;*****
+         ldy   #$01
+         cmp   #max_fast_rad ;in_rad in Acc
+         blt   :use_fast
+         dey
+:use_fast sty  circ_8bit
+         fin              ;*****
+
+         lda   in_x0l     ;copy center to DP for speed
+         sta   ]cxl
+         lda   in_x0h
+         sta   ]cxh
+         lda   in_y0
+         sta   ]cy
+
+* Compute min/max values, based on offset from center.
+* These are compared against offset-from-center x/y.
+* We need tight bounds on Y because we use it to
+* compute the rast_render top/bottom.  Getting tight
+* bounds on X is not so important, but we still need
+* it for the no-clip optimization.
+         ldx   #$04       ;count edges needing clip
+
+         lda   #NUM_ROWS-1 ;191
+         sec
+         sbc   ]cy        ;maxY = 191-cy
+         cmp   in_rad
+         blt   :ylimok
+         lda   in_rad     ;clamp to radius
+         dex
+:ylimok  sta   ]max_y     ;maxY = 191-cy
+
+         lda   ]cy        ;minY = cy
+         cmp   in_rad
+         blt   :ylimok2
+         lda   in_rad     ;clamp to radius
+         dex
+:ylimok2 sta   ]min_y
+
+         lda   ]cxh
+         beq   :xlimlo
+* Examples (note #<NUM_COLS-1 is 279-256 = 23):
+* cx=265 (cxh=1 cxl=11), 23-11=14, chk rad
+         lda   #<NUM_COLS-1 ;maxX = 279-cx
+         sec
+         sbc   ]cxl
+         cmp   in_rad
+         blt   :xlimhok
+         lda   in_rad     ;clamp to radius
+         dex
+:xlimhok sta   ]max_x
+
+         lda   in_rad     ;min X always out of range
+         dex              ; so just clamp to radius
+         sta   ]min_x
+         jmp   :xlimdone
+
+* Examples:
+* For cx=0 to 24, we can never pass right edge (our
+*  maximum radius is 255).
+* cx=3, 23-3=20 + carry set --> bad, must use rad
+* cx=24, 23-24=255 + carry clear --> ok, chk rad
+* cx=255, 23-255=24 + carry clear --> ok, chk rad
+:xlimlo
+         lda   #<NUM_COLS-1 ;maxX = 279-cx
+         sec
+         sbc   ]cxl
+         bcs   :xuserad
+         cmp   in_rad
+         blt   :xlimok
+:xuserad lda   in_rad     ;clamp to radius
+         dex
+:xlimok  sta   ]max_x
+
+         lda   ]cxl       ;minX = (cx > 255) ?
+         cmp   in_rad
+         blt   :xlimok2
+         lda   in_rad     ;clamp to radius
+         dex
+:xlimok2 sta   ]min_x
+
+:xlimdone
+
+         do    USE_FAST   ;*****
+         stx   circ_clip
+         fin              ;*****
+
+* set top/bottom rows for rasterizer
+         lda   ]cy
+         clc
+         adc   ]max_y
+         sta   rast_bottom
+         lda   ]cy
+         sec
+         sbc   ]min_y
+         sta   rast_top
+
+         DO    0          ;debug debug debug
+         LDA   ]min_x     ;save a copy where the
+         STA   $0380      ; monitor won't trash it
+         LDA   ]max_x
+         STA   $0381
+         LDA   ]min_y
+         STA   $0382
+         LDA   ]max_y
+         STA   $0383
+         FIN
+
+* Set initial conditions for Bresenham.
+         ldx   #0         ;:x = 0
+         stx   ]xsav
+         ldy   in_rad     ;:y = rad
+         sty   ]ysav
+         lda   #1         ;:d = 1 - rad
+         sec
+         sbc   ]ysav      ;in_rad
+         sta   ]dlo
+         bcs   :hizero    ;C==1 if in_rad<=1
+         ldx   #$ff       ;C was 0, make neg
+:hizero  stx   ]dhi
+
+*
+* Outer loop -- plot 8 points, then update values.
+*
+circ_loop
+
+         do    USE_FAST   ;*****
+         lda   circ_clip
+         beq   ncypy
+         jmp   with_clip
+
+* Quick version, no clipping required
+* row cy+y: cx-x and cx+x
+ncypy
+         lda   ]ysav
+         clc
+         adc   ]cy
+         tay              ;y-coord in Y-reg
+
+         lda   ]cxl
+         sec
+         sbc   ]xsav
+         sta   rastx0l,y
+         lda   ]cxh
+         sbc   #$00
+         sta   rastx0h,y
+_cp00    jsr   cplotl
+
+         lda   ]cxl
+         clc
+         adc   ]xsav
+         sta   rastx1l,y
+         lda   ]cxh
+         adc   #$00
+         sta   rastx1h,y
+_cp01    jsr   cplotrn
+
+* row cy-y: cx-x and cx+x
+ncymy
+         lda   ]cy
+         sec
+         sbc   ]ysav
+         tay              ;y-coord in Y-reg
+
+         lda   ]cxl
+         sec
+         sbc   ]xsav
+         sta   rastx0l,y
+         lda   ]cxh
+         sbc   #$00
+         sta   rastx0h,y
+_cp02    jsr   cplotl
+
+         lda   ]cxl
+         clc
+         adc   ]xsav
+         sta   rastx1l,y
+         lda   ]cxh
+         adc   #$00
+         sta   rastx1h,y
+_cp03    jsr   cplotrn
+
+* row cy+x: cx-y and cx+y
+ncypx
+         lda   ]xsav      ;off bottom?
+         clc
+         adc   ]cy
+         tay              ;y-coord in Y-reg
+
+         lda   ]cxl
+         sec
+         sbc   ]ysav
+         sta   rastx0l,y
+         lda   ]cxh
+         sbc   #$00
+         sta   rastx0h,y
+_cp04    jsr   cplotl
+
+         lda   ]cxl
+         clc
+         adc   ]ysav
+         sta   rastx1l,y
+         lda   ]cxh
+         adc   #$00
+         sta   rastx1h,y
+_cp05    jsr   cplotrn
+
+* row cy-x: cx-y and cx+y
+ncymx
+         lda   ]cy
+         sec
+         sbc   ]xsav
+         tay              ;y-coord in Y-reg
+
+         lda   ]cxl
+         sec
+         sbc   ]ysav
+         sta   rastx0l,y
+         lda   ]cxh
+         sbc   #$00
+         sta   rastx0h,y
+_cp06    jsr   cplotl
+
+         lda   ]cxl
+         clc
+         adc   ]ysav
+         sta   rastx1l,y
+         lda   ]cxh
+         adc   #$00
+         sta   rastx1h,y
+_cp07    jsr   cplotrn
+
+* CLICK
+         jmp   circ_plot_done
+
+         fin              ;***** (USE_FAST)
+
+*
+* Same thing, but this time clipping edges.
+*
+with_clip
+
+* row cy+y: cx-x and cx+x
+ccypy
+         lda   ]ysav      ;off bottom?
+         cmp   ]max_y
+         beq   :cypy_ok
+         bge   cypy_skip  ;completely off screen
+:cypy_ok clc
+         adc   ]cy
+         tay              ;y-coord in Y-reg
+
+         ldx   ]xsav      ;handle cx-x
+         cpx   ]min_x
+         blt   :cxmx_ok
+         beq   :cxmx_ok
+         lda   #0         ;clip at 0
+         sta   rastx0l,y
+         sta   rastx0h,y
+         beq   cxmx_done0 ;always
+         BREAK
+:cxmx_ok lda   ]cxl
+         sec
+         sbc   ]xsav
+         sta   rastx0l,y
+         lda   ]cxh
+         sbc   #$00
+         sta   rastx0h,y
+_cp08    jsr   cplotl
+cxmx_done0
+
+         cpx   ]max_x     ;handle cx+x
+         blt   :cxpx_ok
+         beq   :cxpx_ok
+         lda   #<NUM_COLS-1
+         sta   rastx1l,y
+         lda   #>NUM_COLS-1
+         sta   rastx1h,y
+         bne   cxpx_done0 ;always
+         BREAK
+:cxpx_ok lda   ]cxl
+         clc
+         adc   ]xsav
+         sta   rastx1l,y
+         lda   ]cxh
+         adc   #$00
+         sta   rastx1h,y
+_cp09    jsr   cplotr
+cxpx_done0
+cypy_skip
+
+* row cy-y: cx-x and cx+x
+ccymy
+         lda   ]ysav      ;off top?
+         cmp   ]min_y
+         beq   :cymy_ok
+         bge   cymy_skip
+:cymy_ok lda   ]cy
+         sec
+         sbc   ]ysav
+         tay              ;y-coord in Y-reg
+
+         ldx   ]xsav      ;handle cx-x
+         cpx   ]min_x
+         blt   :cxmx_ok
+         beq   :cxmx_ok
+         lda   #0         ;clip at 0
+         sta   rastx0l,y
+         sta   rastx0h,y
+         beq   cxmx_done1 ;always
+         BREAK
+:cxmx_ok lda   ]cxl
+         sec
+         sbc   ]xsav
+         sta   rastx0l,y
+         lda   ]cxh
+         sbc   #$00
+         sta   rastx0h,y
+_cp10    jsr   cplotl
+cxmx_done1
+
+         cpx   ]max_x     ;handle cx+x
+         blt   :cxpx_ok
+         beq   :cxpx_ok
+         lda   #<NUM_COLS-1
+         sta   rastx1l,y
+         lda   #>NUM_COLS-1
+         sta   rastx1h,y
+         bne   cxpx_done1 ;always
+         BREAK
+:cxpx_ok lda   ]cxl
+         clc
+         adc   ]xsav
+         sta   rastx1l,y
+         lda   ]cxh
+         adc   #$00
+         sta   rastx1h,y
+_cp11    jsr   cplotr
+cxpx_done1
+cymy_skip
+
+* row cy+x: cx-y and cx+y
+ccypx
+         lda   ]xsav      ;off bottom?
+         cmp   ]max_y
+         beq   :cypx_ok
+         bge   cypx_skip
+:cypx_ok clc
+         adc   ]cy
+         tay              ;y-coord in Y-reg
+
+         ldx   ]ysav      ;handle cx-y
+         cpx   ]min_x
+         blt   :cxmy_ok
+         beq   :cxmy_ok
+         lda   #0         ;clip at 0
+         sta   rastx0l,y
+         sta   rastx0h,y
+         beq   cxmy_done2 ;always
+         BREAK
+:cxmy_ok lda   ]cxl
+         sec
+         sbc   ]ysav
+         sta   rastx0l,y
+         lda   ]cxh
+         sbc   #$00
+         sta   rastx0h,y
+_cp12    jsr   cplotl
+cxmy_done2
+
+         cpx   ]max_x     ;handle cx+y
+         blt   :cxpy_ok
+         beq   :cxpy_ok
+         lda   #<NUM_COLS-1
+         sta   rastx1l,y
+         lda   #>NUM_COLS-1
+         sta   rastx1h,y
+         bne   cxpy_done2 ;always
+         BREAK
+:cxpy_ok lda   ]cxl
+         clc
+         adc   ]ysav
+         sta   rastx1l,y
+         lda   ]cxh
+         adc   #$00
+         sta   rastx1h,y
+_cp13    jsr   cplotr
+cxpy_done2
+cypx_skip
+
+* row cy-x: cx-y and cx+y
+ccymx
+         lda   ]xsav      ;off top?
+         cmp   ]min_y
+         beq   :cymx_ok
+         bge   cymx_skip
+:cymx_ok lda   ]cy
+         sec
+         sbc   ]xsav
+         tay              ;y-coord in Y-reg
+
+         ldx   ]ysav      ;handle cx-y
+         cpx   ]min_x
+         blt   :cxmy_ok
+         beq   :cxmy_ok
+         lda   #0         ;clip at 0
+         sta   rastx0l,y
+         sta   rastx0h,y
+         beq   cxmy_done3 ;always
+         BREAK
+:cxmy_ok lda   ]cxl
+         sec
+         sbc   ]ysav
+         sta   rastx0l,y
+         lda   ]cxh
+         sbc   #$00
+         sta   rastx0h,y
+_cp14    jsr   cplotl
+cxmy_done3
+
+         cpx   ]max_x     ;handle cx+y
+         blt   :cxpy_ok
+         beq   :cxpy_ok
+         lda   #<NUM_COLS-1
+         sta   rastx1l,y
+         lda   #>NUM_COLS-1
+         sta   rastx1h,y
+         bne   cxpy_done3 ;always
+         BREAK
+:cxpy_ok lda   ]cxl
+         clc
+         adc   ]ysav
+         sta   rastx1l,y
+         lda   ]cxh
+         adc   #$00
+         sta   rastx1h,y
+_cp15    jsr   cplotr
+cxpy_done3
+cymx_skip
+
+circ_plot_done
+* Update X/Y/D.  Up to about radius=41 we can maintain
+* 'd' in an 8-bit register.
+         do    USE_FAST   ;*****
+         lda   circ_8bit
+         beq   circ_slow
+
+*
+* Bresenham update, with 8-bit 'd'.
+*
+         ldx   ]xsav
+         lda   ]dlo
+         bmi   :dneg
+         txa              ;:d = d + ((x-y)*4) +5
+         sec
+         sbc   ]ysav      ;x <= y, may be neg or 0
+         asl
+         asl
+         clc              ;can't know carry
+         adc   #5
+         clc              ;still don't want carry
+         adc   ]dlo
+         sta   ]dlo
+         dec   ]ysav      ;:y--
+         jmp   :loopbot
+:dneg    txa              ;:d = d + (x*4) +3
+         asl
+         asl              ;x always pos, C=0
+         DO    0
+         BCC   :TEST      ;debug
+         BREAK            ;debug
+:TEST                     ;debug
+         FIN
+         adc   #3
+         adc   ]dlo
+         sta   ]dlo
+:loopbot
+         inx              ;:x++
+         stx   ]xsav
+         cpx   ]ysav
+         beq   :again
+         bge   circ_done
+:again   jmp   circ_loop
+
+         fin              ;*****
+
+*
+* Bresenham update, with 16-bit 'd'
+*
+circ_slow
+         CLICK
+         ldx   ]xsav
+         lda   ]dhi
+         bmi   :dneg
+         lda   ]dlo
+         clc
+         adc   #5
+         sta   ]dlo
+         bcc   :noinc
+         inc   ]dhi
+:noinc
+         txa              ;:d = d + ((x-y)*4) +5
+         ldy   #$00
+         sty   ]hitmp
+         sec
+         sbc   ]ysav      ;x <= y, may be neg or 0
+         beq   :xeqy      ;if x==y, nothing to add
+         ldy   #$ff
+         sty   ]hitmp
+         asl
+         rol   ]hitmp
+         asl
+         rol   ]hitmp
+         clc
+         adc   ]dlo
+         sta   ]dlo
+         lda   ]dhi
+         adc   ]hitmp
+         sta   ]dhi
+:xeqy
+         dec   ]ysav      ;:y--
+         jmp   :loopbot
+
+:dneg    lda   ]dlo       ;:d = d + (x*4) + 3
+         clc
+         adc   #3
+         sta   ]dlo
+         bcc   :noinc2
+         inc   ]dhi
+:noinc2  txa
+         ldy   #0         ;x always positive
+         sty   ]hitmp
+         asl
+         rol   ]hitmp
+         asl
+         rol   ]hitmp
+         clc              ;not needed?
+         adc   ]dlo
+         sta   ]dlo
+         lda   ]dhi
+         adc   ]hitmp
+         sta   ]dhi
+:loopbot
+         inx              ;:x++
+         stx   ]xsav
+         cpx   ]ysav
+         beq   :again
+         bge   circ_done
+:again   jmp   circ_loop
+
+
+circ_done rts
+
+
+* Plot a point for outline circle rendering.
+*
+* X and Y must be preserved.  Y holds the current line
+* number.
+*
+* Most DP locations are in use -- see the variable
+* declarations at the start of the circle function.
+
+* cplotl is the entry point for the leftmost point.
+cplotl
+         stx   ]savxreg
+         sty   ]savyreg
+
+         lda   ylooklo,y
+         sta   ]hbasl
+         lda   ylookhi,y
+_pg_or2  ora   #$20
+         sta   ]hbasl+1
+
+* Convert the X coordinate into byte/bit.
+         ldx   rastx0l,y  ;x coord, lo
+         lda   rastx0h,y  ;>= 256?
+         beq   :lotabl    ;no, use the low table
+         ldy   div7hi,x
+         lda   mod7hi,x
+         bpl   cplotcom   ;always
+         BREAK            ;debug
+:lotabl  ldy   div7lo,x
+         lda   mod7lo,x
+         jmp   cplotcom
+
+* cplotr is the entry point for the rightmost point.
+* We use rastx1 instead of rastx0.
+cplotr
+         lda   ylooklo,y
+         sta   ]hbasl
+         lda   ylookhi,y
+_pg_or3  ora   #$20
+         sta   ]hbasl+1
+
+* If we just plotted the left point on the same line,
+* we can skip the Y-lookup by jumping here.
+cplotrn
+         stx   ]savxreg
+         sty   ]savyreg
+
+         ldx   rastx1l,y  ;x coord, lo
+         lda   rastx1h,y  ;>= 256?
+         beq   :lotabl    ;no, use the low table
+         ldy   div7hi,x
+         lda   mod7hi,x
+         bpl   cplotcom   ;always
+         BREAK            ;debug
+:lotabl  ldy   div7lo,x
+         lda   mod7lo,x
+
+* Plot the point.  The byte offset (0-39) is in Y,
+* the bit offset (0-6) is in A.
+cplotcom
+         tax
+         lda   colorline,y ;start with color pattern
+         eor   (]hbasl),y ;flip all bits
+         and   andmask,x  ;clear other bits
+         eor   (]hbasl),y ;restore ours, set theirs
+         sta   (]hbasl),y
+
+         ldx   ]savxreg
+         ldy   ]savyreg
+         rts
+
+* Reconfigure calc_circle to either JSR to cplotl/r,
+* or just BIT the address (a 4-cycle no-op).  The
+* desired instruction is in A.
+fixcplot
+         do    USE_FAST   ;*****
+         sta   _cp00
+         sta   _cp01
+         sta   _cp02
+         sta   _cp03
+         sta   _cp04
+         sta   _cp05
+         sta   _cp06
+         sta   _cp07
+         fin              ;*****
+         sta   _cp08
+         sta   _cp09
+         sta   _cp10
+         sta   _cp11
+         sta   _cp12
+         sta   _cp13
+         sta   _cp14
+         sta   _cp15
+         rts
--- a/FDRAW.LINE.S
+++ b/FDRAW.LINE.S
@ -0,0 +1,588 @@
+********************************
+*                              *
+* Fast Apple II Graphics       *
+* By Andy McFadden             *
+* Version 0.3, Aug 2015        *
+*                              *
+* Point and line functions     *
+* (Included by FDRAW.S)        *
+*                              *
+* Developed with Merlin-16     *
+*                              *
+********************************
+
+
+********************************
+*
+* Draw a single point in the current color.
+*
+********************************
+DrawPoint
+]hbasl   equ   zptr0
+
+         ldy   in_y0
+         lda   ylooklo,y
+         sta   ]hbasl
+         lda   ylookhi,y
+         ora   g_page
+         sta   ]hbasl+1
+
+         ldx   in_x0l     ;x coord, lo
+         lda   in_x0h     ;>= 256?
+         beq   :lotabl    ;no, use the low table
+         ldy   div7hi,x
+         lda   mod7hi,x
+         bpl   :plotit    ;always
+         BREAK            ;debug
+:lotabl  ldy   div7lo,x
+         lda   mod7lo,x
+
+* Plot the point.  The byte offset (0-39) is in Y,
+* the bit offset (0-6) is in A.
+:plotit
+         tax
+         lda   colorline,y ;start with color pattern
+         eor   (]hbasl),y ;flip all bits
+         and   andmask,x  ;clear other bits
+         eor   (]hbasl),y ;restore ours, set theirs
+         sta   (]hbasl),y
+         rts
+
+
+********************************
+*
+* Draw a line between two points.
+*
+********************************
+DrawLine
+
+]hbasl   equ   zptr0
+]xposl   equ   zloc0      ;always left edge
+]xposh   equ   zloc1
+]ypos    equ   zloc2      ;top or bottom
+]deltaxl equ   zloc3
+]deltaxh equ   zloc4
+]deltay  equ   zloc5
+]count   equ   zloc6
+]counth  equ   zloc7
+]diff    equ   zloc8
+]diffh   equ   zloc9
+]andmask equ   zloc10
+]wideflag equ  zloc11     ;doesn't really need DP
+
+* We use a traditional Bresenham run-length approach.
+* Run-slicing is possible, but the code is larger
+* and the increased cost means it's only valuable
+* for longer lines.  An optimal solution would switch
+* approaches based on line length.
+*
+* Start by identifying where x0 or x1 is on the
+* left.  To make life simpler we always work from
+* left to right, flipping the coordinates if
+* needed.
+*
+* We also need to figure out if the line is more
+* than 255 pixels long -- which, because of
+* inclusive coordinates, means abs(x0-x1) > 254.
+         lda   in_x1l     ;assume x0 on left
+         sec
+         sbc   in_x0l
+         tax
+         beq   checkvert  ;low bytes even, check hi
+         lda   in_x1h
+         sbc   in_x0h
+         bcs   lx0left
+
+* x1 is on the left, so the values are negative
+* (hi byte in A, lo byte in X)
+lx0right eor   #$ff       ;invert hi
+         sta   ]deltaxh   ;store
+         txa
+         eor   #$ff       ;invert lo
+         sta   ]deltaxl
+         inc   ]deltaxl   ;add one for 2s complement
+         bne   :noinchi   ;rolled into high byte?
+         inc   ]deltaxh   ;yes
+:noinchi lda   in_x1l     ;start with x1
+         sta   ]xposl
+         lda   in_x1h
+         sta   ]xposh
+         lda   in_y1
+         sta   ]ypos
+         sec
+         sbc   in_y0      ;compute deltay
+         jmp   lncommon
+
+checkvert
+         lda   in_x1h     ;diff high bytes
+         sbc   in_x0h     ;(carry still set)
+         blt   lx0right   ;width=256, x0 right
+         bne   lx0left    ;width=256, x0 left
+         jmp   vertline   ;all zero, go vert
+
+* (branch back from below)
+* This is a purely horizontal line.  We farm the job
+* out to the raster fill code for speed.  (There's
+* no problem with the line code handling it; its just
+* more efficient to let the raster code do it.)
+phorizontal
+         ldy   ]ypos
+         sty   rast_top
+         sty   rast_bottom
+         lda   ]xposl
+         sta   rastx0l,y
+         clc
+         adc   ]deltaxl   ;easier to add delta back
+         sta   rastx1l,y  ; in than sort out which
+         lda   ]xposh     ; arg is left vs. right
+         sta   rastx0h,y
+         adc   ]deltaxh
+         sta   rastx1h,y
+         jmp   FillRaster
+
+* x0 is on the left, so the values are positive
+lx0left  stx   ]deltaxl
+         sta   ]deltaxh
+         lda   in_x0l     ;start with x0
+         sta   ]xposl
+         lda   in_x0h
+         sta   ]xposh
+         lda   in_y0      ;and y0
+         sta   ]ypos
+         sec
+         sbc   in_y1      ;compute deltay
+
+* Value of (starty - endy) is in A, flags still set.
+lncommon
+         bcs   :posy
+         eor   #$ff       ;negative, invert
+         adc   #$01
+         sta   ]deltay
+         lda   #$e8       ;INX
+         bne   gotdy
+:posy
+_lmb     beq   phorizontal
+         sta   ]deltay
+         lda   #$ca       ;DEX
+gotdy    sta   _hmody
+         sta   _vmody
+         sta   _wmody
+
+         do    0          ;***** for regression test
+         ldx   #$01
+         lda   ]deltaxh
+         bne   :iswide
+         lda   ]deltaxl
+         cmp   #$ff       ;== 255?
+         beq   :iswide
+         ldx   #$00       ;notwide
+:iswide  stx   $300
+         lda   ]xposl
+         sta   $301
+         lda   ]xposh
+         sta   $302
+         lda   ]ypos
+         sta   $303
+         ldx   ]deltaxl
+         stx   $304
+         ldx   ]deltaxh
+         stx   $305
+         ldx   ]deltay
+         stx   $306
+         lda   _hmody
+         and   #$20       ;nonzero means inc,
+         sta   $307       ; zero means dec
+         fin              ;*****
+
+* At this point we have the initial X position in
+* ]startxl/h, the initial Y position in ]starty,
+* deltax in ]deltaxl, deltay in ]deltay, and we've
+* tweaked the Y-update instructions to either INC or
+* DEC depending on the direction of movement.
+*
+* The next step is to decide whether the line is
+* horizontal-dominant or vertical-dominant, and
+* branch to the appropriate handler.
+*
+* The core loops for horiz and vert take about
+* 80 cycles when moving diagonally, and about
+* 20 fewer when moving in the primary direction.
+* The wide-horiz is a bit slower.
+         ldy   #$01       ;set "wide" flag to 1
+         lda   ]deltaxl
+         ldx   ]deltaxh
+         bne   horzdom    ;width >= 256
+         cmp   #$ff       ;width == 255
+         beq   horzdom
+         dey              ;not wide
+         cmp   ]deltay
+         bge   horzdom    ; for diagonal lines
+         jmp   vertdom
+
+* We could special-case pure-diagonal lines here
+* (just BEQ a couple lines up).  It does
+* represent our worst case.  I'm not convinced
+* we'll see them often enough to make it worthwhile.
+
+
+* horizontal-dominant
+horzdom
+         sty   ]wideflag
+         sta   ]count     ;:count = deltax + 1
+         inc   ]count
+         lsr              ;:diff = deltax / 2
+         sta   ]diff
+
+* set Y to the byte offset in the line
+* load the AND mask into ]andmask
+         ldx   ]xposl
+         lda   ]xposh     ;>= 256?
+         beq   :lotabl    ;no, use the low table
+         ldy   div7hi,x
+         lda   mod7hi,x
+         bpl   :gottab    ;always
+* BREAK ;debug
+:lotabl  ldy   div7lo,x
+         lda   mod7lo,x
+:gottab
+         tax
+         lda   andmask,x
+         sta   ]andmask
+
+* Set initial value for line address.
+         ldx   ]ypos
+         lda   ylooklo,x
+         sta   ]hbasl
+         lda   ylookhi,x
+         ora   g_page
+         sta   ]hbasl+1
+
+         lda   ]wideflag  ;is this a "wide" line?
+         beq   :notwide   ;nope, stay local
+         jmp   widedom
+
+:notwide lda   colorline,y ;set initial color mask
+         sta   _hlcolor+1
+         jmp   horzloop
+
+hrts     rts
+
+* bottom of loop, essentially
+hnoroll  sta   ]diff      ;3
+hdecc    dec   ]count     ;5 :count--
+         beq   hrts       ;2 :while (count != 0)
+                          ;= 7 or 10
+
+* We keep the byte offset in the line in Y, and the
+* line index in X, for the entire loop.
+horzloop
+_hlcolor lda   #$00       ;2 start with color pattern
+_lmdh    eor   (]hbasl),y ;5 flip all bits
+         and   ]andmask   ;3 clear other bits
+         eor   (]hbasl),y ;5 restore ours, set theirs
+         sta   (]hbasl),y ;6 = 21
+
+* Move right.  We shift the bit mask that determines
+* the pixel.  When we shift into bit 7, we know it's
+* time to advance another byte.
+*
+* If this is a shallow line we would benefit from
+* keeping the index in X and just doing a 4-cycle
+* indexed load to get the mask. Not having the
+* line number in X makes the line calc more
+* expensive for steeper lines though.
+         lda   ]andmask   ;3
+         asl              ;2 shift, losing hi bit
+         eor   #$80       ;2 set the hi bit
+         bne   :noh8      ;3 cleared hi bit?
+* We could BEQ away and branch back in, but this
+* happens every 7 iterations, so on average it's
+* a very small improvement.  If we happen to branch
+* across a page boundary the double-branch adds
+* two more cycles and we lose.
+         iny              ;2 advance to next byte
+         lda   colorline,y ;4 update color mask
+         sta   _hlcolor+1 ;4
+         lda   #$81       ;2 reset
+:noh8    sta   ]andmask   ;3 = 13 + ((12-1)/7) = 14
+
+* Update error diff.
+         lda   ]diff      ;3
+         sec              ;2
+         sbc   ]deltay    ;3 :diff -= deltay
+         bcs   hnoroll    ;2+ :if (diff < 0) ...
+                          ;= 11 level, 10 up/down
+         adc   ]deltaxl   ;3 :  diff += deltax
+         sta   ]diff      ;3
+_hmody   inx              ;2 :  ypos++ (or --)
+         lda   ylooklo,x  ;4 update hbasl after line
+         sta   ]hbasl     ;3  change
+         lda   ylookhi,x  ;4
+_pg_or4  ora   #$20       ;2
+         sta   ]hbasl+1   ;3
+         bne   hdecc      ;3 = +27 this path -> 37
+         BREAK
+* horizontal: 10+21+14+11=56 cycles/pixel
+* diagonal:   7+21+14+37=79 cycles/pixel
+
+
+* Vertical-dominant line.  Could go up or down.
+vertdom
+         ldx   in_y0
+         cpx   ]ypos      ;starting at y0?
+         bne   :endy0     ;yup
+         ldx   in_y1      ;nope
+:endy0   stx   _vchk+1    ;end condition
+
+         lda   ]deltay
+         lsr
+         sta   ]diff      ;:diff = deltay / 2
+
+* set Y to the byte offset in the line
+* load the AND mask into ]andmask
+         ldx   ]xposl
+         lda   ]xposh     ;>= 256?
+         beq   :lotabl    ;no, use the low table
+         ldy   div7hi,x
+         lda   mod7hi,x
+         bpl   :gottab    ;always
+         BREAK            ;debug
+:lotabl  ldy   div7lo,x
+         lda   mod7lo,x
+:gottab
+         tax
+         lda   andmask,x  ;initial pixel mask
+         sta   ]andmask
+
+         lda   colorline,y ;initial color mask
+         sta   _vlcolor+1
+
+         ldx   ]ypos
+         jmp   vertloop
+
+* We keep the byte offset in the line in Y, and the
+* line index in X, for the entire loop.
+
+* Bottom of loop, essentially.
+vnoroll  sta   ]diff      ;3
+
+vertloop
+         lda   ylooklo,x  ;4
+         sta   ]hbasl     ;3
+         lda   ylookhi,x  ;4
+_pg_or5  ora   #$20       ;2
+         sta   ]hbasl+1   ;3 = 16
+
+_vlcolor lda   #$00       ;2 start with color pattern
+_lmdv    eor   (]hbasl),y ;5 flip all bits
+         and   ]andmask   ;3 clear other bits
+         eor   (]hbasl),y ;5 restore ours, set theirs
+         sta   (]hbasl),y ;6 = 21
+
+_vchk    cpx   #$00       ;2 was this last line?
+         beq   vrts       ;2 yes, done
+_vmody   inx              ;2 :ypos++ (or --)
+
+* Update error diff.
+         lda   ]diff      ;3
+         sec              ;2
+         sbc   ]deltaxl   ;3 :diff -= deltax
+         bcs   vnoroll    ;2 :if (diff < 0) ...
+                          ;= 10 vert, 9 move right
+
+         adc   ]deltay    ;3 :  diff += deltay
+         sta   ]diff      ;3
+* Move right.  We shift the bit mask that determines
+* the pixel.  When we shift into bit 7, we know it's
+* time to advance another byte.
+         lda   ]andmask   ;3
+         asl              ;2 shift, losing hi bit
+         eor   #$80       ;2 set the hi bit
+         beq   :is8       ;2+ goes to zero on 8th bit
+         sta   ]andmask   ;3
+         bne   vertloop   ;3 = 21 + (18/7) = 24
+         BREAK
+
+:is8     iny              ;2 advance to next byte
+         lda   colorline,y ;4 update color
+         sta   _vlcolor+1 ;4
+         lda   #$81       ;2 reset
+         sta   ]andmask   ;3
+         bne   vertloop   ;3 = 18
+         BREAK
+vrts     rts
+* vertical: 3 + 16 + 21 + 6 + 10 = 56 cycles
+* diagonal: 16 + 21 + 6 + 9 + 24 = 76 cycles
+
+
+* "Wide" horizontally-dominant loop.  We have to
+* maintain error-diff and deltax as 16-bit values.
+* Most of the setup from the "narrow" version carried
+* over, but we have to re-do the count and diff.
+*
+* Normally we set count to (deltax + 1) and decrement
+* to zero, but it's actually easier to set it equal
+* to deltax and check for -1.
+widedom
+         lda   ]deltaxh   ;:count = deltax
+         sta   ]counth
+         ldx   ]deltaxl
+         stx   ]count
+         stx   ]diff
+         lsr              ;:diff = deltax / 2
+         ror   ]diff
+         sta   ]diffh
+         ldx   ]ypos
+
+         lda   colorline,y ;set initial color mask
+         sta   _wlcolor+1
+
+* We keep the byte offset in the line in Y, and the
+* line index in X, for the entire loop.
+wideloop
+_wlcolor lda   #$00       ;2 start with color pattern
+_lmdw    eor   (]hbasl),y ;5 flip all bits
+         and   ]andmask   ;3 clear other bits
+         eor   (]hbasl),y ;5 restore ours, set theirs
+         sta   (]hbasl),y ;6 = 21
+
+* Move right.  We shift the bit mask that determines
+* the pixel.  When we shift into bit 7, we know it's
+* time to advance another byte.
+         lda   ]andmask   ;3
+         asl              ;2 shift, losing hi bit
+         eor   #$80       ;2 set the hi bit
+         bne   :not7      ;3 goes to zero on 8th bit
+         iny              ; 2 advance to next byte
+         lda   colorline,y ; 4 update color mask
+         sta   _hlcolor+1 ; 4
+         lda   #$81       ; 2 reset
+:not7    sta   ]andmask   ;3 = 13 usually, 25 every 7
+
+* Update error diff, which is a positive number.  If
+* it goes negative ("if (diff < 0)") we act.
+         lda   ]diff
+         sec
+         sbc   ]deltay    ;:diff -= deltay
+         bcs   wnoroll    ;didn't even roll low byte
+         dec   ]diffh     ;check hi byte
+         bpl   wnoroll    ;went 1->0, keep going
+
+         adc   ]deltaxl   ;:  diff += deltax
+         sta   ]diff
+         lda   ]diffh
+         adc   ]deltaxh
+         sta   ]diffh
+_wmody   inx              ;:  ypos++ (or --)
+         lda   ylooklo,x  ;update hbasl after line
+         sta   ]hbasl     ; change
+         lda   ylookhi,x
+_pg_or6  ora   #$20
+         sta   ]hbasl+1
+         bne   wdecc
+         BREAK
+
+wnoroll  sta   ]diff
+
+wdecc    dec   ]count     ;5 :count--
+         lda   ]count     ;3
+         cmp   #$ff       ;2
+         bne   wideloop   ;3 :while (count > -1)
+         dec   ]counth    ;low rolled, decr high
+         beq   wideloop   ;went 1->0, keep going
+         rts
+
+
+* Pure-vertical line.  These are common in certain
+* applications, and checking for it only adds two
+* cycles to the general case.
+vertline
+         ldx   in_y0
+         ldy   in_y1
+         cpx   in_y1      ;y0 < y1?
+         blt   :usey0     ;yes, go from y0 to y1
+         txa              ;swap X/A
+         tay
+         ldx   in_y1
+:usey0   stx   ]ypos
+         iny
+         sty   _pvytest+1
+
+         ldx   in_x0l     ;xc lo
+         lda   in_x0h     ;>= 256?
+         beq   :lotabl
+         ldy   div7hi,x
+         lda   mod7hi,x
+         bpl   :gotit     ;always
+:lotabl  ldy   div7lo,x
+         lda   mod7lo,x
+
+* Byte offset is in Y, mod-7 value is in A.
+:gotit   tax
+         lda   andmask,x
+         sta   _pvand+1   ;this doesn't change
+
+         lda   colorline,y
+         sta   _pvcolor+1 ;nor does this
+
+         ldx   ]ypos      ;top line
+
+* There's a trick where, when (linenum & 0x07) is
+* nonzero, you just add 4 to hbasl+1 instead of
+* re-doing the lookup.  However, TXA+AND+BEQ
+* followed by LDA+CLC+ADC+STA is 16 cycles, the same
+* as our self-modified lookup, so it's not a win.
+* (And if we used a second ylookhi and self-modded
+* the table address, we could shave off another 2.)
+
+* Main pure-vertical loop
+pverloop
+         lda   ylooklo,x  ;4
+         sta   ]hbasl     ;3
+         lda   ylookhi,x  ;4
+_pg_or7  ora   #$20       ;2
+         sta   ]hbasl+1   ;3 (= 16)
+
+_pvcolor lda   #$00       ;2 start with color pattern
+_lmdpv   eor   (]hbasl),y ;5 flip all bits
+_pvand   and   #$00       ;2 clear other bits
+         eor   (]hbasl),y ;5
+         sta   (]hbasl),y ;6 (= 20)
+
+         inx              ;2
+_pvytest cpx   #$00       ;2 done?
+         bne   pverloop   ;3 = 7
+         rts
+* 43 cycles/pixel
+
+
+********************************
+*
+* Set the line mode according to in_arg
+*
+* A slightly silly feature to get xdraw lines
+* without really working for it.
+*
+********************************
+SetLineMode
+         lda   in_arg
+         beq   :standard
+
+* configure for xdraw
+         lda   #$24       ;BIT dp
+         sta   _lmb
+         sta   _lmdh
+         sta   _lmdv
+         sta   _lmdw
+         sta   _lmdpv
+         rts
+
+* configure for standard drawing
+:standard lda  #$f0       ;BEQ
+         sta   _lmb
+         lda   #$51       ;EOR (dp),y
+         sta   _lmdh
+         sta   _lmdv
+         sta   _lmdw
+         sta   _lmdpv
+         rts
--- a/FDRAW.S
+++ b/FDRAW.S
@ -0,0 +1,805 @@
+********************************
+*                              *
+* Fast Apple II Graphics       *
+* By Andy McFadden             *
+* Version 0.3, Aug 2015        *
+*                              *
+* Main source file             *
+*                              *
+* Developed with Merlin-16     *
+*                              *
+********************************
+
+* Set to 1 to build FDRAW.FAST, set to zero to
+* build FDRAW.SMALL.
+USE_FAST equ   1
+
+* Set to 1 to turn on beeps/clicks for debugging.
+NOISE_ON equ   0
+
+
+         lst   off
+         org   $6000
+
+*
+* Macros.
+*
+spkr     equ   $c030
+bell     equ   $ff3a
+
+* If enabled, click the speaker (changes flags only).
+CLICK    mac
+         do    NOISE_ON
+         bit   spkr
+         fin
+         <<<
+* If enabled, beep the speaker (scrambles regs).
+BEEP     mac
+         do    NOISE_ON
+         jsr   bell
+         fin
+         <<<
+* If enabled, insert a BRK.
+BREAK    mac
+         do    NOISE_ON
+         brk   $99
+         fin
+         <<<
+
+* In "fast" mode, we align tables on page boundaries so we
+* don't take a 1-cycle hit when the indexing crosses a page.
+* In "small" mode, we skip the alignment.
+PG_ALIGN mac
+         do    USE_FAST
+         ds    \
+         fin
+         <<<
+
+*
+* Hi-res screen constants.
+*
+BYTES_PER_ROW = 40
+NUM_ROWS =     192
+NUM_COLS =     280
+
+*
+* Variable storage.  We assign generic names to
+* zero-page scratch locations, then assign variables
+* with real names to these.
+*
+* 06-09 are unused (except by SWEET-16)
+* 1a-1d are Applesoft hi-res scratch
+* cc-cf are only used by INTBASIC
+* eb-ef and ff appear totally unused by ROM routines
+*
+zptr0    equ   $1a        ;2b
+zloc0    equ   $06
+zloc1    equ   $07
+zloc2    equ   $08
+zloc3    equ   $09
+zloc4    equ   $1c
+zloc5    equ   $1d
+zloc6    equ   $cc
+zloc7    equ   $cd
+zloc8    equ   $ce
+zloc9    equ   $cf
+zloc10   equ   $eb
+zloc11   equ   $ec
+zloc12   equ   $ed
+zloc13   equ   $ee
+
+
+********************************
+*
+* Entry points for external programs.
+*
+********************************
+Entry
+         jmp   Init       ;initialize data tables
+         dfb   0,3        ;version number
+
+*
+* Parameters passed from external programs.
+*
+in_arg   ds    1          ;generic argument
+in_x0l   ds    1          ;X coordinate 0, low part
+in_x0h   ds    1          ;X coordinate 0, high part
+in_y0    ds    1          ;Y coordinate 0
+in_x1l   ds    1
+in_x1h   ds    1
+in_y1    ds    1
+in_rad   ds    1          ;radius for circles
+
+         ds    3          ;pad to 16 bytes
+
+         jmp   SetColor
+         jmp   SetPage
+         jmp   Clear
+         jmp   DrawPoint
+         jmp   DrawLine
+         jmp   DrawRect
+         jmp   FillRect
+         jmp   DrawCircle
+         jmp   FillCircle
+         jmp   SetLineMode
+         jmp   noimpl     ;reserved2
+         jmp   FillRaster
+
+* Raster fill values.  Top, bottom, and pointers to tables
+* for the benefit of external callers.
+rast_top ds    1
+rast_bottom ds 1
+         da    rastx0l
+         da    rastx0h
+         da    rastx1l
+         da    rastx1h
+
+noimpl   rts
+
+
+********************************
+*
+* Global variables.
+*
+********************************
+
+g_inited dfb   0          ;initialized?
+g_color  dfb   0          ;hi-res color (0-7)
+g_page   dfb   $20        ;hi-res page ($20 or $40)
+
+
+********************************
+*
+* Initialize.
+*
+********************************
+Init
+         lda   #$00
+         sta   in_arg
+         jsr   SetColor   ;set color to zero
+         jsr   SetLineMode ;set normal lines
+         lda   #$20
+         sta   in_arg
+         sta   g_inited
+         jmp   SetPage    ;set hi-res page 1
+
+
+********************************
+*
+* Set the color.
+*
+********************************
+SetColor
+         lda   in_arg
+         cmp   g_color    ;same as the old color?
+         beq   :done
+
+         and   #$07       ;safety first
+         sta   g_color
+
+* Update the "colorline" table, which provides a quick color
+* lookup for odd/even bytes.  We could also have one table
+* per color and self-mod the "LDA addr,y" instructions to
+* point to the current one, but that uses a bunch of memory
+* and is kind of ugly.  Takes 16 + (12 * 40) = 496 cycles.
+         tax              ;2
+         lda   xormask,x  ;4
+         sta   :_xormsk+1 ;4
+
+         lda   oddcolor,x ;4
+         ldy   #BYTES_PER_ROW-1 ;2
+]loop    sta   colorline,y ;5
+:_xormsk eor   #$00       ;2
+         dey              ;2
+         bpl   ]loop      ;3
+
+:done    rts
+
+
+********************************
+*
+* Set the page.
+*
+********************************
+SetPage
+         lda   g_inited   ;let's just check this
+         beq   noinit     ; (not called too often)
+
+         lda   in_arg
+         cmp   #$20
+         beq   :good
+         cmp   #$40
+         beq   :good
+         jmp   bell
+:good
+         sta   g_page
+
+         do    0          ;*****
+         cmp   ylookhi
+         beq   :tabok
+* Check to see if the values currently in the Y-lookup table
+* match our current page setting.  If they don't, we need to
+* adjust the code that does lookups.
+
+* This approach modifies the table itself, paying a large
+* cost now so we don't have to pay it on every lookup.
+* However, this costs 2+(16*192)=3074 cycles, while an
+* "ORA imm" only adds two to each lookup, so we'd have
+* to do a lot of drawing to make this worthwhile.
+* (Note: assumes ylookhi is based at $2000 not $0000)
+         ldy   #NUM_ROWS  ;2
+]loop    lda   ylookhi-1,y ;4
+         eor   #$60       ;2 $20 <--> $40
+         sta   ylookhi-1,y ;5
+         dey              ;2
+         bne   ]loop      ;3
+
+         else             ;*****
+
+* This approach uses self-modifying code to update the
+* relevant instructions.  It's a bit messy to have it
+* here, but it saves us from having to do it on
+* every call.
+*
+* We could also have a second y-lookup table and
+* use this to update the pointers.  That would let
+* us drop the "ORA imm" entirely, without the cost
+* of the rewrite above, but eating up another 192 bytes.
+         sta   _pg_or1+1  ;rastfill
+         sta   _pg_or2+1  ;circle hplot
+         sta   _pg_or3+1  ;circle hplot
+         sta   _pg_or4+1  ;drawline
+         sta   _pg_or5+1  ;drawline
+         sta   _pg_or6+1  ;drawline
+         sta   _pg_or7+1  ;drawline
+
+         fin              ;*****
+
+:tabok   rts
+
+noinit   ldy   #$00
+]loop    lda   :initmsg,y
+         beq   :done
+         jsr   $fded      ;cout
+         iny
+         bne   ]loop
+:done    rts
+
+:initmsg asc   "FDRAW NOT INITIALIZED",87,87,00
+
+
+********************************
+*
+* Clear the screen to the current color.
+*
+********************************
+Clear
+
+         do    USE_FAST   ;*****
+* This performs a "visually linear" clear, erasing the screen
+* from left to right and top to bottom.  To reduce the amount
+* of code required we erase in thirds (top/middle/bottom).
+*
+* Compare to a "venetian blind" clear, which is what you get
+* if you erase memory linearly.
+*
+* The docs discuss different approaches.  This version
+* requires ((2 + 5*64 + 11) * 40 + 14) * 3 = 40002 cycles.
+* If we didn't divide it into thirds to keep the top-down
+* look, we'd need (5*64 + 9) * 120 = 39480 cycles, so
+* we're spending 522 cycles to avoid the venetian look.
+         lda   :clrloop+2
+         cmp   g_page
+         beq   :pageok
+
+* We're on the wrong hi-res page.  Flip to the other one.
+* 4 + (20*64) = 1284 cycles to do the flip (+ a few more
+* because we're probably crossing a page boundary).
+         BEEP
+         ldy   #NUM_ROWS  ;2
+]loop    lda   :clrloop-3+2,y ;4
+         eor   #$60       ;2
+         sta   :clrloop-3+2,y ;5
+         dey              ;2
+         dey              ;2
+         dey              ;2
+         bne   ]loop      ;3
+
+:pageok  ldx   g_color    ;grab the current color
+         lda   xormask,x
+         sta   :_xormsk+1
+         lda   evencolor,x
+
+         ldy   #0
+         jsr   :clearthird
+         ldy   #BYTES_PER_ROW
+         jsr   :clearthird
+         ldy   #BYTES_PER_ROW*2
+* fall through into :clearthird for final pass
+
+:clearthird
+         ldx   #BYTES_PER_ROW-1 ;2
+:clrloop sta   $2000,y    ;5 (* 64)
+         sta   $2400,y    ;this could probably be
+         sta   $2800,y    ; done with LUP math
+         sta   $2c00,y
+         sta   $3000,y
+         sta   $3400,y
+         sta   $3800,y
+         sta   $3c00,y
+         sta   $2080,y
+         sta   $2480,y
+         sta   $2880,y
+         sta   $2c80,y
+         sta   $3080,y
+         sta   $3480,y
+         sta   $3880,y
+         sta   $3c80,y
+         sta   $2100,y
+         sta   $2500,y
+         sta   $2900,y
+         sta   $2d00,y
+         sta   $3100,y
+         sta   $3500,y
+         sta   $3900,y
+         sta   $3d00,y
+         sta   $2180,y
+         sta   $2580,y
+         sta   $2980,y
+         sta   $2d80,y
+         sta   $3180,y
+         sta   $3580,y
+         sta   $3980,y
+         sta   $3d80,y
+         sta   $2200,y
+         sta   $2600,y
+         sta   $2a00,y
+         sta   $2e00,y
+         sta   $3200,y
+         sta   $3600,y
+         sta   $3a00,y
+         sta   $3e00,y
+         sta   $2280,y
+         sta   $2680,y
+         sta   $2a80,y
+         sta   $2e80,y
+         sta   $3280,y
+         sta   $3680,y
+         sta   $3a80,y
+         sta   $3e80,y
+         sta   $2300,y
+         sta   $2700,y
+         sta   $2b00,y
+         sta   $2f00,y
+         sta   $3300,y
+         sta   $3700,y
+         sta   $3b00,y
+         sta   $3f00,y
+         sta   $2380,y
+         sta   $2780,y
+         sta   $2b80,y
+         sta   $2f80,y
+         sta   $3380,y
+         sta   $3780,y
+         sta   $3b80,y
+         sta   $3f80,y
+:_xormsk eor   #$00       ;2 flip odd/even bits
+         iny              ;2
+         dex              ;2
+         bmi   :done      ;2
+         jmp   :clrloop   ;3
+:done    rts
+
+         else             ;***** not USE_FAST
+
+* This version was suggested by Marcus Heuser on
+* comp.sys.apple2.programmer.  It does a "venetian blind"
+* clear, and takes (5 * 32 + 7) * 248 = 41416 cycles.
+* It overwrites half of the screen holes.
+         lda   :clrloop+5
+         cmp   g_page
+         beq   :pageok
+
+* We're on the wrong hi-res page.  Flip to the other one.
+* 12 + (20*31) = 632 cycles to do the flip.  We have to
+* single out the first entry because it's $1f not $20.
+         BEEP
+         lda   :clrloop+2 ;4
+         eor   #$20       ;2 $1f <-> $3f
+         sta   :clrloop+2 ;4
+         ldy   #31*3      ;2
+]loop    lda   :clrloop+2,y ;4
+         eor   #$60       ;2 $20 <-> $40
+         sta   :clrloop+2,y ;5
+         dey              ;2
+         dey              ;2
+         dey              ;2
+         bne   ]loop      ;3
+
+:pageok  ldx   g_color
+         lda   xormask,x
+         sta   :_xormsk+1
+         lda   oddcolor,x
+         ldy   #248       ;120 + 8 + 120
+:clrloop
+]addr    =     $1fff
+         lup   32         ;begin a loop in assembler
+         sta   ]addr,y    ;5
+]addr    =     ]addr+$100 ;sta 20ff,21ff,...
+         --^
+:_xormsk eor   #$00       ;2
+         dey              ;2
+         bne   :clrloop   ;3
+         rts
+
+         fin              ;***** not USE_FAST
+
+
+********************************
+*
+* Draw rectangle outline.
+*
+********************************
+DrawRect
+* We could just issue 4 line draw calls here, maybe
+* adjusting the vertical lines by 1 pixel up/down to
+* avoid overdraw.  But if the user wanted 4 lines,
+* they could just draw 4 lines.  Instead, we're going
+* to draw a double line on each edge to ensure that
+* the outline rectangle always has the correct color.
+*
+* Rather than draw two vertical lines, we draw a
+* two-pixel-wide filled rectangle on each side.
+*
+* We don't want to double-up if the rect is only one
+* pixel wide, so we have to check for that.
+*
+* If the rect is one pixel high, it's just a line.
+* If it's two pixels high, we don't need to draw
+* the left/right edges, just the top/bottom lines.
+* If it's more than two tall, we don't need to draw
+* the left/right edges on the top and bottom lines,
+* so we save a few cycles by skipping those.
+
+         lda   in_y1      ;copy top/bottom to local
+         sta   rast_bottom
+         dec   rast_bottom ;move up one
+         sec
+         sbc   in_y0
+         beq   :isline    ;1 pixel high, just draw line
+         cmp   #1
+         beq   :twolines  ;2 pixels high, lines only
+         ldy   in_y0
+         iny              ;start down a line
+         sty   rast_top
+
+         lda   in_x0h     ;check to see if left/right
+         cmp   in_x1h     ; coords are the same; if
+         bne   :notline   ; so, going +1/-1 at edge
+         lda   in_x0l     ; will overdraw.
+         cmp   in_x1l
+         bne   :notlin1
+
+:isline  jmp   DrawLine   ;just treat like line
+
+* Set up left edge.  Top line is in Y.
+:notline lda   in_x0l
+:notlin1 sta   rastx0l,y
+         clc
+         adc   #1
+         sta   rastx1l,y
+         lda   in_x0h
+         ora   #$80       ;"repeat" flag
+         sta   rastx0h,y
+         and   #$7f
+         adc   #0
+         sta   rastx1h,y
+         jsr   FillRaster
+
+         ldy   rast_top
+         lda   in_x1l     ;now set up right edge
+         sta   rastx1l,y
+         sec
+         sbc   #1
+         sta   rastx0l,y
+         lda   in_x1h
+         sta   rastx1h,y
+         sbc   #0
+         ora   #$80       ;"repeat" flag
+         sta   rastx0h,y
+         jsr   FillRaster
+
+* Now the top/bottom lines.
+:twolines
+         ldy   in_y0
+         jsr   :drawline
+         ldy   in_y1
+
+:drawline
+         sty   rast_top
+         sty   rast_bottom
+         lda   in_x0l     ;copy left/right to the
+         sta   rastx0l,y  ; table entry for the
+         lda   in_x0h     ; appropriate line
+         sta   rastx0h,y
+         lda   in_x1l
+         sta   rastx1l,y
+         lda   in_x1h
+         sta   rastx1h,y
+         jmp   FillRaster
+
+
+********************************
+*
+* Draw filled rectangle.
+*
+********************************
+FillRect
+* Just fill out the raster table and call the fill routine.
+* We require y0=top, y1=bottom, x0=left, x1=right.
+         ldy   in_y0
+         sty   rast_top
+         lda   in_y1
+         sta   rast_bottom
+
+         lda   in_x0l
+         sta   rastx0l,y
+         lda   in_x0h
+         ora   #$80       ;"repeat" flag
+         sta   rastx0h,y
+         lda   in_x1l
+         sta   rastx1l,y
+         lda   in_x1h
+         sta   rastx1h,y
+
+         jmp   FillRaster
+
+
+********************************
+*
+* Fill an area defined by the raster tables.
+*
+********************************
+FillRaster
+
+* Render rasterized output.  The left and right edges
+* are stored in the rastx0/rastx1 tables, and the top
+* and bottom-most pixels are in rast_top/rast_bottom.
+*
+* This can be used to render an arbitrary convex
+* polygon after it has been rasterized.
+*
+* If the high bit of the high byte of X0 is set, we
+* go into "repeat" mode, where we just repeat the
+* previous line.  This saves about 40 cycles of
+* overhead per line when drawing rectangles, plus
+* what we would have to spend to populate multiple
+* lines of the raster table.  It only increases the
+* general per-line cost by 3 cycles.
+*
+* We could use the "repeat" flag to use this code to
+* draw vertical lines, though that's mostly of value
+* to an external caller who knows ahead of time that
+* the line is vertical.  The DrawLine code is pretty
+* good with vertical lines, and adding additional
+* setup time to every vertical-dominant line to
+* decide if it should call here seems like a
+* losing proposition.
+
+]hbasl   equ   zptr0
+]hbash   equ   zptr0+1
+]lftbyte equ   zloc0
+]lftbit  equ   zloc1
+]rgtbyte equ   zloc2
+]rgtbit  equ   zloc3
+]line    equ   zloc4
+]andmask equ   zloc5
+]cur_line equ  zloc6
+]repting equ   zloc7
+
+         ldx   g_color    ;configure color XOR byte
+         lda   xormask,x
+         do    USE_FAST   ;*****
+         cmp   rast_unroll+3 ;already configured?
+         beq   :goodmask
+         jsr   fixrastxor
+:goodmask
+         else
+         sta   _xorcolor+1
+         fin              ;*****
+
+         lda   #$00
+         sta   ]repting
+
+         ldy   rast_top
+
+* Main rasterization loop.  Y holds the line number.
+rastloop
+         sty   ]cur_line  ;3
+         ldx   ylooklo,y  ;4
+         stx   ]hbasl     ;3
+         lda   ylookhi,y  ;4
+_pg_or1  ora   #$20       ;2 will be $20 or $40
+         sta   ]hbash     ;3 = 19 cycles
+         do    USE_FAST-1 ;***** i.e. not USE_FAST
+         stx   _wrhires+1
+         sta   _wrhires+2
+         fin              ;*****
+
+* divide left edge by 7
+         ldx   rastx0l,y  ;4 line num in Y
+         lda   rastx0h,y  ;4
+         bpl   :noflag    ;2
+         sta   rastx0h+1,y ;4 propagate
+         lda   ]repting   ;3 first time through?
+         beq   :firstre   ;2 yup, finish calculations
+         lda   ]rgtbyte   ;3 need this in A
+         bpl   :repeat    ;3 always
+:firstre lda   rastx0h,y  ;reload
+         sta   ]repting   ;any nonzero will do
+         and   #$7f       ;strip repeat flag
+:noflag  beq   :lotabl
+         lda   mod7hi,x
+         sta   ]lftbit
+         lda   div7hi,x
+         sta   ]lftbyte
+         bpl   :gotlft    ;always
+         BREAK            ;debug
+:lotabl  lda   mod7lo,x
+         sta   ]lftbit
+         lda   div7lo,x
+         sta   ]lftbyte
+:gotlft
+
+* divide right edge by 7
+         ldx   rastx1l,y  ;4 line num in Y
+         lda   rastx1h,y  ;4
+         beq   :lotabr    ;3
+         lda   mod7hi,x
+         sta   ]rgtbit
+         lda   div7hi,x
+         sta   ]rgtbyte
+         bpl   :gotrgt    ;always
+         BREAK            ;debug
+:lotabr  lda   mod7lo,x   ;4
+         sta   ]rgtbit    ;3
+         lda   div7lo,x   ;4
+         sta   ]rgtbyte   ;3 = 25 for X1 < 256
+:gotrgt
+
+:repeat
+         cmp   ]lftbyte   ;3
+         bne   :not1byte  ;3
+
+* The left and right edges are in the same byte.  We
+* need to set up the mask differently, so we deal with
+* it as a special case.
+         ldy   ]lftbit
+         lda   leftmask,y ;create the AND mask
+         ldx   ]rgtbit
+         and   rightmask,x ;strip out bits on right
+         sta   ]andmask
+
+         ldy   ]lftbyte
+         lda   colorline,y ;get color bits
+         eor   (]hbasl),y ;combine w/screen
+         and   ]andmask   ;remove not-ours
+         eor   (]hbasl),y ;combine again
+         sta   (]hbasl),y
+         jmp   rastlinedone
+
+* This is the more general case.  We special-case the
+* left and right edges, then byte-stomp the middle.
+* On entry, ]rgtbyte is in A
+:not1byte
+         sec              ;2 compute number of full
+         sbc   ]lftbyte   ;3  and partial bytes to
+         tax              ;2  draw
+         inx              ;2
+
+         ldy   ]rgtbit    ;3
+         cpy   #6         ;2
+         beq   :rgtnospcl ;3
+         lda   rightmask,y ;handle partial-byte right
+         sta   ]andmask
+         ldy   ]rgtbyte
+         lda   colorline,y
+         eor   (]hbasl),y
+         and   ]andmask
+         eor   (]hbasl),y
+         sta   (]hbasl),y
+         dex              ;adjust count
+:rgtnospcl
+
+         ldy   ]lftbit    ;3 check left for partial
+         beq   :lftnospcl ;3
+         lda   leftmask,y ;handle partial-byte left
+         sta   ]andmask
+         ldy   ]lftbyte
+         lda   colorline,y
+         eor   (]hbasl),y
+         and   ]andmask
+         eor   (]hbasl),y
+         sta   (]hbasl),y
+         dex              ;adjust count
+         beq   rastlinedone ;bail if all done
+         iny              ;advance start position
+         bne   :liny      ;always
+         BREAK
+:lftnospcl
+
+         ldy   ]lftbyte   ;3
+:liny
+
+         do    USE_FAST   ;***** "fast" loop
+* Instead of looping, jump into an unrolled loop.
+* Cost is 10 cycles per byte with an extra 14 cycles
+* of overhead, so we start to win at 4 bytes.
+         lda   rastunidx,x ;4
+         sta   :_rastun+1 ;4
+         lda   colorline,y ;4 get odd/even color val
+:_rastun jmp   rast_unroll ;3
+
+         else             ;***** "slow" loop
+* Inner loop of the renderer.  This runs 0-40x.
+* Cost is 14 cycles/byte.
+         lda   colorline,y ;get appropriate odd/even val
+_wrhires sta   $2000,y    ;5 replaced with line addr
+_xorcolor eor  #$00       ;2 replaced with $00/$7f
+         iny              ;2
+         dex              ;2
+         bne   _wrhires   ;3
+
+         fin              ;*****
+
+rastlinedone
+         ldy   ]cur_line  ;3 more lines to go?
+         cpy   rast_bottom ;4
+         bge   :done      ;2
+         iny              ;2
+         jmp   rastloop   ;3 must have line in Y
+
+:done    rts
+
+fixrastxor
+         do    USE_FAST   ;*****
+* Update the EOR statements in the unrolled rastfill code.
+* Doing this with a loop takes ~600 cycles, doing it with
+* unrolled stores takes 160.  We only do this when we
+* need to, so changing the color from green to blue won't
+* cause this to run.
+*
+* Call with the XOR value in A.
+]offset  =     0
+         lup   BYTES_PER_ROW
+         sta   rast_unroll+3+]offset
+]offset  =     ]offset+5
+         --^
+         BEEP
+         rts
+         fin              ;*****
+
+
+* include the line functions
+         put   FDRAW.LINE
+
+* include the circle functions
+         put   FDRAW.CIRCLE
+
+         lst   on
+CODE_END equ   *          ;end of code section
+         lst   off
+
+* include the data tables
+         put   FDRAW.TABLES
+
+         lst   on
+DAT_END  equ   *          ;end of data / BSS
+         lst   off
+
+* Save the appropriate object file.
+         do    USE_FAST
+         sav   FDRAW.FAST
+         else
+         sav   FDRAW.SMALL
+         fin
--- a/FDRAW.TABLES.S
+++ b/FDRAW.TABLES.S
@ -0,0 +1,339 @@
+********************************
+*                              *
+* Fast Apple II Graphics       *
+* By Andy McFadden             *
+* Version 0.3, Aug 2015        *
+*                              *
+* Pre-computed data and        *
+* large internal buffers.      *
+* (Included by FDRAW.S)        *
+*                              *
+* Developed with Merlin-16     *
+*                              *
+********************************
+
+* Expected layout with alignment:
+*
+* P1 ylooklo, misc tables
+* P2 ylookhi, colorline
+* P3 rastx0l
+* P4 rastx0h
+* P5 rastx1l
+* P6 rastx1h, div7hi, mod7hi
+* P7 div7lo
+* P8 mod7lo
+* P9 rast_unroll, rastunidx
+*
+* Tables should be just under $900 bytes.
+
+         PG_ALIGN
+
+* Hi-res Y lookup, low part (192 bytes).
+ylooklo  HEX   0000000000000000
+         HEX   8080808080808080
+         HEX   0000000000000000
+         HEX   8080808080808080
+         HEX   0000000000000000
+         HEX   8080808080808080
+         HEX   0000000000000000
+         HEX   8080808080808080
+         HEX   2828282828282828
+         HEX   a8a8a8a8a8a8a8a8
+         HEX   2828282828282828
+         HEX   a8a8a8a8a8a8a8a8
+         HEX   2828282828282828
+         HEX   a8a8a8a8a8a8a8a8
+         HEX   2828282828282828
+         HEX   a8a8a8a8a8a8a8a8
+         HEX   5050505050505050
+         HEX   d0d0d0d0d0d0d0d0
+         HEX   5050505050505050
+         HEX   d0d0d0d0d0d0d0d0
+         HEX   5050505050505050
+         HEX   d0d0d0d0d0d0d0d0
+         HEX   5050505050505050
+         HEX   d0d0d0d0d0d0d0d0
+
+* Color masks for odd/even bytes, colors 0-7.
+evencolor dfb  $00,$2a,$55,$7f,$80,$aa,$d5,$ff
+oddcolor dfb   $00,$55,$2a,$7f,$80,$d5,$aa,$ff
+
+* XOR mask for colors 0-7 - non-BW flip on odd/even.
+xormask  dfb   $00,$7f,$7f,$00,$00,$7f,$7f,$00
+
+* AND mask for the 7 pixel positions, high bit set
+* for the color shift.
+andmask  dfb   $81,$82,$84,$88,$90,$a0,$c0
+
+* These are pixel AND masks, used with the modulo 7
+* result.  Entry #2 in leftmask means we're touching
+* the rightmost 5 pixels, and entry #2 in rightmask
+* means we're touching the 3 leftmost pixels.
+*
+* The high bit is always set, because we want to
+* keep the color's high bit.
+leftmask dfb   $ff,$fe,$fc,$f8,$f0,$e0,$c0
+rightmask dfb  $81,$83,$87,$8f,$9f,$bf,$ff
+
+         PG_ALIGN
+
+* Hi-res Y lookup, high part (192 bytes).
+* OR with $20 or $40.
+ylookhi  HEX   0004080c1014181c
+         HEX   0004080c1014181c
+         HEX   0105090d1115191d
+         HEX   0105090d1115191d
+         HEX   02060a0e12161a1e
+         HEX   02060a0e12161a1e
+         HEX   03070b0f13171b1f
+         HEX   03070b0f13171b1f
+         HEX   0004080c1014181c
+         HEX   0004080c1014181c
+         HEX   0105090d1115191d
+         HEX   0105090d1115191d
+         HEX   02060a0e12161a1e
+         HEX   02060a0e12161a1e
+         HEX   03070b0f13171b1f
+         HEX   03070b0f13171b1f
+         HEX   0004080c1014181c
+         HEX   0004080c1014181c
+         HEX   0105090d1115191d
+         HEX   0105090d1115191d
+         HEX   02060a0e12161a1e
+         HEX   02060a0e12161a1e
+         HEX   03070b0f13171b1f
+         HEX   03070b0f13171b1f
+
+* Masks for current color (even/odd), e.g. 55 2a 55 2a ...
+* Updated whenever the color changes.
+colorline ds   40
+
+         PG_ALIGN
+rastx0l  ds    NUM_ROWS
+         PG_ALIGN
+rastx0h  ds    NUM_ROWS
+         ds    1          ;repeat mode can overstep
+         PG_ALIGN
+rastx1l  ds    NUM_ROWS
+         PG_ALIGN
+rastx1h  ds    NUM_ROWS
+
+* Lookup tables for dividing 0-279 by 7.  The "hi"
+* parts are 24 bytes each, so they fit inside
+* the previous 192-byte entry.  The "lo" parts
+* each fill a page.
+div7hi   HEX   2424242525252525
+         HEX   2525262626262626
+         HEX   2627272727272727
+mod7hi   HEX   0405060001020304
+         HEX   0506000102030405
+         HEX   0600010203040506
+
+         PG_ALIGN
+
+div7lo   HEX   0000000000000001
+         HEX   0101010101010202
+         HEX   0202020202030303
+         HEX   0303030304040404
+         HEX   0404040505050505
+         HEX   0505060606060606
+         HEX   0607070707070707
+         HEX   0808080808080809
+         HEX   0909090909090a0a
+         HEX   0a0a0a0a0a0b0b0b
+         HEX   0b0b0b0b0c0c0c0c
+         HEX   0c0c0c0d0d0d0d0d
+         HEX   0d0d0e0e0e0e0e0e
+         HEX   0e0f0f0f0f0f0f0f
+         HEX   1010101010101011
+         HEX   1111111111111212
+         HEX   1212121212131313
+         HEX   1313131314141414
+         HEX   1414141515151515
+         HEX   1515161616161616
+         HEX   1617171717171717
+         HEX   1818181818181819
+         HEX   1919191919191a1a
+         HEX   1a1a1a1a1a1b1b1b
+         HEX   1b1b1b1b1c1c1c1c
+         HEX   1c1c1c1d1d1d1d1d
+         HEX   1d1d1e1e1e1e1e1e
+         HEX   1e1f1f1f1f1f1f1f
+         HEX   2020202020202021
+         HEX   2121212121212222
+         HEX   2222222222232323
+         HEX   2323232324242424
+mod7lo   HEX   0001020304050600
+         HEX   0102030405060001
+         HEX   0203040506000102
+         HEX   0304050600010203
+         HEX   0405060001020304
+         HEX   0506000102030405
+         HEX   0600010203040506
+         HEX   0001020304050600
+         HEX   0102030405060001
+         HEX   0203040506000102
+         HEX   0304050600010203
+         HEX   0405060001020304
+         HEX   0506000102030405
+         HEX   0600010203040506
+         HEX   0001020304050600
+         HEX   0102030405060001
+         HEX   0203040506000102
+         HEX   0304050600010203
+         HEX   0405060001020304
+         HEX   0506000102030405
+         HEX   0600010203040506
+         HEX   0001020304050600
+         HEX   0102030405060001
+         HEX   0203040506000102
+         HEX   0304050600010203
+         HEX   0405060001020304
+         HEX   0506000102030405
+         HEX   0600010203040506
+         HEX   0001020304050600
+         HEX   0102030405060001
+         HEX   0203040506000102
+         HEX   0304050600010203
+
+
+* RastFill unrolled loop.  At each step we store the current
+* color value, XOR it to flip the bits if needed, and advance.
+* The caller needs to set the appropriate initial value based
+* on whether the address is odd or even.
+*
+* We can use a 3-cycle "EOR dp" or a 2-cycle "EOR imm".  The
+* former is one cycle slower, the latter requires us to
+* self-mod 40 instructions when the color changes.
+*
+* This must be page-aligned so that we can take the value
+* from the rastunidx table and self-mod a JMP without having
+* to do a 16-bit add.  We have just enough room for the
+* unrolled loop (40*5+3) and x5 table (41) = 244 bytes, fits
+* on a single page.
+
+         do    USE_FAST   ;*****
+         ds    \
+]hbasl   equ   zptr0      ;must match FillRaster
+rast_unroll equ *
+         lst   off
+         lup   BYTES_PER_ROW
+         sta   (]hbasl),y ;6
+         eor   #$00       ;2
+         iny              ;2  10 cycles, 5 bytes
+         --^
+         jmp   rastlinedone
+
+* Index into rast_unroll.  If we need to output N bytes,
+* we want to jump to (rast_unroll + (40 - N) * 5) (where
+* 5 is the number of bytes per iteration).
+rastunidx
+]offset  =     BYTES_PER_ROW*5
+         lup   BYTES_PER_ROW+1 ;0-40
+         dfb   ]offset
+]offset  =     ]offset-5
+         --^
+
+         fin              ;*****
+
+
+********************************
+*
+* Code used to generate tables above.  If you want to
+* decrease load size, use these functions to generate
+* the data into empty memory, then discard the code.
+* (Maybe use a negative DS and overlap with rastx0l?)
+*
+********************************
+         DO    0          ;*****
+
+init_ylook
+]hbasl   equ   zptr1
+]hbash   equ   zptr1+1
+
+* Initialize Y-lookup table.  We just call the bascalc
+* function.
+         ldx   #NUM_ROWS
+         ldy   #NUM_ROWS-1
+]loop    tya
+         jsr   bascalc
+         lda   hbasl
+         sta   ylooklo,y
+         lda   hbash
+         ora   #$20       ;remove for $0000 base
+         sta   ylookhi,y
+         dey
+         dex
+         bne   ]loop
+         rts
+
+* Hi-res base address calculation.  This is based on the
+* HPOSN routine at $F411.
+*
+* Call with the line in A.  The results are placed into
+* zptr1.  X and Y are not disturbed.
+*
+* The value is in the $0000-1fff range, so you must OR
+* the desired hi-res page in.
+*
+bascalc
+         pha
+         and   #$c0
+         sta   ]hbasl
+         lsr
+         lsr
+         ora   ]hbasl
+         sta   ]hbasl
+         pla
+         sta   ]hbash
+         asl
+         asl
+         asl
+         rol   ]hbash
+         asl
+         rol   ]hbash
+         asl
+         ror   ]hbasl
+         lda   ]hbash
+         and   #$1f
+         sta   ]hbash
+         rts
+
+*
+* Create divide-by-7 tables.
+*
+mkdivtab
+]val     equ   zloc0
+
+         ldy   #0
+         sty   ]val
+         ldx   #0
+]loop    lda   ]val
+         sta   div7lo,y
+         txa
+         sta   mod7lo,y
+         inx
+         iny
+         beq   :lodone
+         cpx   #7
+         bne   ]loop
+         inc   ]val
+         ldx   #0
+         beq   ]loop      ;always
+:lodone                   ;safe to ignore ]va update
+]loop    lda   ]val
+         sta   div7hi,y
+         txa
+         sta   mod7hi,y
+         iny
+         cpy   #280-256
+         beq   :hidone
+         inx
+         cpx   #7
+         bne   ]loop
+         inc   ]val
+         ldx   #0
+         beq   ]loop      ;always
+:hidone  rts
+
+         FIN              ;*****
--- a/README.md
+++ b/README.md
@ -1,2 +1,59 @@
-# fdraw
-Fast Apple II graphics
+fdraw
+=====
+
+Fast graphics routines for the Apple II  
+By Andy McFadden  
+Version 0.3, August 2015
+
+## Overview ##
+
+The fdraw library provides fast rendering of points, lines, rectangles,
+and circles, as well as high-speed screen clears, for Apple II hi-res
+graphics.  It can be used from Applesoft or 6502 assembly language.
+
+Two disk images are available in the [fdraw-disks.zip](fdraw-disks) zip
+archive.  `fdrawdemo.do` is a 140K disk image with the demos that will
+run on an Apple ][+ or later.  `fdrawdev.po` is an 800K disk image with
+the source code, demos, and a few extras.
+
+A video of the demos running in the AppleWin emulator
+[https://www.youtube.com/watch?v=z2RFGVoaROE](is available).
+
+Learn more about how fdraw works in the
+[docs/manual.md](library documentation).
+
+Learn about the demos in the [docs/demos.md](demo documentation).
+
+Learn more about what possessed me to write a graphics library for the
+Apple II more than 20 years after the platform was discontinued in the
+[docs/personal-notes.md](fadden's brain documentation).
+
+The main bits of source code are accessible from git for easy viewing,
+but the "official" home is on `fdrawdev.po`.
+
+All code is copyright 2015 by Andy McFadden.  All rights reserved.  The
+source code is available under the Apache 2 license (a very friendly
+open-source license).
+
+
+### Version History ###
+
+##### v0.1 March 13, 2006
+
+No source code, just a demo with fast filled circles and screen clears.
+
+##### v0.2 March 20, 2006
+
+Polished up the sources and published.  This version implemented Clear,
+FillRect, FillCircle, and FillRaster.
+
+##### v0.3 August 21, 2015
+
+Added DrawPoint, DrawLine, DrawRect, DrawCircle, and SetLineMode.  Various
+size and performance improvements.
+
+Added Amperfdraw to make Applesoft BASIC programming easier.
+
+Added several more demos and tests.
+
+Added documentation.
--- a/docs/demos.md
+++ b/docs/demos.md
@ -0,0 +1,167 @@
+fdraw Demo README
+=================
+
+The fdraw distribution comes with a handful of demonstration programs.
+Most of them are written in Applesoft BASIC, and use the amperfdraw
+interface.  This is a somewhat poor way to demonstrate animation
+performance, as Applesoft adds a tremendous amount of overhead, but it
+is the only way to show what you *can* do with Applesoft.
+
+The easiest way to run them is with the "DEMO" program, which scans the
+DEMOS directory for BASIC programs and presents a list.  You can also
+just run them directly.
+
+* INTRO : Sort of a "hello, world" for fdraw.  Mix of single- and
+  double-buffered animation.
+
+* CIRCULAR : Draws lots of circles.
+
+* RECTSPLAT : Draws lots of rectangles.
+
+* CUBIC : Draws a spinning wireframe 3D cube.  (The 3D coordinates are
+  pre-computed -- fdraw doesn't do matrix transforms.)
+
+* TUNNEL : Animates circles to simulate driving through a tunnel.
+
+* LINEAR : Draws lots of lines.  The wipes show speed differences for
+  horizontal and vertical special cases, while the circular spinner
+  shows HPLOT is not as fast as &HPLOT which is not as fast as &PLOT for
+  a set of lines at a variety of angles.
+
+* LINE.DIFF : Draws several lines with the ROM routines and fdraw
+  side-by-side to illustrate the difference in line style.
+
+* CLEARLY : Clears the screen 32 times, 4 sets in each of the 8 colors.
+  The first round is done with the Applesoft ROM routine ("CALL 62454"),
+  the second round uses the fdraw &CLEAR function.
+
+* HRFAN : A simple line-art demo, using "xdraw" DrawLine with lines in
+  different colors.  Not a great demo, as the Applesoft code driving it
+  is rather slow, but it looks pretty good if you bump up the emulation
+  speed or switch to IIgs "fast" mode.  (This deserves a conversion to
+  assembly language.)
+
+* BRIAN.THEME.ORI : The Brian's Theme demo from the DOS 3.3 System
+  Master.  Unmodified except for integration with the demo menu
+  system, and with the bug on line 31112 fixed.
+
+* BRIAN.THEME.NEW : The Brian's Theme demo with '&' placed in front of
+  the various draw calls.  There isn't a huge difference in speed, as
+  there's a lot of overhead from Applesoft, but its interesting to note
+  the change in the appearance of the lines.
+
+* WIGGLE : Sample program that shows direct use of rasterization tables.
+
+When the demos are launched from the menu, they will assume that fdraw
+is already loaded and won't try to load it again.  If you run the demo
+program directly, it will try to load FDRAW.FAST and AMPERFDRAW from the
+parent directory before doing any drawing.
+
+
+## Extras ##
+
+The EXTRAS directory has some additional software that isn't "officially"
+part of fdraw, but may be of use.
+
+NOTE: some of these assume fdraw and amperfdraw are already loaded, and
+will hang if not.  Run DEMO and hit <esc> before running these.
+
+* ARRAY.EXAMPLE : The &PLOT example from the documentation.
+
+* XDRAW.ANIM : A demonstration of line animation using "xdraw" mode and
+  a simple shape that is drawn twice by a single &PLOT call.  One copy
+  is offset by 2 pixels, so each &PLOT call erases the previous copy and
+  draws a new copy 2 pixels to the right.  The animation is shown twice,
+  once with "erase all, draw all", and once with the erase and draw calls
+  interleaved for every line.
+
+* LINEFONT : Program for creating draw-array tables for text phrases.  Used
+  to create data files for the "intro" demo.  See the "LINEFONT Details"
+  section for more information.
+
+* DAVIEWER: Views the contents of .DA files created by LINEFONT.
+
+* BENCHCLEAR : Calls the "clear" function 256 times from a small
+  assembly-language program.  Handy for benchmarks, but slightly silly
+  since it's relatively easy to calculate the exact cycle cost.
+
+
+## LINEFONT Details ##
+
+NOTE: this program is an unfinished rough cut ("pre alpha"), used for
+preparing data for demos.
+
+The program includes a font definition, routines for displaying
+characters, and code for generating and exporting pre-rendered strings.
+
+Character vertices are expressed as floating-point values.  The baseline
+is at zero, the peak ascent is at 1.0, the lowest descent is -1.0. The
+leftmost pixel is at zero, the maximum value for the rightmost pixel is 1.0.
+Characters don't have to fill out the entire cell -- proportionally-spaced
+fonts are supported -- but they are expected to start at the left edge.
+
+So a capital 'M' might look like this:
+
+  0.0,0.0 -> 0.0,1.0 -> 0.5,0.7 -> 1.0,1.0 -> 1.0,0.0
+
+There is currently no "user interface", unless the "user" can program in
+Applesoft BASIC.  To generate strings, add a series of statements that set
+variables and call 20000 to add rendered strings to the set.  The relevant
+variables are:
+
+  S$ - string to add
+  DW - desired width, in pixels, of a cell 1.0 units wide
+  DH - desired height, in pixels of a cell 2.0 units high (ascent + descent)
+  IS% - inter-character spacing, in pixels
+  SW% - width of the space character (usually same as DW)
+  MO% - monospace flag; if nonzero, all chars are treated as 1.0 units wide
+
+Remove the REM from the start of line 1010 to enable the character viewer.
+At present only a couple of lower-case letters are defined.
+
+
+#### LINEFONT Output ####
+
+The LINEFONT program outputs a binary blob that can be passed to
+the &PLOT array-draw function.  The file structure is:
+
+0  byte - number of array sets in the list.
+1  2 bytes * N - table of offsets to individual array sets.  One of
+    these per array set.  The value is the offset from the start of the
+    file.
+
+(2N+1) array set #1:
+0  byte - number of vertices (0-127)
+1  byte - number of index pairs (0-127)
+2  2 bytes * V - vertices (values are signed X/Y)
+X  2 bytes * I - index pairs (values are 0-127)
+
+To display phrase #3, you would get the 16-bit value from the offset
+table with PEEK(start + 1 + 3 * 2) + PEEK(start + 2 + 3 * 2) * 256.
+You get the number of vertices from PEEK(start + offset), and the number
+of index pairs from PEEK(start + offset + 1).  Finally, call the array-draw
+function with:
+
+ VA = start + offset + 2
+ IA = VA + num_vertices * 2
+ &PLOT va, ia, num_index_pairs
+
+The 0,0 point in the blob is in the center of the phrase horizontally
+(which allows a maximum width of 255 pixels), and at the font baseline
+vertically (so most of the font will appear above the zero point, but
+descenders will extend below).
+
+
+#### Future Enhancements ####
+
+Right now the font definition is embedded in the program.  This takes up
+a lot of space -- before too long the BASIC program is going to intrude
+on the hi-res page -- and is unnecessarily restrictive.  The font should be
+defined by a separate program, and BSAVEd into a line-font file that
+LINEFONT can load.
+
+Generating strings should be menu-driven and interactive, rather than
+requiring manual changes to the code to fiddle with sizes and spacing.
+DAVIEWER should be folded into the generation program (though it's kind
+of handy as a simple example of how to unpack and access content).
+
--- a/docs/manual.md
+++ b/docs/manual.md
@ -0,0 +1,990 @@
+fdraw Library Documentation
+===========================
+
+Fast graphics primitives for the Apple II  
+By Andy McFadden  
+Version 0.3, August 2015
+
+## Overview ##
+
+The fdraw library provides fast rendering of points, lines, rectangles,
+and circles, as well as high-speed screen clears, for Apple II hi-res
+graphics.  It can be used from Applesoft or assembly language.
+
+The Applesoft ROM routines were designed to be as compact as possible,
+and were unable to use self-modifying code techniques, so their speed is
+less than what the Apple II is capable of.  The fdraw routines pick a
+different point in the speed/space trade-off continuum, providing fast
+speeds at a reasonable size.  Not everyone agrees on what "reasonable"
+means, so the fdraw code can be built in two modes, one that favors
+speed, one that reduces size.
+
+**Contents:**
+
+- [Applesoft BASIC Ampersand API](#amperapi)
+- [Raw API](#rawapi)
+- [Building the Code](#building)
+- [Apple II Hi-res in a Nutshell](#nutshell)
+- [Notes on the Drawing Functions](#notes)
+- [General Notes](#additional-notes)
+- [Enhancement Ideas](#ideas)
+- [My Quest for Lines](#history)
+
+
+<div id='amperapi'/>
+## Applesoft BASIC Ampersand API (Amperfdraw) ##
+
+The ampersand API acts as a bridge between Applesoft BASIC and fdraw.
+It's more convenient and has less overhead than POKE and CALL, though
+you are not prevented from using that approach if you prefer.  It's
+best to use one or the other though, not mix and match.
+
+All arguments are checked for validity.  An appropriate Applesoft
+error is thrown if invalid syntax or arguments are discovered.
+
+This is not intended to be compatible with, nor a replacement for, the
+ampersand utilities in Beagle Graphics.
+
+* &NEW - calls the fdraw Init function (which sets the color to 0 and
+  selects hi-res page 1).  You must do this once, at the start of
+  your program, after fdraw has been loaded.  This also resets internal
+  amperfdraw state, setting the "HPLOT TO" origin to (0,0) and the "AT"
+  point to (139,95).
+* &HGR - does what HGR does, only faster. Equivalent to executing
+  `&HCOLOR=0:&SCRN(1):&CLEAR:&HCOLOR=[prevcolor]`, and then setting the
+  display softswitches to display hi-res page 1 in mixed mode.  Also sets
+  $e6 (HPAG) for convenience in case you want to mix & match with ROM
+  routines.
+* &HGR2 - like &HGR, but for page 2.  Like HGR2, this turns off
+  mixed-text mode.
+* &SCRN({1,2}) - sets the hi-res page that will be used for drawing.  Does
+  not change which page is displayed.  (Use the softswitches, or call
+  &INVERSE.)
+* &INVERSE - flips the render page to the other page, and hits the
+  display softswitches to show the page that was just rendered.  Intended
+  for double-buffered animation.
+* &HCOLOR={0-7} - sets color, using the same numbering scheme as Applesoft.
+  Does not affect the color used by the ROM routines.
+* &CLEAR - clears screen to current color.
+* &HPLOT [TO] x,y [TO x,y ...] - draws a point or a line.  Works the same as
+  Applesoft, e.g. "&HPLOT TO" starts from the end of the previously
+  drawn line, and you can chain multiple "TO x,y" in a single statement.
+* &EXP {0,1} - set line mode.  0 is normal, 1 is "xdraw".
+* &XDRAW left,top,right,bottom - draws outline rectangle.
+* &DRAW left,top,right,bottom - draws filled rectangle.
+* &COS cx,cy,r - draws outline circle.
+* &SIN cx,cy,r - draws filled circle.
+
+* &AT cx,cy - sets center offset for array-based rendering.  Position must
+  be on the hi-res screen (0-279, 0-191).
+* &PLOT vertexAddr, indexAddr, indexCount [AT cx,cy] - draws from the
+  specified byte-arrays.  See the "Drawing Lines with Indexed Byte-Arrays"
+  section for the full explanation.
+
+
+<div id='rawapi'/>
+## Raw API ##
+
+The code is assembled at $6000 by default.  The program's length includes
+all data tables and work areas, and no memory outside of the program,
+zero page, and the current hi-res page is modified.
+
+Input parameters and the function jump table are located near the start
+of the program.  The API description below describes the addresses in
+relative terms.
+
+Input parameters are not checked for validity.  They must be in the range
+specified by the API, or undefined (but probably bad) behavior will result.
+The values will not be modified by fdraw functions.
+
+All drawing operations use the current color.
+
+* +0   Init - call this when the library is first loaded.  It must be
+       called before any other functions are used.  It initializes the
+       color to zero and the page to $20.
+* +3   (major version number, currently 0)
+* +4   (minor version number, currently 3)
+* +5   Input parameter area:
+  *  +5   arg - used for misc functions, e.g. SetColor and SetPage
+  *  +6   x0l - low part of the X0 coordinate (0-279)
+  *  +7   x0h -   high part of X0
+  *  +8   y0  - Y0 coordinate (0-191)
+  *  +9   x1l - low part of X1 (0-279)
+  *  +10  x1h -   high part of X1
+  *  +11  y1  - Y1 coordinate (0-191)
+  *  +12  rad - circle radius (0-255)
+* +13  (reserved)
+* +16  SetColor - set the color used for drawing (0-7) to the value in "arg".
+       The numbering is the same as the Applesoft hi-res colors.
+* +19  SetPage - set the hi-res page used for drawing to the value in "arg",
+       which must be $20 or $40.  Does not change the page that is displayed.
+       (Because a bad value can cause memory corruption, this value *is*
+       checked, and bad values rejected.)
+* +22  Clear - erase the current hi-res page to the current color.
+* +25  DrawPoint - plot a single point at x0,y0.
+* +28  DrawLine - draw a line from x0,y0 to x1,y1 (inclusive).
+* +31  DrawRect - draw a rectangle with corners at x0,y0 and x1,y1 (inclusive).
+       x0,y0 is the top-left, x1,y1 is the bottom-right.  The left and
+       right edges will be drawn two bits wide to ensure that the edges
+       are visible (drawn at x0+1, x1-1).
+* +34  FillRect - draw a filled rectangle with corners at x0,y0 and x1,y1
+       (inclusive).
+* +37  DrawCircle - draw a circle with center at x0,y0 and radius=rad.
+* +40  FillCircle - draw a filled circle with center at x0,y0 and radius=rad.
+* +43  SetLineMode - set the DrawLine mode to the value in "arg", which can
+       be 0 (normal) or 1 (xdraw).
+* +46  (reserved)
+
+* +49  FillRaster - draw an arbitrary shape from the rasterization tables.
+       For each line from top to bottom, the left and right edges will
+       be read from rastx1/rastx2 and a raster drawn in the current color.
+* +52  (byte) topmost line to rasterize (0-191)
+* +53  (byte) bottom-most line to rasterize (0-191), inclusive
+* +54  (2 bytes) address of rastx1l table
+* +56  (2 bytes) address of rastx1h table
+* +58  (2 bytes) address of rastx2l table
+* +60  (2 bytes) address of rastx2h table
+
+The rasterization table addresses are read-only; changing them will have
+no effect.
+
+fdraw uses a fair number of zero page locations.  The exact set can be
+determined by looking at FDRAW.S.  The locations were chosen to not
+interfere with DOS, ProDOS, Applesoft, or the Monitor.  They may
+interfere with Integer BASIC, SWEET16, or your own application code.
+Remapping them to different locations is straightforward: just change
+the assignment of zptr/zloc values near the top of FDRAW.S to use
+different addresses.  fdraw does not expect any zero page value to be
+preserved across calls, so you're welcome to use those locations in your
+own code, but understand that fdraw functions will overwrite them.
+
+
+<div id='nutshell'/>
+## Apple II Hi-res in a Nutshell ##
+
+This is a quick overview of the Apple II hi-res graphics architecture
+for anyone not recently acquainted.
+
+The Apple II hi-res graphics screen is a quirky beast.  The typical
+API treats it as 280x192 with 6 colors (black, white, green, purple,
+orange, blue), though the reality is more complicated than that.
+
+There are two hi-res screens, occupying 8K each, at $2000 and $4000.
+You turn them on and flip between them by accessing softswitches in
+memory-mapped I/O space.
+
+Each byte determines the color of seven adjacent pixels, so it takes
+(280 / 7) = 40 bytes to store each line.  The lines are organized into
+groups of three (120 bytes), which are interleaved across thirds of
+the screen.  To speed the computation used to find the start of a
+line in memory, the group is padded out to 128 bytes; this means
+((192 / 3) * 8) = 512 of the 8192 bytes are part of invisible
+"screen holes".  The interleaving is responsible for the characteristic
+"venetian blind" effect when clearing the screen.
+
+Now imagine 280 bits in a row.  If two consecutive bits are on, you
+get white.  If they're both off, you get black.  If they alternate
+on and off, you get color.  The color depends on the position of the bit;
+for example, if even-numbered bits are on, you get purple, while
+odd-numbered bits yield green.  The high bit in each byte adjusts the
+position of bits within that byte by half a pixel, changing purple and
+green to blue and orange.
+
+This arrangement has some curious consequences.  If you have green and
+purple next to each other, there will be a color glitch where they meet.
+The reason is obvious if you look at the bit patterns when odd/even meet:
+`...010101101010...` or `...101010010101...`.  The first pattern has two
+adjacent 1 bits (white), the latter two adjacent 0 bits (black).  Things
+get even weirder if split occurs at a byte boundary and the high bit is
+different, as the half-pixel shift can make the "glitch" pixel wider or
+narrower by half a pixel.
+
+The Applesoft ROM routines draw lines that are 1 bit wide.  If you execute
+a command like `HGR : HCOLOR=1 : HPLOT 0,0 to 0,10`, you won't see
+anything happen.  That's because HCOLOR=1 sets the color to green,
+which means it only draws on odd pixels, but the HPLOT command we gave
+drew a vertical line on even pixels.  It set 11 bits to zero, but since
+the screen was already zeroed out there was no apparent effect.
+
+If you execute `HGR : HCOLOR=3 : HPLOT 1,0 to 1,10`, you would expect a
+white line to appear.  However, drawing in "white" just means that no
+bit positions are excluded.  So it drew a vertical column of pixels at
+X=1, which appears as a green line.
+
+If (without clearing the screen after the previous command) you execute
+"HCOLOR=4 : HPLOT 5,0 to 5,10`, something curious happens: the green line
+turns orange.  HCOLOR=4 is black with the high-bit set.  So we drew a
+line of black in column 5 (which we won't see, because that part of the
+screen is already black), and set the high bit in that byte.  The same
+byte holds columns 0 through 6, so drawing in column 5 also affected
+column 1.  We can put it back to green with "HCOLOR=0 : HPLOT 5,0 to 5,10".
+
+It's important to keep the structure in mind while drawing to avoid
+surprises.
+
+Note that the Applesoft ROM routines treat 0,0 as the top-left corner,
+with positive coordinates moving right and down, and lines are drawn
+with inclusive end coordinates.  This is different from many modern
+systems.  fdraw follows the Applesoft conventions to avoid confusion.
+
+Handy table of graphics softswitches:
+
+name   | addr  | decimal | purpose
+------ | ----- | ------- | ------------------
+TXTCLR | $c050 | -16304  | enable graphics
+TXTSET | $c051 | -16303  | text-only
+MIXCLR | $c052 | -16302  | disable mixed mode
+MIXSET | $c053 | -16301  | enable mixed mode (4 lines of text)
+LOWSCR | $c054 | -16300  | display page 1
+HISCR  | $c055 | -16299  | display page 2
+LORES  | $c056 | -16298  | show lo-res screen
+HIRES  | $c057 | -16297  | show hi-res screen
+
+
+<div id='building'/>
+## Building the Code ##
+
+The main fdraw code is written for the Merlin assembler (specifically
+Merlin-16 3.40, though other versions should work).  It uses plain 6502
+code, and is expected to run on an Apple ][+.
+
+For convenience when editing the files on an Apple II, and to allow the
+code to be compiled by Merlin-16 running under ProDOS 8, the code is
+broken into four files.  The main file, FDRAW.S, includes the other
+three with PUT directives.  FDRAW.S holds the API entry points and some
+of the drawing code.  FDRAW.LINE.S has the code for drawing points and
+lines, while FDRAW.CIRCLE.S has the code for drawing circles.
+FDRAW.TABLE.S holds the data tables, as well as empty space for work
+areas.  The empty space is included in the binary so you can determine
+the full memory footprint by looking at the length of the file.
+
+Near the top of FDRAW.S is a constant, `USE_FAST`, which may be set
+to 0 or 1.  If set to 0, some code optimizations are disabled,
+reducing the size of the code and data areas.  Further, the page
+alignment on data tables is disabled, reducing the internal fragmentation
+of the data area.
+
+The USE_FAST setting also determines which file recevies the assembler
+output: FDRAW.FAST or FDRAW.SMALL.  To generate both, it is necessary to
+assemble the file, change the constant, and then assemble the file again.
+
+Tests and demos are written in Applesoft BASIC, with a couple of
+exceptions.
+
+
+### Why So Big? ###
+
+The fdraw code weighs in at a hefty 5KB (or 4KB for the "small" build).
+That doesn't sound like much in the age of multi-gigabyte mobile phones,
+but it's a sizeable fraction of the space available on an Apple ][+.
+
+If you want to modify individual pixels quickly, you need two things:
+a line base-address table, and a divide-by-7 table.  Computing base
+addresses and dividing by 7 aren't hugely expensive, but we're going
+to be doing them often, so they need to be as fast as possible.
+
+The line address table has 192 entries, one for each line, 2 bytes per
+entry.  The divide-by-7 table has 280 entries, one for each horizontal
+pixel position, with one byte for the dividend and one for the quotient.
+(The quotient can be expressed as a numeric value from 0 to 6, or as
+a byte with a specific bit set.)
+
+That's 944 bytes.  For optimum performance, each table must fit on a
+single page of memory.  We can split the division table into two pieces,
+one for 0-255 and one for 256-279, and put the smaller half on the same
+page as the Y table, along with 16 bytes of padding.  The final size is
+256 + 256 + (192+24+24+pad) + 192 = 960.  So you can write off 1K of
+memory before you've written any code.
+
+(There's a clever way to reduce the size of the y-lookup table to 24
+entries, but it's slightly faster and much easier to use full tables.)
+
+For the FillRaster function, fdraw needs to record the left and right
+X coordinates on each line (2 bytes each), so that's 192 * 4 = 768 bytes.
+Again, for optimum performance, each table needs to be on its own page,
+so for USE_FAST=1 that expands to 1024 bytes.
+
+Add to that another full page of unrolled rasterization code, and you've
+got 2304 bytes of tables.
+
+The rest is code, most of which was written with a flagrant disregard
+for size.  Many common code fragments are repeated inline, rather than
+called as a subroutine, because a subroutine call (JSR+RTS) costs 12
+cycles.  Calling a common "plot a point" function from the line-drawing
+code would increase the per-pixel cost by 15-20%.
+
+
+<div id='notes'/>
+## Notes on the Drawing Functions ##
+
+### Screen Clear ###
+
+The Clear function erases the current hi-res page to the current color.
+It's several times faster than the version built into the ROM.
+
+#### Performance ####
+
+The fastest possible way to clear the screen to a specific color on a
+6502 is to write to every visible location with an absolute store
+instruction.  Subtracting the screen holes, that's 7680 address *
+4 cycles = 30720 cycles.  The code to do that would be 23,040 bytes long,
+making it impractical.
+
+A slower but more memory-efficient approach has one store statement for
+each line, and iterates through 40 times (280 / 7 = 40).  Factoring in the
+loop overhead, that comes out to 40 * (192 * 5 + 9) = 38760 cycles.
+192 sets of store instructions fills 576 bytes, which is much better
+than 23K, but still quite a lot.
+
+We can reduce the size further by taking the lines 3 at a time, erasing
+the first 120 bytes in each 128-byte group (the last 8 bytes are the
+screen hole).  We'd need to use 7680/120 = 64 store instructions, for a
+total of 120 * (64 * 5 + 9) = 39480 cycles, with 192 bytes for the main
+part of the erase loop.  We're not quite 2% slower, but 384 bytes
+smaller, which seems a fair trade-off.  Because we're accessing memory
+linearly we now have a "venetian blind" clear, which is something of an
+Apple II trademark, but we can fix that by spending an additional 522
+cycles to erase the screen in thirds (top/middle/bottom).
+
+Any further changes that make the code smaller also increase the execution
+time.  When built with USE_FAST=0, the code will use a different loop
+with 32 stores that write 248 bytes each, and takes 41416 cycles.  It's
+half the size, but nearly 2000 cycles slower, and overwrites half of the
+screen holes.
+
+At the extreme end of space over speed is the Applesoft ROM routine -- HGR
+or "CALL 62454" -- which only needs about 30 bytes for its main loop, but
+takes (8192*33)+(12*64)+17 = 271121 cycles for black or white, or
+(8192*40)+(12*64)+17 = 328465 cycles for green/purple/blue/orange --
+7-8x slower than our preferred implementation.
+
+The screen clear is wired to a specific hi-res page, so the SetPage
+function must rewrite the store instructions when the page changes (or
+we need to keep two full copies of the function around).  For an
+application that is constantly doing flip-erase, the overhead must be
+factored into the efficiency of the approach -- for example, rewriting
+stores with indexed LDA/EOR/STA in a loop will take 20 cycles per iteration,
+1280 cycles for the full set of 64.  The "slow" clear has half the
+number of store instructions, so takes half the time to fix up after
+a page flip.
+
+
+### Raster Fill ###
+
+Drawing an outline of a rectangle or circle can be done efficiently by
+drawing lines or plotting points.  Drawing a filled shape is more
+expensive if one point is plotted at a time, especially on the Apple II
+where every byte affects 7 pixels.
+
+For filled shapes, fdraw populates a rasterization table.  The table has
+192 entries, each of which holds the left and right edges of the shape
+on that line.  The code fills in the pixels one line at a time, using
+a simple byte store for the middle parts, and bit masks at the edges.
+
+External applications can use the raster renderer directly by filling
+out the rasterization table and calling FillRaster.
+
+While the FillRaster function itself will not modify the contents of the 
+raster tables, other fdraw calls will, sometimes unexpectedly.  For
+example, drawing a horizontal line is performed with a single-line
+fill call.  Filled rectangles might populate the table in the way you'd
+expect, or might use some internal shortcut that only fills out one line
+and sets a "repeat" flag.  Don't make assumptions about what will be in
+the table after a call to one of the drawing functions.  You *can* count
+on whatever you wrote there yourself to be unmodified after calls to
+FillRaster, SetColor, or SetPage, so you can do page-flipping and
+color-cycling without having to repopulate the tables.
+
+#### Performance ####
+
+The fill code needs about 100 cycles to set up each line when drawing
+a rectangle, more if the line doesn't start and end on byte boundaries.
+The inner loop costs 10 cycles per byte.  To clear the screen with the
+raster fill code, it would take (192 * (100 + 40 * 10)) = 96000 cycles,
+or nearly 2.5x the time required for the dedicated clear code.  Which is
+about what you'd expect, as the screen erase needs 4 cycles per byte, and
+has lower per-line overhead.  (This can be improved significantly; see
+the notes in the "enhancements" section.)
+
+Non-rectangular shapes take slightly longer to set up, as the edges must
+be recomputed for each line.
+
+
+### Lines ###
+
+The goal is to provide a replacement for Applesoft's HPLOT function
+that is faster and more consistent in appearance.  Lines are drawn using
+Bresenham's run-length algorithm.
+
+Internally, there are five separate functions.  Horizontal and vertical
+lines each get a special-case handler.  There's another for mostly-vertical
+lines, one for mostly-horizontal lines, and one for wide mostly-horizontal
+lines (255 pixels or wider).  The latter requires 16-bit math, and is
+slightly slower.
+
+The Applesoft routine isn't quite the same as the standard Bresenham
+algorithm, because it doesn't move diagonally.  Consider a line from
+(0,0) to (50,10) -- gently sloping down and to the right.  The standard
+algorithm would plot exactly 51 pixels, one in each horizontal position.
+The "pen" always moves one pixel right, but sometimes also moves down.
+
+In Applesoft, the "pen" can move either right or down, but can't do
+both at once.  This results in lines that feel thin when near horizontal
+or vertical, but become thicker as they approach 45 degrees.  This
+reduces performance, because Applesoft draws twice as many pixels for a
+diagonal line as the basic algorithm.  It can also be visually jarring
+when animated, because lines get very thick when near diagonal.
+
+Different applications have used different styles; for example:
+
+- Stellar 7 and Elite for the Apple II use Bresenham-style lines.  If
+  you look at near-diagonal lines on a color monitor you can see the
+  pixels alternating green and purple.
+- A2-FS1 Flight Simulator appears to be using Bresenham lines but with
+  doubled bits, effectively treating the screen as having 140 pixels.  This
+  gives solid white lines with a fairly consistent feel.
+- GraFORTH doubles the bits, but treats the screen as 256 pixels wide
+  (not 280... it gives up 24 pixels to improve performance).  White
+  lines are thick like Flight Simulator, but feel less jagged because
+  each step can move left or right by one bit rather than two.
+
+The SetLineMode function lets you choose between "draw" and "xdraw".  The
+former draws color pixels, setting and clearing bits as needed, while
+the latter inverts whatever is currently on the screen.  This can have
+some unusual effects.  Drawing the same line twice erases the line.
+Drawing a green line over a purple line gives you a white line.  Drawing
+with colors 5 and 6 can produce odd results, because the high bit inverts
+every time you touch a byte -- which means the ends of a horizontal line
+will be a different color if the byte holds an even number of affected
+pixels.  It's best to draw with colors 0-3 when in xdraw mode.  Clearing
+the background to color 4, rather than 0, will cause drawing in colors
+0-3 to actually be 4-7.
+
+#### Performance ####
+
+Mostly-horizontal lines step horizontally each iteration, and sometimes
+step vertically.  Mostly-vertical lines step vertically each iteration,
+and sometimes step horizontally.  Each part of the operation has a cost,
+so the fastest lines are the ones drawn primarily in a single direction.
+Diagonal lines are the worst case for performance.
+
+The current code requires just under 80 cycles per pixel for diagonal
+movement, and about 56 for single-direction movement.  There's another
+150 cycles or so per line for the initial setup.
+
+Vertical lines cost about 43 cycles per pixel.  Horizontal lines are
+handled as a trivial FillRaster call, which at peak performance can write
+7 pixels in 10 cycles.
+
+This is about as fast as you can get with the Bresenham run-length
+algorithm and Applesoft-style color handling.  It's possible to go faster
+by switching to a different pixel style, or using a run-slice approach.
+
+
+### Rectangles ###
+
+Filled rectangles are currently implemented by putting the left and
+right edges into the rasterization table, and calling FillRaster.
+
+Outline rectangles could be drawn as four lines, but that doesn't look
+very good in color unless you get the lines on the right columns.  To
+ensure that the edges are in the correct color, outline rectangles are
+drawn as four separate items: a two-pixel-wide left edge, a two-pixel-wide
+right edge, and horizontal lines at the top and bottom.  FillRaster does
+the actual work.
+
+#### Performance ####
+
+FillRaster is suboptimal for rectangles, because it works by rows rather
+than by columns (see "Vertically-Challenged Rasterization" later in this
+document).  Rectangles could be drawn 2.5x faster with dedicated code,
+but at a cost of hundreds of bytes of memory.
+
+The advantage of using FillRaster is that we need it for filled circles,
+so adding support for rectangles was nearly free.  And it's still pretty
+fast.
+
+
+### Circles ###
+
+Circles are computed with Bresenham's algorithm.  The idea is to compute
+one octant of the circle with this bit of magic:
+
+void drawOutline(int cx, int cy, int rad) {
+    int x, y, d;
+
+    d = 1 - rad;
+    x = 0;
+    y = rad;
+
+    while (x <= y) {
+        plot(cx, cy, x, y);
+
+        if (d < 0) {
+            d = d + (x * 4) + 3;
+        } else {
+            d = d + ((x - y) * 4) + 5;
+            y--;
+        }
+        x++;
+    }
+}
+
+Then each X/Y coordinate is plotted eight times:
+
+    (cx+x, cy+y) (cx-x, cy+y) (cx+x, cy-y) (cx-x, cy-y)
+    (cx+y, cy+x) (cx-y, cy+x) (cx+y, cy-x) (cx-y, cy-x)
+
+For an outline circle, we plot every point.  For a filled circle, we add
+each point to a rasterization table.  Near the top and bottom of the
+circle there will be multiple updates to the same line, with each update
+replacing the previous one (which works, as we are moving "outward").
+
+The center point of the circle must be on screen, but it's not necessary
+for the entire circle to fit.  Coordinates outside screen space are clipped.
+
+#### Performance ####
+
+The implementation of Bresenham's algorithm is straightforward, and is
+about as fast as it's going to get.  There are actually two versions of
+the core computation.  If the radius is less than 41, we can keep all of
+the variables in 8 bits.  For circles with radius 41 and larger, we need
+to use 16 bits, slowing each step slightly.
+
+There are also two versions of the octant plot.  If the circle fits entirely
+on-screen, we use a simple version.  If it doesn't, we use a version that
+clips values.  For rasterization that means clamping X to the left or
+right edge, and skipping updates that are off the screen in the Y dimension.
+For an outline circle we simply don't plot any clipped points.
+
+The rendering of filled circles is very fast, though there is a possibility
+of optimizing the center-fill of large circles.  Outline circles were
+added by inserting JSR PLOT at key points, and could perhaps be faster.
+
+
+### Drawing Lines with Indexed Byte-Arrays ###
+
+The &PLOT command allows a BASIC program to execute a series of line-draw
+commands with a single statement.  Think of it like shape-table animation
+with lines instead of plotted points.
+
+Suppose you want to draw a rectangle with an X through the middle.  We'll
+make it 11 units wide and 21 units high.  To draw that in the middle of
+the screen, we'd set CX=139 and CY=95, then draw lines offset from that
+by +/- 5 in X and +/- 10 in Y:
+
+  HPLOT CX-5,CY-10 TO CX-5,CY+10 : REM LEFT  
+  HPLOT CX-5,CY-10 TO CX+5,CY-10 : REM TOP  
+  HPLOT CX+5,CY-10 TO CX+5,CY+10 : REM RIGHT  
+  HPLOT CX-5,CY+10 TO CX+5,CY+10 : REM BOTTOM  
+  HPLOT CX-5,CY-10 to CX+5,CY+10 : SLASH  
+  HPLOT CX+5,CY-10 to CX-5,CY+10 : BACKSLASH  
+
+Six lines, each of which needs four coordinates.  We'd need 24 bytes
+to store that in an integer array.
+
+Suppose instead we identified the four vertices, and numbered them:
+
+  #0 CX-5,CY-10  
+  #1 CX+5,CY-10  
+  #2 CX-5,CY+10  
+  #3 CX+5,CY+10
+
+and then created a list of line segments using the vertex indices:
+
+  HPLOT #0 TO #2  
+  HPLOT #0 to #1  
+  HPLOT #1 TO #3  
+  HPLOT #2 TO #3  
+  HPLOT #0 TO #3  
+  HPLOT #1 TO #2  
+
+This requires (4*2) + (6*2) = 20 bytes, for a small savings.  The real
+value in the approach is that it separates the description of the shape
+from the placement of the points.  For example, if you want to change
+vertex #0 to (CX-7,CY-12), you don't have to make changes two three
+separate HPLOT calls.  (This is particularly useful when you have code
+that scales and rotates the vertices.)
+
+For the current release of fdraw, the only built-in transform is
+translation.  Using "&AT cx,cy", you can place the center point anywhere
+on the screen.  This allows you to animate movement of the shape by
+simply calling &AT to change the position, and &PLOT to draw.
+
+The &PLOT command takes three arguments: the address of a vertex array,
+the address of an index array, and the number of line segments to draw.
+These are referred to as "byte arrays" because they are arbitrary
+locations in memory where you have BLOADed or POKEd your shape data, not
+Applesoft arrays.  The count can be from 0 to 127.  You can optionally
+add an AT to the end; if not present, the coordinates of the previous AT
+are used.  The initial value is the center of the screen (x=139 y=95).
+
+The vertex array uses two signed bytes per vertex (-128 to 127), one for
+the X coordinate and one for the Y coordinate.
+
+The index array uses two bytes per line segment.  Each byte is an index
+into the vertex array, from 0 to 127.
+
+Here's an Applesoft program that implements the above example.  (The DATA
+statements use negative numbers for clarity; if you replace the negative
+values with 256+value, e.g. -5 becomes 251, then you can avoid the IF
+statement and just poke the value directly.)
+
+    100  TEXT : NORMAL : HOME 
+    200  &  NEW : &  HGR : VTAB 21
+    210  &  HCOLOR= 3
+    500  REM ARRAY TEST
+    510 AD = 768: REM $300
+    520  READ D: IF D = 1000 THEN 560
+    530  IF D < 0 THEN D = 256 + D
+    540  POKE AD,D:AD = AD + 1: GOTO 520
+    560  &  PLOT 768,776,6: &  AT 50,50: &  PLOT 768,776,6
+    570  POKE 768,256 - 10: POKE 769,256 - 20: &  PLOT 768,776,6 AT 100,50
+    600  DATA -5,-10, 5,-10, -5,10, 5,10
+    610  DATA 0,2, 0,1, 1,3, 2,3, 0,3, 1,2, 1000  
+
+This draws the shape twice, once at the middle of the screen, once centered
+at 50,50.  It then adjusts the top-left coordinate, and draws the shape
+centered at 100,50.  Looking at the output, you can see that the top-left
+corner of the third instance has moved, and all three lines from that
+point have moved with it.
+
+If a vertex ends up off-screen, lines that use that vertex are omitted
+(not clipped).  If you tried to draw the example shape at (0,0), nothing
+would happen, because every line has at least one point that would be
+off-screen -- only point #3 is still visible, and all of the lines that
+use that point extend off screen.
+
+You can specify a maximum of 128 vertices and 128 index pairs for a
+single call.  If none of the line segments share vertices, you'll need
+two vertices per line, which means a cap of 64 lines.
+
+#### Performance ####
+
+There isn't a whole lot to it -- it just feeds the lines to DrawLine.
+The key speed advantage is the removal of the Applesoft overhead.
+
+
+<div id='ideas'/>
+## Enhancement Ideas ##
+
+Some ideas for future versions of fdraw.
+
+### fdraw ###
+
+Line clipping would make the array-draw function more useful for
+animation projects.  If we accepted signed 16-bit values as input to
+the clip function, we could specify an AT point outside the screen bounds.
+That could be extended to circles, which could have off-screen centers.
+
+A "game line" function or line mode that restricts coordinates to 0-255
+and ignores color might be worth an experiment.
+
+Triangle rasterization is possible, but perhaps a bit silly.
+
+We could handle ellipses, but they're more complicated than circles, and
+are slower to compute -- you need a couple of multiplications during
+setup, and the asymmetry means you have to compute a quadrant rather
+than an octant.  If the goal is fast animation rather than general-purpose
+picture painting then there's little value in supporting ellipses.
+
+Some of the inner loops are almost certainly paying an extra cycle to
+cross a page boundary.  That's not easy to fix without adding absurd
+amounts of padding.
+
+"USE_FAST" could be applied more aggressively to reduce the size.
+
+Having "fast" vs. "small" builds was mostly an experiment to see how
+much of a difference in size and speed we'd get by dropping some of
+the more expensive operations.  Another way to reduce size would be to
+make the build modular, so you could (say) omit circle drawing or only
+include line drawing.  Some trade-offs would have to be made, e.g. if
+you only wanted line drawing then it makese sense to disable (or replace)
+the horizontal-line optimization that calls FillRaster, as that requires
+some sizeable tables that would otherwise be unused.
+
+### Amperfdraw ###
+
+The Amperfdraw API is somewhat minimal and could be improved.  Taking a
+cue from Beagle Graphics, the rect and circle calls should probably look
+more like:
+
+  &DRAW width,height [AT left,top]
+  &COS radius [AT left,top]
+
+The "&AT" coordinate, currently only used by &PLOT, should be more
+widely used.  Not only is it more convenient, it's also slightly faster,
+since we don't have to parse the left/top coordinates each time.
+
+The existing code is (somewhat lazily) using the Applesoft routines to
+parse coordinates, which includes the range check.  We wouldn't be able
+to use them for width/height, because we would need to take values in the
+range (0-280, 0-192), where width/height of zero means "draw nothing".
+
+I deliberately used Applesoft tokens, rather than arbitrary words, to
+make commands simpler to parse.  Some of them don't fit that well.  COS
+and SIN are circle-related, but it's not obvious which is outline and
+which is filled.  DRAW and XDRAW don't really sound like rectangle-draw
+calls, and would be much more appropriate if used to set the line draw
+mode.  Spending a few bytes & cycles to get better names might be
+worthwhile.
+
+It's possible to store &PLOT arrays in actual BASIC integer arrays,
+which might make them easier to code for.  The fact that arrays are
+DIM()ed once, cannot be resized, and cannot be discarded makes them
+difficult to use for dynamic data.
+
+Currently &PLOT takes a list of vertices and a list of line segments.
+We could also support "continuous line" mode, where it just plays
+connect-the-dots (saves space, doesn't really affect speed).  Being
+able to embed color changes could be handy.
+
+&PLOT handles lines and vertices the way Applesoft does, with inclusive
+coordinates.  This results in overdraw when vertices are shared.  This
+is a (small) performance hit, and causes graphical glitches when connected
+lines are drawn in "xdraw" mode.
+
+
+<div id='additional-notes'/>
+# Additional Notes #
+
+Getting into the gory details here.
+
+## Setting a pixel ##
+
+Hi-res pixels are curious creatures.
+
+Pixel color values are determined by adjacent bits.  The various drawing
+routines only set one bit at a time, so "drawing" in green (hcolor=1) will
+cause bits to be set in odd columns, cleared in even columns.  We don't
+touch adjacent bits, so drawing purple (hcolor=2) in column 0 and green
+in column 1 will produce a white line, while drawing them with the columns
+reversed will produce a black line.
+
+Making life more complicated is the use of the high bit in each byte, which
+affects the color.  If you draw a purple line in column 0, and a black1
+line with hcolor=4 in column 6, the purple line turns blue, because the
+black1 line sets the high bit.
+
+To set a bit at an arbitrary X offset, we need to do the following:
+
+(1) Determine which byte to change (xc / 7) and which bit (xc mod 7).
+(2) Determine the color mask for that byte.  For green, it's 0x2a
+    (00101010) in even columns, 0x55 (01010101) in odd columns.
+(3) Set or clear the target bit and the high bit, leaving the others
+    intact.
+
+One way to do this is illustrated below.  Assume we're drawing a green
+line at X=17.  There's already a green dot at X=15, which gives us a
+bit pattern of 00000010.  (Bits are "backwards", i.e. the bit on the
+right is the pixel on the left.)
+
+  LDY byteoffset                      X=2
+  LDX bitoffset                       X=3
+  LDA bitmask,x                       A=0x88 (10001000)
+  STA <andmask              
+  LDA oddevencolor,y   4 cyc          A=0x2a (00101010)
+  EOR (hbasl),y        5 cyc          A=0x28 (00101010 ^ 00000010 = 00101000)
+  AND <andmask         3 cyc          A=0x08 (00101000 & 10001000 = 00001000)
+  EOR (hbasl),y        5 cyc          A=0x0a (00001000 ^ 00000010 = 00001010)
+  STA (hbasl),y        6 cyc
+
+As a second example, here's how we plot a black1 (hcolor=4) point at X=6
+when there's a purple point (hcolor=2) at X=0 (00000001).
+
+  LDA bitmask,x                       A=0xc0 (11000000)
+  STA <andmask              
+  LDA oddevencolor,y   4 cyc          A=0x80 (10000000)
+  EOR (hbasl),y        5 cyc          A=0x81 (10000000 ^ 10000001 = 00000001)
+  AND <andmask         3 cyc          A=0x81 (00000001 & 11000000 = 00000000)
+  EOR (hbasl),y        5 cyc          A=0x81 (00000000 ^ 10000001 = 10000001)
+  STA (hbasl),y        6 cyc
+
+Note the purple pixel is still set, but now the high bit is as well,
+changing it to blue.
+
+The trick is to start with the color pattern, which specifies how we want
+the bits to be set or cleared.  We EOR in the screen, which causes the
+bits in A to be inverted wherever they were set on the screen.  Next we
+use the AND mask to zero out the bits we don't want to update on-screen.
+When we do the second EOR from the screen, the bits we just zeroed will
+take on the values from the screen, while the bits we didn't zero will
+return to their original values from the color pattern (because EORing
+twice with the same value restores the original).
+
+It might look a little nicer if we always set two adjacent bits.  That
+would avoid the phenomenon where drawing from 0,0 to 0,10 in green doesn't
+appear to do anything.  For 6 out of 7 pixels this is easy, a simple
+adjustment to the bitmask, but for the 7th pixel we'll need to update an
+adjacent byte... unless it's the rightmost byte, which would cause us to
+overflow and wrap around (or write into a screen hole).  GraFORTH
+renders lines this way, avoiding the overflow issue by limiting the X
+coordinate range to (0,255).
+
+To implement "xdraw" mode, where instead of setting pixels we invert
+the current value, we can just omit (or NOP out) the first EOR.
+
+We could draw faster if we simply set the new bits, rather than setting
+some and clearing others according to the color mask.  This could result
+in some odd behavior, e.g. drawing a horizontal green line over a
+horizontal purple line would result in a white line.  Given how strange
+things are in general this might not be an issue.
+
+For 3D games like Stellar 7 or Elite, which essentially draw thin
+monochromatic lines, we can drop the color mask and just set the bit on
+the screen.  Plotting a pixel is then simply:
+
+  LDA (hbasl),y        5 cyc
+  ORA <bitmask         3 cyc
+  STA (hbasl),y        6 cyc
+
+This cuts the cycle count from 23 to 14.  It's also not necessary to
+worry about the high bit, which can save a few cycles when shifting
+the bitmask.  Most games are also able to limit the "active" part of
+the screen to fewer than 255 pixels, which eliminates some 16-bit math
+during setup.
+
+For "xdraw" mode, the "ORA <bitmask" becomes "EOR <bitmask".
+
+
+## Single- or Double-Buffered Animation ##
+
+Because the Apple II has two hi-res graphics pages, it's possible to
+double-buffer the animation to reduce or eliminate flicker.  The
+application displays one page while erasing and redrawing the other.
+
+In most cases it's faster to erase the entire screen with the Clear
+function than it is to draw over with black.  For example, consider four
+diagonal lines in a diamond shape, 100 pixels on a side.  Diagonal
+lines are the most expensive, as each step requires advancing in
+both vertical and horizontal directions.  The current implementation
+needs about 80 cycles per diagonal pixel, or 100 * 4 * 80 = 32,000 cycles
+to draw four medium-length lines (ignoring the setup cost for each line).
+If you assume that the average cost to draw a pixel is about 70 cycles,
+you can draw 570 pixels in the time it takes to erase the full screen.
+
+We can clear the entire screen in about 40,000 cycles.  If the drawing
+area is smaller, a custom clear routine could do it in even less.
+(Imagine your drawing routines keep track of the highest and lowest
+line that anything touches, and then just erase the "dirty" lines.) So
+unless you're doing relatively light rendering, you'll get the best
+performance by wiping all or part of screen rather than drawing over the
+previous contents.
+
+The &INVERSE command is intended to make double-buffered animation
+easier from BASIC.  Use &HGR2 to switch to full-screen mode, then call
+`&SCRN(1):&HCOLOR=0:&CLEAR` to select page 1 and clear it.  Draw your
+first frame, then call &INVERSE to display page 1 and select page 2
+for drawing.
+
+
+An alternative approach is exemplified by Elite.  The game only uses
+one hi-res page, but doesn't noticeably flicker (though distant objects
+sort of "sparkle").  Suppose you're writing a similarly line-oriented
+game, and your rendering cycle looks like this:
+
+ - Step 1: draw over previous content with black
+ - Step 2: draw new content with white
+
+Your game will flicker badly without double-buffering, because there will
+be a few display refresh periods where most of the lines have been erased.
+Suppose instead you did this:
+
+ - For each line in the shape, erase the old line, then draw the line in
+   its new position
+
+Now you might get some flickering on certain lines if the beam crosses
+them while they're black, but the shape as a whole will be visible most
+of the time.  The trouble with this approach is that, if your shape is
+moving across the screen, you'll be drawing black over some recent white
+lines, causing some distracting artifacts.
+
+The way to make this work is to use "xdraw" mode, where bits are toggled
+rather than set or cleared.  If you draw a new line across an old line that
+will soon be erased, the crossing point is cleared.  When the old line
+is erased, the crossing point is set white again, so your new line
+appears unbroken.
+
+It should be noted that this works well for Elite because they use backface
+elimination, so lines within a single shape don't cross.  It's also
+important to avoid re-drawing points at shared vertices, or your corners
+will disappear unless there are an odd number of lines.
+
+If there's very little on screen, this could be faster than a full clear.
+Mostly it's of value if you need the 8KB occupied by the second hi-res
+page for something other than output.
+
+
+## Vertically-Challenged Rasterization ##
+
+As noted earlier, we can clear the screen in about 40,000 cycles with
+the Clear function, but drawing a screen-sized filled rectangle takes
+about 96,000.  Why the difference?
+
+The FillRaster function handles one horizontal line at a time.  For
+each line it sets any pixels sticking out on the left and right edges,
+and then it jumps into an unrolled byte-stomp function that blasts
+its way through the middle at 10 cycles per byte.  Compare this to the
+Clear function, which only needs 5 cycles per byte.
+
+The trick to improving the speed at which we draw filled rectangles
+is to make it more like the Clear function, which operates on columns
+rather than rows.
+
+Suppose, for example, we figured out which bits we need to set on the
+left edge, and then applied them to every row.  Then we did the same
+for the right edge.  The set-up cost for each edge went from
+(N cycles * Y rows) to (N cycles).  Can we apply this to the middle
+byte as well?
+
+It turns out we can.  The fundamental problem with setting bytes
+horizontally is that we have to index off of a direct page register,
+e.g. "STA ([hbasl),y".  The only ways around this either add too much
+loop overhead, too much setup overhead, or require too much memory.
+For any given line, we need to find the base address, and issue a
+6-cycle indirect store, followed immediately by an increment of the Y
+register.  If we're drawing in color it's worse than that, because we
+also have to exclusive-OR the color because the bit pattern flips for
+odd/even columns.
+
+We're much better off unrolling vertically.  Suppose you have 192
+"STA abs,y" instructions, one for each row, one after the other.  You
+no longer need the base address lookup, because it's baked into the
+code, and since we're only touching one column we don't need to worry
+about odd/even color values here.  To use this to draw rows 50-100, you
+would replace the STA in row 101 with an RTS, and then JSR to the 50th
+STA instruction.  After the row is painted, you increment Y, exclusive-OR
+the color value, and jump through again.  (You can make this a little
+faster by JMPing in and out instead, but you pay a bit more for setup
+and cleanup, especially when you have to restore the base address that
+got overwritten by the JMP.)
+
+With this change we're working at 5 cycles per byte, plus the loop
+overhead.  A full-screen FillRect will be about as fast as a Clear.
+
+There are a couple of down sides.  First, you need 192*3=576 bytes to
+hold this pile of store instructions.  If you're drawing a lot of filled
+rectangles, though, the 2x speed improvement would make the size penalty
+worthwhile.  The other problem arises if you use double-buffered animation,
+as the table is hard-wired to page 1.  You can either spend a couple
+thousand cycles when the page flips to rewrite the addresses, or you can
+have a second full copy of the stores for page 2.
+
+The current horizontally-focused implementation uses 256 bytes for its
+unrolled code area, but you wouldn't be able to get rid of that by
+switching to the vertical approach.  The reason the code works the way
+it does is that it's designed to render circles, and those are hard to do
+vertically.  With horizontal rasters, when you look at the left and right
+edges you only need to examine the current row, and set pixels in a
+single byte.  With vertical strips, each byte spans seven columns of
+pixels, so the top and bottom "edges" might be several bytes deep.  The
+code would have to iterate in "edge space" until it reached the meaty
+center, and the cost of doing so would likely erase the benefit of vertical
+fills until your circles got reasonably large.
+
+It's possible that a hybrid approach, in which selected rectangles in the
+center of a large circle are drawn with a fast vertical fill, could be
+used, with slower code rendering the outer edges.  The trick would be to
+come up with an approach that doesn't leave gaps, minimizes overdraw, and
+is sufficiently faster to make the effort worthwhile.
+
--- a/docs/personal-notes.md
+++ b/docs/personal-notes.md
@ -0,0 +1,197 @@
+My Quest for Lines
+==================
+
+As far back as I can remember, I always wanted to draw lines on the
+hi-res screen.
+
+This probably started when I saw Battlezone in the arcades in the early
+1980s.  I still think the game is beautiful -- a first-person shooter
+reduced to the essential elements.  I wanted to write something similar
+for the Apple II, but I didn't know where to start.  (I should probably
+mention that I was 11 years old in 1980.)
+
+Battlezone had a dedicated matrix processor (the "math box"), and a
+vector display that handled the line drawing.  The Apple II had neither
+of those things, which meant that achieving the same level of performance
+and graphical detail weren't possible.  Despite those shortcomings, Damon
+Slye create a pretty solid Battlezone-ish game in 1983, called Stellar 7.
+A couple of years later, Braben and Bell made another compelling wireframe
+combat game, the space combat sim Elite.  (The A2-FS1 flight simulator
+came out much earlier, but the graphics were blinky, enemies were just
+dots, and the action was much slower-paced.  Of course, it loaded from
+cassette tape and ran in 16KB, so they didn't have much choice.)
+
+Seeing these games showed me that the problems could be solved.  I decided
+that the place to start was line drawing, because (a) line drawing is
+pretty fundamental to wireframe 3D, and (b) I wasn't getting the performance
+I needed out of HPLOT TO.
+
+Somewhere in the mid-1980s -- I was in high school now -- I began by trying
+to figure out how line drawing worked.  Suppose, for example, you want to
+HPLOT 0,0 TO 19,5.  How do you decide which pixels to set?
+
+I wrote a program (which I recently found) called "HPLOT SIMULATOR".  It
+computed the ratio of vertical to horizontal pixels (e.g. 20 / 6 = 0.3),
+and marched horizontally across the screen, adding the fractional value to
+the Y coordinate at each step.  The result was a pretty good-looking line.
+
+The trouble was that it used floating-point math and required division,
+things that the 6502 is not very good at.  It occurred to me that division
+can be performed as a series of integer subtractions.  (It probably occurred
+to me because I didn't know any other way to divide on the 6502, not having
+encountered the shift-and-subtract approach yet.)  So if you initialize a
+counter to zero, and add 6 to it each time you move horizontally, then when
+it reaches 20 you know it's time to move vertically.  Subtracting 20 from
+the counter resets it, but retains the division remainder as the starting
+point, so you retain the fractional part.
+
+When I went to college I took a graphics class, and was introduced to
+Bresenham's classic line algorithm.  This was essentially the same as what
+I'd figured out for myself, but with two refinements: (1) it used signed
+values, allowing a slightly cheaper "< 0" comparison, and (2) it started
+with the counter half full, correcting the slight lopsidedness of my lines.
+
+The graphics class inspired me to write a 3D game library called Arc3D
+in 1990.  I used it to create a pair of demos: "Not Modulae", which
+animated several 3D shapes on the screen, including a pair of ships from
+Elite; and "Not Stellar 7", a graphics demo that let you drive around
+(and, sadly, through) some tanks from Stellar 7.  The Arc3D library was
+written for the IIgs, in 65816 assembly, and used the super-hi-res screen.
+Having a better CPU, lots more memory, and a less-quirky graphics
+architecture made things easier than doing the same on a classic Apple II.
+
+I wrote my own super-hi-res line drawing code, of course, but a year later
+when I disassembled somebody else's demo I found better code.  Which, it
+turned out, they had also lifted from another source, an FTA demo.  I
+dropped mine and used theirs.
+
+After I graduated from college, my side projects tended more toward data
+compression and Netrek, so Arc3D was never improved upon.
+
+Fifteen years later, in 2006, there was a discussion on a Usenet group
+about circle rendering.  Once upon a time I'd drawn circles from BASIC
+with trig functions, but it was painfully slow, which made me wonder
+about a part of the game Horizon V where you steer through a series of
+circles.  I wanted to try it for myself and see what it would take.
+(Looking at a youtube video of Horizon V, the animation is more radial
+than circular... I suspect it's not really drawing circles at all.)
+
+I first announced my results in a
+[comp.sys.apple2.programmer](https://groups.google.com/forum/#!msg/comp.sys.apple2.programmer/Vj_xVjMHaR0/cLU3t2TlPrMJ)
+posting.  I had focused on filled circles, rather than outline circles,
+since that seemed like a more interesting challenge.  The "fdraw" demo
+supported fast rendering of filled circles, filled rectangles, and had
+a very fast screen clear.  A week later, after a bit of cleanup, I
+[https://groups.google.com/d/msg/comp.sys.apple2.programmer/Un4pV5p8Elw/6qZVAPc_da0J](released the fdraw v0.2 sources).
+
+It occurred to me at the time that this would be a handy place to stick
+the hi-res line drawing code I'd always wanted to write.  Somewhere around
+this time I also sort of poked at the idea of writing a dedicated hi-res
+graphics compression program.
+
+Fast forward another nine years, to 2015.  After learning about the LZ4
+format, I went back to my data compression roots and wrote
+[https://github.com/fadden/fhpack](fhpack) and some demos.  I had so much
+fun doing it that I decided it was finally time to write some hi-res
+line drawing code.
+
+Being older, wiser, and having easy access to relevant information, I
+began with the appropriate chapters in Michael Abrash's _Graphics
+Programming Black Book Special Edition_.  This covered the standard
+algorithm, but also had a chapter on a faster "run-slice" approach.
+This intrigued me, because instead of the usual "step right, check if
+it's time to move down, step right, check if it's time ..." logic, it
+says, "figure out how long each line segment is; then, move right 3
+times, step down, move right 4 times, step down, ...", saving a lot of
+redundant computation.  The trouble is that it requires fixed-point
+division, and drawing N adjacent pixels is tricky when your graphics
+architecture has 7 horizontal pixels per byte.  You'd have to be a bit
+crazy to try to get that to work.
+
+So I went with a standard approach, and used the Applesoft ROM method of
+coloring pixels (discussed in the fdraw docs).  I carefully optimized
+the code, and squeezed out as much performance as I could.
+
+When I was done, I began looking around at what other people did to see if
+there were any tricks I missed.
+
+I looked at the Applesoft ROM code.  Very clever, but very much optimized
+for space over speed.  Also, because it's in ROM, self-modifying code is
+not possible, so they lose a cycle here and there.
+
+Next I looked at GraFORTH.  I figured out how functions were arranged,
+identified the plot function, and disassembled it with CiderPress.  It uses
+a pretty standard algorithm, but supports multiple drawing modes and sets
+two adjacent bits for better-looking colored lines.  Good use of
+self-modifying code, but some choices were made to reduce code size at the
+expense of speed.  My code was faster.
+
+Next I looked at Elite.  Digging through memory after the program had
+loaded, I found a collection of purpose-built line functions.  Some drew
+color, most used EOR to "xdraw" monochrome lines.  Standard Bresenham
+approach, with a bit of variation on the Y-lookup table -- their table is
+only 24 bytes (1/8th of the screen), and they use a quick "add 4 to the
+high byte" 7 out of every eight lines.  I tried applying this to my code,
+but it turned out that just using a full lookup table was a tiny bit faster.
+
+Next I looked at Stellar 7, one of my earliest inspirations.  I scanned
+through some files with CiderPress, looking for anything line-draw-esque.
+(If you spend enough time drawing lines you start to see patterns.)
+After about five minutes I found the code, in the same file as this
+gigantic unrolled division routine.  But as I started to dig into the code
+I noticed that it was using a count oddly, and this one function was...
+HOLY CATS he did run-slicing.
+
+And he did it big.  There are several line functions, all of them padded
+out to live on a single page (so that none of the branches cross page
+boundaries, which costs an extra cycle).  It has the usual special cases --
+simple horizontal and vertical lines -- and the usual split between
+vertically-dominant and horizontally-dominant lines.  But there are *three*
+different functions for drawing mostly-horizontal lines, selected based on
+slope, all of which try to set multiple horizontal pixels at once.  The
+slope of the line affects how the code is structured; for example, for
+very shallow lines it expects that it will often be able to set an entire
+byte at once.  Color is not supported, so pixels are set with a simple
+OR operation.
+
+It's very impressive, and a wee bit terrifying.  But when you're making
+a game that will be spending much of its time drawing lines, you really
+want to optimize those draw functions.
+
+The tricky part is that divide.  The division routine is unrolled to a
+healthy 187 bytes long, and might take 240 cycles to run.  For short
+lines and mostly-vertical lines it might have been more efficicent to skip
+the division and just use a run-length implementation, but the ability to
+set multiple bits at once for mostly-horizontal lines is a huge win.  It's
+a fair bet that the code in Stellar 7 is the fastest line drawing
+implementation for the Apple II.  (Of course, I haven't looked at Arcticfox,
+the sequel...)
+
+The general structure of the code was actually very similar to mine: always
+draw left to right, use self-modifying code to handle up vs. down, and so on.
+I didn't come away with any new ideas for optimizations to my run-length
+implementation from this or the other programs I looked at... but there
+are a lot of other games that I haven't disassembled.
+
+
+So, 30+ years after HPLOT SIMULATOR, here I am with a bunch of code for
+drawing lines on the Apple II hi-res screen.
+
+I don't plan on writing Battlezone for the Apple II.  Stellar 7 did that,
+and more.  My goal in developing fdraw was to scratch a very old itch.
+
+I had forgotten how much fun this stuff is.  Working in ARM assembly
+language on Android offered similar challenges, but you're never entirely
+sure exactly how your code will perform on the wide range of CPU
+architectures (affecting instruction interleave, cache size and
+replacement policy, etc.), you have to guess at cache misses and the
+success rate of data prefetching, and it's difficult to measure results when
+there's multiple threads running and interrupts firing.  On the Apple II
+you can count every cycle, and know exactly what will happen when.
+
+I don't expect that anyone will find the code useful, but that wasn't
+really the point.
+
+Andy McFadden  
+August 2015
+
--- a/fdraw-disks.zip
+++ b/fdraw-disks.zip