diff --git a/FDRAW.CIRCLE.S b/FDRAW.CIRCLE.S new file mode 100644 index 0000000..bfe84db --- /dev/null +++ b/FDRAW.CIRCLE.S @@ -0,0 +1,752 @@ +******************************** +* * +* Fast Apple II Graphics * +* By Andy McFadden * +* Version 0.3, Aug 2015 * +* * +* Circle rendering * +* (Included by FDRAW.S) * +* * +* Developed with Merlin-16 * +* * +******************************** + +* TODO: if USE_FAST is 0, replace the outline circle +* plot code with calls to DrawPoint (or maybe a +* common sub-function so we don't trash the input +* parameters). Saves a little space. + + +******************************** +* +* Draw a circle. The radius is in in_rad, and +* the center is at in_x0l+in_x0h,in_y0. +* +******************************** +DrawCircle + lda #$20 ;JSR + cmp _cp08 ;configured for outline? + beq :okay + jsr fixcplot +:okay + jmp calc_circle + + +******************************** +* +* Draw filled circle. +* +******************************** +FillCircle + lda #$2c ;BIT + cmp _cp08 ;configured for fill? + beq :okay + jsr fixcplot +:okay + jsr calc_circle + jmp FillRaster + + +* Calculate a circle, using Bresenham's algorithm. The +* results are placed into the rasterization buffers. +* +* in_rad must be from 0 to 255. The x/y center +* coordinates must be on the screen, but the circle +* can extend off the edge. +* +* The computed values are stored in the rasterization +* tables. For an outline circle, we also plot the +* points immediately. + + do USE_FAST ;***** +* local storage -- not used often enough to merit DP +circ_8bit ds 1 +circ_clip ds 1 + fin ;***** + +calc_circle +max_fast_rad equ 41 +]cxl equ zloc0 +]cxh equ zloc1 +]cy equ zloc2 +]dlo equ zloc3 +]dhi equ zloc4 +]xsav equ zloc5 +]ysav equ zloc6 +]min_x equ zloc7 ;min/max offsets from center +]max_x equ zloc8 ;(min is above center, max +]min_y equ zloc9 ; is below) +]max_y equ zloc10 +]hitmp equ zloc11 +* only used by hplot for outline circles +]hbasl equ zptr0 +]andmask equ zloc11 ;overlaps with ]hitmp +]savxreg equ zloc12 +]savyreg equ zloc13 + +* Special-case radius=0. It removes an annoying +* edge case (first y-- becomes 0xff, but 6502 cmp +* is unsigned). + lda in_rad + bne :notzero + ldy in_y0 + sty rast_top + sty rast_bottom + lda in_x0l + sta rastx0l,y + sta rastx1l,y + lda in_x0h + sta rastx0h,y + sta rastx1h,y + rts + +* Use different version of function for small +* circles, because we can do it all in 8 bits. +:notzero + do USE_FAST ;***** + ldy #$01 + cmp #max_fast_rad ;in_rad in Acc + blt :use_fast + dey +:use_fast sty circ_8bit + fin ;***** + + lda in_x0l ;copy center to DP for speed + sta ]cxl + lda in_x0h + sta ]cxh + lda in_y0 + sta ]cy + +* Compute min/max values, based on offset from center. +* These are compared against offset-from-center x/y. +* We need tight bounds on Y because we use it to +* compute the rast_render top/bottom. Getting tight +* bounds on X is not so important, but we still need +* it for the no-clip optimization. + ldx #$04 ;count edges needing clip + + lda #NUM_ROWS-1 ;191 + sec + sbc ]cy ;maxY = 191-cy + cmp in_rad + blt :ylimok + lda in_rad ;clamp to radius + dex +:ylimok sta ]max_y ;maxY = 191-cy + + lda ]cy ;minY = cy + cmp in_rad + blt :ylimok2 + lda in_rad ;clamp to radius + dex +:ylimok2 sta ]min_y + + lda ]cxh + beq :xlimlo +* Examples (note # bad, must use rad +* cx=24, 23-24=255 + carry clear --> ok, chk rad +* cx=255, 23-255=24 + carry clear --> ok, chk rad +:xlimlo + lda # 255) ? + cmp in_rad + blt :xlimok2 + lda in_rad ;clamp to radius + dex +:xlimok2 sta ]min_x + +:xlimdone + + do USE_FAST ;***** + stx circ_clip + fin ;***** + +* set top/bottom rows for rasterizer + lda ]cy + clc + adc ]max_y + sta rast_bottom + lda ]cy + sec + sbc ]min_y + sta rast_top + + DO 0 ;debug debug debug + LDA ]min_x ;save a copy where the + STA $0380 ; monitor won't trash it + LDA ]max_x + STA $0381 + LDA ]min_y + STA $0382 + LDA ]max_y + STA $0383 + FIN + +* Set initial conditions for Bresenham. + ldx #0 ;:x = 0 + stx ]xsav + ldy in_rad ;:y = rad + sty ]ysav + lda #1 ;:d = 1 - rad + sec + sbc ]ysav ;in_rad + sta ]dlo + bcs :hizero ;C==1 if in_rad<=1 + ldx #$ff ;C was 0, make neg +:hizero stx ]dhi + +* +* Outer loop -- plot 8 points, then update values. +* +circ_loop + + do USE_FAST ;***** + lda circ_clip + beq ncypy + jmp with_clip + +* Quick version, no clipping required +* row cy+y: cx-x and cx+x +ncypy + lda ]ysav + clc + adc ]cy + tay ;y-coord in Y-reg + + lda ]cxl + sec + sbc ]xsav + sta rastx0l,y + lda ]cxh + sbc #$00 + sta rastx0h,y +_cp00 jsr cplotl + + lda ]cxl + clc + adc ]xsav + sta rastx1l,y + lda ]cxh + adc #$00 + sta rastx1h,y +_cp01 jsr cplotrn + +* row cy-y: cx-x and cx+x +ncymy + lda ]cy + sec + sbc ]ysav + tay ;y-coord in Y-reg + + lda ]cxl + sec + sbc ]xsav + sta rastx0l,y + lda ]cxh + sbc #$00 + sta rastx0h,y +_cp02 jsr cplotl + + lda ]cxl + clc + adc ]xsav + sta rastx1l,y + lda ]cxh + adc #$00 + sta rastx1h,y +_cp03 jsr cplotrn + +* row cy+x: cx-y and cx+y +ncypx + lda ]xsav ;off bottom? + clc + adc ]cy + tay ;y-coord in Y-reg + + lda ]cxl + sec + sbc ]ysav + sta rastx0l,y + lda ]cxh + sbc #$00 + sta rastx0h,y +_cp04 jsr cplotl + + lda ]cxl + clc + adc ]ysav + sta rastx1l,y + lda ]cxh + adc #$00 + sta rastx1h,y +_cp05 jsr cplotrn + +* row cy-x: cx-y and cx+y +ncymx + lda ]cy + sec + sbc ]xsav + tay ;y-coord in Y-reg + + lda ]cxl + sec + sbc ]ysav + sta rastx0l,y + lda ]cxh + sbc #$00 + sta rastx0h,y +_cp06 jsr cplotl + + lda ]cxl + clc + adc ]ysav + sta rastx1l,y + lda ]cxh + adc #$00 + sta rastx1h,y +_cp07 jsr cplotrn + +* CLICK + jmp circ_plot_done + + fin ;***** (USE_FAST) + +* +* Same thing, but this time clipping edges. +* +with_clip + +* row cy+y: cx-x and cx+x +ccypy + lda ]ysav ;off bottom? + cmp ]max_y + beq :cypy_ok + bge cypy_skip ;completely off screen +:cypy_ok clc + adc ]cy + tay ;y-coord in Y-reg + + ldx ]xsav ;handle cx-x + cpx ]min_x + blt :cxmx_ok + beq :cxmx_ok + lda #0 ;clip at 0 + sta rastx0l,y + sta rastx0h,y + beq cxmx_done0 ;always + BREAK +:cxmx_ok lda ]cxl + sec + sbc ]xsav + sta rastx0l,y + lda ]cxh + sbc #$00 + sta rastx0h,y +_cp08 jsr cplotl +cxmx_done0 + + cpx ]max_x ;handle cx+x + blt :cxpx_ok + beq :cxpx_ok + lda #NUM_COLS-1 + sta rastx1h,y + bne cxpx_done0 ;always + BREAK +:cxpx_ok lda ]cxl + clc + adc ]xsav + sta rastx1l,y + lda ]cxh + adc #$00 + sta rastx1h,y +_cp09 jsr cplotr +cxpx_done0 +cypy_skip + +* row cy-y: cx-x and cx+x +ccymy + lda ]ysav ;off top? + cmp ]min_y + beq :cymy_ok + bge cymy_skip +:cymy_ok lda ]cy + sec + sbc ]ysav + tay ;y-coord in Y-reg + + ldx ]xsav ;handle cx-x + cpx ]min_x + blt :cxmx_ok + beq :cxmx_ok + lda #0 ;clip at 0 + sta rastx0l,y + sta rastx0h,y + beq cxmx_done1 ;always + BREAK +:cxmx_ok lda ]cxl + sec + sbc ]xsav + sta rastx0l,y + lda ]cxh + sbc #$00 + sta rastx0h,y +_cp10 jsr cplotl +cxmx_done1 + + cpx ]max_x ;handle cx+x + blt :cxpx_ok + beq :cxpx_ok + lda #NUM_COLS-1 + sta rastx1h,y + bne cxpx_done1 ;always + BREAK +:cxpx_ok lda ]cxl + clc + adc ]xsav + sta rastx1l,y + lda ]cxh + adc #$00 + sta rastx1h,y +_cp11 jsr cplotr +cxpx_done1 +cymy_skip + +* row cy+x: cx-y and cx+y +ccypx + lda ]xsav ;off bottom? + cmp ]max_y + beq :cypx_ok + bge cypx_skip +:cypx_ok clc + adc ]cy + tay ;y-coord in Y-reg + + ldx ]ysav ;handle cx-y + cpx ]min_x + blt :cxmy_ok + beq :cxmy_ok + lda #0 ;clip at 0 + sta rastx0l,y + sta rastx0h,y + beq cxmy_done2 ;always + BREAK +:cxmy_ok lda ]cxl + sec + sbc ]ysav + sta rastx0l,y + lda ]cxh + sbc #$00 + sta rastx0h,y +_cp12 jsr cplotl +cxmy_done2 + + cpx ]max_x ;handle cx+y + blt :cxpy_ok + beq :cxpy_ok + lda #NUM_COLS-1 + sta rastx1h,y + bne cxpy_done2 ;always + BREAK +:cxpy_ok lda ]cxl + clc + adc ]ysav + sta rastx1l,y + lda ]cxh + adc #$00 + sta rastx1h,y +_cp13 jsr cplotr +cxpy_done2 +cypx_skip + +* row cy-x: cx-y and cx+y +ccymx + lda ]xsav ;off top? + cmp ]min_y + beq :cymx_ok + bge cymx_skip +:cymx_ok lda ]cy + sec + sbc ]xsav + tay ;y-coord in Y-reg + + ldx ]ysav ;handle cx-y + cpx ]min_x + blt :cxmy_ok + beq :cxmy_ok + lda #0 ;clip at 0 + sta rastx0l,y + sta rastx0h,y + beq cxmy_done3 ;always + BREAK +:cxmy_ok lda ]cxl + sec + sbc ]ysav + sta rastx0l,y + lda ]cxh + sbc #$00 + sta rastx0h,y +_cp14 jsr cplotl +cxmy_done3 + + cpx ]max_x ;handle cx+y + blt :cxpy_ok + beq :cxpy_ok + lda #NUM_COLS-1 + sta rastx1h,y + bne cxpy_done3 ;always + BREAK +:cxpy_ok lda ]cxl + clc + adc ]ysav + sta rastx1l,y + lda ]cxh + adc #$00 + sta rastx1h,y +_cp15 jsr cplotr +cxpy_done3 +cymx_skip + +circ_plot_done +* Update X/Y/D. Up to about radius=41 we can maintain +* 'd' in an 8-bit register. + do USE_FAST ;***** + lda circ_8bit + beq circ_slow + +* +* Bresenham update, with 8-bit 'd'. +* + ldx ]xsav + lda ]dlo + bmi :dneg + txa ;:d = d + ((x-y)*4) +5 + sec + sbc ]ysav ;x <= y, may be neg or 0 + asl + asl + clc ;can't know carry + adc #5 + clc ;still don't want carry + adc ]dlo + sta ]dlo + dec ]ysav ;:y-- + jmp :loopbot +:dneg txa ;:d = d + (x*4) +3 + asl + asl ;x always pos, C=0 + DO 0 + BCC :TEST ;debug + BREAK ;debug +:TEST ;debug + FIN + adc #3 + adc ]dlo + sta ]dlo +:loopbot + inx ;:x++ + stx ]xsav + cpx ]ysav + beq :again + bge circ_done +:again jmp circ_loop + + fin ;***** + +* +* Bresenham update, with 16-bit 'd' +* +circ_slow + CLICK + ldx ]xsav + lda ]dhi + bmi :dneg + lda ]dlo + clc + adc #5 + sta ]dlo + bcc :noinc + inc ]dhi +:noinc + txa ;:d = d + ((x-y)*4) +5 + ldy #$00 + sty ]hitmp + sec + sbc ]ysav ;x <= y, may be neg or 0 + beq :xeqy ;if x==y, nothing to add + ldy #$ff + sty ]hitmp + asl + rol ]hitmp + asl + rol ]hitmp + clc + adc ]dlo + sta ]dlo + lda ]dhi + adc ]hitmp + sta ]dhi +:xeqy + dec ]ysav ;:y-- + jmp :loopbot + +:dneg lda ]dlo ;:d = d + (x*4) + 3 + clc + adc #3 + sta ]dlo + bcc :noinc2 + inc ]dhi +:noinc2 txa + ldy #0 ;x always positive + sty ]hitmp + asl + rol ]hitmp + asl + rol ]hitmp + clc ;not needed? + adc ]dlo + sta ]dlo + lda ]dhi + adc ]hitmp + sta ]dhi +:loopbot + inx ;:x++ + stx ]xsav + cpx ]ysav + beq :again + bge circ_done +:again jmp circ_loop + + +circ_done rts + + +* Plot a point for outline circle rendering. +* +* X and Y must be preserved. Y holds the current line +* number. +* +* Most DP locations are in use -- see the variable +* declarations at the start of the circle function. + +* cplotl is the entry point for the leftmost point. +cplotl + stx ]savxreg + sty ]savyreg + + lda ylooklo,y + sta ]hbasl + lda ylookhi,y +_pg_or2 ora #$20 + sta ]hbasl+1 + +* Convert the X coordinate into byte/bit. + ldx rastx0l,y ;x coord, lo + lda rastx0h,y ;>= 256? + beq :lotabl ;no, use the low table + ldy div7hi,x + lda mod7hi,x + bpl cplotcom ;always + BREAK ;debug +:lotabl ldy div7lo,x + lda mod7lo,x + jmp cplotcom + +* cplotr is the entry point for the rightmost point. +* We use rastx1 instead of rastx0. +cplotr + lda ylooklo,y + sta ]hbasl + lda ylookhi,y +_pg_or3 ora #$20 + sta ]hbasl+1 + +* If we just plotted the left point on the same line, +* we can skip the Y-lookup by jumping here. +cplotrn + stx ]savxreg + sty ]savyreg + + ldx rastx1l,y ;x coord, lo + lda rastx1h,y ;>= 256? + beq :lotabl ;no, use the low table + ldy div7hi,x + lda mod7hi,x + bpl cplotcom ;always + BREAK ;debug +:lotabl ldy div7lo,x + lda mod7lo,x + +* Plot the point. The byte offset (0-39) is in Y, +* the bit offset (0-6) is in A. +cplotcom + tax + lda colorline,y ;start with color pattern + eor (]hbasl),y ;flip all bits + and andmask,x ;clear other bits + eor (]hbasl),y ;restore ours, set theirs + sta (]hbasl),y + + ldx ]savxreg + ldy ]savyreg + rts + +* Reconfigure calc_circle to either JSR to cplotl/r, +* or just BIT the address (a 4-cycle no-op). The +* desired instruction is in A. +fixcplot + do USE_FAST ;***** + sta _cp00 + sta _cp01 + sta _cp02 + sta _cp03 + sta _cp04 + sta _cp05 + sta _cp06 + sta _cp07 + fin ;***** + sta _cp08 + sta _cp09 + sta _cp10 + sta _cp11 + sta _cp12 + sta _cp13 + sta _cp14 + sta _cp15 + rts diff --git a/FDRAW.LINE.S b/FDRAW.LINE.S new file mode 100644 index 0000000..db0df77 --- /dev/null +++ b/FDRAW.LINE.S @@ -0,0 +1,588 @@ +******************************** +* * +* Fast Apple II Graphics * +* By Andy McFadden * +* Version 0.3, Aug 2015 * +* * +* Point and line functions * +* (Included by FDRAW.S) * +* * +* Developed with Merlin-16 * +* * +******************************** + + +******************************** +* +* Draw a single point in the current color. +* +******************************** +DrawPoint +]hbasl equ zptr0 + + ldy in_y0 + lda ylooklo,y + sta ]hbasl + lda ylookhi,y + ora g_page + sta ]hbasl+1 + + ldx in_x0l ;x coord, lo + lda in_x0h ;>= 256? + beq :lotabl ;no, use the low table + ldy div7hi,x + lda mod7hi,x + bpl :plotit ;always + BREAK ;debug +:lotabl ldy div7lo,x + lda mod7lo,x + +* Plot the point. The byte offset (0-39) is in Y, +* the bit offset (0-6) is in A. +:plotit + tax + lda colorline,y ;start with color pattern + eor (]hbasl),y ;flip all bits + and andmask,x ;clear other bits + eor (]hbasl),y ;restore ours, set theirs + sta (]hbasl),y + rts + + +******************************** +* +* Draw a line between two points. +* +******************************** +DrawLine + +]hbasl equ zptr0 +]xposl equ zloc0 ;always left edge +]xposh equ zloc1 +]ypos equ zloc2 ;top or bottom +]deltaxl equ zloc3 +]deltaxh equ zloc4 +]deltay equ zloc5 +]count equ zloc6 +]counth equ zloc7 +]diff equ zloc8 +]diffh equ zloc9 +]andmask equ zloc10 +]wideflag equ zloc11 ;doesn't really need DP + +* We use a traditional Bresenham run-length approach. +* Run-slicing is possible, but the code is larger +* and the increased cost means it's only valuable +* for longer lines. An optimal solution would switch +* approaches based on line length. +* +* Start by identifying where x0 or x1 is on the +* left. To make life simpler we always work from +* left to right, flipping the coordinates if +* needed. +* +* We also need to figure out if the line is more +* than 255 pixels long -- which, because of +* inclusive coordinates, means abs(x0-x1) > 254. + lda in_x1l ;assume x0 on left + sec + sbc in_x0l + tax + beq checkvert ;low bytes even, check hi + lda in_x1h + sbc in_x0h + bcs lx0left + +* x1 is on the left, so the values are negative +* (hi byte in A, lo byte in X) +lx0right eor #$ff ;invert hi + sta ]deltaxh ;store + txa + eor #$ff ;invert lo + sta ]deltaxl + inc ]deltaxl ;add one for 2s complement + bne :noinchi ;rolled into high byte? + inc ]deltaxh ;yes +:noinchi lda in_x1l ;start with x1 + sta ]xposl + lda in_x1h + sta ]xposh + lda in_y1 + sta ]ypos + sec + sbc in_y0 ;compute deltay + jmp lncommon + +checkvert + lda in_x1h ;diff high bytes + sbc in_x0h ;(carry still set) + blt lx0right ;width=256, x0 right + bne lx0left ;width=256, x0 left + jmp vertline ;all zero, go vert + +* (branch back from below) +* This is a purely horizontal line. We farm the job +* out to the raster fill code for speed. (There's +* no problem with the line code handling it; its just +* more efficient to let the raster code do it.) +phorizontal + ldy ]ypos + sty rast_top + sty rast_bottom + lda ]xposl + sta rastx0l,y + clc + adc ]deltaxl ;easier to add delta back + sta rastx1l,y ; in than sort out which + lda ]xposh ; arg is left vs. right + sta rastx0h,y + adc ]deltaxh + sta rastx1h,y + jmp FillRaster + +* x0 is on the left, so the values are positive +lx0left stx ]deltaxl + sta ]deltaxh + lda in_x0l ;start with x0 + sta ]xposl + lda in_x0h + sta ]xposh + lda in_y0 ;and y0 + sta ]ypos + sec + sbc in_y1 ;compute deltay + +* Value of (starty - endy) is in A, flags still set. +lncommon + bcs :posy + eor #$ff ;negative, invert + adc #$01 + sta ]deltay + lda #$e8 ;INX + bne gotdy +:posy +_lmb beq phorizontal + sta ]deltay + lda #$ca ;DEX +gotdy sta _hmody + sta _vmody + sta _wmody + + do 0 ;***** for regression test + ldx #$01 + lda ]deltaxh + bne :iswide + lda ]deltaxl + cmp #$ff ;== 255? + beq :iswide + ldx #$00 ;notwide +:iswide stx $300 + lda ]xposl + sta $301 + lda ]xposh + sta $302 + lda ]ypos + sta $303 + ldx ]deltaxl + stx $304 + ldx ]deltaxh + stx $305 + ldx ]deltay + stx $306 + lda _hmody + and #$20 ;nonzero means inc, + sta $307 ; zero means dec + fin ;***** + +* At this point we have the initial X position in +* ]startxl/h, the initial Y position in ]starty, +* deltax in ]deltaxl, deltay in ]deltay, and we've +* tweaked the Y-update instructions to either INC or +* DEC depending on the direction of movement. +* +* The next step is to decide whether the line is +* horizontal-dominant or vertical-dominant, and +* branch to the appropriate handler. +* +* The core loops for horiz and vert take about +* 80 cycles when moving diagonally, and about +* 20 fewer when moving in the primary direction. +* The wide-horiz is a bit slower. + ldy #$01 ;set "wide" flag to 1 + lda ]deltaxl + ldx ]deltaxh + bne horzdom ;width >= 256 + cmp #$ff ;width == 255 + beq horzdom + dey ;not wide + cmp ]deltay + bge horzdom ; for diagonal lines + jmp vertdom + +* We could special-case pure-diagonal lines here +* (just BEQ a couple lines up). It does +* represent our worst case. I'm not convinced +* we'll see them often enough to make it worthwhile. + + +* horizontal-dominant +horzdom + sty ]wideflag + sta ]count ;:count = deltax + 1 + inc ]count + lsr ;:diff = deltax / 2 + sta ]diff + +* set Y to the byte offset in the line +* load the AND mask into ]andmask + ldx ]xposl + lda ]xposh ;>= 256? + beq :lotabl ;no, use the low table + ldy div7hi,x + lda mod7hi,x + bpl :gottab ;always +* BREAK ;debug +:lotabl ldy div7lo,x + lda mod7lo,x +:gottab + tax + lda andmask,x + sta ]andmask + +* Set initial value for line address. + ldx ]ypos + lda ylooklo,x + sta ]hbasl + lda ylookhi,x + ora g_page + sta ]hbasl+1 + + lda ]wideflag ;is this a "wide" line? + beq :notwide ;nope, stay local + jmp widedom + +:notwide lda colorline,y ;set initial color mask + sta _hlcolor+1 + jmp horzloop + +hrts rts + +* bottom of loop, essentially +hnoroll sta ]diff ;3 +hdecc dec ]count ;5 :count-- + beq hrts ;2 :while (count != 0) + ;= 7 or 10 + +* We keep the byte offset in the line in Y, and the +* line index in X, for the entire loop. +horzloop +_hlcolor lda #$00 ;2 start with color pattern +_lmdh eor (]hbasl),y ;5 flip all bits + and ]andmask ;3 clear other bits + eor (]hbasl),y ;5 restore ours, set theirs + sta (]hbasl),y ;6 = 21 + +* Move right. We shift the bit mask that determines +* the pixel. When we shift into bit 7, we know it's +* time to advance another byte. +* +* If this is a shallow line we would benefit from +* keeping the index in X and just doing a 4-cycle +* indexed load to get the mask. Not having the +* line number in X makes the line calc more +* expensive for steeper lines though. + lda ]andmask ;3 + asl ;2 shift, losing hi bit + eor #$80 ;2 set the hi bit + bne :noh8 ;3 cleared hi bit? +* We could BEQ away and branch back in, but this +* happens every 7 iterations, so on average it's +* a very small improvement. If we happen to branch +* across a page boundary the double-branch adds +* two more cycles and we lose. + iny ;2 advance to next byte + lda colorline,y ;4 update color mask + sta _hlcolor+1 ;4 + lda #$81 ;2 reset +:noh8 sta ]andmask ;3 = 13 + ((12-1)/7) = 14 + +* Update error diff. + lda ]diff ;3 + sec ;2 + sbc ]deltay ;3 :diff -= deltay + bcs hnoroll ;2+ :if (diff < 0) ... + ;= 11 level, 10 up/down + adc ]deltaxl ;3 : diff += deltax + sta ]diff ;3 +_hmody inx ;2 : ypos++ (or --) + lda ylooklo,x ;4 update hbasl after line + sta ]hbasl ;3 change + lda ylookhi,x ;4 +_pg_or4 ora #$20 ;2 + sta ]hbasl+1 ;3 + bne hdecc ;3 = +27 this path -> 37 + BREAK +* horizontal: 10+21+14+11=56 cycles/pixel +* diagonal: 7+21+14+37=79 cycles/pixel + + +* Vertical-dominant line. Could go up or down. +vertdom + ldx in_y0 + cpx ]ypos ;starting at y0? + bne :endy0 ;yup + ldx in_y1 ;nope +:endy0 stx _vchk+1 ;end condition + + lda ]deltay + lsr + sta ]diff ;:diff = deltay / 2 + +* set Y to the byte offset in the line +* load the AND mask into ]andmask + ldx ]xposl + lda ]xposh ;>= 256? + beq :lotabl ;no, use the low table + ldy div7hi,x + lda mod7hi,x + bpl :gottab ;always + BREAK ;debug +:lotabl ldy div7lo,x + lda mod7lo,x +:gottab + tax + lda andmask,x ;initial pixel mask + sta ]andmask + + lda colorline,y ;initial color mask + sta _vlcolor+1 + + ldx ]ypos + jmp vertloop + +* We keep the byte offset in the line in Y, and the +* line index in X, for the entire loop. + +* Bottom of loop, essentially. +vnoroll sta ]diff ;3 + +vertloop + lda ylooklo,x ;4 + sta ]hbasl ;3 + lda ylookhi,x ;4 +_pg_or5 ora #$20 ;2 + sta ]hbasl+1 ;3 = 16 + +_vlcolor lda #$00 ;2 start with color pattern +_lmdv eor (]hbasl),y ;5 flip all bits + and ]andmask ;3 clear other bits + eor (]hbasl),y ;5 restore ours, set theirs + sta (]hbasl),y ;6 = 21 + +_vchk cpx #$00 ;2 was this last line? + beq vrts ;2 yes, done +_vmody inx ;2 :ypos++ (or --) + +* Update error diff. + lda ]diff ;3 + sec ;2 + sbc ]deltaxl ;3 :diff -= deltax + bcs vnoroll ;2 :if (diff < 0) ... + ;= 10 vert, 9 move right + + adc ]deltay ;3 : diff += deltay + sta ]diff ;3 +* Move right. We shift the bit mask that determines +* the pixel. When we shift into bit 7, we know it's +* time to advance another byte. + lda ]andmask ;3 + asl ;2 shift, losing hi bit + eor #$80 ;2 set the hi bit + beq :is8 ;2+ goes to zero on 8th bit + sta ]andmask ;3 + bne vertloop ;3 = 21 + (18/7) = 24 + BREAK + +:is8 iny ;2 advance to next byte + lda colorline,y ;4 update color + sta _vlcolor+1 ;4 + lda #$81 ;2 reset + sta ]andmask ;3 + bne vertloop ;3 = 18 + BREAK +vrts rts +* vertical: 3 + 16 + 21 + 6 + 10 = 56 cycles +* diagonal: 16 + 21 + 6 + 9 + 24 = 76 cycles + + +* "Wide" horizontally-dominant loop. We have to +* maintain error-diff and deltax as 16-bit values. +* Most of the setup from the "narrow" version carried +* over, but we have to re-do the count and diff. +* +* Normally we set count to (deltax + 1) and decrement +* to zero, but it's actually easier to set it equal +* to deltax and check for -1. +widedom + lda ]deltaxh ;:count = deltax + sta ]counth + ldx ]deltaxl + stx ]count + stx ]diff + lsr ;:diff = deltax / 2 + ror ]diff + sta ]diffh + ldx ]ypos + + lda colorline,y ;set initial color mask + sta _wlcolor+1 + +* We keep the byte offset in the line in Y, and the +* line index in X, for the entire loop. +wideloop +_wlcolor lda #$00 ;2 start with color pattern +_lmdw eor (]hbasl),y ;5 flip all bits + and ]andmask ;3 clear other bits + eor (]hbasl),y ;5 restore ours, set theirs + sta (]hbasl),y ;6 = 21 + +* Move right. We shift the bit mask that determines +* the pixel. When we shift into bit 7, we know it's +* time to advance another byte. + lda ]andmask ;3 + asl ;2 shift, losing hi bit + eor #$80 ;2 set the hi bit + bne :not7 ;3 goes to zero on 8th bit + iny ; 2 advance to next byte + lda colorline,y ; 4 update color mask + sta _hlcolor+1 ; 4 + lda #$81 ; 2 reset +:not7 sta ]andmask ;3 = 13 usually, 25 every 7 + +* Update error diff, which is a positive number. If +* it goes negative ("if (diff < 0)") we act. + lda ]diff + sec + sbc ]deltay ;:diff -= deltay + bcs wnoroll ;didn't even roll low byte + dec ]diffh ;check hi byte + bpl wnoroll ;went 1->0, keep going + + adc ]deltaxl ;: diff += deltax + sta ]diff + lda ]diffh + adc ]deltaxh + sta ]diffh +_wmody inx ;: ypos++ (or --) + lda ylooklo,x ;update hbasl after line + sta ]hbasl ; change + lda ylookhi,x +_pg_or6 ora #$20 + sta ]hbasl+1 + bne wdecc + BREAK + +wnoroll sta ]diff + +wdecc dec ]count ;5 :count-- + lda ]count ;3 + cmp #$ff ;2 + bne wideloop ;3 :while (count > -1) + dec ]counth ;low rolled, decr high + beq wideloop ;went 1->0, keep going + rts + + +* Pure-vertical line. These are common in certain +* applications, and checking for it only adds two +* cycles to the general case. +vertline + ldx in_y0 + ldy in_y1 + cpx in_y1 ;y0 < y1? + blt :usey0 ;yes, go from y0 to y1 + txa ;swap X/A + tay + ldx in_y1 +:usey0 stx ]ypos + iny + sty _pvytest+1 + + ldx in_x0l ;xc lo + lda in_x0h ;>= 256? + beq :lotabl + ldy div7hi,x + lda mod7hi,x + bpl :gotit ;always +:lotabl ldy div7lo,x + lda mod7lo,x + +* Byte offset is in Y, mod-7 value is in A. +:gotit tax + lda andmask,x + sta _pvand+1 ;this doesn't change + + lda colorline,y + sta _pvcolor+1 ;nor does this + + ldx ]ypos ;top line + +* There's a trick where, when (linenum & 0x07) is +* nonzero, you just add 4 to hbasl+1 instead of +* re-doing the lookup. However, TXA+AND+BEQ +* followed by LDA+CLC+ADC+STA is 16 cycles, the same +* as our self-modified lookup, so it's not a win. +* (And if we used a second ylookhi and self-modded +* the table address, we could shave off another 2.) + +* Main pure-vertical loop +pverloop + lda ylooklo,x ;4 + sta ]hbasl ;3 + lda ylookhi,x ;4 +_pg_or7 ora #$20 ;2 + sta ]hbasl+1 ;3 (= 16) + +_pvcolor lda #$00 ;2 start with color pattern +_lmdpv eor (]hbasl),y ;5 flip all bits +_pvand and #$00 ;2 clear other bits + eor (]hbasl),y ;5 + sta (]hbasl),y ;6 (= 20) + + inx ;2 +_pvytest cpx #$00 ;2 done? + bne pverloop ;3 = 7 + rts +* 43 cycles/pixel + + +******************************** +* +* Set the line mode according to in_arg +* +* A slightly silly feature to get xdraw lines +* without really working for it. +* +******************************** +SetLineMode + lda in_arg + beq :standard + +* configure for xdraw + lda #$24 ;BIT dp + sta _lmb + sta _lmdh + sta _lmdv + sta _lmdw + sta _lmdpv + rts + +* configure for standard drawing +:standard lda #$f0 ;BEQ + sta _lmb + lda #$51 ;EOR (dp),y + sta _lmdh + sta _lmdv + sta _lmdw + sta _lmdpv + rts diff --git a/FDRAW.S b/FDRAW.S new file mode 100644 index 0000000..661f8d1 --- /dev/null +++ b/FDRAW.S @@ -0,0 +1,805 @@ +******************************** +* * +* Fast Apple II Graphics * +* By Andy McFadden * +* Version 0.3, Aug 2015 * +* * +* Main source file * +* * +* Developed with Merlin-16 * +* * +******************************** + +* Set to 1 to build FDRAW.FAST, set to zero to +* build FDRAW.SMALL. +USE_FAST equ 1 + +* Set to 1 to turn on beeps/clicks for debugging. +NOISE_ON equ 0 + + + lst off +** org $6000 + +* +* Macros. +* +spkr equ $c030 +bell equ $ff3a + +* If enabled, click the speaker (changes flags only). +CLICK mac + do NOISE_ON + bit spkr + fin + <<< +* If enabled, beep the speaker (scrambles regs). +BEEP mac + do NOISE_ON + jsr bell + fin + <<< +* If enabled, insert a BRK. +BREAK mac + do NOISE_ON + brk $99 + fin + <<< + +* In "fast" mode, we align tables on page boundaries so we +* don't take a 1-cycle hit when the indexing crosses a page. +* In "small" mode, we skip the alignment. +PG_ALIGN mac + do USE_FAST + ds \ + fin + <<< + +* +* Hi-res screen constants. +* +BYTES_PER_ROW = 40 +NUM_ROWS = 192 +NUM_COLS = 280 + +* +* Variable storage. We assign generic names to +* zero-page scratch locations, then assign variables +* with real names to these. +* +* 06-09 are unused (except by SWEET-16) +* 1a-1d are Applesoft hi-res scratch +* cc-cf are only used by INTBASIC +* eb-ef and ff appear totally unused by ROM routines +* +zptr0 equ $1a ;2b +zloc0 equ $06 +zloc1 equ $07 +zloc2 equ $08 +zloc3 equ $09 +zloc4 equ $1c +zloc5 equ $1d +zloc6 equ $cc +zloc7 equ $cd +zloc8 equ $ce +zloc9 equ $cf +zloc10 equ $eb +zloc11 equ $ec +zloc12 equ $ed +zloc13 equ $ee + + +******************************** +* +* Entry points for external programs. +* +******************************** +Entry + jmp Init ;initialize data tables + dfb 0,3 ;version number + +* +* Parameters passed from external programs. +* +in_arg ds 1 ;generic argument +in_x0l ds 1 ;X coordinate 0, low part +in_x0h ds 1 ;X coordinate 0, high part +in_y0 ds 1 ;Y coordinate 0 +in_x1l ds 1 +in_x1h ds 1 +in_y1 ds 1 +in_rad ds 1 ;radius for circles + + ds 3 ;pad to 16 bytes + + jmp SetColor + jmp SetPage + jmp Clear + jmp DrawPoint + jmp DrawLine + jmp DrawRect + jmp FillRect + jmp DrawCircle + jmp FillCircle + jmp SetLineMode + jmp noimpl ;reserved2 + jmp FillRaster + +* Raster fill values. Top, bottom, and pointers to tables +* for the benefit of external callers. +rast_top ds 1 +rast_bottom ds 1 + da rastx0l + da rastx0h + da rastx1l + da rastx1h + +noimpl rts + + +******************************** +* +* Global variables. +* +******************************** + +g_inited dfb 0 ;initialized? +g_color dfb 0 ;hi-res color (0-7) +g_page dfb $20 ;hi-res page ($20 or $40) + + +******************************** +* +* Initialize. +* +******************************** +Init + lda #$00 + sta in_arg + jsr SetColor ;set color to zero + jsr SetLineMode ;set normal lines + lda #$20 + sta in_arg + sta g_inited + jmp SetPage ;set hi-res page 1 + + +******************************** +* +* Set the color. +* +******************************** +SetColor + lda in_arg + cmp g_color ;same as the old color? + beq :done + + and #$07 ;safety first + sta g_color + +* Update the "colorline" table, which provides a quick color +* lookup for odd/even bytes. We could also have one table +* per color and self-mod the "LDA addr,y" instructions to +* point to the current one, but that uses a bunch of memory +* and is kind of ugly. Takes 16 + (12 * 40) = 496 cycles. + tax ;2 + lda xormask,x ;4 + sta :_xormsk+1 ;4 + + lda oddcolor,x ;4 + ldy #BYTES_PER_ROW-1 ;2 +]loop sta colorline,y ;5 +:_xormsk eor #$00 ;2 + dey ;2 + bpl ]loop ;3 + +:done rts + + +******************************** +* +* Set the page. +* +******************************** +SetPage + lda g_inited ;let's just check this + beq noinit ; (not called too often) + + lda in_arg + cmp #$20 + beq :good + cmp #$40 + beq :good + jmp bell +:good + sta g_page + + do 0 ;***** + cmp ylookhi + beq :tabok +* Check to see if the values currently in the Y-lookup table +* match our current page setting. If they don't, we need to +* adjust the code that does lookups. + +* This approach modifies the table itself, paying a large +* cost now so we don't have to pay it on every lookup. +* However, this costs 2+(16*192)=3074 cycles, while an +* "ORA imm" only adds two to each lookup, so we'd have +* to do a lot of drawing to make this worthwhile. +* (Note: assumes ylookhi is based at $2000 not $0000) + ldy #NUM_ROWS ;2 +]loop lda ylookhi-1,y ;4 + eor #$60 ;2 $20 <--> $40 + sta ylookhi-1,y ;5 + dey ;2 + bne ]loop ;3 + + else ;***** + +* This approach uses self-modifying code to update the +* relevant instructions. It's a bit messy to have it +* here, but it saves us from having to do it on +* every call. +* +* We could also have a second y-lookup table and +* use this to update the pointers. That would let +* us drop the "ORA imm" entirely, without the cost +* of the rewrite above, but eating up another 192 bytes. + sta _pg_or1+1 ;rastfill + sta _pg_or2+1 ;circle hplot + sta _pg_or3+1 ;circle hplot + sta _pg_or4+1 ;drawline + sta _pg_or5+1 ;drawline + sta _pg_or6+1 ;drawline + sta _pg_or7+1 ;drawline + + fin ;***** + +:tabok rts + +noinit ldy #$00 +]loop lda :initmsg,y + beq :done + jsr $fded ;cout + iny + bne ]loop +:done rts + +:initmsg asc "FDRAW NOT INITIALIZED",87,87,00 + + +******************************** +* +* Clear the screen to the current color. +* +******************************** +Clear + + do USE_FAST ;***** +* This performs a "visually linear" clear, erasing the screen +* from left to right and top to bottom. To reduce the amount +* of code required we erase in thirds (top/middle/bottom). +* +* Compare to a "venetian blind" clear, which is what you get +* if you erase memory linearly. +* +* The docs discuss different approaches. This version +* requires ((2 + 5*64 + 11) * 40 + 14) * 3 = 40002 cycles. +* If we didn't divide it into thirds to keep the top-down +* look, we'd need (5*64 + 9) * 120 = 39480 cycles, so +* we're spending 522 cycles to avoid the venetian look. + lda :clrloop+2 + cmp g_page + beq :pageok + +* We're on the wrong hi-res page. Flip to the other one. +* 4 + (20*64) = 1284 cycles to do the flip (+ a few more +* because we're probably crossing a page boundary). + BEEP + ldy #NUM_ROWS ;2 +]loop lda :clrloop-3+2,y ;4 + eor #$60 ;2 + sta :clrloop-3+2,y ;5 + dey ;2 + dey ;2 + dey ;2 + bne ]loop ;3 + +:pageok ldx g_color ;grab the current color + lda xormask,x + sta :_xormsk+1 + lda evencolor,x + + ldy #0 + jsr :clearthird + ldy #BYTES_PER_ROW + jsr :clearthird + ldy #BYTES_PER_ROW*2 +* fall through into :clearthird for final pass + +:clearthird + ldx #BYTES_PER_ROW-1 ;2 +:clrloop sta $2000,y ;5 (* 64) + sta $2400,y ;this could probably be + sta $2800,y ; done with LUP math + sta $2c00,y + sta $3000,y + sta $3400,y + sta $3800,y + sta $3c00,y + sta $2080,y + sta $2480,y + sta $2880,y + sta $2c80,y + sta $3080,y + sta $3480,y + sta $3880,y + sta $3c80,y + sta $2100,y + sta $2500,y + sta $2900,y + sta $2d00,y + sta $3100,y + sta $3500,y + sta $3900,y + sta $3d00,y + sta $2180,y + sta $2580,y + sta $2980,y + sta $2d80,y + sta $3180,y + sta $3580,y + sta $3980,y + sta $3d80,y + sta $2200,y + sta $2600,y + sta $2a00,y + sta $2e00,y + sta $3200,y + sta $3600,y + sta $3a00,y + sta $3e00,y + sta $2280,y + sta $2680,y + sta $2a80,y + sta $2e80,y + sta $3280,y + sta $3680,y + sta $3a80,y + sta $3e80,y + sta $2300,y + sta $2700,y + sta $2b00,y + sta $2f00,y + sta $3300,y + sta $3700,y + sta $3b00,y + sta $3f00,y + sta $2380,y + sta $2780,y + sta $2b80,y + sta $2f80,y + sta $3380,y + sta $3780,y + sta $3b80,y + sta $3f80,y +:_xormsk eor #$00 ;2 flip odd/even bits + iny ;2 + dex ;2 + bmi :done ;2 + jmp :clrloop ;3 +:done rts + + else ;***** not USE_FAST + +* This version was suggested by Marcus Heuser on +* comp.sys.apple2.programmer. It does a "venetian blind" +* clear, and takes (5 * 32 + 7) * 248 = 41416 cycles. +* It overwrites half of the screen holes. + lda :clrloop+5 + cmp g_page + beq :pageok + +* We're on the wrong hi-res page. Flip to the other one. +* 12 + (20*31) = 632 cycles to do the flip. We have to +* single out the first entry because it's $1f not $20. + BEEP + lda :clrloop+2 ;4 + eor #$20 ;2 $1f <-> $3f + sta :clrloop+2 ;4 + ldy #31*3 ;2 +]loop lda :clrloop+2,y ;4 + eor #$60 ;2 $20 <-> $40 + sta :clrloop+2,y ;5 + dey ;2 + dey ;2 + dey ;2 + bne ]loop ;3 + +:pageok ldx g_color + lda xormask,x + sta :_xormsk+1 + lda oddcolor,x + ldy #248 ;120 + 8 + 120 +:clrloop +]addr = $1fff + lup 32 ;begin a loop in assembler + sta ]addr,y ;5 +]addr = ]addr+$100 ;sta 20ff,21ff,... + --^ +:_xormsk eor #$00 ;2 + dey ;2 + bne :clrloop ;3 + rts + + fin ;***** not USE_FAST + + +******************************** +* +* Draw rectangle outline. +* +******************************** +DrawRect +* We could just issue 4 line draw calls here, maybe +* adjusting the vertical lines by 1 pixel up/down to +* avoid overdraw. But if the user wanted 4 lines, +* they could just draw 4 lines. Instead, we're going +* to draw a double line on each edge to ensure that +* the outline rectangle always has the correct color. +* +* Rather than draw two vertical lines, we draw a +* two-pixel-wide filled rectangle on each side. +* +* We don't want to double-up if the rect is only one +* pixel wide, so we have to check for that. +* +* If the rect is one pixel high, it's just a line. +* If it's two pixels high, we don't need to draw +* the left/right edges, just the top/bottom lines. +* If it's more than two tall, we don't need to draw +* the left/right edges on the top and bottom lines, +* so we save a few cycles by skipping those. + + lda in_y1 ;copy top/bottom to local + sta rast_bottom + dec rast_bottom ;move up one + sec + sbc in_y0 + beq :isline ;1 pixel high, just draw line + cmp #1 + beq :twolines ;2 pixels high, lines only + ldy in_y0 + iny ;start down a line + sty rast_top + + lda in_x0h ;check to see if left/right + cmp in_x1h ; coords are the same; if + bne :notline ; so, going +1/-1 at edge + lda in_x0l ; will overdraw. + cmp in_x1l + bne :notlin1 + +:isline jmp DrawLine ;just treat like line + +* Set up left edge. Top line is in Y. +:notline lda in_x0l +:notlin1 sta rastx0l,y + clc + adc #1 + sta rastx1l,y + lda in_x0h + ora #$80 ;"repeat" flag + sta rastx0h,y + and #$7f + adc #0 + sta rastx1h,y + jsr FillRaster + + ldy rast_top + lda in_x1l ;now set up right edge + sta rastx1l,y + sec + sbc #1 + sta rastx0l,y + lda in_x1h + sta rastx1h,y + sbc #0 + ora #$80 ;"repeat" flag + sta rastx0h,y + jsr FillRaster + +* Now the top/bottom lines. +:twolines + ldy in_y0 + jsr :drawline + ldy in_y1 + +:drawline + sty rast_top + sty rast_bottom + lda in_x0l ;copy left/right to the + sta rastx0l,y ; table entry for the + lda in_x0h ; appropriate line + sta rastx0h,y + lda in_x1l + sta rastx1l,y + lda in_x1h + sta rastx1h,y + jmp FillRaster + + +******************************** +* +* Draw filled rectangle. +* +******************************** +FillRect +* Just fill out the raster table and call the fill routine. +* We require y0=top, y1=bottom, x0=left, x1=right. + ldy in_y0 + sty rast_top + lda in_y1 + sta rast_bottom + + lda in_x0l + sta rastx0l,y + lda in_x0h + ora #$80 ;"repeat" flag + sta rastx0h,y + lda in_x1l + sta rastx1l,y + lda in_x1h + sta rastx1h,y + + jmp FillRaster + + +******************************** +* +* Fill an area defined by the raster tables. +* +******************************** +FillRaster + +* Render rasterized output. The left and right edges +* are stored in the rastx0/rastx1 tables, and the top +* and bottom-most pixels are in rast_top/rast_bottom. +* +* This can be used to render an arbitrary convex +* polygon after it has been rasterized. +* +* If the high bit of the high byte of X0 is set, we +* go into "repeat" mode, where we just repeat the +* previous line. This saves about 40 cycles of +* overhead per line when drawing rectangles, plus +* what we would have to spend to populate multiple +* lines of the raster table. It only increases the +* general per-line cost by 3 cycles. +* +* We could use the "repeat" flag to use this code to +* draw vertical lines, though that's mostly of value +* to an external caller who knows ahead of time that +* the line is vertical. The DrawLine code is pretty +* good with vertical lines, and adding additional +* setup time to every vertical-dominant line to +* decide if it should call here seems like a +* losing proposition. + +]hbasl equ zptr0 +]hbash equ zptr0+1 +]lftbyte equ zloc0 +]lftbit equ zloc1 +]rgtbyte equ zloc2 +]rgtbit equ zloc3 +]line equ zloc4 +]andmask equ zloc5 +]cur_line equ zloc6 +]repting equ zloc7 + + ldx g_color ;configure color XOR byte + lda xormask,x + do USE_FAST ;***** + cmp rast_unroll+3 ;already configured? + beq :goodmask + jsr fixrastxor +:goodmask + else + sta _xorcolor+1 + fin ;***** + + lda #$00 + sta ]repting + + ldy rast_top + +* Main rasterization loop. Y holds the line number. +rastloop + sty ]cur_line ;3 + ldx ylooklo,y ;4 + stx ]hbasl ;3 + lda ylookhi,y ;4 +_pg_or1 ora #$20 ;2 will be $20 or $40 + sta ]hbash ;3 = 19 cycles + do USE_FAST-1 ;***** i.e. not USE_FAST + stx _wrhires+1 + sta _wrhires+2 + fin ;***** + +* divide left edge by 7 + ldx rastx0l,y ;4 line num in Y + lda rastx0h,y ;4 + bpl :noflag ;2 + sta rastx0h+1,y ;4 propagate + lda ]repting ;3 first time through? + beq :firstre ;2 yup, finish calculations + lda ]rgtbyte ;3 need this in A + bpl :repeat ;3 always +:firstre lda rastx0h,y ;reload + sta ]repting ;any nonzero will do + and #$7f ;strip repeat flag +:noflag beq :lotabl + lda mod7hi,x + sta ]lftbit + lda div7hi,x + sta ]lftbyte + bpl :gotlft ;always + BREAK ;debug +:lotabl lda mod7lo,x + sta ]lftbit + lda div7lo,x + sta ]lftbyte +:gotlft + +* divide right edge by 7 + ldx rastx1l,y ;4 line num in Y + lda rastx1h,y ;4 + beq :lotabr ;3 + lda mod7hi,x + sta ]rgtbit + lda div7hi,x + sta ]rgtbyte + bpl :gotrgt ;always + BREAK ;debug +:lotabr lda mod7lo,x ;4 + sta ]rgtbit ;3 + lda div7lo,x ;4 + sta ]rgtbyte ;3 = 25 for X1 < 256 +:gotrgt + +:repeat + cmp ]lftbyte ;3 + bne :not1byte ;3 + +* The left and right edges are in the same byte. We +* need to set up the mask differently, so we deal with +* it as a special case. + ldy ]lftbit + lda leftmask,y ;create the AND mask + ldx ]rgtbit + and rightmask,x ;strip out bits on right + sta ]andmask + + ldy ]lftbyte + lda colorline,y ;get color bits + eor (]hbasl),y ;combine w/screen + and ]andmask ;remove not-ours + eor (]hbasl),y ;combine again + sta (]hbasl),y + jmp rastlinedone + +* This is the more general case. We special-case the +* left and right edges, then byte-stomp the middle. +* On entry, ]rgtbyte is in A +:not1byte + sec ;2 compute number of full + sbc ]lftbyte ;3 and partial bytes to + tax ;2 draw + inx ;2 + + ldy ]rgtbit ;3 + cpy #6 ;2 + beq :rgtnospcl ;3 + lda rightmask,y ;handle partial-byte right + sta ]andmask + ldy ]rgtbyte + lda colorline,y + eor (]hbasl),y + and ]andmask + eor (]hbasl),y + sta (]hbasl),y + dex ;adjust count +:rgtnospcl + + ldy ]lftbit ;3 check left for partial + beq :lftnospcl ;3 + lda leftmask,y ;handle partial-byte left + sta ]andmask + ldy ]lftbyte + lda colorline,y + eor (]hbasl),y + and ]andmask + eor (]hbasl),y + sta (]hbasl),y + dex ;adjust count + beq rastlinedone ;bail if all done + iny ;advance start position + bne :liny ;always + BREAK +:lftnospcl + + ldy ]lftbyte ;3 +:liny + + do USE_FAST ;***** "fast" loop +* Instead of looping, jump into an unrolled loop. +* Cost is 10 cycles per byte with an extra 14 cycles +* of overhead, so we start to win at 4 bytes. + lda rastunidx,x ;4 + sta :_rastun+1 ;4 + lda colorline,y ;4 get odd/even color val +:_rastun jmp rast_unroll ;3 + + else ;***** "slow" loop +* Inner loop of the renderer. This runs 0-40x. +* Cost is 14 cycles/byte. + lda colorline,y ;get appropriate odd/even val +_wrhires sta $2000,y ;5 replaced with line addr +_xorcolor eor #$00 ;2 replaced with $00/$7f + iny ;2 + dex ;2 + bne _wrhires ;3 + + fin ;***** + +rastlinedone + ldy ]cur_line ;3 more lines to go? + cpy rast_bottom ;4 + bge :done ;2 + iny ;2 + jmp rastloop ;3 must have line in Y + +:done rts + +fixrastxor + do USE_FAST ;***** +* Update the EOR statements in the unrolled rastfill code. +* Doing this with a loop takes ~600 cycles, doing it with +* unrolled stores takes 160. We only do this when we +* need to, so changing the color from green to blue won't +* cause this to run. +* +* Call with the XOR value in A. +]offset = 0 + lup BYTES_PER_ROW + sta rast_unroll+3+]offset +]offset = ]offset+5 + --^ + BEEP + rts + fin ;***** + + +* include the line functions +** put FDRAW.LINE + +* include the circle functions +** put FDRAW.CIRCLE + + lst on +CODE_END equ * ;end of code section + lst off + +* include the data tables +** put FDRAW.TABLES + + lst on +DAT_END equ * ;end of data / BSS + lst off + +* Save the appropriate object file. +** do USE_FAST +** sav FDRAW.FAST +** else +** sav FDRAW.SMALL +** fin diff --git a/FDRAW.TABLES.S b/FDRAW.TABLES.S new file mode 100644 index 0000000..d1d91f2 --- /dev/null +++ b/FDRAW.TABLES.S @@ -0,0 +1,339 @@ +******************************** +* * +* Fast Apple II Graphics * +* By Andy McFadden * +* Version 0.3, Aug 2015 * +* * +* Pre-computed data and * +* large internal buffers. * +* (Included by FDRAW.S) * +* * +* Developed with Merlin-16 * +* * +******************************** + +* Expected layout with alignment: +* +* P1 ylooklo, misc tables +* P2 ylookhi, colorline +* P3 rastx0l +* P4 rastx0h +* P5 rastx1l +* P6 rastx1h, div7hi, mod7hi +* P7 div7lo +* P8 mod7lo +* P9 rast_unroll, rastunidx +* +* Tables should be just under $900 bytes. + + PG_ALIGN + +* Hi-res Y lookup, low part (192 bytes). +ylooklo HEX 0000000000000000 + HEX 8080808080808080 + HEX 0000000000000000 + HEX 8080808080808080 + HEX 0000000000000000 + HEX 8080808080808080 + HEX 0000000000000000 + HEX 8080808080808080 + HEX 2828282828282828 + HEX a8a8a8a8a8a8a8a8 + HEX 2828282828282828 + HEX a8a8a8a8a8a8a8a8 + HEX 2828282828282828 + HEX a8a8a8a8a8a8a8a8 + HEX 2828282828282828 + HEX a8a8a8a8a8a8a8a8 + HEX 5050505050505050 + HEX d0d0d0d0d0d0d0d0 + HEX 5050505050505050 + HEX d0d0d0d0d0d0d0d0 + HEX 5050505050505050 + HEX d0d0d0d0d0d0d0d0 + HEX 5050505050505050 + HEX d0d0d0d0d0d0d0d0 + +* Color masks for odd/even bytes, colors 0-7. +evencolor dfb $00,$2a,$55,$7f,$80,$aa,$d5,$ff +oddcolor dfb $00,$55,$2a,$7f,$80,$d5,$aa,$ff + +* XOR mask for colors 0-7 - non-BW flip on odd/even. +xormask dfb $00,$7f,$7f,$00,$00,$7f,$7f,$00 + +* AND mask for the 7 pixel positions, high bit set +* for the color shift. +andmask dfb $81,$82,$84,$88,$90,$a0,$c0 + +* These are pixel AND masks, used with the modulo 7 +* result. Entry #2 in leftmask means we're touching +* the rightmost 5 pixels, and entry #2 in rightmask +* means we're touching the 3 leftmost pixels. +* +* The high bit is always set, because we want to +* keep the color's high bit. +leftmask dfb $ff,$fe,$fc,$f8,$f0,$e0,$c0 +rightmask dfb $81,$83,$87,$8f,$9f,$bf,$ff + + PG_ALIGN + +* Hi-res Y lookup, high part (192 bytes). +* OR with $20 or $40. +ylookhi HEX 0004080c1014181c + HEX 0004080c1014181c + HEX 0105090d1115191d + HEX 0105090d1115191d + HEX 02060a0e12161a1e + HEX 02060a0e12161a1e + HEX 03070b0f13171b1f + HEX 03070b0f13171b1f + HEX 0004080c1014181c + HEX 0004080c1014181c + HEX 0105090d1115191d + HEX 0105090d1115191d + HEX 02060a0e12161a1e + HEX 02060a0e12161a1e + HEX 03070b0f13171b1f + HEX 03070b0f13171b1f + HEX 0004080c1014181c + HEX 0004080c1014181c + HEX 0105090d1115191d + HEX 0105090d1115191d + HEX 02060a0e12161a1e + HEX 02060a0e12161a1e + HEX 03070b0f13171b1f + HEX 03070b0f13171b1f + +* Masks for current color (even/odd), e.g. 55 2a 55 2a ... +* Updated whenever the color changes. +colorline ds 40 + + PG_ALIGN +rastx0l ds NUM_ROWS + PG_ALIGN +rastx0h ds NUM_ROWS + ds 1 ;repeat mode can overstep + PG_ALIGN +rastx1l ds NUM_ROWS + PG_ALIGN +rastx1h ds NUM_ROWS + +* Lookup tables for dividing 0-279 by 7. The "hi" +* parts are 24 bytes each, so they fit inside +* the previous 192-byte entry. The "lo" parts +* each fill a page. +div7hi HEX 2424242525252525 + HEX 2525262626262626 + HEX 2627272727272727 +mod7hi HEX 0405060001020304 + HEX 0506000102030405 + HEX 0600010203040506 + + PG_ALIGN + +div7lo HEX 0000000000000001 + HEX 0101010101010202 + HEX 0202020202030303 + HEX 0303030304040404 + HEX 0404040505050505 + HEX 0505060606060606 + HEX 0607070707070707 + HEX 0808080808080809 + HEX 0909090909090a0a + HEX 0a0a0a0a0a0b0b0b + HEX 0b0b0b0b0c0c0c0c + HEX 0c0c0c0d0d0d0d0d + HEX 0d0d0e0e0e0e0e0e + HEX 0e0f0f0f0f0f0f0f + HEX 1010101010101011 + HEX 1111111111111212 + HEX 1212121212131313 + HEX 1313131314141414 + HEX 1414141515151515 + HEX 1515161616161616 + HEX 1617171717171717 + HEX 1818181818181819 + HEX 1919191919191a1a + HEX 1a1a1a1a1a1b1b1b + HEX 1b1b1b1b1c1c1c1c + HEX 1c1c1c1d1d1d1d1d + HEX 1d1d1e1e1e1e1e1e + HEX 1e1f1f1f1f1f1f1f + HEX 2020202020202021 + HEX 2121212121212222 + HEX 2222222222232323 + HEX 2323232324242424 +mod7lo HEX 0001020304050600 + HEX 0102030405060001 + HEX 0203040506000102 + HEX 0304050600010203 + HEX 0405060001020304 + HEX 0506000102030405 + HEX 0600010203040506 + HEX 0001020304050600 + HEX 0102030405060001 + HEX 0203040506000102 + HEX 0304050600010203 + HEX 0405060001020304 + HEX 0506000102030405 + HEX 0600010203040506 + HEX 0001020304050600 + HEX 0102030405060001 + HEX 0203040506000102 + HEX 0304050600010203 + HEX 0405060001020304 + HEX 0506000102030405 + HEX 0600010203040506 + HEX 0001020304050600 + HEX 0102030405060001 + HEX 0203040506000102 + HEX 0304050600010203 + HEX 0405060001020304 + HEX 0506000102030405 + HEX 0600010203040506 + HEX 0001020304050600 + HEX 0102030405060001 + HEX 0203040506000102 + HEX 0304050600010203 + + +* RastFill unrolled loop. At each step we store the current +* color value, XOR it to flip the bits if needed, and advance. +* The caller needs to set the appropriate initial value based +* on whether the address is odd or even. +* +* We can use a 3-cycle "EOR dp" or a 2-cycle "EOR imm". The +* former is one cycle slower, the latter requires us to +* self-mod 40 instructions when the color changes. +* +* This must be page-aligned so that we can take the value +* from the rastunidx table and self-mod a JMP without having +* to do a 16-bit add. We have just enough room for the +* unrolled loop (40*5+3) and x5 table (41) = 244 bytes, fits +* on a single page. + + do USE_FAST ;***** + ds \ +]hbasl equ zptr0 ;must match FillRaster +rast_unroll equ * + lst off + lup BYTES_PER_ROW + sta (]hbasl),y ;6 + eor #$00 ;2 + iny ;2 10 cycles, 5 bytes + --^ + jmp rastlinedone + +* Index into rast_unroll. If we need to output N bytes, +* we want to jump to (rast_unroll + (40 - N) * 5) (where +* 5 is the number of bytes per iteration). +rastunidx +]offset = BYTES_PER_ROW*5 + lup BYTES_PER_ROW+1 ;0-40 + dfb ]offset +]offset = ]offset-5 + --^ + + fin ;***** + + +******************************** +* +* Code used to generate tables above. If you want to +* decrease load size, use these functions to generate +* the data into empty memory, then discard the code. +* (Maybe use a negative DS and overlap with rastx0l?) +* +******************************** + DO 0 ;***** + +init_ylook +]hbasl equ zptr1 +]hbash equ zptr1+1 + +* Initialize Y-lookup table. We just call the bascalc +* function. + ldx #NUM_ROWS + ldy #NUM_ROWS-1 +]loop tya + jsr bascalc + lda hbasl + sta ylooklo,y + lda hbash + ora #$20 ;remove for $0000 base + sta ylookhi,y + dey + dex + bne ]loop + rts + +* Hi-res base address calculation. This is based on the +* HPOSN routine at $F411. +* +* Call with the line in A. The results are placed into +* zptr1. X and Y are not disturbed. +* +* The value is in the $0000-1fff range, so you must OR +* the desired hi-res page in. +* +bascalc + pha + and #$c0 + sta ]hbasl + lsr + lsr + ora ]hbasl + sta ]hbasl + pla + sta ]hbash + asl + asl + asl + rol ]hbash + asl + rol ]hbash + asl + ror ]hbasl + lda ]hbash + and #$1f + sta ]hbash + rts + +* +* Create divide-by-7 tables. +* +mkdivtab +]val equ zloc0 + + ldy #0 + sty ]val + ldx #0 +]loop lda ]val + sta div7lo,y + txa + sta mod7lo,y + inx + iny + beq :lodone + cpx #7 + bne ]loop + inc ]val + ldx #0 + beq ]loop ;always +:lodone ;safe to ignore ]va update +]loop lda ]val + sta div7hi,y + txa + sta mod7hi,y + iny + cpy #280-256 + beq :hidone + inx + cpx #7 + bne ]loop + inc ]val + ldx #0 + beq ]loop ;always +:hidone rts + + FIN ;***** diff --git a/applecorn.po b/applecorn.po index 52aa70e..75c0752 100644 Binary files a/applecorn.po and b/applecorn.po differ diff --git a/applecorn.s b/applecorn.s index 1918910..5231398 100644 --- a/applecorn.s +++ b/applecorn.s @@ -7,7 +7,8 @@ * Assembled with the Merlin 8 v2.58 assembler on Apple II. XC ; 65c02 - ORG $2000 ; Load addr of loader in main memory + ORG $4000 ; Load addr of loader in main memory + ; Clear of first HGR frame buffer * Monitor routines BELL EQU $FBDD @@ -47,12 +48,12 @@ IOBUF3 EQU $1800 IOBUF4 EQU $1C00 * 512 byte buffer sufficient for one disk block -BLKBUF EQU $6000 ; Can't use $400 as ProDOS uses -BLKBUFEND EQU $6200 ; 'hidden' bytes within screen +BLKBUF EQU $9000 ; Can't use $400 as ProDOS uses +BLKBUFEND EQU $9200 ; 'hidden' bytes within screen * 512 byte buffer for file copy (*COPY) -COPYBUF EQU $6200 ; File copy needs separate buffer -*COPYBUFEND EQU $6400 +COPYBUF EQU $9200 ; File copy needs separate buffer +*COPYBUFEND EQU $9400 * Address in aux memory where ROM will be loaded AUXADDR EQU $8000 @@ -176,6 +177,10 @@ MAINZP MAC PUT MAINMEM.WILD PUT MAINMEM.LISTS PUT MAINMEM.MISC + PUT FDRAW + PUT FDRAW.LINE + PUT FDRAW.CIRCLE + PUT FDRAW.TABLES PUT AUXMEM.MOSEQU PUT AUXMEM.INIT PUT AUXMEM.VDU diff --git a/mainmem.ldr.s b/mainmem.ldr.s index 0987dd4..a5f2824 100644 --- a/mainmem.ldr.s +++ b/mainmem.ldr.s @@ -102,6 +102,16 @@ START JSR ROMMENU LDA #>GSBRK STA $3F0+1 +* LDA $C057 ; Enable hi-res +* LDA $C054 ; Enable page 1 +* LDA $C050 ; Enable graphics + JSR Entry+0 ; Initialize FDRAW library + LDA #$20 + STA Entry+5 + JSR Entry+19 ; FDRAW: Set page $2000 + JSR Entry+22 ; FDRAW: clear HGR screen +* LDA $C051 ; Enable text again + TSX ; Save SP at $0100 in aux STA $C005 ; Write to aux STX $0100