From 68d5983a14890440582bb8a7bbef42a4a23f2fc5 Mon Sep 17 00:00:00 2001 From: Irmen de Jong Date: Wed, 1 Jan 2025 19:06:11 +0100 Subject: [PATCH] optimize monogfx.plot() to use a *40 lookup table in lores mode. Speeds up a lot of other routines too (line etc) --- compiler/res/prog8lib/cx16/monogfx.p8 | 198 +++++++++++++++++++------- docs/source/todo.rst | 5 +- examples/test.p8 | 198 +++++++++++++++++++++++++- 3 files changed, 343 insertions(+), 58 deletions(-) diff --git a/compiler/res/prog8lib/cx16/monogfx.p8 b/compiler/res/prog8lib/cx16/monogfx.p8 index d49b422e7..cc92ebd81 100644 --- a/compiler/res/prog8lib/cx16/monogfx.p8 +++ b/compiler/res/prog8lib/cx16/monogfx.p8 @@ -16,6 +16,7 @@ monogfx { ; read-only control variables: uword width = 0 uword height = 0 + bool lores_mode ubyte mode const ubyte MODE_NORMAL = %00000000 const ubyte MODE_STIPPLE = %00000001 @@ -32,6 +33,7 @@ monogfx { cx16.VERA_L1_TILEBASE = 0 width = 320 height = 240 + lores_mode = true mode = MODE_NORMAL clear_screen(false) } @@ -47,6 +49,7 @@ monogfx { cx16.VERA_L1_TILEBASE = %00000001 width = 640 height = 480 + lores_mode = false mode = MODE_NORMAL clear_screen(false) } @@ -64,15 +67,12 @@ monogfx { sub clear_screen(bool draw) { position(0, 0) - when width { - 320 -> { - repeat 240/2/8 - cs_innerloop640(draw) - } - 640 -> { - repeat 480/8 - cs_innerloop640(draw) - } + if lores_mode { + repeat 240/2/8 + cs_innerloop640(draw) + } else { + repeat 480/8 + cs_innerloop640(draw) } position(0, 0) } @@ -109,7 +109,7 @@ monogfx { return if length<=8 { ; just use 2 byte writes with shifted mask - position2(xx,yy,true) + position2(xx,yy) %asm {{ ldy p8v_length lda p8v_masked_ends,y @@ -309,8 +309,8 @@ _done }} if mode!=MODE_STIPPLE { ; draw continuous line. - position2(xx,yy,true) - if width==320 + position2(xx,yy) + if lores_mode set_both_strides(11) ; 40 increment = 1 line in 320 px monochrome else set_both_strides(12) ; 80 increment = 1 line in 640 px monochrome @@ -328,8 +328,8 @@ drawmode: ora cx16.r15L lheight-- } lheight++ ; because it is divided by 2 later, don't round off the last pixel - position2(xx,yy,true) - if width==320 + position2(xx,yy) + if lores_mode set_both_strides(12) ; 80 increment = 2 line in 320 px monochrome else set_both_strides(13) ; 160 increment = 2 line in 640 px monochrome @@ -342,9 +342,9 @@ drawmode: ora cx16.r15L } } } else { - position2(xx,yy,true) + position2(xx,yy) cx16.r15 = ~cx16.r15 ; erase pixels - if width==320 + if lores_mode set_both_strides(11) ; 40 increment = 1 line in 320 px monochrome else set_both_strides(12) ; 80 increment = 1 line in 640 px monochrome @@ -679,26 +679,51 @@ invert: return sub prepare() { - %asm {{ - lda p8v_xx - and #7 - pha ; xbits - }} - xx /= 8 - if width==320 - xx += yy*(320/8) ; TODO *40 table - else - xx += yy*(640/8) ; TODO *80 table? (a bit large, need lo,mid,hi. maybe just reuse *40 table and do 1 shift.) - %asm {{ - stz cx16.VERA_CTRL - stz cx16.VERA_ADDR_H - lda p8v_xx+1 - sta cx16.VERA_ADDR_M - lda p8v_xx - sta cx16.VERA_ADDR_L - ply ; xbits - lda p8v_maskbits,y - }} + if lores_mode { + %asm {{ + stz cx16.VERA_CTRL + stz cx16.VERA_ADDR_H + + lda p8v_xx+1 + lsr a + lda p8v_xx + ror a + lsr a + lsr a + + clc + ldy p8v_yy + adc p8v_times40_lsb,y + sta cx16.VERA_ADDR_L + lda p8v_times40_msb,y + adc #0 + sta cx16.VERA_ADDR_M + + lda p8v_xx + and #7 + tax + lda p8v_maskbits,x + }} + } else { + ; width=640 (hires) + %asm {{ + stz cx16.VERA_CTRL + stz cx16.VERA_ADDR_H + lda p8v_xx + and #7 + pha ; xbits + }} + xx /= 8 + xx += yy*(640/8) + %asm {{ + lda p8v_xx+1 + sta cx16.VERA_ADDR_M + lda p8v_xx + sta cx16.VERA_ADDR_L + plx ; xbits + lda p8v_maskbits,x + }} + } } } @@ -718,10 +743,21 @@ invert: pha ; xbits }} xx /= 8 - if width==320 - xx += yy*(320/8) ; TODO *40 table + if lores_mode { + %asm {{ + ; xx += yy * 40 + ldy p8v_yy + lda p8v_xx + clc + adc p8v_times40_lsb,y + sta p8v_xx + lda p8v_xx+1 + adc p8v_times40_msb,y + sta p8v_xx+1 + }} + } else - xx += yy*(640/8) ; TODO *80 table? (a bit large, need lo,mid,hi maybe just reuse *40 table and do 1 shift.) + xx += yy*(640/8) %asm {{ stz cx16.VERA_CTRL @@ -830,10 +866,21 @@ skip: pha ; xbits }} xpos /= 8 - if width==320 - xpos += yy*(320/8) as uword ; TODO *40 table + if lores_mode { + %asm {{ + ; xpos += yy*40 + ldy p8v_yy + lda p8v_xpos + clc + adc p8v_times40_lsb,y + sta p8v_xpos + lda p8v_xpos+1 + adc p8v_times40_msb,y + sta p8v_xpos+1 + }} + } else - xpos += yy*(640/8) as uword ; TODO *80 table? (a bit large, need lo,mid,hi maybe just reuse *40 table and do 1 shift.) + xpos += yy*(640/8) as uword %asm {{ stz cx16.VERA_CTRL @@ -878,20 +925,63 @@ _doplot beq + } } - sub position(uword @zp xx, uword yy) { - if width==320 - cx16.r0 = yy*(320/8) ; TODO *40 table - else - cx16.r0 = yy*(640/8) ; TODO *80 table? (a bit large, need lo,mid,hi maybe just reuse *40 table and do 1 shift.) - cx16.vaddr(0, cx16.r0+(xx/8), 0, 1) + sub position(uword xx, uword yy) { + if lores_mode { + %asm {{ + stz cx16.VERA_CTRL + lda p8v_xx+1 + lsr a + lda p8v_xx + ror a + lsr a + lsr a + clc + ldy p8v_yy + adc p8v_times40_lsb,y + sta cx16.VERA_ADDR_L + lda p8v_times40_msb,y + adc #0 + sta cx16.VERA_ADDR_M + lda #%00010000 ; autoincr + sta cx16.VERA_ADDR_H + }} + } + else { + cx16.r0 = yy*(640/8) + cx16.vaddr(0, cx16.r0+(xx/8), 0, 1) + } + return } - sub position2(uword @zp xx, uword yy, bool also_port_1) { + sub position2(uword xx, uword yy) { position(xx, yy) - if also_port_1 - cx16.vaddr_clone(0) + ; also set port 1 like that + cx16.vaddr_clone(0) } + ; y*40 lookup table. Pretty compact because it all fits in a word and we only need 240 y positions. + ; a y*80 lookup table would be very large (lo,mid,hi for 480 values...) + uword[240] @split @shared times40 = [ + 0, 40, 80, 120, 160, 200, 240, 280, 320, 360, 400, 440, 480, 520, 560, 600, + 640, 680, 720, 760, 800, 840, 880, 920, 960, 1000, 1040, 1080, 1120, 1160, + 1200, 1240, 1280, 1320, 1360, 1400, 1440, 1480, 1520, 1560, 1600, 1640, 1680, + 1720, 1760, 1800, 1840, 1880, 1920, 1960, 2000, 2040, 2080, 2120, 2160, 2200, + 2240, 2280, 2320, 2360, 2400, 2440, 2480, 2520, 2560, 2600, 2640, 2680, 2720, + 2760, 2800, 2840, 2880, 2920, 2960, 3000, 3040, 3080, 3120, 3160, 3200, 3240, + 3280, 3320, 3360, 3400, 3440, 3480, 3520, 3560, 3600, 3640, 3680, 3720, 3760, + 3800, 3840, 3880, 3920, 3960, 4000, 4040, 4080, 4120, 4160, 4200, 4240, 4280, + 4320, 4360, 4400, 4440, 4480, 4520, 4560, 4600, 4640, 4680, 4720, 4760, 4800, + 4840, 4880, 4920, 4960, 5000, 5040, 5080, 5120, 5160, 5200, 5240, 5280, 5320, + 5360, 5400, 5440, 5480, 5520, 5560, 5600, 5640, 5680, 5720, 5760, 5800, 5840, + 5880, 5920, 5960, 6000, 6040, 6080, 6120, 6160, 6200, 6240, 6280, 6320, 6360, + 6400, 6440, 6480, 6520, 6560, 6600, 6640, 6680, 6720, 6760, 6800, 6840, 6880, + 6920, 6960, 7000, 7040, 7080, 7120, 7160, 7200, 7240, 7280, 7320, 7360, 7400, + 7440, 7480, 7520, 7560, 7600, 7640, 7680, 7720, 7760, 7800, 7840, 7880, 7920, + 7960, 8000, 8040, 8080, 8120, 8160, 8200, 8240, 8280, 8320, 8360, 8400, 8440, + 8480, 8520, 8560, 8600, 8640, 8680, 8720, 8760, 8800, 8840, 8880, 8920, 8960, + 9000, 9040, 9080, 9120, 9160, 9200, 9240, 9280, 9320, 9360, 9400, 9440, 9480, + 9520, 9560] + const ubyte charset_bank = $1 const uword charset_addr = $f000 ; in bank 1, so $1f000 @@ -948,7 +1038,7 @@ _doplot beq + bne -- }} ; left part of shifted char - position2(xx, yy, true) + position2(xx, yy) set_autoincrs() if draw { %asm {{ @@ -974,7 +1064,7 @@ cdraw_mod1 ora cx16.VERA_DATA1 } ; right part of shifted char if lsb(xx) & 7 !=0 { - position2(xx+8, yy, true) + position2(xx+8, yy) set_autoincrs() if draw { %asm {{ @@ -1005,7 +1095,7 @@ cdraw_mod2 ora cx16.VERA_DATA1 sub set_autoincrs() { ; set autoincrements to go to next pixel row (40 or 80 increment) - if width==320 { + if lores_mode { cx16.VERA_CTRL = 1 cx16.VERA_ADDR_H = cx16.VERA_ADDR_H & $0f | (11<<4) cx16.VERA_CTRL = 0 diff --git a/docs/source/todo.rst b/docs/source/todo.rst index 210bda1c7..9ee8d50e3 100644 --- a/docs/source/todo.rst +++ b/docs/source/todo.rst @@ -1,6 +1,10 @@ TODO ==== +- optimize word addition word += mul40[indexbyte] to use adc with register indexed instructions +- same with other operators (sbc, and, or, eor) +- how is tye codegen for byte values here? + - add paypal donation button as well? - announce prog8 on the 6502.org site? @@ -74,7 +78,6 @@ IR/VM Libraries --------- - Sorting module gnomesort_uw could be optimized more, rewrite in asm? Shellshort seems consistently faster even if most of the words are already sorted. -- Monogfx: use *40 multiplication lookup tables (possibly *80 as well? but those are a bit large; needing lo,mid,hi) - Add split-word array sorting routines to sorting module? - pet32 target: make syslib more complete (missing kernal routines)? - need help with: PET disk routines (OPEN, SETLFS etc are not exposed as kernal calls) diff --git a/examples/test.p8 b/examples/test.p8 index 192c8771e..05838d8b6 100644 --- a/examples/test.p8 +++ b/examples/test.p8 @@ -1,9 +1,201 @@ -%zeropage basicsafe +%import textio +%import math +%import monogfx +%import gfx_lores %option no_sysinit main { sub start() { - if not unexistingsymbol - cx16.r0++ + + math.rndseed(1234,8877) + cbm.SETTIM(0,0,0) + kernal() + txt.print_uw(cbm.RDTIM16()) + + sys.wait(200) + + math.rndseed(1234,8877) + cbm.SETTIM(0,0,0) + custom_256() + txt.print_uw(cbm.RDTIM16()) + + sys.wait(200) + + + math.rndseed(1234,8877) + cbm.SETTIM(0,0,0) + custom_mono() + txt.print_uw(cbm.RDTIM16()) + + repeat { + } } + + sub kernal() { + cx16.set_screen_mode(128) + cx16.GRAPH_set_colors(2,1,0) + cx16.GRAPH_clear() + repeat 1000 { + cx16.r0 = math.rndw() % 320 + cx16.r1 = math.rnd() % 240 + cx16.r2 = math.rndw() % 320 + cx16.r3 = math.rnd() % 240 + cx16.GRAPH_draw_line(cx16.r0, cx16.r1, cx16.r2, cx16.r3) + } + cx16.set_screen_mode(0) + } + + sub custom_mono() { + monogfx.lores() + repeat 1000 { + cx16.r0 = math.rndw() % 320 + cx16.r1L = math.rnd() % 240 + cx16.r2 = math.rndw() % 320 + cx16.r3L = math.rnd() % 240 + line(cx16.r0, cx16.r1L, cx16.r2, cx16.r3L) + } + monogfx.textmode() + } + + sub custom_256() { + gfx_lores.graphics_mode() + repeat 1000 { + cx16.r0 = math.rndw() % 320 + cx16.r1L = math.rnd() % 240 + cx16.r2 = math.rndw() % 320 + cx16.r3L = math.rnd() % 240 + gfx_lores.line(cx16.r0, cx16.r1L, cx16.r2, cx16.r3L, 2) + } + gfx_lores.text_mode() + } + + sub line(uword @zp x1, ubyte @zp y1, uword @zp x2, ubyte @zp y2) { + ; Bresenham algorithm. + ; This code special-cases various quadrant loops to allow simple ++ and -- operations. + if y1>y2 { + ; make sure dy is always positive to have only 4 instead of 8 special cases + cx16.r0 = x1 + x1 = x2 + x2 = cx16.r0 + cx16.r0L = y1 + y1 = y2 + y2 = cx16.r0L + } + word @zp dx = (x2 as word)-x1 + ubyte @zp dy = y2-y1 + + if dx==0 { + monogfx.vertical_line(x1, y1, abs(dy) as uword +1, true) + return + } + if dy==0 { + if x1>x2 + x1=x2 + monogfx.horizontal_line(x1, y1, abs(dx) as uword +1, true) + return + } + + cx16.r1L = 1 ;; true ; 'positive_ix' + if dx < 0 { + dx = -dx + cx16.r1L = 0 ;; false + } + word @zp dx2 = dx*2 + word @zp dy2 = dy*2 + word @zp d = 0 + cx16.VERA_CTRL = 0 + cx16.VERA_ADDR_H = 0 + if dx >= dy { + if cx16.r1L!=0 { + repeat { + plot() + if x1==x2 + return + x1++ + d += dy2 + if d > dx { + y1++ + d -= dx2 + } + } + } else { + repeat { + plot() + if x1==x2 + return + x1-- + d += dy2 + if d > dx { + y1++ + d -= dx2 + } + } + } + } + else { + if cx16.r1L!=0 { + repeat { + plot() + if y1==y2 + return + y1++ + d += dx2 + if d > dy { + x1++ + d -= dy2 + } + } + } else { + repeat { + plot() + if y1==y2 + return + y1++ + d += dx2 + if d > dy { + x1-- + d -= dy2 + } + } + } + } + + asmsub plot() { + %asm {{ + lda p8v_x1+1 + lsr a + lda p8v_x1 + ror a + lsr a + lsr a + + clc + ldy p8v_y1 + adc times40_lo,y + sta cx16.VERA_ADDR_L + lda times40_mid,y + adc #0 + sta cx16.VERA_ADDR_M + + lda p8v_x1 + and #7 + tax + lda maskbits,x + tsb cx16.VERA_DATA0 + rts + +maskbits .byte 128,64,32,16,8,4,2,1 +; multiplication by 40 lookup table +times40 := 40*range(240) + +times40_lo .byte times40 + + ; !notreached! + }} + } + + } + + }