optimizing gfx2.fill()

This commit is contained in:
Irmen de Jong 2024-08-23 19:33:20 +02:00
parent beaff4d650
commit e2fcac322f
3 changed files with 96 additions and 69 deletions

View File

@ -713,33 +713,20 @@ gfx2 {
while cx16.r12L!=0 {
pop_stack()
xx = x1
; possible speed optimization: if mode==1 (256c) use vera autodecrement instead of pget(), but code bloat not worth it?
while xx >= 0 {
if pget(xx as uword, yy as uword) != cx16.r11L
break
xx--
when active_mode {
1 -> if fill_scanline_left_8bpp() goto skip
2 -> if fill_scanline_left_2bpp() goto skip
}
if x1!=xx
horizontal_line(xx as uword+1, yy as uword, x1-xx as uword, cx16.r10L)
else
goto skip
left = xx + 1
if left < x1
push_stack(left, x1 - 1, yy, -dy)
xx = x1 + 1
do {
cx16.r9s = xx
; possible speed optimization: if mode==1 (256c) use vera autoincrement instead of pget(), but code bloat not worth it?
while xx <= width-1 {
if pget(xx as uword, yy as uword) != cx16.r11L
break
xx++
when active_mode {
1 -> fill_scanline_right_8bpp()
2 -> fill_scanline_right_2bpp()
}
if cx16.r9s!=xx
horizontal_line(cx16.r9, yy as uword, xx-cx16.r9s as uword, cx16.r10L)
push_stack(left, xx - 1, yy, dy)
if xx > x2 + 1
push_stack(x2 + 1, xx - 1, yy, -dy)
@ -753,6 +740,71 @@ skip:
left = xx
} until xx>x2
}
sub fill_scanline_left_8bpp() -> bool {
void addr_mul_24_for_lores_256c(yy as uword, xx as uword) ; 24 bits result is in r0 and r1L (highest byte)
cx16.VERA_CTRL = 0
cx16.VERA_ADDR_H = cx16.r1L | %00011000 ; auto decrement enabled
cx16.VERA_ADDR_M = cx16.r0H
cx16.VERA_ADDR_L = cx16.r0L
cx16.VERA_CTRL = 1
cx16.VERA_ADDR_H = cx16.r1L | %00011000 ; auto decrement enabled
cx16.VERA_ADDR_M = cx16.r0H
cx16.VERA_ADDR_L = cx16.r0L
cx16.r9s = xx
while xx >= 0 {
if cx16.VERA_DATA0 != cx16.r11L
break
cx16.VERA_DATA1 = cx16.r10L
xx--
}
return xx==cx16.r9s
}
sub fill_scanline_right_8bpp() {
void addr_mul_24_for_lores_256c(yy as uword, xx as uword) ; 24 bits result is in r0 and r1L (highest byte)
cx16.VERA_CTRL = 0
cx16.VERA_ADDR_H = cx16.r1L | %00010000 ; auto increment enabled
cx16.VERA_ADDR_M = cx16.r0H
cx16.VERA_ADDR_L = cx16.r0L
cx16.VERA_CTRL = 1
cx16.VERA_ADDR_H = cx16.r1L | %00010000 ; auto increment enabled
cx16.VERA_ADDR_M = cx16.r0H
cx16.VERA_ADDR_L = cx16.r0L
while xx <= width-1 {
if cx16.VERA_DATA0 != cx16.r11L
break
cx16.VERA_DATA1 = cx16.r10L
xx++
}
}
sub fill_scanline_left_2bpp() -> bool {
; TODO optimize this to use vera auto-decrements, but requires masking etc because of 4 pixels per byte...
cx16.r9s = xx
while xx >= 0 {
if pget(xx as uword, yy as uword) as ubyte != cx16.r11L
break
xx--
}
if xx!=cx16.r9s {
horizontal_line(xx+1 as uword, yy as uword, cx16.r9s-xx as uword, cx16.r10L)
return false
}
return true
}
sub fill_scanline_right_2bpp() {
; TODO optimize this to use vera auto-increments, but requires masking etc because of 4 pixels per byte...
cx16.r9s = xx
while xx <= width-1 {
if pget(xx as uword, yy as uword) as ubyte != cx16.r11L
break
xx++
}
if xx!=cx16.r9s
horizontal_line(cx16.r9, yy as uword, xx-cx16.r9s as uword, cx16.r10L)
}
}
sub position(uword @zp xx, uword yy) {

View File

@ -1,7 +1,8 @@
TODO
====
Move vectors such as USRADD in cx16 to cbm block?
Optimize gfx2 fill_scanline_XXX routines also for the 2bpp modes (4c hires)
See open issues on github.
@ -40,7 +41,6 @@ Compiler:
- do we need (array)variable alignment tag instead of block alignment tag? You want to align the data, not the code in the block?
- ir: related to the one above: block alignment doesn't translate well to variables in the block (the actual stuff that needs to be aligned in memory) but: need variable alignment tag instead of block alignment tag, really
- ir: fix call() return value handling
- ir: add specialized bit test instructions to support "prog8_ifelse_bittest_xxx" see the check in IRCodeGen
- ir: proper code gen for the CALLI instruction and that it (optionally) returns a word value that needs to be assigned to a reg
- ir: idea: (but LLVM IR simply keeps the variables, so not a good idea then?...): replace all scalar variables by an allocated register. Keep a table of the variable to register mapping (including the datatype)
global initialization values are simply a list of LOAD instructions.

View File

@ -1,64 +1,39 @@
%import gfx2
%import textio
%import string
%import math
%option no_sysinit
%zeropage basicsafe
main {
str large1 = "the quick brown fox jumps over the lazy dog. the quick brown fox jumps over the lazy dog. the quick brown fox jumps over the lazy dog. the quick brown fox jumps over the lazy dog."
str large2 = "the quick brown fox jumps over the lazy dog. the quick brown fox jumps over the lazy dog. the quick brown fox jumps over the lazy dog. the quick brown fox jumps over the laxx doggo doggo."
sub start() {
txt.nl()
check("", "", 0)
check("", "a", -1)
check("a", "", 1)
check("a", "a", 0)
check("a", "z", -1)
check("z", "a", 1)
check("irmen", "irmen", 0)
check("irmen", "irmen2", -1)
check("irmen2", "irmen", 1)
check("irmen", "irxen", -1)
check("irmen", "irman", 1)
txt.nl()
bench() ; orig: 88 (pet: 713) optimized: 56 451
bench2() ; orig: 131 (pet: 1066) optimized: 83 674
gfx2.screen_mode(2)
demofill()
}
sub bench2() {
sub demofill() {
gfx2.circle(160, 120, 110, 1)
gfx2.rect(180, 5, 25, 190, 2)
gfx2.line(100, 150, 240, 10, 2)
gfx2.rect(150, 130, 10, 100, 3)
sys.wait(30)
cbm.SETTIM(0,0,0)
repeat 1000 {
bool compare = large1 != large2
cx16.r0L++
compare = large1 > large2
cx16.r0L++
compare = large1 <= large2
}
txt.print_uw(cbm.RDTIM16())
txt.nl()
}
gfx2.fill(100,100,3)
gfx2.fill(100,100,2)
gfx2.fill(100,100,0)
uword duration = cbm.RDTIM16()
sys.wait(30)
sub bench() {
cbm.SETTIM(0,0,0)
repeat 2000 {
void string.compare(large1,large2)
}
txt.print_uw(cbm.RDTIM16())
gfx2.screen_mode(0)
txt.nl()
}
txt.print_uw(duration)
txt.print(" jiffies\n")
; hires 4c before optimizations: ~345 jiffies
sub check(str s1, str s2, byte expected) {
byte result = string.compare(s1, s2)
txt.print(s1)
txt.print(" & ")
txt.print(s2)
txt.print(": ")
txt.print_b(result)
if result!=expected
txt.print(" !wrong!\n")
else
txt.nl()
}
}