optimizing gfx2.fill()

2025-01-10 20:30:23 +00:00 · 2024-08-23 19:33:20 +02:00 · 2024-08-23 19:33:20 +02:00 · e2fcac322f
commit e2fcac322f
parent beaff4d650
3 changed files with 96 additions and 69 deletions
--- a/compiler/res/prog8lib/cx16/gfx2.p8
+++ b/compiler/res/prog8lib/cx16/gfx2.p8
@ -713,33 +713,20 @@ gfx2 {
        while cx16.r12L!=0 {
            pop_stack()
            xx = x1
-            ; possible speed optimization: if mode==1 (256c) use vera autodecrement instead of pget(), but code bloat not worth it?
-            while xx >= 0 {
-                if pget(xx as uword, yy as uword) != cx16.r11L
-                    break
-                xx--
+            when active_mode {
+                1 -> if fill_scanline_left_8bpp() goto skip
+                2 -> if fill_scanline_left_2bpp() goto skip
            }
-            if x1!=xx
-                horizontal_line(xx as uword+1, yy as uword, x1-xx as uword, cx16.r10L)
-            else
-                goto skip
-
            left = xx + 1
            if left < x1
                push_stack(left, x1 - 1, yy, -dy)
            xx = x1 + 1

            do {
-                cx16.r9s = xx
-                ; possible speed optimization: if mode==1 (256c) use vera autoincrement instead of pget(), but code bloat not worth it?
-                while xx <= width-1 {
-                    if pget(xx as uword, yy as uword) != cx16.r11L
-                        break
-                    xx++
+                when active_mode {
+                    1 -> fill_scanline_right_8bpp()
+                    2 -> fill_scanline_right_2bpp()
                }
-                if cx16.r9s!=xx
-                    horizontal_line(cx16.r9, yy as uword, xx-cx16.r9s as uword, cx16.r10L)
-
                push_stack(left, xx - 1, yy, dy)
                if xx > x2 + 1
                    push_stack(x2 + 1, xx - 1, yy, -dy)
@ -753,6 +740,71 @@ skip:
                left = xx
            } until xx>x2
        }
+
+        sub fill_scanline_left_8bpp() -> bool {
+            void addr_mul_24_for_lores_256c(yy as uword, xx as uword)      ; 24 bits result is in r0 and r1L (highest byte)
+            cx16.VERA_CTRL = 0
+            cx16.VERA_ADDR_H = cx16.r1L | %00011000     ; auto decrement enabled
+            cx16.VERA_ADDR_M = cx16.r0H
+            cx16.VERA_ADDR_L = cx16.r0L
+            cx16.VERA_CTRL = 1
+            cx16.VERA_ADDR_H = cx16.r1L | %00011000     ; auto decrement enabled
+            cx16.VERA_ADDR_M = cx16.r0H
+            cx16.VERA_ADDR_L = cx16.r0L
+            cx16.r9s = xx
+            while xx >= 0 {
+                if cx16.VERA_DATA0 != cx16.r11L
+                    break
+                cx16.VERA_DATA1 = cx16.r10L
+                xx--
+            }
+            return xx==cx16.r9s
+        }
+
+        sub fill_scanline_right_8bpp() {
+            void addr_mul_24_for_lores_256c(yy as uword, xx as uword)      ; 24 bits result is in r0 and r1L (highest byte)
+            cx16.VERA_CTRL = 0
+            cx16.VERA_ADDR_H = cx16.r1L | %00010000     ; auto increment enabled
+            cx16.VERA_ADDR_M = cx16.r0H
+            cx16.VERA_ADDR_L = cx16.r0L
+            cx16.VERA_CTRL = 1
+            cx16.VERA_ADDR_H = cx16.r1L | %00010000     ; auto increment enabled
+            cx16.VERA_ADDR_M = cx16.r0H
+            cx16.VERA_ADDR_L = cx16.r0L
+            while xx <= width-1 {
+                if cx16.VERA_DATA0 != cx16.r11L
+                    break
+                cx16.VERA_DATA1 = cx16.r10L
+                xx++
+            }
+        }
+
+        sub fill_scanline_left_2bpp() -> bool {
+            ; TODO optimize this to use vera auto-decrements, but requires masking etc because of 4 pixels per byte...
+            cx16.r9s = xx
+            while xx >= 0 {
+                if pget(xx as uword, yy as uword) as ubyte != cx16.r11L
+                    break
+                xx--
+            }
+            if xx!=cx16.r9s {
+                horizontal_line(xx+1 as uword, yy as uword, cx16.r9s-xx as uword, cx16.r10L)
+                return false
+            }
+            return true
+        }
+
+        sub fill_scanline_right_2bpp() {
+            ; TODO optimize this to use vera auto-increments, but requires masking etc because of 4 pixels per byte...
+            cx16.r9s = xx
+            while xx <= width-1 {
+                if pget(xx as uword, yy as uword) as ubyte != cx16.r11L
+                    break
+                xx++
+            }
+            if xx!=cx16.r9s
+                horizontal_line(cx16.r9, yy as uword, xx-cx16.r9s as uword, cx16.r10L)
+        }
    }

    sub position(uword @zp xx, uword yy) {
--- a/docs/source/todo.rst
+++ b/docs/source/todo.rst
@ -1,7 +1,8 @@
 TODO
 ====

-Move vectors such as USRADD in cx16 to cbm block?
+Optimize gfx2 fill_scanline_XXX routines also for the 2bpp modes (4c hires)
+

 See open issues on github.

@ -40,7 +41,6 @@ Compiler:
 - do we need (array)variable alignment tag instead of block alignment tag? You want to align the data, not the code in the block?
 - ir: related to the one above: block alignment doesn't translate well to variables in the block (the actual stuff that needs to be aligned in memory)  but: need variable alignment tag instead of block alignment tag, really
 - ir: fix call() return value handling
- ir: add specialized bit test instructions to support "prog8_ifelse_bittest_xxx" see the check in IRCodeGen
 - ir: proper code gen for the CALLI instruction and that it (optionally) returns a word value that needs to be assigned to a reg
 - ir: idea: (but LLVM IR simply keeps the variables, so not a good idea then?...): replace all scalar variables by an allocated register. Keep a table of the variable to register mapping (including the datatype)
  global initialization values are simply a list of LOAD instructions.
--- a/examples/test.p8
+++ b/examples/test.p8
@ -1,64 +1,39 @@
+%import gfx2
 %import textio
-%import string
+%import math
+
 %option no_sysinit
 %zeropage basicsafe


 main {
-    str large1 = "the quick brown fox jumps over the lazy dog. the quick brown fox jumps over the lazy dog. the quick brown fox jumps over the lazy dog. the quick brown fox jumps over the lazy dog."
-    str large2 = "the quick brown fox jumps over the lazy dog. the quick brown fox jumps over the lazy dog. the quick brown fox jumps over the lazy dog. the quick brown fox jumps over the laxx doggo doggo."

    sub start() {
-        txt.nl()
-        check("", "", 0)
-        check("", "a", -1)
-        check("a", "", 1)
-        check("a", "a", 0)
-        check("a", "z", -1)
-        check("z", "a", 1)
-        check("irmen", "irmen", 0)
-        check("irmen", "irmen2", -1)
-        check("irmen2", "irmen", 1)
-        check("irmen", "irxen", -1)
-        check("irmen", "irman", 1)
-        txt.nl()
-
-        bench()     ; orig: 88   (pet: 713)      optimized:   56  451
-        bench2()    ; orig: 131  (pet: 1066)     optimized:   83  674
+        gfx2.screen_mode(2)
+        demofill()
    }

-    sub bench2() {
+    sub demofill() {
+        gfx2.circle(160, 120, 110, 1)
+        gfx2.rect(180, 5, 25, 190, 2)
+        gfx2.line(100, 150, 240, 10, 2)
+        gfx2.rect(150, 130, 10, 100, 3)
+
+        sys.wait(30)
+
        cbm.SETTIM(0,0,0)
-        repeat 1000 {
-            bool compare = large1 != large2
-            cx16.r0L++
-            compare = large1 > large2
-            cx16.r0L++
-            compare = large1 <= large2
-        }
-        txt.print_uw(cbm.RDTIM16())
-        txt.nl()
-    }
+        gfx2.fill(100,100,3)
+        gfx2.fill(100,100,2)
+        gfx2.fill(100,100,0)
+        uword duration = cbm.RDTIM16()
+        sys.wait(30)

-    sub bench() {
-        cbm.SETTIM(0,0,0)
-        repeat 2000 {
-            void string.compare(large1,large2)
-        }
-        txt.print_uw(cbm.RDTIM16())
+        gfx2.screen_mode(0)
        txt.nl()
-    }
+        txt.print_uw(duration)
+        txt.print(" jiffies\n")
+
+        ; hires 4c before optimizations: ~345 jiffies

-    sub check(str s1, str s2, byte expected) {
-        byte result = string.compare(s1, s2)
-        txt.print(s1)
-        txt.print(" & ")
-        txt.print(s2)
-        txt.print(": ")
-        txt.print_b(result)
-        if result!=expected
-            txt.print("  !wrong!\n")
-        else
-            txt.nl()
    }
 }