added verafx.copy() routine for fast vram-to-vram copying ('blitting')

2025-01-26 19:30:59 +00:00 · 2023-12-22 17:52:43 +01:00 · 2023-12-22 17:52:43 +01:00 · 6cd392909c
commit 6cd392909c
parent 49ec430592
4 changed files with 107 additions and 24 deletions
--- a/compiler/res/prog8lib/cx16/verafx.p8
+++ b/compiler/res/prog8lib/cx16/verafx.p8
@ -21,7 +21,7 @@ verafx {
        return cx16.r1L
    }

-    sub clear(ubyte vbank, uword vaddr, ubyte data, uword amountof32bits) {
+    sub clear(ubyte vbank, uword vaddr, ubyte data, uword num_longwords) {
        ; use cached 4-byte write to quickly clear a portion of the video memory to a given byte value
        ; this routine is around 3 times faster as gfx2.clear_screen()
        cx16.VERA_CTRL = 0
@ -37,24 +37,24 @@ verafx {
        cx16.VERA_FX_MULT = 0
        cx16.VERA_FX_CTRL = %01000000    ; cache write enable

-        if (amountof32bits & %1111110000000011) == 0 {
-            repeat lsb(amountof32bits >> 2)
+        if (num_longwords & %1111110000000011) == 0 {
+            repeat lsb(num_longwords >> 2)
                unroll 4 cx16.VERA_DATA0=0       ; write 4*4 bytes at a time, unrolled
        }
-        else if (amountof32bits & %1111111000000001) == 0 {
-            repeat lsb(amountof32bits >> 1)
+        else if (num_longwords & %1111111000000001) == 0 {
+            repeat lsb(num_longwords >> 1)
                unroll 2 cx16.VERA_DATA0=0       ; write 2*4 bytes at a time, unrolled
        }
-        else if (lsb(amountof32bits) & 3) == 0 {
-            repeat amountof32bits >> 2
+        else if (lsb(num_longwords) & 3) == 0 {
+            repeat num_longwords >> 2
                unroll 4 cx16.VERA_DATA0=0       ; write 4*4 bytes at a time, unrolled
        }
-        else if (lsb(amountof32bits) & 1) == 0 {
-            repeat amountof32bits >> 1
+        else if (lsb(num_longwords) & 1) == 0 {
+            repeat num_longwords >> 1
                unroll 2 cx16.VERA_DATA0=0       ; write 2*4 bytes at a time, unrolled
        }
        else {
-            repeat amountof32bits
+            repeat num_longwords
                cx16.VERA_DATA0=0       ; write 4 bytes at a time
        }

@ -62,6 +62,53 @@ verafx {
        cx16.VERA_CTRL = 0
    }

+    sub copy(ubyte srcbank, uword srcaddr, ubyte tgtbank, uword tgtaddr, uword num_longwords) {
+        ; use cached 4-byte writes to quickly copy a portion of the video memory to somewhere else
+        ; this routine is around 40-50% faster as a plain byte-by-byte copy
+        cx16.VERA_CTRL = 1
+        cx16.VERA_ADDR_H = srcbank | %00010000       ; source: 1-byte increment
+        cx16.VERA_ADDR_M = msb(srcaddr)
+        cx16.VERA_ADDR_L = lsb(srcaddr)
+        cx16.VERA_CTRL = 0
+        cx16.VERA_ADDR_H = tgtbank | %00110000       ; target: 4-byte increment
+        cx16.VERA_ADDR_M = msb(tgtaddr)
+        cx16.VERA_ADDR_L = lsb(tgtaddr)
+        cx16.VERA_CTRL = 2<<1       ; dcsel = 2
+        cx16.VERA_FX_MULT = 0
+        cx16.VERA_FX_CTRL = %01100000    ; cache write enable + cache fill enable
+        cx16.r0 = num_longwords
+
+        if (cx16.r0L & 1) == 0 {
+            repeat cx16.r0>>1 {
+                %asm {{
+                    lda  cx16.VERA_DATA1    ; fill cache with 4 source bytes...
+                    lda  cx16.VERA_DATA1
+                    lda  cx16.VERA_DATA1
+                    lda  cx16.VERA_DATA1
+                    stz  cx16.VERA_DATA0    ; write 4 bytes at once.
+                    lda  cx16.VERA_DATA1    ; fill cache with 4 source bytes...
+                    lda  cx16.VERA_DATA1
+                    lda  cx16.VERA_DATA1
+                    lda  cx16.VERA_DATA1
+                    stz  cx16.VERA_DATA0    ; write 4 bytes at once.
+                }}
+            }
+        } else {
+            repeat cx16.r0 {
+                %asm {{
+                    lda  cx16.VERA_DATA1    ; fill cache with 4 source bytes...
+                    lda  cx16.VERA_DATA1
+                    lda  cx16.VERA_DATA1
+                    lda  cx16.VERA_DATA1
+                    stz  cx16.VERA_DATA0    ; write 4 bytes at once.
+                }}
+            }
+        }
+
+        cx16.VERA_FX_CTRL = 0    ; cache write disable
+        cx16.VERA_CTRL = 0
+    }
+
    ; unsigned multiplication just passes the values as signed to muls
    ; if you do this yourself in your call to muls, it will save a few instructions.
    sub mult(uword value1, uword value2) -> uword {
--- a/docs/source/libraries.rst
+++ b/docs/source/libraries.rst
@ -625,6 +625,11 @@ the emulators already support it).
    Very quickly clear a piece of vram to a given byte value (it writes 4 bytes at a time).
    The routine is around 3 times faster as a regular unrolled loop to clear vram.

+``copy``
+    Very quickly copy a portion of the video memory to somewhere else in vram (4 bytes at a time)
+    Sometimes this is also called "blitting".
+    This routine is around 40-50% faster as a regular byte-by-byte copy.
+
 ``transparency``
    Enable or disable transparent writes (color 0 will be transparent if enabled).

--- a/docs/source/todo.rst
+++ b/docs/source/todo.rst
@ -2,12 +2,6 @@
 TODO
 ====

- verafx vram-vram copy routine?
-set the cache fill and cache write bits in fx ctrl, set one data port's increment to 1 and the other one to 4,
-Assuming your writes are aligned to 32-bit boundaries, do four reads from the increment-1 port
-(ex: lda DATA1 ; 4 times) and then stz the other one (stz DATA0).
-The cache is loaded by the DATA1 reads, and the contents are written out with the DATA0 write, 4 bytes at once.
-
 - [on branch: shortcircuit] investigate McCarthy evaluation again? this may also reduce code size perhaps for things like if a>4 or a<2 ....

 ...
--- a/examples/test.p8
+++ b/examples/test.p8
@ -1,14 +1,51 @@
 %import textio
-%zeropage basicsafe
+%import verafx
+%import diskio
+%zeropage dontuse

 main {
    sub start() {
-        uword module
-        module++
-        module.test++
+        txt.uppercase()
+        txt.print("abcdefghijklmnopqrstuvwxyz\n")
+        txt.print("ABCDEFGHIJKLMNOPQRSTUVWXYZ\n")
+        txt.print("0123456789!@#$%^&*()-=[]<>\n")
+
+        void diskio.vload_raw("fantasy.pf", 0, $4000)
+;        for cx16.r0 in $4000 to $4000+256*$0008 {
+;            cx16.vpoke(0, cx16.r0, %10101010)
+;        }
+
+        cbm.SETTIM(0,0,0)
+        repeat 1000 {
+            slowcopy(0, $4000, 1, $f000, 8*256/4)
+        }
+        txt.print("\nslow copy time: ")
+        txt.print_uw(cbm.RDTIM16())
+        txt.nl()
+        sys.wait(60)
+        txt.lowercase()
+        sys.wait(120)
+
+        cbm.SETTIM(0,0,0)
+        repeat 1000 {
+            verafx.copy(0, $4000, 1, $f000, 8*256/4)
+        }
+        txt.print("verafx copy time: ")
+        txt.print_uw(cbm.RDTIM16())
+        txt.nl()
+        sys.wait(60)
+;        txt.uppercase()
+    }
+
+    sub slowcopy(ubyte srcbank, uword srcaddr, ubyte tgtbank, uword tgtaddr, uword num_longwords) {
+        cx16.vaddr(srcbank, srcaddr, 0, 1)
+        cx16.vaddr(tgtbank, tgtaddr, 1, 1)
+        repeat num_longwords {
+            cx16.VERA_DATA1=cx16.VERA_DATA0
+            cx16.VERA_DATA1=cx16.VERA_DATA0
+            cx16.VERA_DATA1=cx16.VERA_DATA0
+            cx16.VERA_DATA1=cx16.VERA_DATA0
+        }
+        cx16.VERA_CTRL = 0
    }
 }
-
-module {
-    ubyte @shared test
-}