added verafx.clear()

2025-10-25 22:17:23 +00:00 · 2023-10-02 01:34:56 +02:00
parent 70ee2026ff
commit c3f1f09ad1
4 changed files with 66 additions and 13 deletions
--- a/compiler/res/prog8lib/cx16/verafx.p8
+++ b/compiler/res/prog8lib/cx16/verafx.p8
@@ -6,15 +6,45 @@
 verafx {
    %option no_symbol_prefixing
-    sub fill(ubyte vbank, uword vaddr, ubyte data, uword numlongs) {
+    sub clear(ubyte vbank, uword vaddr, ubyte data, uword amountof32bits) {
-        ; TODO use vera fx cache write
+        ; use cached 4-byte write to quickly clear a portion of the video memory to a given byte value
-        cx16.vaddr(vbank, vaddr, 0, true)
+        ; this routine is around 3 times faster as gfx2.clear_screen()
-        repeat numlongs {
+        cx16.VERA_CTRL = 0
-            cx16.VERA_DATA0 = data
+        cx16.VERA_ADDR_H = vbank | %00110000       ; 4-byte increment
-            cx16.VERA_DATA0 = data
+        cx16.VERA_ADDR_M = msb(vaddr)
-            cx16.VERA_DATA0 = data
+        cx16.VERA_ADDR_L = lsb(vaddr)
-            cx16.VERA_DATA0 = data
+        cx16.VERA_CTRL = 6<<1       ; dcsel = 6, fill the 32 bits cache
        cx16.VERA_FX_CACHE_L = data
        cx16.VERA_FX_CACHE_M = data
        cx16.VERA_FX_CACHE_H = data
        cx16.VERA_FX_CACHE_U = data
        cx16.VERA_CTRL = 2<<1       ; dcsel = 2
        cx16.VERA_FX_MULT = 0
        cx16.VERA_FX_CTRL = %01000000    ; cache write enable
        if (amountof32bits & %1111110000000011) == 0 {
            repeat lsb(amountof32bits >> 2)
                unroll 4 cx16.VERA_DATA0=0       ; write 4 bytes at a time, unrolled
        }
        else if (amountof32bits & %1111111000000001) == 0 {
            repeat lsb(amountof32bits >> 1)
                unroll 2 cx16.VERA_DATA0=0       ; write 4 bytes at a time, unrolled
        }
        else if (lsb(amountof32bits) & 3) == 0 {
            repeat amountof32bits >> 2
                unroll 4 cx16.VERA_DATA0=0       ; write 4 bytes at a time, unrolled
        }
        else if (lsb(amountof32bits) & 1) == 0 {
            repeat amountof32bits >> 1
                unroll 2 cx16.VERA_DATA0=0       ; write 4 bytes at a time, unrolled
        }
        else {
            repeat amountof32bits
                cx16.VERA_DATA0=0       ; write 4 bytes at a time
        }
        cx16.VERA_FX_CTRL = 0       ; cache write disable
        cx16.VERA_CTRL = 0
    }
    ; unsigned multiplication just passes the values as signed to muls
--- a/docs/source/libraries.rst
+++ b/docs/source/libraries.rst
@@ -493,9 +493,14 @@ Available for the Cx16 target.
 Experimental routines that use the new Vera FX logic (hopefully coming in the Vera in new X16 boards,
 the emulators already support it).
-For now, the hardware 16*16 multiplier is exposed via ``mult`` and ``muls`` routines.
+``mult`` , ``muls``
-They are about 4 to 5 times faster as the default 6502 cpu routine for word multiplication.
+    For now, the hardware 16*16 multiplier is exposed via ``mult`` and ``muls`` routines (unsigned and signed respectively).
-But they depend on
+    They are about 4 to 5 times faster as the default 6502 cpu routine for word multiplication.
    But they depend on some Vera manipulation and 4 bytes in vram just below the PSG registers for storage.
 ``clear``
    There's also a ``clear`` routine here to very quickly clear a piece of vram to a given byte value (it writes 4 bytes at a time).
    The routine is around 3 times faster as a regular unrolled loop to clear vram.
 Read the `source code <https://github.com/irmen/prog8/tree/master/compiler/res/prog8lib/cx16/verafx.p8>`_
 to see what's in there.
--- a/docs/source/todo.rst
+++ b/docs/source/todo.rst
@@ -1,6 +1,8 @@
 TODO
 ====
 - '>>='  can be used as an operator in an expression?? should only be augmented assignment!
 - [on branch: shortcircuit] investigate McCarthy evaluation again? this may also reduce code size perhaps for things like if a>4 or a<2 ....
 - [on branch: ir-less-branch-opcodes] IR: reduce the number of branch instructions such as BEQ, BEQR, etc (gradually), replace with CMP(I) + status branch instruction
 - IR: reduce amount of CMP/CMPI after instructions that set the status bits correctly (LOADs? INC? etc), but only after setting the status bits is verified!
--- a/examples/test.p8
+++ b/examples/test.p8
@@ -37,8 +37,24 @@ main {
        txt.nl()
        gfx2.screen_mode(1)
-        verafx.fill(0, 0, %10101010, 1200)        ; should fill top half of the screen
+
-        verafx.fill(0, 4800, %11111111, 1200)     ; should fill bottom half of the screen
+        cbm.SETTIM(0,0,0)
        repeat 255 {
            gfx2.clear_screen()
        }
        uword time1 = cbm.RDTIM16()
        cbm.SETTIM(0,0,0)
        repeat 255 {
            verafx.clear(0, 0, %10101010, 2400)
        }
        uword time2 = cbm.RDTIM16()
        gfx2.screen_mode(0)
        txt.print_uw(time1)
        txt.spc()
        txt.print_uw(time2)
        txt.nl()
 ;        txt.print_uw(math.mul16_last_upper())