added verafx.clear()

This commit is contained in:
Irmen de Jong
2023-10-02 01:34:56 +02:00
parent 70ee2026ff
commit c3f1f09ad1
4 changed files with 66 additions and 13 deletions

View File

@@ -6,15 +6,45 @@
verafx { verafx {
%option no_symbol_prefixing %option no_symbol_prefixing
sub fill(ubyte vbank, uword vaddr, ubyte data, uword numlongs) { sub clear(ubyte vbank, uword vaddr, ubyte data, uword amountof32bits) {
; TODO use vera fx cache write ; use cached 4-byte write to quickly clear a portion of the video memory to a given byte value
cx16.vaddr(vbank, vaddr, 0, true) ; this routine is around 3 times faster as gfx2.clear_screen()
repeat numlongs { cx16.VERA_CTRL = 0
cx16.VERA_DATA0 = data cx16.VERA_ADDR_H = vbank | %00110000 ; 4-byte increment
cx16.VERA_DATA0 = data cx16.VERA_ADDR_M = msb(vaddr)
cx16.VERA_DATA0 = data cx16.VERA_ADDR_L = lsb(vaddr)
cx16.VERA_DATA0 = data cx16.VERA_CTRL = 6<<1 ; dcsel = 6, fill the 32 bits cache
cx16.VERA_FX_CACHE_L = data
cx16.VERA_FX_CACHE_M = data
cx16.VERA_FX_CACHE_H = data
cx16.VERA_FX_CACHE_U = data
cx16.VERA_CTRL = 2<<1 ; dcsel = 2
cx16.VERA_FX_MULT = 0
cx16.VERA_FX_CTRL = %01000000 ; cache write enable
if (amountof32bits & %1111110000000011) == 0 {
repeat lsb(amountof32bits >> 2)
unroll 4 cx16.VERA_DATA0=0 ; write 4 bytes at a time, unrolled
} }
else if (amountof32bits & %1111111000000001) == 0 {
repeat lsb(amountof32bits >> 1)
unroll 2 cx16.VERA_DATA0=0 ; write 4 bytes at a time, unrolled
}
else if (lsb(amountof32bits) & 3) == 0 {
repeat amountof32bits >> 2
unroll 4 cx16.VERA_DATA0=0 ; write 4 bytes at a time, unrolled
}
else if (lsb(amountof32bits) & 1) == 0 {
repeat amountof32bits >> 1
unroll 2 cx16.VERA_DATA0=0 ; write 4 bytes at a time, unrolled
}
else {
repeat amountof32bits
cx16.VERA_DATA0=0 ; write 4 bytes at a time
}
cx16.VERA_FX_CTRL = 0 ; cache write disable
cx16.VERA_CTRL = 0
} }
; unsigned multiplication just passes the values as signed to muls ; unsigned multiplication just passes the values as signed to muls

View File

@@ -493,9 +493,14 @@ Available for the Cx16 target.
Experimental routines that use the new Vera FX logic (hopefully coming in the Vera in new X16 boards, Experimental routines that use the new Vera FX logic (hopefully coming in the Vera in new X16 boards,
the emulators already support it). the emulators already support it).
For now, the hardware 16*16 multiplier is exposed via ``mult`` and ``muls`` routines. ``mult`` , ``muls``
They are about 4 to 5 times faster as the default 6502 cpu routine for word multiplication. For now, the hardware 16*16 multiplier is exposed via ``mult`` and ``muls`` routines (unsigned and signed respectively).
But they depend on They are about 4 to 5 times faster as the default 6502 cpu routine for word multiplication.
But they depend on some Vera manipulation and 4 bytes in vram just below the PSG registers for storage.
``clear``
There's also a ``clear`` routine here to very quickly clear a piece of vram to a given byte value (it writes 4 bytes at a time).
The routine is around 3 times faster as a regular unrolled loop to clear vram.
Read the `source code <https://github.com/irmen/prog8/tree/master/compiler/res/prog8lib/cx16/verafx.p8>`_ Read the `source code <https://github.com/irmen/prog8/tree/master/compiler/res/prog8lib/cx16/verafx.p8>`_
to see what's in there. to see what's in there.

View File

@@ -1,6 +1,8 @@
TODO TODO
==== ====
- '>>=' can be used as an operator in an expression?? should only be augmented assignment!
- [on branch: shortcircuit] investigate McCarthy evaluation again? this may also reduce code size perhaps for things like if a>4 or a<2 .... - [on branch: shortcircuit] investigate McCarthy evaluation again? this may also reduce code size perhaps for things like if a>4 or a<2 ....
- [on branch: ir-less-branch-opcodes] IR: reduce the number of branch instructions such as BEQ, BEQR, etc (gradually), replace with CMP(I) + status branch instruction - [on branch: ir-less-branch-opcodes] IR: reduce the number of branch instructions such as BEQ, BEQR, etc (gradually), replace with CMP(I) + status branch instruction
- IR: reduce amount of CMP/CMPI after instructions that set the status bits correctly (LOADs? INC? etc), but only after setting the status bits is verified! - IR: reduce amount of CMP/CMPI after instructions that set the status bits correctly (LOADs? INC? etc), but only after setting the status bits is verified!

View File

@@ -37,8 +37,24 @@ main {
txt.nl() txt.nl()
gfx2.screen_mode(1) gfx2.screen_mode(1)
verafx.fill(0, 0, %10101010, 1200) ; should fill top half of the screen
verafx.fill(0, 4800, %11111111, 1200) ; should fill bottom half of the screen cbm.SETTIM(0,0,0)
repeat 255 {
gfx2.clear_screen()
}
uword time1 = cbm.RDTIM16()
cbm.SETTIM(0,0,0)
repeat 255 {
verafx.clear(0, 0, %10101010, 2400)
}
uword time2 = cbm.RDTIM16()
gfx2.screen_mode(0)
txt.print_uw(time1)
txt.spc()
txt.print_uw(time2)
txt.nl()
; txt.print_uw(math.mul16_last_upper()) ; txt.print_uw(math.mul16_last_upper())