added verafx.clear()

This commit is contained in:
Irmen de Jong 2023-10-02 01:34:56 +02:00
parent 70ee2026ff
commit c3f1f09ad1
4 changed files with 66 additions and 13 deletions

View File

@ -6,15 +6,45 @@
verafx {
%option no_symbol_prefixing
sub fill(ubyte vbank, uword vaddr, ubyte data, uword numlongs) {
; TODO use vera fx cache write
cx16.vaddr(vbank, vaddr, 0, true)
repeat numlongs {
cx16.VERA_DATA0 = data
cx16.VERA_DATA0 = data
cx16.VERA_DATA0 = data
cx16.VERA_DATA0 = data
sub clear(ubyte vbank, uword vaddr, ubyte data, uword amountof32bits) {
; use cached 4-byte write to quickly clear a portion of the video memory to a given byte value
; this routine is around 3 times faster as gfx2.clear_screen()
cx16.VERA_CTRL = 0
cx16.VERA_ADDR_H = vbank | %00110000 ; 4-byte increment
cx16.VERA_ADDR_M = msb(vaddr)
cx16.VERA_ADDR_L = lsb(vaddr)
cx16.VERA_CTRL = 6<<1 ; dcsel = 6, fill the 32 bits cache
cx16.VERA_FX_CACHE_L = data
cx16.VERA_FX_CACHE_M = data
cx16.VERA_FX_CACHE_H = data
cx16.VERA_FX_CACHE_U = data
cx16.VERA_CTRL = 2<<1 ; dcsel = 2
cx16.VERA_FX_MULT = 0
cx16.VERA_FX_CTRL = %01000000 ; cache write enable
if (amountof32bits & %1111110000000011) == 0 {
repeat lsb(amountof32bits >> 2)
unroll 4 cx16.VERA_DATA0=0 ; write 4 bytes at a time, unrolled
}
else if (amountof32bits & %1111111000000001) == 0 {
repeat lsb(amountof32bits >> 1)
unroll 2 cx16.VERA_DATA0=0 ; write 4 bytes at a time, unrolled
}
else if (lsb(amountof32bits) & 3) == 0 {
repeat amountof32bits >> 2
unroll 4 cx16.VERA_DATA0=0 ; write 4 bytes at a time, unrolled
}
else if (lsb(amountof32bits) & 1) == 0 {
repeat amountof32bits >> 1
unroll 2 cx16.VERA_DATA0=0 ; write 4 bytes at a time, unrolled
}
else {
repeat amountof32bits
cx16.VERA_DATA0=0 ; write 4 bytes at a time
}
cx16.VERA_FX_CTRL = 0 ; cache write disable
cx16.VERA_CTRL = 0
}
; unsigned multiplication just passes the values as signed to muls

View File

@ -493,9 +493,14 @@ Available for the Cx16 target.
Experimental routines that use the new Vera FX logic (hopefully coming in the Vera in new X16 boards,
the emulators already support it).
For now, the hardware 16*16 multiplier is exposed via ``mult`` and ``muls`` routines.
They are about 4 to 5 times faster as the default 6502 cpu routine for word multiplication.
But they depend on
``mult`` , ``muls``
For now, the hardware 16*16 multiplier is exposed via ``mult`` and ``muls`` routines (unsigned and signed respectively).
They are about 4 to 5 times faster as the default 6502 cpu routine for word multiplication.
But they depend on some Vera manipulation and 4 bytes in vram just below the PSG registers for storage.
``clear``
There's also a ``clear`` routine here to very quickly clear a piece of vram to a given byte value (it writes 4 bytes at a time).
The routine is around 3 times faster as a regular unrolled loop to clear vram.
Read the `source code <https://github.com/irmen/prog8/tree/master/compiler/res/prog8lib/cx16/verafx.p8>`_
to see what's in there.

View File

@ -1,6 +1,8 @@
TODO
====
- '>>=' can be used as an operator in an expression?? should only be augmented assignment!
- [on branch: shortcircuit] investigate McCarthy evaluation again? this may also reduce code size perhaps for things like if a>4 or a<2 ....
- [on branch: ir-less-branch-opcodes] IR: reduce the number of branch instructions such as BEQ, BEQR, etc (gradually), replace with CMP(I) + status branch instruction
- IR: reduce amount of CMP/CMPI after instructions that set the status bits correctly (LOADs? INC? etc), but only after setting the status bits is verified!

View File

@ -37,8 +37,24 @@ main {
txt.nl()
gfx2.screen_mode(1)
verafx.fill(0, 0, %10101010, 1200) ; should fill top half of the screen
verafx.fill(0, 4800, %11111111, 1200) ; should fill bottom half of the screen
cbm.SETTIM(0,0,0)
repeat 255 {
gfx2.clear_screen()
}
uword time1 = cbm.RDTIM16()
cbm.SETTIM(0,0,0)
repeat 255 {
verafx.clear(0, 0, %10101010, 2400)
}
uword time2 = cbm.RDTIM16()
gfx2.screen_mode(0)
txt.print_uw(time1)
txt.spc()
txt.print_uw(time2)
txt.nl()
; txt.print_uw(math.mul16_last_upper())