mirror of
https://github.com/irmen/prog8.git
synced 2025-10-25 22:17:23 +00:00
added verafx.clear()
This commit is contained in:
@@ -6,15 +6,45 @@
|
|||||||
verafx {
|
verafx {
|
||||||
%option no_symbol_prefixing
|
%option no_symbol_prefixing
|
||||||
|
|
||||||
sub fill(ubyte vbank, uword vaddr, ubyte data, uword numlongs) {
|
sub clear(ubyte vbank, uword vaddr, ubyte data, uword amountof32bits) {
|
||||||
; TODO use vera fx cache write
|
; use cached 4-byte write to quickly clear a portion of the video memory to a given byte value
|
||||||
cx16.vaddr(vbank, vaddr, 0, true)
|
; this routine is around 3 times faster as gfx2.clear_screen()
|
||||||
repeat numlongs {
|
cx16.VERA_CTRL = 0
|
||||||
cx16.VERA_DATA0 = data
|
cx16.VERA_ADDR_H = vbank | %00110000 ; 4-byte increment
|
||||||
cx16.VERA_DATA0 = data
|
cx16.VERA_ADDR_M = msb(vaddr)
|
||||||
cx16.VERA_DATA0 = data
|
cx16.VERA_ADDR_L = lsb(vaddr)
|
||||||
cx16.VERA_DATA0 = data
|
cx16.VERA_CTRL = 6<<1 ; dcsel = 6, fill the 32 bits cache
|
||||||
|
cx16.VERA_FX_CACHE_L = data
|
||||||
|
cx16.VERA_FX_CACHE_M = data
|
||||||
|
cx16.VERA_FX_CACHE_H = data
|
||||||
|
cx16.VERA_FX_CACHE_U = data
|
||||||
|
cx16.VERA_CTRL = 2<<1 ; dcsel = 2
|
||||||
|
cx16.VERA_FX_MULT = 0
|
||||||
|
cx16.VERA_FX_CTRL = %01000000 ; cache write enable
|
||||||
|
|
||||||
|
if (amountof32bits & %1111110000000011) == 0 {
|
||||||
|
repeat lsb(amountof32bits >> 2)
|
||||||
|
unroll 4 cx16.VERA_DATA0=0 ; write 4 bytes at a time, unrolled
|
||||||
}
|
}
|
||||||
|
else if (amountof32bits & %1111111000000001) == 0 {
|
||||||
|
repeat lsb(amountof32bits >> 1)
|
||||||
|
unroll 2 cx16.VERA_DATA0=0 ; write 4 bytes at a time, unrolled
|
||||||
|
}
|
||||||
|
else if (lsb(amountof32bits) & 3) == 0 {
|
||||||
|
repeat amountof32bits >> 2
|
||||||
|
unroll 4 cx16.VERA_DATA0=0 ; write 4 bytes at a time, unrolled
|
||||||
|
}
|
||||||
|
else if (lsb(amountof32bits) & 1) == 0 {
|
||||||
|
repeat amountof32bits >> 1
|
||||||
|
unroll 2 cx16.VERA_DATA0=0 ; write 4 bytes at a time, unrolled
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
repeat amountof32bits
|
||||||
|
cx16.VERA_DATA0=0 ; write 4 bytes at a time
|
||||||
|
}
|
||||||
|
|
||||||
|
cx16.VERA_FX_CTRL = 0 ; cache write disable
|
||||||
|
cx16.VERA_CTRL = 0
|
||||||
}
|
}
|
||||||
|
|
||||||
; unsigned multiplication just passes the values as signed to muls
|
; unsigned multiplication just passes the values as signed to muls
|
||||||
|
|||||||
@@ -493,9 +493,14 @@ Available for the Cx16 target.
|
|||||||
Experimental routines that use the new Vera FX logic (hopefully coming in the Vera in new X16 boards,
|
Experimental routines that use the new Vera FX logic (hopefully coming in the Vera in new X16 boards,
|
||||||
the emulators already support it).
|
the emulators already support it).
|
||||||
|
|
||||||
For now, the hardware 16*16 multiplier is exposed via ``mult`` and ``muls`` routines.
|
``mult`` , ``muls``
|
||||||
They are about 4 to 5 times faster as the default 6502 cpu routine for word multiplication.
|
For now, the hardware 16*16 multiplier is exposed via ``mult`` and ``muls`` routines (unsigned and signed respectively).
|
||||||
But they depend on
|
They are about 4 to 5 times faster as the default 6502 cpu routine for word multiplication.
|
||||||
|
But they depend on some Vera manipulation and 4 bytes in vram just below the PSG registers for storage.
|
||||||
|
|
||||||
|
``clear``
|
||||||
|
There's also a ``clear`` routine here to very quickly clear a piece of vram to a given byte value (it writes 4 bytes at a time).
|
||||||
|
The routine is around 3 times faster as a regular unrolled loop to clear vram.
|
||||||
|
|
||||||
Read the `source code <https://github.com/irmen/prog8/tree/master/compiler/res/prog8lib/cx16/verafx.p8>`_
|
Read the `source code <https://github.com/irmen/prog8/tree/master/compiler/res/prog8lib/cx16/verafx.p8>`_
|
||||||
to see what's in there.
|
to see what's in there.
|
||||||
|
|||||||
@@ -1,6 +1,8 @@
|
|||||||
TODO
|
TODO
|
||||||
====
|
====
|
||||||
|
|
||||||
|
- '>>=' can be used as an operator in an expression?? should only be augmented assignment!
|
||||||
|
|
||||||
- [on branch: shortcircuit] investigate McCarthy evaluation again? this may also reduce code size perhaps for things like if a>4 or a<2 ....
|
- [on branch: shortcircuit] investigate McCarthy evaluation again? this may also reduce code size perhaps for things like if a>4 or a<2 ....
|
||||||
- [on branch: ir-less-branch-opcodes] IR: reduce the number of branch instructions such as BEQ, BEQR, etc (gradually), replace with CMP(I) + status branch instruction
|
- [on branch: ir-less-branch-opcodes] IR: reduce the number of branch instructions such as BEQ, BEQR, etc (gradually), replace with CMP(I) + status branch instruction
|
||||||
- IR: reduce amount of CMP/CMPI after instructions that set the status bits correctly (LOADs? INC? etc), but only after setting the status bits is verified!
|
- IR: reduce amount of CMP/CMPI after instructions that set the status bits correctly (LOADs? INC? etc), but only after setting the status bits is verified!
|
||||||
|
|||||||
@@ -37,8 +37,24 @@ main {
|
|||||||
txt.nl()
|
txt.nl()
|
||||||
|
|
||||||
gfx2.screen_mode(1)
|
gfx2.screen_mode(1)
|
||||||
verafx.fill(0, 0, %10101010, 1200) ; should fill top half of the screen
|
|
||||||
verafx.fill(0, 4800, %11111111, 1200) ; should fill bottom half of the screen
|
cbm.SETTIM(0,0,0)
|
||||||
|
repeat 255 {
|
||||||
|
gfx2.clear_screen()
|
||||||
|
}
|
||||||
|
uword time1 = cbm.RDTIM16()
|
||||||
|
|
||||||
|
cbm.SETTIM(0,0,0)
|
||||||
|
repeat 255 {
|
||||||
|
verafx.clear(0, 0, %10101010, 2400)
|
||||||
|
}
|
||||||
|
uword time2 = cbm.RDTIM16()
|
||||||
|
|
||||||
|
gfx2.screen_mode(0)
|
||||||
|
txt.print_uw(time1)
|
||||||
|
txt.spc()
|
||||||
|
txt.print_uw(time2)
|
||||||
|
txt.nl()
|
||||||
|
|
||||||
|
|
||||||
; txt.print_uw(math.mul16_last_upper())
|
; txt.print_uw(math.mul16_last_upper())
|
||||||
|
|||||||
Reference in New Issue
Block a user