added verafx.copy() routine for fast vram-to-vram copying ('blitting')

This commit is contained in:
Irmen de Jong 2023-12-22 17:52:43 +01:00
parent 49ec430592
commit 6cd392909c
4 changed files with 107 additions and 24 deletions

View File

@ -21,7 +21,7 @@ verafx {
return cx16.r1L
}
sub clear(ubyte vbank, uword vaddr, ubyte data, uword amountof32bits) {
sub clear(ubyte vbank, uword vaddr, ubyte data, uword num_longwords) {
; use cached 4-byte write to quickly clear a portion of the video memory to a given byte value
; this routine is around 3 times faster as gfx2.clear_screen()
cx16.VERA_CTRL = 0
@ -37,24 +37,24 @@ verafx {
cx16.VERA_FX_MULT = 0
cx16.VERA_FX_CTRL = %01000000 ; cache write enable
if (amountof32bits & %1111110000000011) == 0 {
repeat lsb(amountof32bits >> 2)
if (num_longwords & %1111110000000011) == 0 {
repeat lsb(num_longwords >> 2)
unroll 4 cx16.VERA_DATA0=0 ; write 4*4 bytes at a time, unrolled
}
else if (amountof32bits & %1111111000000001) == 0 {
repeat lsb(amountof32bits >> 1)
else if (num_longwords & %1111111000000001) == 0 {
repeat lsb(num_longwords >> 1)
unroll 2 cx16.VERA_DATA0=0 ; write 2*4 bytes at a time, unrolled
}
else if (lsb(amountof32bits) & 3) == 0 {
repeat amountof32bits >> 2
else if (lsb(num_longwords) & 3) == 0 {
repeat num_longwords >> 2
unroll 4 cx16.VERA_DATA0=0 ; write 4*4 bytes at a time, unrolled
}
else if (lsb(amountof32bits) & 1) == 0 {
repeat amountof32bits >> 1
else if (lsb(num_longwords) & 1) == 0 {
repeat num_longwords >> 1
unroll 2 cx16.VERA_DATA0=0 ; write 2*4 bytes at a time, unrolled
}
else {
repeat amountof32bits
repeat num_longwords
cx16.VERA_DATA0=0 ; write 4 bytes at a time
}
@ -62,6 +62,53 @@ verafx {
cx16.VERA_CTRL = 0
}
sub copy(ubyte srcbank, uword srcaddr, ubyte tgtbank, uword tgtaddr, uword num_longwords) {
; use cached 4-byte writes to quickly copy a portion of the video memory to somewhere else
; this routine is around 40-50% faster as a plain byte-by-byte copy
cx16.VERA_CTRL = 1
cx16.VERA_ADDR_H = srcbank | %00010000 ; source: 1-byte increment
cx16.VERA_ADDR_M = msb(srcaddr)
cx16.VERA_ADDR_L = lsb(srcaddr)
cx16.VERA_CTRL = 0
cx16.VERA_ADDR_H = tgtbank | %00110000 ; target: 4-byte increment
cx16.VERA_ADDR_M = msb(tgtaddr)
cx16.VERA_ADDR_L = lsb(tgtaddr)
cx16.VERA_CTRL = 2<<1 ; dcsel = 2
cx16.VERA_FX_MULT = 0
cx16.VERA_FX_CTRL = %01100000 ; cache write enable + cache fill enable
cx16.r0 = num_longwords
if (cx16.r0L & 1) == 0 {
repeat cx16.r0>>1 {
%asm {{
lda cx16.VERA_DATA1 ; fill cache with 4 source bytes...
lda cx16.VERA_DATA1
lda cx16.VERA_DATA1
lda cx16.VERA_DATA1
stz cx16.VERA_DATA0 ; write 4 bytes at once.
lda cx16.VERA_DATA1 ; fill cache with 4 source bytes...
lda cx16.VERA_DATA1
lda cx16.VERA_DATA1
lda cx16.VERA_DATA1
stz cx16.VERA_DATA0 ; write 4 bytes at once.
}}
}
} else {
repeat cx16.r0 {
%asm {{
lda cx16.VERA_DATA1 ; fill cache with 4 source bytes...
lda cx16.VERA_DATA1
lda cx16.VERA_DATA1
lda cx16.VERA_DATA1
stz cx16.VERA_DATA0 ; write 4 bytes at once.
}}
}
}
cx16.VERA_FX_CTRL = 0 ; cache write disable
cx16.VERA_CTRL = 0
}
; unsigned multiplication just passes the values as signed to muls
; if you do this yourself in your call to muls, it will save a few instructions.
sub mult(uword value1, uword value2) -> uword {

View File

@ -625,6 +625,11 @@ the emulators already support it).
Very quickly clear a piece of vram to a given byte value (it writes 4 bytes at a time).
The routine is around 3 times faster as a regular unrolled loop to clear vram.
``copy``
Very quickly copy a portion of the video memory to somewhere else in vram (4 bytes at a time)
Sometimes this is also called "blitting".
This routine is around 40-50% faster as a regular byte-by-byte copy.
``transparency``
Enable or disable transparent writes (color 0 will be transparent if enabled).

View File

@ -2,12 +2,6 @@
TODO
====
- verafx vram-vram copy routine?
set the cache fill and cache write bits in fx ctrl, set one data port's increment to 1 and the other one to 4,
Assuming your writes are aligned to 32-bit boundaries, do four reads from the increment-1 port
(ex: lda DATA1 ; 4 times) and then stz the other one (stz DATA0).
The cache is loaded by the DATA1 reads, and the contents are written out with the DATA0 write, 4 bytes at once.
- [on branch: shortcircuit] investigate McCarthy evaluation again? this may also reduce code size perhaps for things like if a>4 or a<2 ....
...

View File

@ -1,14 +1,51 @@
%import textio
%zeropage basicsafe
%import verafx
%import diskio
%zeropage dontuse
main {
sub start() {
uword module
module++
module.test++
txt.uppercase()
txt.print("abcdefghijklmnopqrstuvwxyz\n")
txt.print("ABCDEFGHIJKLMNOPQRSTUVWXYZ\n")
txt.print("0123456789!@#$%^&*()-=[]<>\n")
void diskio.vload_raw("fantasy.pf", 0, $4000)
; for cx16.r0 in $4000 to $4000+256*$0008 {
; cx16.vpoke(0, cx16.r0, %10101010)
; }
cbm.SETTIM(0,0,0)
repeat 1000 {
slowcopy(0, $4000, 1, $f000, 8*256/4)
}
txt.print("\nslow copy time: ")
txt.print_uw(cbm.RDTIM16())
txt.nl()
sys.wait(60)
txt.lowercase()
sys.wait(120)
cbm.SETTIM(0,0,0)
repeat 1000 {
verafx.copy(0, $4000, 1, $f000, 8*256/4)
}
txt.print("verafx copy time: ")
txt.print_uw(cbm.RDTIM16())
txt.nl()
sys.wait(60)
; txt.uppercase()
}
sub slowcopy(ubyte srcbank, uword srcaddr, ubyte tgtbank, uword tgtaddr, uword num_longwords) {
cx16.vaddr(srcbank, srcaddr, 0, 1)
cx16.vaddr(tgtbank, tgtaddr, 1, 1)
repeat num_longwords {
cx16.VERA_DATA1=cx16.VERA_DATA0
cx16.VERA_DATA1=cx16.VERA_DATA0
cx16.VERA_DATA1=cx16.VERA_DATA0
cx16.VERA_DATA1=cx16.VERA_DATA0
}
cx16.VERA_CTRL = 0
}
}
module {
ubyte @shared test
}