mirror of
https://github.com/irmen/prog8.git
synced 2024-11-22 15:33:02 +00:00
added verafx.copy() routine for fast vram-to-vram copying ('blitting')
This commit is contained in:
parent
49ec430592
commit
6cd392909c
@ -21,7 +21,7 @@ verafx {
|
||||
return cx16.r1L
|
||||
}
|
||||
|
||||
sub clear(ubyte vbank, uword vaddr, ubyte data, uword amountof32bits) {
|
||||
sub clear(ubyte vbank, uword vaddr, ubyte data, uword num_longwords) {
|
||||
; use cached 4-byte write to quickly clear a portion of the video memory to a given byte value
|
||||
; this routine is around 3 times faster as gfx2.clear_screen()
|
||||
cx16.VERA_CTRL = 0
|
||||
@ -37,24 +37,24 @@ verafx {
|
||||
cx16.VERA_FX_MULT = 0
|
||||
cx16.VERA_FX_CTRL = %01000000 ; cache write enable
|
||||
|
||||
if (amountof32bits & %1111110000000011) == 0 {
|
||||
repeat lsb(amountof32bits >> 2)
|
||||
if (num_longwords & %1111110000000011) == 0 {
|
||||
repeat lsb(num_longwords >> 2)
|
||||
unroll 4 cx16.VERA_DATA0=0 ; write 4*4 bytes at a time, unrolled
|
||||
}
|
||||
else if (amountof32bits & %1111111000000001) == 0 {
|
||||
repeat lsb(amountof32bits >> 1)
|
||||
else if (num_longwords & %1111111000000001) == 0 {
|
||||
repeat lsb(num_longwords >> 1)
|
||||
unroll 2 cx16.VERA_DATA0=0 ; write 2*4 bytes at a time, unrolled
|
||||
}
|
||||
else if (lsb(amountof32bits) & 3) == 0 {
|
||||
repeat amountof32bits >> 2
|
||||
else if (lsb(num_longwords) & 3) == 0 {
|
||||
repeat num_longwords >> 2
|
||||
unroll 4 cx16.VERA_DATA0=0 ; write 4*4 bytes at a time, unrolled
|
||||
}
|
||||
else if (lsb(amountof32bits) & 1) == 0 {
|
||||
repeat amountof32bits >> 1
|
||||
else if (lsb(num_longwords) & 1) == 0 {
|
||||
repeat num_longwords >> 1
|
||||
unroll 2 cx16.VERA_DATA0=0 ; write 2*4 bytes at a time, unrolled
|
||||
}
|
||||
else {
|
||||
repeat amountof32bits
|
||||
repeat num_longwords
|
||||
cx16.VERA_DATA0=0 ; write 4 bytes at a time
|
||||
}
|
||||
|
||||
@ -62,6 +62,53 @@ verafx {
|
||||
cx16.VERA_CTRL = 0
|
||||
}
|
||||
|
||||
sub copy(ubyte srcbank, uword srcaddr, ubyte tgtbank, uword tgtaddr, uword num_longwords) {
|
||||
; use cached 4-byte writes to quickly copy a portion of the video memory to somewhere else
|
||||
; this routine is around 40-50% faster as a plain byte-by-byte copy
|
||||
cx16.VERA_CTRL = 1
|
||||
cx16.VERA_ADDR_H = srcbank | %00010000 ; source: 1-byte increment
|
||||
cx16.VERA_ADDR_M = msb(srcaddr)
|
||||
cx16.VERA_ADDR_L = lsb(srcaddr)
|
||||
cx16.VERA_CTRL = 0
|
||||
cx16.VERA_ADDR_H = tgtbank | %00110000 ; target: 4-byte increment
|
||||
cx16.VERA_ADDR_M = msb(tgtaddr)
|
||||
cx16.VERA_ADDR_L = lsb(tgtaddr)
|
||||
cx16.VERA_CTRL = 2<<1 ; dcsel = 2
|
||||
cx16.VERA_FX_MULT = 0
|
||||
cx16.VERA_FX_CTRL = %01100000 ; cache write enable + cache fill enable
|
||||
cx16.r0 = num_longwords
|
||||
|
||||
if (cx16.r0L & 1) == 0 {
|
||||
repeat cx16.r0>>1 {
|
||||
%asm {{
|
||||
lda cx16.VERA_DATA1 ; fill cache with 4 source bytes...
|
||||
lda cx16.VERA_DATA1
|
||||
lda cx16.VERA_DATA1
|
||||
lda cx16.VERA_DATA1
|
||||
stz cx16.VERA_DATA0 ; write 4 bytes at once.
|
||||
lda cx16.VERA_DATA1 ; fill cache with 4 source bytes...
|
||||
lda cx16.VERA_DATA1
|
||||
lda cx16.VERA_DATA1
|
||||
lda cx16.VERA_DATA1
|
||||
stz cx16.VERA_DATA0 ; write 4 bytes at once.
|
||||
}}
|
||||
}
|
||||
} else {
|
||||
repeat cx16.r0 {
|
||||
%asm {{
|
||||
lda cx16.VERA_DATA1 ; fill cache with 4 source bytes...
|
||||
lda cx16.VERA_DATA1
|
||||
lda cx16.VERA_DATA1
|
||||
lda cx16.VERA_DATA1
|
||||
stz cx16.VERA_DATA0 ; write 4 bytes at once.
|
||||
}}
|
||||
}
|
||||
}
|
||||
|
||||
cx16.VERA_FX_CTRL = 0 ; cache write disable
|
||||
cx16.VERA_CTRL = 0
|
||||
}
|
||||
|
||||
; unsigned multiplication just passes the values as signed to muls
|
||||
; if you do this yourself in your call to muls, it will save a few instructions.
|
||||
sub mult(uword value1, uword value2) -> uword {
|
||||
|
@ -625,6 +625,11 @@ the emulators already support it).
|
||||
Very quickly clear a piece of vram to a given byte value (it writes 4 bytes at a time).
|
||||
The routine is around 3 times faster as a regular unrolled loop to clear vram.
|
||||
|
||||
``copy``
|
||||
Very quickly copy a portion of the video memory to somewhere else in vram (4 bytes at a time)
|
||||
Sometimes this is also called "blitting".
|
||||
This routine is around 40-50% faster as a regular byte-by-byte copy.
|
||||
|
||||
``transparency``
|
||||
Enable or disable transparent writes (color 0 will be transparent if enabled).
|
||||
|
||||
|
@ -2,12 +2,6 @@
|
||||
TODO
|
||||
====
|
||||
|
||||
- verafx vram-vram copy routine?
|
||||
set the cache fill and cache write bits in fx ctrl, set one data port's increment to 1 and the other one to 4,
|
||||
Assuming your writes are aligned to 32-bit boundaries, do four reads from the increment-1 port
|
||||
(ex: lda DATA1 ; 4 times) and then stz the other one (stz DATA0).
|
||||
The cache is loaded by the DATA1 reads, and the contents are written out with the DATA0 write, 4 bytes at once.
|
||||
|
||||
- [on branch: shortcircuit] investigate McCarthy evaluation again? this may also reduce code size perhaps for things like if a>4 or a<2 ....
|
||||
|
||||
...
|
||||
|
@ -1,14 +1,51 @@
|
||||
%import textio
|
||||
%zeropage basicsafe
|
||||
%import verafx
|
||||
%import diskio
|
||||
%zeropage dontuse
|
||||
|
||||
main {
|
||||
sub start() {
|
||||
uword module
|
||||
module++
|
||||
module.test++
|
||||
txt.uppercase()
|
||||
txt.print("abcdefghijklmnopqrstuvwxyz\n")
|
||||
txt.print("ABCDEFGHIJKLMNOPQRSTUVWXYZ\n")
|
||||
txt.print("0123456789!@#$%^&*()-=[]<>\n")
|
||||
|
||||
void diskio.vload_raw("fantasy.pf", 0, $4000)
|
||||
; for cx16.r0 in $4000 to $4000+256*$0008 {
|
||||
; cx16.vpoke(0, cx16.r0, %10101010)
|
||||
; }
|
||||
|
||||
cbm.SETTIM(0,0,0)
|
||||
repeat 1000 {
|
||||
slowcopy(0, $4000, 1, $f000, 8*256/4)
|
||||
}
|
||||
txt.print("\nslow copy time: ")
|
||||
txt.print_uw(cbm.RDTIM16())
|
||||
txt.nl()
|
||||
sys.wait(60)
|
||||
txt.lowercase()
|
||||
sys.wait(120)
|
||||
|
||||
cbm.SETTIM(0,0,0)
|
||||
repeat 1000 {
|
||||
verafx.copy(0, $4000, 1, $f000, 8*256/4)
|
||||
}
|
||||
txt.print("verafx copy time: ")
|
||||
txt.print_uw(cbm.RDTIM16())
|
||||
txt.nl()
|
||||
sys.wait(60)
|
||||
; txt.uppercase()
|
||||
}
|
||||
|
||||
sub slowcopy(ubyte srcbank, uword srcaddr, ubyte tgtbank, uword tgtaddr, uword num_longwords) {
|
||||
cx16.vaddr(srcbank, srcaddr, 0, 1)
|
||||
cx16.vaddr(tgtbank, tgtaddr, 1, 1)
|
||||
repeat num_longwords {
|
||||
cx16.VERA_DATA1=cx16.VERA_DATA0
|
||||
cx16.VERA_DATA1=cx16.VERA_DATA0
|
||||
cx16.VERA_DATA1=cx16.VERA_DATA0
|
||||
cx16.VERA_DATA1=cx16.VERA_DATA0
|
||||
}
|
||||
cx16.VERA_CTRL = 0
|
||||
}
|
||||
}
|
||||
|
||||
module {
|
||||
ubyte @shared test
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user