mirror of
https://github.com/irmen/prog8.git
synced 2025-01-19 19:33:52 +00:00
198 lines
8.1 KiB
Lua
198 lines
8.1 KiB
Lua
; Somewhat experimental Vera FX support.
|
|
; Docs:
|
|
; https://github.com/X16Community/x16-docs/blob/fb63156cca2d6de98be0577aacbe4ddef458f896/X16%20Reference%20-%2010%20-%20VERA%20FX%20Reference.md
|
|
; https://docs.google.com/document/d/1q34uWOiM3Be2pnaHRVgSdHySI-qsiQWPTo_gfE54PTg
|
|
|
|
verafx {
|
|
%option no_symbol_prefixing, ignore_unused
|
|
|
|
sub available() -> bool {
|
|
; returns true if Vera FX is available (Vera V0.3.1 or later), false if not.
|
|
cx16.r1L = 0
|
|
cx16.r0L = cx16.VERA_CTRL
|
|
cx16.VERA_CTRL = $7e
|
|
if cx16.VERA_DC_VER0 == $56 {
|
|
; Vera version number is valid.
|
|
; Vera fx is available on Vera version 0.3.1 and later,
|
|
; so no need to even check VERA_DC_VER1, which contains 0 (or higher)
|
|
cx16.r1L = mkword(cx16.VERA_DC_VER2, cx16.VERA_DC_VER3) >= $0301 as ubyte
|
|
}
|
|
cx16.VERA_CTRL = cx16.r0L
|
|
return cx16.r1L as bool
|
|
}
|
|
|
|
sub clear(ubyte vbank, uword vaddr, ubyte data, uword num_longwords) {
|
|
; use cached 4-byte write to quickly clear a portion of the video memory to a given byte value
|
|
; this routine is around 3 times faster as gfx2.clear_screen()
|
|
cx16.VERA_CTRL = 0
|
|
cx16.VERA_ADDR_H = vbank | %00110000 ; 4-byte increment
|
|
cx16.VERA_ADDR_M = msb(vaddr)
|
|
cx16.VERA_ADDR_L = lsb(vaddr)
|
|
cx16.VERA_CTRL = 6<<1 ; dcsel = 6, fill the 32 bits cache
|
|
cx16.VERA_FX_CACHE_L = data
|
|
cx16.VERA_FX_CACHE_M = data
|
|
cx16.VERA_FX_CACHE_H = data
|
|
cx16.VERA_FX_CACHE_U = data
|
|
cx16.VERA_CTRL = 2<<1 ; dcsel = 2
|
|
cx16.VERA_FX_MULT = 0
|
|
cx16.VERA_FX_CTRL = %01000000 ; cache write enable
|
|
|
|
if (num_longwords & %1111110000000011) == 0 {
|
|
repeat lsb(num_longwords >> 2)
|
|
unroll 4 cx16.VERA_DATA0=0 ; write 4*4 bytes at a time, unrolled
|
|
}
|
|
else if (num_longwords & %1111111000000001) == 0 {
|
|
repeat lsb(num_longwords >> 1)
|
|
unroll 2 cx16.VERA_DATA0=0 ; write 2*4 bytes at a time, unrolled
|
|
}
|
|
else if (lsb(num_longwords) & 3) == 0 {
|
|
repeat num_longwords >> 2
|
|
unroll 4 cx16.VERA_DATA0=0 ; write 4*4 bytes at a time, unrolled
|
|
}
|
|
else if (lsb(num_longwords) & 1) == 0 {
|
|
repeat num_longwords >> 1
|
|
unroll 2 cx16.VERA_DATA0=0 ; write 2*4 bytes at a time, unrolled
|
|
}
|
|
else {
|
|
repeat num_longwords
|
|
cx16.VERA_DATA0=0 ; write 4 bytes at a time
|
|
}
|
|
|
|
cx16.VERA_FX_CTRL = 0 ; cache write disable
|
|
cx16.VERA_CTRL = 0
|
|
}
|
|
|
|
sub copy(ubyte srcbank, uword srcaddr, ubyte tgtbank, uword tgtaddr, uword num_longwords) {
|
|
; use cached 4-byte writes to quickly copy a portion of the video memory to somewhere else
|
|
; this routine is about 50% faster as a plain byte-by-byte copy
|
|
cx16.VERA_CTRL = 1
|
|
cx16.VERA_ADDR_H = srcbank | %00010000 ; source: 1-byte increment
|
|
cx16.VERA_ADDR_M = msb(srcaddr)
|
|
cx16.VERA_ADDR_L = lsb(srcaddr)
|
|
cx16.VERA_CTRL = 0
|
|
cx16.VERA_ADDR_H = tgtbank | %00110000 ; target: 4-byte increment
|
|
cx16.VERA_ADDR_M = msb(tgtaddr)
|
|
cx16.VERA_ADDR_L = lsb(tgtaddr)
|
|
cx16.VERA_CTRL = 2<<1 ; dcsel = 2
|
|
cx16.VERA_FX_MULT = 0
|
|
cx16.VERA_FX_CTRL = %01100000 ; cache write enable + cache fill enable
|
|
cx16.r0 = num_longwords
|
|
|
|
if (cx16.r0L & 1) == 0 {
|
|
repeat cx16.r0>>1 {
|
|
%asm {{
|
|
lda cx16.VERA_DATA1 ; fill cache with 4 source bytes...
|
|
lda cx16.VERA_DATA1
|
|
lda cx16.VERA_DATA1
|
|
lda cx16.VERA_DATA1
|
|
stz cx16.VERA_DATA0 ; write 4 bytes at once.
|
|
lda cx16.VERA_DATA1 ; fill cache with 4 source bytes...
|
|
lda cx16.VERA_DATA1
|
|
lda cx16.VERA_DATA1
|
|
lda cx16.VERA_DATA1
|
|
stz cx16.VERA_DATA0 ; write 4 bytes at once.
|
|
}}
|
|
}
|
|
} else {
|
|
repeat cx16.r0 {
|
|
%asm {{
|
|
lda cx16.VERA_DATA1 ; fill cache with 4 source bytes...
|
|
lda cx16.VERA_DATA1
|
|
lda cx16.VERA_DATA1
|
|
lda cx16.VERA_DATA1
|
|
stz cx16.VERA_DATA0 ; write 4 bytes at once.
|
|
}}
|
|
}
|
|
}
|
|
|
|
cx16.VERA_FX_CTRL = 0 ; cache write disable
|
|
cx16.VERA_CTRL = 0
|
|
}
|
|
|
|
; unsigned multiplication just passes the values as signed to muls
|
|
; if you do this yourself in your call to muls, it will save a few instructions.
|
|
; TODO fix this: verafx.muls doesn't support unsigned values like this for full 32 bit result
|
|
; inline asmsub mult(uword value1 @R0, uword value2 @R1) clobbers(X) -> uword @AY, uword @R0 {
|
|
; ; Returns the 32 bits unsigned result in AY and R0 (lower word, upper word).
|
|
; %asm {{
|
|
; jsr verafx.muls
|
|
; }}
|
|
; }
|
|
|
|
asmsub mult16(uword value1 @R0, uword value2 @R1) clobbers(X) -> uword @AY {
|
|
; Returns the 16 bits unsigned result of R0*R1 in AY.
|
|
; Note: only the lower 16 bits! (the upper 16 bits are not valid for unsigned word multiplications, only for signed)
|
|
%asm {{
|
|
lda cx16.r0
|
|
sta P8ZP_SCRATCH_W1
|
|
lda cx16.r0+1
|
|
sta P8ZP_SCRATCH_W1+1
|
|
jsr verafx.muls
|
|
ldx P8ZP_SCRATCH_W1
|
|
stx cx16.r0
|
|
ldx P8ZP_SCRATCH_W1+1
|
|
stx cx16.r0+1
|
|
rts
|
|
}}
|
|
}
|
|
|
|
asmsub muls(word value1 @R0, word value2 @R1) clobbers(X) -> word @AY, word @R0 {
|
|
; Returns the 32 bits signed result in AY and R0 (lower word, upper word).
|
|
; Vera Fx multiplication support only works on signed values!
|
|
%asm {{
|
|
lda #(2 << 1)
|
|
sta cx16.VERA_CTRL ; $9F25
|
|
stz cx16.VERA_FX_CTRL ; $9F29 (mainly to reset Addr1 Mode to 0)
|
|
lda #%00010000
|
|
sta cx16.VERA_FX_MULT ; $9F2C
|
|
lda #(6 << 1)
|
|
sta cx16.VERA_CTRL ; $9F25
|
|
lda cx16.r0
|
|
sta cx16.VERA_FX_CACHE_L ; $9F29
|
|
lda cx16.r0+1
|
|
sta cx16.VERA_FX_CACHE_M ; $9F2A
|
|
lda cx16.r1
|
|
sta cx16.VERA_FX_CACHE_H ; $9F2B
|
|
lda cx16.r1+1
|
|
sta cx16.VERA_FX_CACHE_U ; $9F2C
|
|
lda cx16.VERA_FX_ACCUM_RESET ; $9F29 (DCSEL=6)
|
|
|
|
; Set the ADDR0 pointer to $1f9bc and write our multiplication result there
|
|
; (these are the 4 bytes just before the PSG registers start)
|
|
lda #(2 << 1)
|
|
sta cx16.VERA_CTRL
|
|
lda #%01000000 ; Cache Write Enable
|
|
sta cx16.VERA_FX_CTRL
|
|
lda #$bc
|
|
sta cx16.VERA_ADDR_L
|
|
lda #$f9
|
|
sta cx16.VERA_ADDR_M
|
|
lda #$01
|
|
sta cx16.VERA_ADDR_H ; no increment
|
|
stz cx16.VERA_DATA0 ; multiply and write out result
|
|
lda #%00010001 ; $01 with Increment 1
|
|
sta cx16.VERA_ADDR_H ; so we can read out the result
|
|
lda cx16.VERA_DATA0 ; store the lower 16 bits of the result in AY
|
|
ldy cx16.VERA_DATA0
|
|
ldx cx16.VERA_DATA0 ; store the upper 16 bits of the result in R0
|
|
stx cx16.r0s
|
|
ldx cx16.VERA_DATA0
|
|
stx cx16.r0s+1
|
|
stz cx16.VERA_FX_CTRL ; Cache write disable
|
|
stz cx16.VERA_CTRL ; reset DCSEL
|
|
rts
|
|
}}
|
|
}
|
|
|
|
sub transparency(bool enable) {
|
|
; Set transparent write mode for VeraFX cached writes and also for normal writes to DATA0/DATA.
|
|
; If enabled, pixels with value 0 do not modify VRAM when written (so they are "transparent")
|
|
cx16.VERA_CTRL = 2<<1 ; dcsel = 2
|
|
if enable
|
|
cx16.VERA_FX_CTRL |= %10000000
|
|
else
|
|
cx16.VERA_FX_CTRL &= %01111111
|
|
cx16.VERA_CTRL = 0
|
|
}
|
|
}
|