prog8/compiler/res/prog8lib/cx16/verafx.p8

; Somewhat experimental Vera FX support.
; Docs:
; https://github.com/X16Community/x16-docs/blob/master/VERA%20FX%20Reference.md
; https://docs.google.com/document/d/1q34uWOiM3Be2pnaHRVgSdHySI-qsiQWPTo_gfE54PTg/edit

verafx {
    %option no_symbol_prefixing, ignore_unused

    sub available() -> bool {
        ; returns true if Vera FX is available (Vera V0.3.1 or later), false if not.
        cx16.r1L = 0
        cx16.r0L = cx16.VERA_CTRL
        cx16.VERA_CTRL = $7e
        if cx16.VERA_DC_VER0 == $56 {
            ; Vera version number is valid.
            ; Vera fx is available on Vera version 0.3.1 and later,
            ; so no need to even check VERA_DC_VER1, which contains 0 (or higher)
            cx16.r1L = mkword(cx16.VERA_DC_VER2, cx16.VERA_DC_VER3) >= $0301 as ubyte
        }
        cx16.VERA_CTRL = cx16.r0L
        return cx16.r1L as bool
    }

    sub clear(ubyte vbank, uword vaddr, ubyte data, uword num_longwords) {
        ; use cached 4-byte write to quickly clear a portion of the video memory to a given byte value
        ; this routine is around 3 times faster as gfx2.clear_screen()
        cx16.VERA_CTRL = 0
        cx16.VERA_ADDR_H = vbank | %00110000       ; 4-byte increment
        cx16.VERA_ADDR_M = msb(vaddr)
        cx16.VERA_ADDR_L = lsb(vaddr)
        cx16.VERA_CTRL = 6<<1       ; dcsel = 6, fill the 32 bits cache
        cx16.VERA_FX_CACHE_L = data
        cx16.VERA_FX_CACHE_M = data
        cx16.VERA_FX_CACHE_H = data
        cx16.VERA_FX_CACHE_U = data
        cx16.VERA_CTRL = 2<<1       ; dcsel = 2
        cx16.VERA_FX_MULT = 0
        cx16.VERA_FX_CTRL = %01000000    ; cache write enable

        if (num_longwords & %1111110000000011) == 0 {
            repeat lsb(num_longwords >> 2)
                unroll 4 cx16.VERA_DATA0=0       ; write 4*4 bytes at a time, unrolled
        }
        else if (num_longwords & %1111111000000001) == 0 {
            repeat lsb(num_longwords >> 1)
                unroll 2 cx16.VERA_DATA0=0       ; write 2*4 bytes at a time, unrolled
        }
        else if (lsb(num_longwords) & 3) == 0 {
            repeat num_longwords >> 2
                unroll 4 cx16.VERA_DATA0=0       ; write 4*4 bytes at a time, unrolled
        }
        else if (lsb(num_longwords) & 1) == 0 {
            repeat num_longwords >> 1
                unroll 2 cx16.VERA_DATA0=0       ; write 2*4 bytes at a time, unrolled
        }
        else {
            repeat num_longwords
                cx16.VERA_DATA0=0       ; write 4 bytes at a time
        }

        cx16.VERA_FX_CTRL = 0       ; cache write disable
        cx16.VERA_CTRL = 0
    }

    sub copy(ubyte srcbank, uword srcaddr, ubyte tgtbank, uword tgtaddr, uword num_longwords) {
        ; use cached 4-byte writes to quickly copy a portion of the video memory to somewhere else
        ; this routine is about 50% faster as a plain byte-by-byte copy
        cx16.VERA_CTRL = 1
        cx16.VERA_ADDR_H = srcbank | %00010000       ; source: 1-byte increment
        cx16.VERA_ADDR_M = msb(srcaddr)
        cx16.VERA_ADDR_L = lsb(srcaddr)
        cx16.VERA_CTRL = 0
        cx16.VERA_ADDR_H = tgtbank | %00110000       ; target: 4-byte increment
        cx16.VERA_ADDR_M = msb(tgtaddr)
        cx16.VERA_ADDR_L = lsb(tgtaddr)
        cx16.VERA_CTRL = 2<<1       ; dcsel = 2
        cx16.VERA_FX_MULT = 0
        cx16.VERA_FX_CTRL = %01100000    ; cache write enable + cache fill enable
        cx16.r0 = num_longwords

        if (cx16.r0L & 1) == 0 {
            repeat cx16.r0>>1 {
                %asm {{
                    lda  cx16.VERA_DATA1    ; fill cache with 4 source bytes...
                    lda  cx16.VERA_DATA1
                    lda  cx16.VERA_DATA1
                    lda  cx16.VERA_DATA1
                    stz  cx16.VERA_DATA0    ; write 4 bytes at once.
                    lda  cx16.VERA_DATA1    ; fill cache with 4 source bytes...
                    lda  cx16.VERA_DATA1
                    lda  cx16.VERA_DATA1
                    lda  cx16.VERA_DATA1
                    stz  cx16.VERA_DATA0    ; write 4 bytes at once.
                }}
            }
        } else {
            repeat cx16.r0 {
                %asm {{
                    lda  cx16.VERA_DATA1    ; fill cache with 4 source bytes...
                    lda  cx16.VERA_DATA1
                    lda  cx16.VERA_DATA1
                    lda  cx16.VERA_DATA1
                    stz  cx16.VERA_DATA0    ; write 4 bytes at once.
                }}
            }
        }

        cx16.VERA_FX_CTRL = 0    ; cache write disable
        cx16.VERA_CTRL = 0
    }

    ; unsigned multiplication just passes the values as signed to muls
    ; if you do this yourself in your call to muls, it will save a few instructions.
    sub mult(uword value1, uword value2) -> uword {
        ; Returns the lower 16 bits of the 32 bits result,
        ; the upper 16 bits are stored in cx16.r0 so you can access those separately.
        ; It's not part of the subroutine's signature to avoid awkward use of multiple returnvalues.
        return muls(value1 as word, value2 as word) as uword
    }

    asmsub muls(word value1 @R0, word value2 @R1) clobbers(X) -> word @AY {
        ; Returns the lower 16 bits of the 32 bits result in AY,
        ; the upper 16 bits are stored in cx16.r0 so you can access those separately.
        ; It's not part of the subroutine's signature to avoid awkward use of multiple returnvalues.
        %asm {{
            lda  #(2 << 1)
            sta  cx16.VERA_CTRL        ; $9F25
            stz  cx16.VERA_FX_CTRL     ; $9F29 (mainly to reset Addr1 Mode to 0)
            lda  #%00010000
            sta  cx16.VERA_FX_MULT     ; $9F2C
            lda  #(6 << 1)
            sta  cx16.VERA_CTRL        ; $9F25
            lda  cx16.r0
            ldy  cx16.r0+1
            sta  cx16.VERA_FX_CACHE_L  ; $9F29
            sty  cx16.VERA_FX_CACHE_M  ; $9F2A
            lda  cx16.r1
            ldy  cx16.r1+1
            sta  cx16.VERA_FX_CACHE_H  ; $9F2B
            sty  cx16.VERA_FX_CACHE_U  ; $9F2C
            lda  cx16.VERA_FX_ACCUM_RESET   ; $9F29 (DCSEL=6)

            ; Set the ADDR0 pointer to $1f9bc and write our multiplication result there
            ; (these are the 4 bytes just before the PSG registers start)
            lda  #(2 << 1)
            sta  cx16.VERA_CTRL
            lda  #%01000000           ; Cache Write Enable
            sta  cx16.VERA_FX_CTRL
            lda  #$bc
            sta  cx16.VERA_ADDR_L
            lda  #$f9
            sta  cx16.VERA_ADDR_M
            lda  #$01
            sta  cx16.VERA_ADDR_H     ; no increment
            stz  cx16.VERA_DATA0      ; multiply and write out result
            lda  #%00010001           ; $01 with Increment 1
            sta  cx16.VERA_ADDR_H     ; so we can read out the result
            lda  cx16.VERA_DATA0
            ldy  cx16.VERA_DATA0
            ldx  cx16.VERA_DATA0      ; store the upper 16 bits of the result in r0
            stx  cx16.r0
            ldx  cx16.VERA_DATA0
            stx  cx16.r0+1
            stz  cx16.VERA_FX_CTRL    ; Cache write disable
            stz  cx16.VERA_CTRL       ; reset DCSEL
            rts
        }}
    }

    sub transparency(bool enable) {
        cx16.VERA_CTRL = 2<<1       ; dcsel = 2
        if enable
            cx16.VERA_FX_CTRL |= %10000000
        else
            cx16.VERA_FX_CTRL &= %01111111
        cx16.VERA_CTRL = 0
    }
}
allow %option no_symbol_prefixing also on module scope 2023-12-26 11:31:18 +00:00			`; Somewhat experimental Vera FX support.`
added cx16 verafx library module 2023-09-24 21:00:40 +00:00			`; Docs:`
			`; https://github.com/X16Community/x16-docs/blob/master/VERA%20FX%20Reference.md`
			`; https://docs.google.com/document/d/1q34uWOiM3Be2pnaHRVgSdHySI-qsiQWPTo_gfE54PTg/edit`

			`verafx {`
added %option ignore_unused to suppress warnings about unused vars and subs in that module/block. Also improved error for invalid directive. 2023-12-26 22:37:59 +00:00			`%option no_symbol_prefixing, ignore_unused`
added cx16 verafx library module 2023-09-24 21:00:40 +00:00
added verafx.available() 2023-10-10 18:26:16 +00:00			`sub available() -> bool {`
			`; returns true if Vera FX is available (Vera V0.3.1 or later), false if not.`
			`cx16.r1L = 0`
			`cx16.r0L = cx16.VERA_CTRL`
			`cx16.VERA_CTRL = $7e`
			`if cx16.VERA_DC_VER0 == $56 {`
			`; Vera version number is valid.`
			`; Vera fx is available on Vera version 0.3.1 and later,`
			`; so no need to even check VERA_DC_VER1, which contains 0 (or higher)`
libraries: add ==0 or !=0 to expressions that depend on implicit conversion from byte to bool 2024-02-04 22:22:43 +00:00			`cx16.r1L = mkword(cx16.VERA_DC_VER2, cx16.VERA_DC_VER3) >= $0301 as ubyte`
added verafx.available() 2023-10-10 18:26:16 +00:00			`}`
			`cx16.VERA_CTRL = cx16.r0L`
libraries: add ==0 or !=0 to expressions that depend on implicit conversion from byte to bool 2024-02-04 22:22:43 +00:00			`return cx16.r1L as bool`
added verafx.available() 2023-10-10 18:26:16 +00:00			`}`

added verafx.copy() routine for fast vram-to-vram copying ('blitting') 2023-12-22 16:52:43 +00:00			`sub clear(ubyte vbank, uword vaddr, ubyte data, uword num_longwords) {`
added verafx.clear() 2023-10-01 23:34:56 +00:00			`; use cached 4-byte write to quickly clear a portion of the video memory to a given byte value`
			`; this routine is around 3 times faster as gfx2.clear_screen()`
			`cx16.VERA_CTRL = 0`
			`cx16.VERA_ADDR_H = vbank \| %00110000 ; 4-byte increment`
			`cx16.VERA_ADDR_M = msb(vaddr)`
			`cx16.VERA_ADDR_L = lsb(vaddr)`
			`cx16.VERA_CTRL = 6<<1 ; dcsel = 6, fill the 32 bits cache`
			`cx16.VERA_FX_CACHE_L = data`
			`cx16.VERA_FX_CACHE_M = data`
			`cx16.VERA_FX_CACHE_H = data`
			`cx16.VERA_FX_CACHE_U = data`
			`cx16.VERA_CTRL = 2<<1 ; dcsel = 2`
			`cx16.VERA_FX_MULT = 0`
			`cx16.VERA_FX_CTRL = %01000000 ; cache write enable`

added verafx.copy() routine for fast vram-to-vram copying ('blitting') 2023-12-22 16:52:43 +00:00			`if (num_longwords & %1111110000000011) == 0 {`
			`repeat lsb(num_longwords >> 2)`
floats.parse_f uses kernal VAL if it's present 2023-11-30 22:07:25 +00:00			`unroll 4 cx16.VERA_DATA0=0 ; write 4*4 bytes at a time, unrolled`
added verafx.clear() 2023-10-01 23:34:56 +00:00			`}`
added verafx.copy() routine for fast vram-to-vram copying ('blitting') 2023-12-22 16:52:43 +00:00			`else if (num_longwords & %1111111000000001) == 0 {`
			`repeat lsb(num_longwords >> 1)`
floats.parse_f uses kernal VAL if it's present 2023-11-30 22:07:25 +00:00			`unroll 2 cx16.VERA_DATA0=0 ; write 2*4 bytes at a time, unrolled`
added verafx.clear() 2023-10-01 23:34:56 +00:00			`}`
added verafx.copy() routine for fast vram-to-vram copying ('blitting') 2023-12-22 16:52:43 +00:00			`else if (lsb(num_longwords) & 3) == 0 {`
			`repeat num_longwords >> 2`
floats.parse_f uses kernal VAL if it's present 2023-11-30 22:07:25 +00:00			`unroll 4 cx16.VERA_DATA0=0 ; write 4*4 bytes at a time, unrolled`
fix gfx2 screen fill broken when using verafx 2023-10-01 22:12:48 +00:00			`}`
added verafx.copy() routine for fast vram-to-vram copying ('blitting') 2023-12-22 16:52:43 +00:00			`else if (lsb(num_longwords) & 1) == 0 {`
			`repeat num_longwords >> 1`
floats.parse_f uses kernal VAL if it's present 2023-11-30 22:07:25 +00:00			`unroll 2 cx16.VERA_DATA0=0 ; write 2*4 bytes at a time, unrolled`
added verafx.clear() 2023-10-01 23:34:56 +00:00			`}`
			`else {`
added verafx.copy() routine for fast vram-to-vram copying ('blitting') 2023-12-22 16:52:43 +00:00			`repeat num_longwords`
added verafx.clear() 2023-10-01 23:34:56 +00:00			`cx16.VERA_DATA0=0 ; write 4 bytes at a time`
			`}`

			`cx16.VERA_FX_CTRL = 0 ; cache write disable`
			`cx16.VERA_CTRL = 0`
fix gfx2 screen fill broken when using verafx 2023-10-01 22:12:48 +00:00			`}`

added verafx.copy() routine for fast vram-to-vram copying ('blitting') 2023-12-22 16:52:43 +00:00			`sub copy(ubyte srcbank, uword srcaddr, ubyte tgtbank, uword tgtaddr, uword num_longwords) {`
			`; use cached 4-byte writes to quickly copy a portion of the video memory to somewhere else`
`call` now returns a word value 2023-12-22 21:24:11 +00:00			`; this routine is about 50% faster as a plain byte-by-byte copy`
added verafx.copy() routine for fast vram-to-vram copying ('blitting') 2023-12-22 16:52:43 +00:00			`cx16.VERA_CTRL = 1`
			`cx16.VERA_ADDR_H = srcbank \| %00010000 ; source: 1-byte increment`
			`cx16.VERA_ADDR_M = msb(srcaddr)`
			`cx16.VERA_ADDR_L = lsb(srcaddr)`
			`cx16.VERA_CTRL = 0`
			`cx16.VERA_ADDR_H = tgtbank \| %00110000 ; target: 4-byte increment`
			`cx16.VERA_ADDR_M = msb(tgtaddr)`
			`cx16.VERA_ADDR_L = lsb(tgtaddr)`
			`cx16.VERA_CTRL = 2<<1 ; dcsel = 2`
			`cx16.VERA_FX_MULT = 0`
			`cx16.VERA_FX_CTRL = %01100000 ; cache write enable + cache fill enable`
			`cx16.r0 = num_longwords`

			`if (cx16.r0L & 1) == 0 {`
			`repeat cx16.r0>>1 {`
			`%asm {{`
			`lda cx16.VERA_DATA1 ; fill cache with 4 source bytes...`
			`lda cx16.VERA_DATA1`
			`lda cx16.VERA_DATA1`
			`lda cx16.VERA_DATA1`
			`stz cx16.VERA_DATA0 ; write 4 bytes at once.`
			`lda cx16.VERA_DATA1 ; fill cache with 4 source bytes...`
			`lda cx16.VERA_DATA1`
			`lda cx16.VERA_DATA1`
			`lda cx16.VERA_DATA1`
			`stz cx16.VERA_DATA0 ; write 4 bytes at once.`
			`}}`
			`}`
			`} else {`
			`repeat cx16.r0 {`
			`%asm {{`
			`lda cx16.VERA_DATA1 ; fill cache with 4 source bytes...`
			`lda cx16.VERA_DATA1`
			`lda cx16.VERA_DATA1`
			`lda cx16.VERA_DATA1`
			`stz cx16.VERA_DATA0 ; write 4 bytes at once.`
			`}}`
			`}`
			`}`

			`cx16.VERA_FX_CTRL = 0 ; cache write disable`
			`cx16.VERA_CTRL = 0`
			`}`

added cx16 verafx library module 2023-09-24 21:00:40 +00:00			`; unsigned multiplication just passes the values as signed to muls`
			`; if you do this yourself in your call to muls, it will save a few instructions.`
			`sub mult(uword value1, uword value2) -> uword {`
verafx.mult/muls now return upper 16 bits of the result in r0 2023-11-06 20:55:58 +00:00			`; Returns the lower 16 bits of the 32 bits result,`
			`; the upper 16 bits are stored in cx16.r0 so you can access those separately.`
			`; It's not part of the subroutine's signature to avoid awkward use of multiple returnvalues.`
added cx16 verafx library module 2023-09-24 21:00:40 +00:00			`return muls(value1 as word, value2 as word) as uword`
			`}`

verafx.mult/muls now return upper 16 bits of the result in r0 2023-11-06 20:55:58 +00:00			`asmsub muls(word value1 @R0, word value2 @R1) clobbers(X) -> word @AY {`
			`; Returns the lower 16 bits of the 32 bits result in AY,`
			`; the upper 16 bits are stored in cx16.r0 so you can access those separately.`
			`; It's not part of the subroutine's signature to avoid awkward use of multiple returnvalues.`
added cx16 verafx library module 2023-09-24 21:00:40 +00:00			`%asm {{`
			`lda #(2 << 1)`
			`sta cx16.VERA_CTRL ; $9F25`
			`stz cx16.VERA_FX_CTRL ; $9F29 (mainly to reset Addr1 Mode to 0)`
			`lda #%00010000`
			`sta cx16.VERA_FX_MULT ; $9F2C`
			`lda #(6 << 1)`
			`sta cx16.VERA_CTRL ; $9F25`
			`lda cx16.r0`
			`ldy cx16.r0+1`
			`sta cx16.VERA_FX_CACHE_L ; $9F29`
			`sty cx16.VERA_FX_CACHE_M ; $9F2A`
			`lda cx16.r1`
			`ldy cx16.r1+1`
			`sta cx16.VERA_FX_CACHE_H ; $9F2B`
			`sty cx16.VERA_FX_CACHE_U ; $9F2C`
			`lda cx16.VERA_FX_ACCUM_RESET ; $9F29 (DCSEL=6)`

			`; Set the ADDR0 pointer to $1f9bc and write our multiplication result there`
			`; (these are the 4 bytes just before the PSG registers start)`
			`lda #(2 << 1)`
			`sta cx16.VERA_CTRL`
			`lda #%01000000 ; Cache Write Enable`
			`sta cx16.VERA_FX_CTRL`
			`lda #$bc`
			`sta cx16.VERA_ADDR_L`
			`lda #$f9`
			`sta cx16.VERA_ADDR_M`
			`lda #$01`
			`sta cx16.VERA_ADDR_H ; no increment`
			`stz cx16.VERA_DATA0 ; multiply and write out result`
			`lda #%00010001 ; $01 with Increment 1`
			`sta cx16.VERA_ADDR_H ; so we can read out the result`
			`lda cx16.VERA_DATA0`
			`ldy cx16.VERA_DATA0`
verafx.mult/muls now return upper 16 bits of the result in r0 2023-11-06 20:55:58 +00:00			`ldx cx16.VERA_DATA0 ; store the upper 16 bits of the result in r0`
			`stx cx16.r0`
			`ldx cx16.VERA_DATA0`
			`stx cx16.r0+1`
fix gfx2 screen fill broken when using verafx 2023-10-01 22:12:48 +00:00			`stz cx16.VERA_FX_CTRL ; Cache write disable`
			`stz cx16.VERA_CTRL ; reset DCSEL`
added cx16 verafx library module 2023-09-24 21:00:40 +00:00			`rts`
			`}}`
			`}`
add verafx.transparency() 2023-10-02 23:47:52 +00:00
			`sub transparency(bool enable) {`
			`cx16.VERA_CTRL = 2<<1 ; dcsel = 2`
			`if enable`
			`cx16.VERA_FX_CTRL \|= %10000000`
			`else`
			`cx16.VERA_FX_CTRL &= %01111111`
			`cx16.VERA_CTRL = 0`
			`}`
added cx16 verafx library module 2023-09-24 21:00:40 +00:00			`}`