; Partial Vera FX support:
; - fast 32 bit cached writes (clear, copy)
; - transparent write setting
; - hardware 16 bits multiplications
;
; Docs:
; https://github.com/X16Community/x16-docs/blob/fb63156cca2d6de98be0577aacbe4ddef458f896/X16%20Reference%20-%2010%20-%20VERA%20FX%20Reference.md
; https://docs.google.com/document/d/1q34uWOiM3Be2pnaHRVgSdHySI-qsiQWPTo_gfE54PTg

verafx {
    %option no_symbol_prefixing, ignore_unused

    sub available() -> bool {
        ; returns true if Vera FX is available (Vera V0.3.1 or later), false if not.
        cx16.r0L = cx16.VERA_CTRL
        cx16.r0H = 0
        cx16.VERA_CTRL = $7e
        if cx16.VERA_DC_VER0 == $56 {
            ; Vera version number is valid. Vera fx is available on Vera version 0.3.1 and later.
            if cx16.VERA_DC_VER1>0
                cx16.r0H = 1
            else
                cx16.r0H = mkword(cx16.VERA_DC_VER2, cx16.VERA_DC_VER3) >= $0301 as ubyte
        }
        cx16.VERA_CTRL = cx16.r0L
        return cx16.r0H as bool
    }

    sub clear(ubyte vbank, uword vaddr, ubyte data, uword num_longwords) {
        ; use cached 4-byte write to quickly clear a portion of the video memory to a given byte value
        ; this routine is around 3 times faster as gfx_hires/gfx_lores.clear_screen()
        cx16.VERA_CTRL = 0
        cx16.VERA_ADDR_H = vbank | %00110000       ; 4-byte increment
        cx16.VERA_ADDR_M = msb(vaddr)
        cx16.VERA_ADDR_L = lsb(vaddr)
        cx16.VERA_CTRL = 6<<1       ; dcsel = 6, fill the 32 bits cache
        cx16.VERA_FX_CACHE_L = data
        cx16.VERA_FX_CACHE_M = data
        cx16.VERA_FX_CACHE_H = data
        cx16.VERA_FX_CACHE_U = data
        cx16.VERA_CTRL = 2<<1       ; dcsel = 2
        cx16.VERA_FX_MULT = 0
        cx16.VERA_FX_CTRL = %01000000    ; cache write enable

        if (num_longwords & %1111110000000011) == 0 {
            repeat lsb(num_longwords >> 2)
                unroll 4 cx16.VERA_DATA0=0       ; write 4*4 bytes at a time, unrolled
        }
        else if (num_longwords & %1111111000000001) == 0 {
            repeat lsb(num_longwords >> 1)
                unroll 2 cx16.VERA_DATA0=0       ; write 2*4 bytes at a time, unrolled
        }
        else if (lsb(num_longwords) & 3) == 0 {
            repeat num_longwords >> 2
                unroll 4 cx16.VERA_DATA0=0       ; write 4*4 bytes at a time, unrolled
        }
        else if (lsb(num_longwords) & 1) == 0 {
            repeat num_longwords >> 1
                unroll 2 cx16.VERA_DATA0=0       ; write 2*4 bytes at a time, unrolled
        }
        else {
            repeat num_longwords
                cx16.VERA_DATA0=0       ; write 4 bytes at a time
        }

        cx16.VERA_FX_CTRL = 0       ; cache write disable
        cx16.VERA_CTRL = 0
    }

    sub copy(ubyte srcbank, uword srcaddr, ubyte tgtbank, uword tgtaddr, uword num_longwords) {
        ; use cached 4-byte writes to quickly copy a portion of the video memory to somewhere else
        ; this routine is about 50% faster as a plain byte-by-byte copy
        cx16.VERA_CTRL = 1
        cx16.VERA_ADDR_H = srcbank | %00010000       ; source: 1-byte increment
        cx16.VERA_ADDR_M = msb(srcaddr)
        cx16.VERA_ADDR_L = lsb(srcaddr)
        cx16.VERA_CTRL = 0
        cx16.VERA_ADDR_H = tgtbank | %00110000       ; target: 4-byte increment
        cx16.VERA_ADDR_M = msb(tgtaddr)
        cx16.VERA_ADDR_L = lsb(tgtaddr)
        cx16.VERA_CTRL = 2<<1       ; dcsel = 2
        cx16.VERA_FX_MULT = 0
        cx16.VERA_FX_CTRL = %01100000    ; cache write enable + cache fill enable
        cx16.r0 = num_longwords

        if (cx16.r0L & 1) == 0 {
            repeat cx16.r0>>1 {
                %asm {{
                    lda  cx16.VERA_DATA1    ; fill cache with 4 source bytes...
                    lda  cx16.VERA_DATA1
                    lda  cx16.VERA_DATA1
                    lda  cx16.VERA_DATA1
                    stz  cx16.VERA_DATA0    ; write 4 bytes at once.
                    lda  cx16.VERA_DATA1    ; fill cache with 4 source bytes...
                    lda  cx16.VERA_DATA1
                    lda  cx16.VERA_DATA1
                    lda  cx16.VERA_DATA1
                    stz  cx16.VERA_DATA0    ; write 4 bytes at once.
                }}
            }
        } else {
            repeat cx16.r0 {
                %asm {{
                    lda  cx16.VERA_DATA1    ; fill cache with 4 source bytes...
                    lda  cx16.VERA_DATA1
                    lda  cx16.VERA_DATA1
                    lda  cx16.VERA_DATA1
                    stz  cx16.VERA_DATA0    ; write 4 bytes at once.
                }}
            }
        }

        cx16.VERA_FX_CTRL = 0    ; cache write disable
        cx16.VERA_CTRL = 0
    }


    asmsub mult16(uword value1 @R0, uword value2 @R1) clobbers(X) -> uword @AY {
        ; Returns the 16 bits unsigned result of R0*R1 in AY.
        ; Note: only the lower 16 bits!   (the upper 16 bits are not valid for unsigned word multiplications, only for signed)
        ; Verafx doesn't support unsigned values like this for full 32 bit result.
        %asm {{
            lda  cx16.r0
            sta  P8ZP_SCRATCH_W1
            lda  cx16.r0+1
            sta  P8ZP_SCRATCH_W1+1
            jsr  verafx.muls
            ldx  P8ZP_SCRATCH_W1
            stx  cx16.r0
            ldx  P8ZP_SCRATCH_W1+1
            stx  cx16.r0+1
            rts
        }}
    }

    asmsub muls(word value1 @R0, word value2 @R1) clobbers(X) -> word @AY, word @R0 {
        ; Returns the 32 bits signed result in AY and R0  (lower word, upper word).
        ; Vera Fx multiplication support only works on signed values!
        %asm {{
            lda  #(2 << 1)
            sta  cx16.VERA_CTRL        ; $9F25
            stz  cx16.VERA_FX_CTRL     ; $9F29 (mainly to reset Addr1 Mode to 0)
            lda  #%00010000
            sta  cx16.VERA_FX_MULT     ; $9F2C
            lda  #(6 << 1)
            sta  cx16.VERA_CTRL        ; $9F25
            lda  cx16.r0
            sta  cx16.VERA_FX_CACHE_L  ; $9F29
            lda  cx16.r0+1
            sta  cx16.VERA_FX_CACHE_M  ; $9F2A
            lda  cx16.r1
            sta  cx16.VERA_FX_CACHE_H  ; $9F2B
            lda  cx16.r1+1
            sta  cx16.VERA_FX_CACHE_U  ; $9F2C
            lda  cx16.VERA_FX_ACCUM_RESET   ; $9F29 (DCSEL=6)

            ; Set the ADDR0 pointer to $1f9bc and write our multiplication result there
            ; (these are the 4 bytes just before the PSG registers start)
            lda  #(2 << 1)
            sta  cx16.VERA_CTRL
            lda  #%01000000           ; Cache Write Enable
            sta  cx16.VERA_FX_CTRL
            lda  #$bc
            sta  cx16.VERA_ADDR_L
            lda  #$f9
            sta  cx16.VERA_ADDR_M
            lda  #$01
            sta  cx16.VERA_ADDR_H     ; no increment
            stz  cx16.VERA_DATA0      ; multiply and write out result
            lda  #%00010001           ; $01 with Increment 1
            sta  cx16.VERA_ADDR_H     ; so we can read out the result
            lda  cx16.VERA_DATA0      ; store the lower 16 bits of the result in AY
            ldy  cx16.VERA_DATA0
            ldx  cx16.VERA_DATA0      ; store the upper 16 bits of the result in R0
            stx  cx16.r0s
            ldx  cx16.VERA_DATA0
            stx  cx16.r0s+1
            stz  cx16.VERA_FX_CTRL    ; Cache write disable
            stz  cx16.VERA_FX_MULT    ; $9F2C  reset multiply bit
            stz  cx16.VERA_CTRL       ; reset DCSEL
            rts
        }}
    }

    sub transparency(bool enable) {
        ; Set transparent write mode for VeraFX cached writes and also for normal writes to DATA0/DATA.
        ; If enabled, pixels with value 0 do not modify VRAM when written (so they are "transparent")
        cx16.VERA_CTRL = 2<<1       ; dcsel = 2
        if enable
            cx16.VERA_FX_CTRL |= %10000000
        else
            cx16.VERA_FX_CTRL &= %01111111
        cx16.VERA_CTRL = 0
    }
}