diff --git a/compiler/res/prog8lib/cx16/verafx.p8 b/compiler/res/prog8lib/cx16/verafx.p8 index 274c7408f..a0341183e 100644 --- a/compiler/res/prog8lib/cx16/verafx.p8 +++ b/compiler/res/prog8lib/cx16/verafx.p8 @@ -6,15 +6,45 @@ verafx { %option no_symbol_prefixing - sub fill(ubyte vbank, uword vaddr, ubyte data, uword numlongs) { - ; TODO use vera fx cache write - cx16.vaddr(vbank, vaddr, 0, true) - repeat numlongs { - cx16.VERA_DATA0 = data - cx16.VERA_DATA0 = data - cx16.VERA_DATA0 = data - cx16.VERA_DATA0 = data + sub clear(ubyte vbank, uword vaddr, ubyte data, uword amountof32bits) { + ; use cached 4-byte write to quickly clear a portion of the video memory to a given byte value + ; this routine is around 3 times faster as gfx2.clear_screen() + cx16.VERA_CTRL = 0 + cx16.VERA_ADDR_H = vbank | %00110000 ; 4-byte increment + cx16.VERA_ADDR_M = msb(vaddr) + cx16.VERA_ADDR_L = lsb(vaddr) + cx16.VERA_CTRL = 6<<1 ; dcsel = 6, fill the 32 bits cache + cx16.VERA_FX_CACHE_L = data + cx16.VERA_FX_CACHE_M = data + cx16.VERA_FX_CACHE_H = data + cx16.VERA_FX_CACHE_U = data + cx16.VERA_CTRL = 2<<1 ; dcsel = 2 + cx16.VERA_FX_MULT = 0 + cx16.VERA_FX_CTRL = %01000000 ; cache write enable + + if (amountof32bits & %1111110000000011) == 0 { + repeat lsb(amountof32bits >> 2) + unroll 4 cx16.VERA_DATA0=0 ; write 4 bytes at a time, unrolled } + else if (amountof32bits & %1111111000000001) == 0 { + repeat lsb(amountof32bits >> 1) + unroll 2 cx16.VERA_DATA0=0 ; write 4 bytes at a time, unrolled + } + else if (lsb(amountof32bits) & 3) == 0 { + repeat amountof32bits >> 2 + unroll 4 cx16.VERA_DATA0=0 ; write 4 bytes at a time, unrolled + } + else if (lsb(amountof32bits) & 1) == 0 { + repeat amountof32bits >> 1 + unroll 2 cx16.VERA_DATA0=0 ; write 4 bytes at a time, unrolled + } + else { + repeat amountof32bits + cx16.VERA_DATA0=0 ; write 4 bytes at a time + } + + cx16.VERA_FX_CTRL = 0 ; cache write disable + cx16.VERA_CTRL = 0 } ; unsigned multiplication just passes the values as signed to muls diff --git a/docs/source/libraries.rst b/docs/source/libraries.rst index cfe8bd2e5..33da4d4ef 100644 --- a/docs/source/libraries.rst +++ b/docs/source/libraries.rst @@ -493,9 +493,14 @@ Available for the Cx16 target. Experimental routines that use the new Vera FX logic (hopefully coming in the Vera in new X16 boards, the emulators already support it). -For now, the hardware 16*16 multiplier is exposed via ``mult`` and ``muls`` routines. -They are about 4 to 5 times faster as the default 6502 cpu routine for word multiplication. -But they depend on +``mult`` , ``muls`` + For now, the hardware 16*16 multiplier is exposed via ``mult`` and ``muls`` routines (unsigned and signed respectively). + They are about 4 to 5 times faster as the default 6502 cpu routine for word multiplication. + But they depend on some Vera manipulation and 4 bytes in vram just below the PSG registers for storage. + +``clear`` + There's also a ``clear`` routine here to very quickly clear a piece of vram to a given byte value (it writes 4 bytes at a time). + The routine is around 3 times faster as a regular unrolled loop to clear vram. Read the `source code `_ to see what's in there. diff --git a/docs/source/todo.rst b/docs/source/todo.rst index ff09bc848..62ddcce73 100644 --- a/docs/source/todo.rst +++ b/docs/source/todo.rst @@ -1,6 +1,8 @@ TODO ==== +- '>>=' can be used as an operator in an expression?? should only be augmented assignment! + - [on branch: shortcircuit] investigate McCarthy evaluation again? this may also reduce code size perhaps for things like if a>4 or a<2 .... - [on branch: ir-less-branch-opcodes] IR: reduce the number of branch instructions such as BEQ, BEQR, etc (gradually), replace with CMP(I) + status branch instruction - IR: reduce amount of CMP/CMPI after instructions that set the status bits correctly (LOADs? INC? etc), but only after setting the status bits is verified! diff --git a/examples/test.p8 b/examples/test.p8 index 89efde817..9f3d54bce 100644 --- a/examples/test.p8 +++ b/examples/test.p8 @@ -37,8 +37,24 @@ main { txt.nl() gfx2.screen_mode(1) - verafx.fill(0, 0, %10101010, 1200) ; should fill top half of the screen - verafx.fill(0, 4800, %11111111, 1200) ; should fill bottom half of the screen + + cbm.SETTIM(0,0,0) + repeat 255 { + gfx2.clear_screen() + } + uword time1 = cbm.RDTIM16() + + cbm.SETTIM(0,0,0) + repeat 255 { + verafx.clear(0, 0, %10101010, 2400) + } + uword time2 = cbm.RDTIM16() + + gfx2.screen_mode(0) + txt.print_uw(time1) + txt.spc() + txt.print_uw(time2) + txt.nl() ; txt.print_uw(math.mul16_last_upper())