diff --git a/compiler/res/prog8lib/cx16/syslib.p8 b/compiler/res/prog8lib/cx16/syslib.p8 index a7d078ed9..3d2c4bea4 100644 --- a/compiler/res/prog8lib/cx16/syslib.p8 +++ b/compiler/res/prog8lib/cx16/syslib.p8 @@ -267,6 +267,17 @@ cx16 { &ubyte VERA_AUDIO_DATA = VERA_BASE + $001D &ubyte VERA_SPI_DATA = VERA_BASE + $001E &ubyte VERA_SPI_CTRL = VERA_BASE + $001F + + ; experimental Vera FX registers: (depends on particular value set in VERA_CTRL!!!) + &ubyte VERA_FX_CTRL = VERA_BASE + $0009 + &ubyte VERA_FX_MULT = VERA_BASE + $000C + &ubyte VERA_FX_CACHE_L = VERA_BASE + $0009 + &ubyte VERA_FX_CACHE_M = VERA_BASE + $000A + &ubyte VERA_FX_CACHE_H = VERA_BASE + $000B + &ubyte VERA_FX_CACHE_U = VERA_BASE + $000C + &ubyte VERA_FX_ACCUM_RESET = VERA_BASE + $0009 ; (DCSEL=6) + + ; VERA_PSG_BASE = $1F9C0 ; VERA_PALETTE_BASE = $1FA00 ; VERA_SPRITES_BASE = $1FC00 diff --git a/compiler/res/prog8lib/cx16/verafx.p8 b/compiler/res/prog8lib/cx16/verafx.p8 new file mode 100644 index 000000000..fb4aef2ea --- /dev/null +++ b/compiler/res/prog8lib/cx16/verafx.p8 @@ -0,0 +1,59 @@ +; Experimental Vera FX support. +; Docs: +; https://github.com/X16Community/x16-docs/blob/master/VERA%20FX%20Reference.md +; https://docs.google.com/document/d/1q34uWOiM3Be2pnaHRVgSdHySI-qsiQWPTo_gfE54PTg/edit + +verafx { + + ; unsigned multiplication just passes the values as signed to muls + ; if you do this yourself in your call to muls, it will save a few instructions. + sub mult(uword value1, uword value2) -> uword { + return muls(value1 as word, value2 as word) as uword + } + + asmsub muls(word value1 @R0, word value2 @R1) -> word @AY { + %asm {{ + lda #(2 << 1) + sta cx16.VERA_CTRL ; $9F25 + stz cx16.VERA_FX_CTRL ; $9F29 (mainly to reset Addr1 Mode to 0) + lda #%00010000 + sta cx16.VERA_FX_MULT ; $9F2C + lda #(6 << 1) + sta cx16.VERA_CTRL ; $9F25 + lda cx16.r0 + ldy cx16.r0+1 + sta cx16.VERA_FX_CACHE_L ; $9F29 + sty cx16.VERA_FX_CACHE_M ; $9F2A + lda cx16.r1 + ldy cx16.r1+1 + sta cx16.VERA_FX_CACHE_H ; $9F2B + sty cx16.VERA_FX_CACHE_U ; $9F2C + lda cx16.VERA_FX_ACCUM_RESET ; $9F29 (DCSEL=6) + + ; Set the ADDR0 pointer to $1f9bc and write our multiplication result there + ; (these are the 4 bytes just before the PSG registers start) + lda #(2 << 1) + sta cx16.VERA_CTRL + lda #%01000000 ; Cache Write Enable + sta cx16.VERA_FX_CTRL + lda #$bc + sta cx16.VERA_ADDR_L + lda #$f9 + sta cx16.VERA_ADDR_M + lda #$01 + sta cx16.VERA_ADDR_H ; no increment + stz cx16.VERA_DATA0 ; multiply and write out result + lda #%00010001 ; $01 with Increment 1 + sta cx16.VERA_ADDR_H ; so we can read out the result + stz cx16.VERA_FX_CTRL ; Cache write disable + lda cx16.VERA_DATA0 + ldy cx16.VERA_DATA0 + rts +; we skip the upper 16 bits of the result: +; lda cx16.VERA_DATA0 +; sta $0402 +; lda cx16.VERA_DATA0 +; sta $0403 + }} + } +} diff --git a/compiler/res/prog8lib/math.asm b/compiler/res/prog8lib/math.asm index 2b63d1fd8..8f9920d6d 100644 --- a/compiler/res/prog8lib/math.asm +++ b/compiler/res/prog8lib/math.asm @@ -64,6 +64,7 @@ multiply_words .proc ; but there currently is no way to use 4 consecutive bytes in ZP (without disabling irq and saving/restoring them)... ; mult62.a +; from: https://github.com/TobyLobster/multiply_test/blob/main/tests/mult62.a ; based on Dr Jefyll, http://forum.6502.org/viewtopic.php?f=9&t=689&start=0#p19958 ; - adjusted to use fixed zero page addresses ; - removed 'decrement to avoid clc' as this is slower on average diff --git a/docs/source/libraries.rst b/docs/source/libraries.rst index 8d9ba98ce..cfe8bd2e5 100644 --- a/docs/source/libraries.rst +++ b/docs/source/libraries.rst @@ -485,3 +485,17 @@ See the examples/cx16/sprites/dragon.p8 and dragons.p8 programs for ideas how to Read the `source code `_ to see what's in there. + + +verafx (cx16 only) +------------------- +Available for the Cx16 target. +Experimental routines that use the new Vera FX logic (hopefully coming in the Vera in new X16 boards, +the emulators already support it). + +For now, the hardware 16*16 multiplier is exposed via ``mult`` and ``muls`` routines. +They are about 4 to 5 times faster as the default 6502 cpu routine for word multiplication. +But they depend on + +Read the `source code `_ +to see what's in there. diff --git a/examples/test.p8 b/examples/test.p8 index 041bfa835..52c8a97bc 100644 --- a/examples/test.p8 +++ b/examples/test.p8 @@ -1,26 +1,39 @@ %import textio -%import floats +%import verafx +%zeropage basicsafe +%option no_sysinit main { sub start() { - ubyte from = 10 - ubyte compare=9 - if from==compare - goto equal - txt.print("from is not compare\n") -equal: + const word MULTIPLIER = 431 - ubyte end = 15 - ubyte xx - for xx in from to end { - txt.print_ub(xx) - txt.spc() + ; verify results: + for value in -50 to 50 { + if value*MULTIPLIER != verafx.muls(value, MULTIPLIER) { + txt.print("verafx muls error\n") + sys.exit(1) + } } + + + word value + txt.print("verafx muls...") + cbm.SETTIM(0,0,0) + for value in -50 to 50 { + repeat 250 void verafx.muls(value, MULTIPLIER) + } + txt.print_uw(cbm.RDTIM16()) + txt.nl() + + txt.print("6502 muls...") + cbm.SETTIM(0,0,0) + for value in -50 to 50 { + repeat 250 cx16.r0s = value*MULTIPLIER + } + txt.print_uw(cbm.RDTIM16()) txt.nl() - ubyte ten=9 - if from!=ten - txt.print("from is not 10\n") } } +