slightly faster sqrt() routine for integers

This commit is contained in:
Irmen de Jong 2023-08-14 15:16:46 +02:00
parent d8991894e3
commit bbb6c53457
6 changed files with 45 additions and 37 deletions

View File

@ -4,7 +4,8 @@
; some more interesting routines can be found here: ; some more interesting routines can be found here:
; http://6502org.wikidot.com/software-math ; http://6502org.wikidot.com/software-math
; http://codebase64.org/doku.php?id=base:6502_6510_maths ; http://codebase64.org/doku.php?id=base:6502_6510_maths
; ; https://github.com/TobyLobster/multiply_test
; https://github.com/TobyLobster/sqrt_test
multiply_bytes .proc multiply_bytes .proc

View File

@ -68,11 +68,8 @@ asmsub RDTIM() -> ubyte @ A, ubyte @ X, ubyte @ Y {
asmsub RDTIM16() clobbers(X) -> uword @AY { asmsub RDTIM16() clobbers(X) -> uword @AY {
; -- like RDTIM() but only returning the lower 16 bits in AY for convenience ; -- like RDTIM() but only returning the lower 16 bits in AY for convenience
%asm {{ %asm {{
jsr cbm.RDTIM lda TIME_LO
pha ldy TIME_MID
txa
tay
pla
rts rts
}} }}
} }
@ -127,7 +124,7 @@ asmsub cleanup_at_exit() {
asmsub waitvsync() clobbers(A) { asmsub waitvsync() clobbers(A) {
; --- busy wait till the next vsync has occurred (approximately), without depending on custom irq handling. ; --- busy wait till the next vsync has occurred (approximately), without depending on custom irq handling.
; TODO: on PET this now simply waits until the next jiffy clock update ; Note: on PET this simply waits until the next jiffy clock update, I don't know if a true vsync is possible there
%asm {{ %asm {{
lda #1 lda #1
ldy #0 ldy #0

View File

@ -127,33 +127,42 @@ _possibly_zero cmp #0
func_sqrt16_into_A .proc func_sqrt16_into_A .proc
; integer square root from http://6502org.wikidot.com/software-math-sqrt ; integer square root
sta P8ZP_SCRATCH_W1 ; http://6502org.wikidot.com/software-math-sqrt
sty P8ZP_SCRATCH_W1+1 ; https://github.com/TobyLobster/sqrt_test/blob/main/sqrt/sqrt7.a
lda #0 ; Tweaked by TobyLobster and 0xC0DE to be smaller and faster
sta P8ZP_SCRATCH_B1 _numl = P8ZP_SCRATCH_W1
sta P8ZP_SCRATCH_REG _numh = P8ZP_SCRATCH_W1+1
ldx #8 _loop_counter = P8ZP_SCRATCH_REG
- sec _root = P8ZP_SCRATCH_B1
lda P8ZP_SCRATCH_W1+1 sta _numl
sbc #$40 sty _numh
tay ldx #$ff
lda P8ZP_SCRATCH_REG stx _loop_counter
sbc P8ZP_SCRATCH_B1 inx
bcc + stx _root
sty P8ZP_SCRATCH_W1+1 sec
sta P8ZP_SCRATCH_REG _loop lda _numh
+ rol P8ZP_SCRATCH_B1 sbc #$40
asl P8ZP_SCRATCH_W1 tay
rol P8ZP_SCRATCH_W1+1 txa
rol P8ZP_SCRATCH_REG sbc _root
asl P8ZP_SCRATCH_W1 bcc +
rol P8ZP_SCRATCH_W1+1 sty _numh
rol P8ZP_SCRATCH_REG bcs ++
dex + txa
bne - + rol _root
lda P8ZP_SCRATCH_B1 asl _numl
rts rol _numh
rol a
asl _numl
rol _numh
rol a
tax
lsr _loop_counter
bne _loop
lda _root
rts
.pend .pend

View File

@ -14,7 +14,7 @@ Currently these machines can be selected as a compilation target (via the ``-tar
- 'c64': the Commodore 64 - 'c64': the Commodore 64
- 'cx16': the `Commander X16 <https://www.commanderx16.com/>`_ - 'cx16': the `Commander X16 <https://www.commanderx16.com/>`_
- 'c128': the Commodore 128 (*limited support*) - 'c128': the Commodore 128 (*limited support*)
- 'pet32': the Commodore PET 4032 (*experimental support*) - 'pet32': the Commodore PET 4032 (*limited support*)
- 'atari': the Atari 800 XL (*experimental support*) - 'atari': the Atari 800 XL (*experimental support*)
- 'virtual': a builtin virtual machine - 'virtual': a builtin virtual machine

View File

@ -1,7 +1,8 @@
TODO TODO
==== ====
- check mult and sqrt routines with the benchmarked ones on https://github.com/TobyLobster/sqrt_test / https://github.com/TobyLobster/multiply_test - don't allow txt.print('@') if possible, don't cast up a byte to str
- check mult routines with the benchmarked ones on https://github.com/TobyLobster/multiply_test
- is math.square still the fastest after this? (now used for word*word) - is math.square still the fastest after this? (now used for word*word)
- [on branch:] investigate McCarthy evaluation again? this may also reduce code size perhaps for things like if a>4 or a<2 .... - [on branch:] investigate McCarthy evaluation again? this may also reduce code size perhaps for things like if a>4 or a<2 ....
- IR: reduce the number of branch instructions such as BEQ, BEQR, etc (gradually), replace with CMP(I) + status branch instruction - IR: reduce the number of branch instructions such as BEQ, BEQR, etc (gradually), replace with CMP(I) + status branch instruction

View File

@ -14,7 +14,7 @@
<keywords keywords="&amp;;-&gt;;@;and;as;asmsub;break;clobbers;do;downto;else;false;for;goto;if;if_cc;if_cs;if_eq;if_mi;if_ne;if_neg;if_nz;if_pl;if_pos;if_vc;if_vs;if_z;in;inline;not;or;repeat;return;romsub;step;sub;to;true;unroll;until;when;while;xor;~" ignore_case="false" /> <keywords keywords="&amp;;-&gt;;@;and;as;asmsub;break;clobbers;do;downto;else;false;for;goto;if;if_cc;if_cs;if_eq;if_mi;if_ne;if_neg;if_nz;if_pl;if_pos;if_vc;if_vs;if_z;in;inline;not;or;repeat;return;romsub;step;sub;to;true;unroll;until;when;while;xor;~" ignore_case="false" />
<keywords2 keywords="%address;%asm;%asmbinary;%asminclude;%breakpoint;%import;%ir;%launcher;%option;%output;%zeropage;%zpreserved;iso:;petscii:;sc:" /> <keywords2 keywords="%address;%asm;%asmbinary;%asminclude;%breakpoint;%import;%ir;%launcher;%option;%output;%zeropage;%zpreserved;iso:;petscii:;sc:" />
<keywords3 keywords="@requirezp;@shared;@split;@zp;bool;byte;const;float;str;ubyte;uword;void;word" /> <keywords3 keywords="@requirezp;@shared;@split;@zp;bool;byte;const;float;str;ubyte;uword;void;word" />
<keywords4 keywords="abs;all;any;callfar;callram;callrom;clamp;cmp;divmod;len;lsb;max;memory;min;mkword;msb;peek;peekw;poke;pokew;pop;popw;push;pushw;reverse;rol;rol2;ror;ror2;rrestore;rrestorex;rsave;rsavex;sgn;sizeof;sort;sqrt;sqrt16;swap;|&gt;" /> <keywords4 keywords="abs;all;any;callfar;callram;callrom;clamp;cmp;divmod;len;lsb;max;memory;min;mkword;msb;peek;peekw;poke;pokew;pop;popw;push;pushw;reverse;rol;rol2;ror;ror2;rrestore;rrestorex;rsave;rsavex;sgn;sizeof;sort;sqrt;swap;|&gt;" />
</highlighting> </highlighting>
<extensionMap> <extensionMap>
<mapping ext="p8" /> <mapping ext="p8" />