slightly faster sqrt() routine for integers

This commit is contained in:
Irmen de Jong 2023-08-14 15:16:46 +02:00
parent d8991894e3
commit bbb6c53457
6 changed files with 45 additions and 37 deletions

View File

@ -4,7 +4,8 @@
; some more interesting routines can be found here:
; http://6502org.wikidot.com/software-math
; http://codebase64.org/doku.php?id=base:6502_6510_maths
;
; https://github.com/TobyLobster/multiply_test
; https://github.com/TobyLobster/sqrt_test
multiply_bytes .proc

View File

@ -68,11 +68,8 @@ asmsub RDTIM() -> ubyte @ A, ubyte @ X, ubyte @ Y {
asmsub RDTIM16() clobbers(X) -> uword @AY {
; -- like RDTIM() but only returning the lower 16 bits in AY for convenience
%asm {{
jsr cbm.RDTIM
pha
txa
tay
pla
lda TIME_LO
ldy TIME_MID
rts
}}
}
@ -127,7 +124,7 @@ asmsub cleanup_at_exit() {
asmsub waitvsync() clobbers(A) {
; --- busy wait till the next vsync has occurred (approximately), without depending on custom irq handling.
; TODO: on PET this now simply waits until the next jiffy clock update
; Note: on PET this simply waits until the next jiffy clock update, I don't know if a true vsync is possible there
%asm {{
lda #1
ldy #0

View File

@ -127,32 +127,41 @@ _possibly_zero cmp #0
func_sqrt16_into_A .proc
; integer square root from http://6502org.wikidot.com/software-math-sqrt
sta P8ZP_SCRATCH_W1
sty P8ZP_SCRATCH_W1+1
lda #0
sta P8ZP_SCRATCH_B1
sta P8ZP_SCRATCH_REG
ldx #8
- sec
lda P8ZP_SCRATCH_W1+1
; integer square root
; http://6502org.wikidot.com/software-math-sqrt
; https://github.com/TobyLobster/sqrt_test/blob/main/sqrt/sqrt7.a
; Tweaked by TobyLobster and 0xC0DE to be smaller and faster
_numl = P8ZP_SCRATCH_W1
_numh = P8ZP_SCRATCH_W1+1
_loop_counter = P8ZP_SCRATCH_REG
_root = P8ZP_SCRATCH_B1
sta _numl
sty _numh
ldx #$ff
stx _loop_counter
inx
stx _root
sec
_loop lda _numh
sbc #$40
tay
lda P8ZP_SCRATCH_REG
sbc P8ZP_SCRATCH_B1
txa
sbc _root
bcc +
sty P8ZP_SCRATCH_W1+1
sta P8ZP_SCRATCH_REG
+ rol P8ZP_SCRATCH_B1
asl P8ZP_SCRATCH_W1
rol P8ZP_SCRATCH_W1+1
rol P8ZP_SCRATCH_REG
asl P8ZP_SCRATCH_W1
rol P8ZP_SCRATCH_W1+1
rol P8ZP_SCRATCH_REG
dex
bne -
lda P8ZP_SCRATCH_B1
sty _numh
bcs ++
+ txa
+ rol _root
asl _numl
rol _numh
rol a
asl _numl
rol _numh
rol a
tax
lsr _loop_counter
bne _loop
lda _root
rts
.pend

View File

@ -14,7 +14,7 @@ Currently these machines can be selected as a compilation target (via the ``-tar
- 'c64': the Commodore 64
- 'cx16': the `Commander X16 <https://www.commanderx16.com/>`_
- 'c128': the Commodore 128 (*limited support*)
- 'pet32': the Commodore PET 4032 (*experimental support*)
- 'pet32': the Commodore PET 4032 (*limited support*)
- 'atari': the Atari 800 XL (*experimental support*)
- 'virtual': a builtin virtual machine

View File

@ -1,7 +1,8 @@
TODO
====
- check mult and sqrt routines with the benchmarked ones on https://github.com/TobyLobster/sqrt_test / https://github.com/TobyLobster/multiply_test
- don't allow txt.print('@') if possible, don't cast up a byte to str
- check mult routines with the benchmarked ones on https://github.com/TobyLobster/multiply_test
- is math.square still the fastest after this? (now used for word*word)
- [on branch:] investigate McCarthy evaluation again? this may also reduce code size perhaps for things like if a>4 or a<2 ....
- IR: reduce the number of branch instructions such as BEQ, BEQR, etc (gradually), replace with CMP(I) + status branch instruction

View File

@ -14,7 +14,7 @@
<keywords keywords="&amp;;-&gt;;@;and;as;asmsub;break;clobbers;do;downto;else;false;for;goto;if;if_cc;if_cs;if_eq;if_mi;if_ne;if_neg;if_nz;if_pl;if_pos;if_vc;if_vs;if_z;in;inline;not;or;repeat;return;romsub;step;sub;to;true;unroll;until;when;while;xor;~" ignore_case="false" />
<keywords2 keywords="%address;%asm;%asmbinary;%asminclude;%breakpoint;%import;%ir;%launcher;%option;%output;%zeropage;%zpreserved;iso:;petscii:;sc:" />
<keywords3 keywords="@requirezp;@shared;@split;@zp;bool;byte;const;float;str;ubyte;uword;void;word" />
<keywords4 keywords="abs;all;any;callfar;callram;callrom;clamp;cmp;divmod;len;lsb;max;memory;min;mkword;msb;peek;peekw;poke;pokew;pop;popw;push;pushw;reverse;rol;rol2;ror;ror2;rrestore;rrestorex;rsave;rsavex;sgn;sizeof;sort;sqrt;sqrt16;swap;|&gt;" />
<keywords4 keywords="abs;all;any;callfar;callram;callrom;clamp;cmp;divmod;len;lsb;max;memory;min;mkword;msb;peek;peekw;poke;pokew;pop;popw;push;pushw;reverse;rol;rol2;ror;ror2;rrestore;rrestorex;rsave;rsavex;sgn;sizeof;sort;sqrt;swap;|&gt;" />
</highlighting>
<extensionMap>
<mapping ext="p8" />