slightly faster sqrt() routine for integers

2024-11-18 04:08:58 +00:00 · 2023-08-14 15:16:46 +02:00 · 2023-08-14 15:16:46 +02:00 · bbb6c53457
commit bbb6c53457
parent d8991894e3
6 changed files with 45 additions and 37 deletions
--- a/compiler/res/prog8lib/math.asm
+++ b/compiler/res/prog8lib/math.asm
@ -4,7 +4,8 @@
 ;  some more interesting routines can be found here:
 ;	http://6502org.wikidot.com/software-math
 ;	http://codebase64.org/doku.php?id=base:6502_6510_maths
-;
+;       https://github.com/TobyLobster/multiply_test
+;       https://github.com/TobyLobster/sqrt_test


 multiply_bytes	.proc
--- a/compiler/res/prog8lib/pet32/syslib.p8
+++ b/compiler/res/prog8lib/pet32/syslib.p8
@ -68,11 +68,8 @@ asmsub RDTIM() -> ubyte @ A, ubyte @ X, ubyte @ Y {
 asmsub RDTIM16() clobbers(X) -> uword @AY {
    ; --  like RDTIM() but only returning the lower 16 bits in AY for convenience
    %asm {{
-        jsr  cbm.RDTIM
-        pha
-        txa
-        tay
-        pla
+        lda  TIME_LO
+        ldy  TIME_MID
        rts
    }}
 }
@ -127,7 +124,7 @@ asmsub  cleanup_at_exit() {

    asmsub waitvsync() clobbers(A) {
        ; --- busy wait till the next vsync has occurred (approximately), without depending on custom irq handling.
-        ;     TODO: on PET this now simply waits until the next jiffy clock update
+        ;     Note: on PET this simply waits until the next jiffy clock update, I don't know if a true vsync is possible there
        %asm {{
            lda  #1
            ldy  #0
--- a/compiler/res/prog8lib/prog8_funcs.asm
+++ b/compiler/res/prog8lib/prog8_funcs.asm
@ -127,32 +127,41 @@ _possibly_zero	cmp  #0


 func_sqrt16_into_A	.proc
-		; integer square root from  http://6502org.wikidot.com/software-math-sqrt
-		sta  P8ZP_SCRATCH_W1
-		sty  P8ZP_SCRATCH_W1+1
-		lda  #0
-		sta  P8ZP_SCRATCH_B1
-		sta  P8ZP_SCRATCH_REG
-		ldx  #8
-		sec
-		lda  P8ZP_SCRATCH_W1+1
+		; integer square root
+		; http://6502org.wikidot.com/software-math-sqrt
+		; https://github.com/TobyLobster/sqrt_test/blob/main/sqrt/sqrt7.a
+		; Tweaked by TobyLobster and 0xC0DE to be smaller and faster
+_numl = P8ZP_SCRATCH_W1
+_numh = P8ZP_SCRATCH_W1+1
+_loop_counter = P8ZP_SCRATCH_REG
+_root = P8ZP_SCRATCH_B1
+            sta  _numl
+            sty  _numh
+            ldx  #$ff
+            stx  _loop_counter
+            inx
+            stx  _root
+            sec
+_loop       lda  _numh
            sbc  #$40
            tay
-		lda  P8ZP_SCRATCH_REG
-		sbc  P8ZP_SCRATCH_B1
+            txa
+            sbc  _root
            bcc  +
-		sty  P8ZP_SCRATCH_W1+1
-		sta  P8ZP_SCRATCH_REG
-+		rol  P8ZP_SCRATCH_B1
-		asl  P8ZP_SCRATCH_W1
-		rol  P8ZP_SCRATCH_W1+1
-		rol  P8ZP_SCRATCH_REG
-		asl  P8ZP_SCRATCH_W1
-		rol  P8ZP_SCRATCH_W1+1
-		rol  P8ZP_SCRATCH_REG
-		dex
-		bne  -
-		lda  P8ZP_SCRATCH_B1
+            sty  _numh
+            bcs  ++
+           txa
+           rol  _root
+            asl  _numl
+            rol  _numh
+            rol  a
+            asl  _numl
+            rol  _numh
+            rol  a
+            tax
+            lsr  _loop_counter
+            bne  _loop
+            lda  _root
            rts
 		.pend

--- a/docs/source/targetsystem.rst
+++ b/docs/source/targetsystem.rst
@ -14,7 +14,7 @@ Currently these machines can be selected as a compilation target (via the ``-tar
 - 'c64': the Commodore 64
 - 'cx16': the `Commander X16 <https://www.commanderx16.com/>`_
 - 'c128': the Commodore 128  (*limited support*)
- 'pet32': the Commodore PET 4032  (*experimental support*)
+- 'pet32': the Commodore PET 4032  (*limited support*)
 - 'atari': the Atari 800 XL  (*experimental support*)
 - 'virtual': a builtin virtual machine

--- a/docs/source/todo.rst
+++ b/docs/source/todo.rst
@ -1,7 +1,8 @@
 TODO
 ====

- check mult and sqrt routines with the benchmarked ones on https://github.com/TobyLobster/sqrt_test / https://github.com/TobyLobster/multiply_test
+- don't allow txt.print('@')  if possible, don't cast up a byte to str
+- check mult routines with the benchmarked ones on https://github.com/TobyLobster/multiply_test
 - is math.square still the fastest after this? (now used for word*word)
 - [on branch:] investigate McCarthy evaluation again? this may also reduce code size perhaps for things like if a>4 or a<2 ....
 - IR: reduce the number of branch instructions such as BEQ, BEQR, etc (gradually), replace with CMP(I) + status branch instruction
--- a/syntax-files/IDEA/Prog8.xml
+++ b/syntax-files/IDEA/Prog8.xml
@ -14,7 +14,7 @@
    <keywords keywords="&amp;;-&gt;;@;and;as;asmsub;break;clobbers;do;downto;else;false;for;goto;if;if_cc;if_cs;if_eq;if_mi;if_ne;if_neg;if_nz;if_pl;if_pos;if_vc;if_vs;if_z;in;inline;not;or;repeat;return;romsub;step;sub;to;true;unroll;until;when;while;xor;~" ignore_case="false" />
    <keywords2 keywords="%address;%asm;%asmbinary;%asminclude;%breakpoint;%import;%ir;%launcher;%option;%output;%zeropage;%zpreserved;iso:;petscii:;sc:" />
    <keywords3 keywords="@requirezp;@shared;@split;@zp;bool;byte;const;float;str;ubyte;uword;void;word" />
-    <keywords4 keywords="abs;all;any;callfar;callram;callrom;clamp;cmp;divmod;len;lsb;max;memory;min;mkword;msb;peek;peekw;poke;pokew;pop;popw;push;pushw;reverse;rol;rol2;ror;ror2;rrestore;rrestorex;rsave;rsavex;sgn;sizeof;sort;sqrt;sqrt16;swap;|&gt;" />
+    <keywords4 keywords="abs;all;any;callfar;callram;callrom;clamp;cmp;divmod;len;lsb;max;memory;min;mkword;msb;peek;peekw;poke;pokew;pop;popw;push;pushw;reverse;rol;rol2;ror;ror2;rrestore;rrestorex;rsave;rsavex;sgn;sizeof;sort;sqrt;swap;|&gt;" />
  </highlighting>
  <extensionMap>
    <mapping ext="p8" />