From fbe231793b8d851ae41be2aaf320808fc02b7468 Mon Sep 17 00:00:00 2001
From: Irmen de Jong <irmen@razorvine.net>
Date: Sat, 16 Mar 2024 00:45:25 +0100
Subject: [PATCH] optimized and added "streaming" crc32 and crc16 routines to
 math module.  Return value is put in different register now! r14+r15 instead
 of r0+r1!

---
 compiler/res/prog8lib/math.p8         | 160 ++++++++++++++++++++------
 compiler/res/prog8lib/virtual/math.p8 | 106 ++++++++++++-----
 docs/source/libraries.rst             |  14 ++-
 examples/cx16/diskspeed.p8            |  10 +-
 examples/test.p8                      |  48 +++++---
 5 files changed, 248 insertions(+), 90 deletions(-)

diff --git a/compiler/res/prog8lib/math.p8 b/compiler/res/prog8lib/math.p8
index 71073f777..b472d8897 100644
--- a/compiler/res/prog8lib/math.p8
+++ b/compiler/res/prog8lib/math.p8
@@ -504,47 +504,137 @@ log2_tab
 
     sub crc16(uword data, uword length) -> uword {
         ; calculates the CRC16 (XMODEM) checksum of the buffer.
-        cx16.r1 = data  ; make sure pointer is in zp (on cx16)
-        cx16.r0 = 0  ; the crc value
-        repeat length {
-            cx16.r0H ^= @(cx16.r1)
-            repeat 8 {
-                if cx16.r0H & $80 !=0 {
-                    cx16.r0 <<= 1
-                    cx16.r0 ^= $1021
-                }
-                else
-                    cx16.r0<<=1
-            }
-            cx16.r1++
+        ; There are also "streaming" crc16_start/update/end routines below, that allow you to calculate crc16 for data that doesn't fit in a single memory block.
+        crc16_start()
+        cx16.r13 = data
+        cx16.r14 = data+length
+        while cx16.r13!=cx16.r14 {
+            crc16_update(@(cx16.r13))
+            cx16.r13++
         }
-        return cx16.r0
+        return crc16_end()
+    }
+
+    sub crc16_start() {
+        ; start the "streaming" crc16
+        ; note: tracks the crc16 checksum in cx16.r15!
+        ;       if your code uses that, it must save/restore it before calling this routine
+        cx16.r15 = 0
+    }
+
+    asmsub crc16_update(ubyte value @A) {
+        ; update the "streaming" crc16 with next byte value
+        ; note: tracks the crc16 checksum in cx16.r15!
+        ;       if your code uses that, it must save/restore it before calling this routine
+        %asm {{
+            eor  cx16.r15H
+            sta  cx16.r15H
+            ldy  #8
+-           lda  cx16.r15H
+            asl  cx16.r15L
+            rol  cx16.r15H
+            and  #$80
+            beq  +
+            lda  cx16.r15H
+            eor  #$10
+            sta  cx16.r15H
+            lda  cx16.r15L
+            eor  #$21
+            sta  cx16.r15L
++           dey
+            bne  -
+            rts
+        }}
+; orignal prog8 code was:
+;        cx16.r15H ^= value
+;        repeat 8 {
+;            if cx16.r15H & $80 !=0 {
+;                cx16.r15 <<=1
+;                cx16.r15 ^= $1021
+;            } else
+;                cx16.r15<<=1
+;        }
+    }
+
+    sub crc16_end() -> uword {
+        ; finalize the "streaming" crc16, returns resulting crc16 value
+        return cx16.r15
     }
 
     sub crc32(uword data, uword length) {
         ; Calculates the CRC-32 (POSIX) checksum of the buffer.
         ; because prog8 doesn't have 32 bits integers, we have to split up the calculation over 2 words.
-        ; result stored in cx16.r0 (low word) and cx16.r1 (high word)
-        cx16.r2 = data  ; make sure pointer is in zp (on cx16)
-        cx16.r1 = 0
-        cx16.r0 = 0
-        repeat length {
-            cx16.r1H ^= @(cx16.r2)
-            repeat 8 {
-                if cx16.r1H & $80 !=0 {
-                    cx16.r0 <<= 1
-                    rol(cx16.r1)
-                    cx16.r1 ^= $04c1
-                    cx16.r0 ^= $1db7
-                }
-                else {
-                    cx16.r0 <<= 1
-                    rol(cx16.r1)
-                }
-            }
-            cx16.r2++
+        ; result stored in cx16.r14 (low word) and cx16.r15 (high word)
+        ; There are also "streaming" crc32_start/update/end routines below, that allow you to calculate crc32 for data that doesn't fit in a single memory block.
+        crc32_start()
+        cx16.r12 = data
+        cx16.r13 = data+length
+        while cx16.r12!=cx16.r13 {
+            crc32_update(@(cx16.r12))
+            cx16.r12++
         }
-        cx16.r1 ^= $ffff
-        cx16.r0 ^= $ffff
+        crc32_end()
+    }
+
+    sub crc32_start() {
+        ; start the "streaming" crc32
+        ; note: tracks the crc32 checksum in cx16.r14 and cx16.r15!
+        ;       if your code uses these, it must save/restore them before calling this routine
+        cx16.r14 = cx16.r15 = 0
+    }
+
+    asmsub crc32_update(ubyte value @A) {
+        ; update the "streaming" crc32 with next byte value
+        ; note: tracks the crc32 checksum in cx16.r14 and cx16.r15!
+        ;       if your code uses these, it must save/restore them before calling this routine
+        %asm {{
+            eor  cx16.r15H
+            sta  cx16.r15H
+            ldy  #8
+-           lda  cx16.r15H
+            asl  cx16.r14L
+            rol  cx16.r14H
+            rol  cx16.r15L
+            rol  cx16.r15H
+            and  #$80
+            beq  +
+            lda  cx16.r15H
+            eor  #$04
+            sta  cx16.r15H
+            lda  cx16.r15L
+            eor  #$c1
+            sta  cx16.r15L
+            lda  cx16.r14H
+            eor  #$1d
+            sta  cx16.r14H
+            lda  cx16.r14L
+            eor  #$b7
+            sta  cx16.r14L
++           dey
+            bne  -
+            rts
+        }}
+; original prog8 code:
+;        cx16.r15H ^= value
+;        repeat 8 {
+;            if cx16.r15H & $80 !=0 {
+;                cx16.r14 <<= 1
+;                rol(cx16.r15)
+;                cx16.r15 ^= $04c1
+;                cx16.r14 ^= $1db7
+;            }
+;            else {
+;                cx16.r14 <<= 1
+;                rol(cx16.r15)
+;            }
+;        }
+
+    }
+
+    sub crc32_end() {
+        ; finalize the "streaming" crc32
+        ; result stored in cx16.r14 (low word) and cx16.r15 (high word)
+        cx16.r15 ^= $ffff
+        cx16.r14 ^= $ffff
     }
 }
diff --git a/compiler/res/prog8lib/virtual/math.p8 b/compiler/res/prog8lib/virtual/math.p8
index ff70e2477..e7087710b 100644
--- a/compiler/res/prog8lib/virtual/math.p8
+++ b/compiler/res/prog8lib/virtual/math.p8
@@ -313,43 +313,87 @@ math {
 
     sub crc16(uword data, uword length) -> uword {
         ; calculates the CRC16 (XMODEM) checksum of the buffer.
-        cx16.r0 = 0  ; the crc value
-        repeat length {
-            cx16.r0H ^= @(data)
-            repeat 8 {
-                if cx16.r0H & $80 !=0
-                    cx16.r0 = (cx16.r0<<1)^$1021
-                else
-                    cx16.r0<<=1
-            }
-            data++
+        ; There are also "streaming" crc16_start/update/end routines below, that allow you to calculate crc32 for data that doesn't fit in a single memory block.
+        crc16_start()
+        cx16.r13 = data
+        cx16.r14 = data+length
+        while cx16.r13!=cx16.r14 {
+            crc16_update(@(cx16.r13))
+            cx16.r13++
         }
-        return cx16.r0
+        return crc16_end()
+    }
+
+    sub crc16_start() {
+        ; start the "streaming" crc16
+        ; note: tracks the crc16 checksum in cx16.r15!
+        ;       if your code uses that, it must save/restore it before calling this routine
+        cx16.r15 = 0
+    }
+
+    sub crc16_update(ubyte value) {
+        ; update the "streaming" crc16 with next byte value
+        ; note: tracks the crc16 checksum in cx16.r15!
+        ;       if your code uses that, it must save/restore it before calling this routine
+        cx16.r15H ^= value
+        repeat 8 {
+            if cx16.r15H & $80 !=0
+                cx16.r15 = (cx16.r15<<1)^$1021
+            else
+                cx16.r15<<=1
+        }
+    }
+
+    sub crc16_end() -> uword {
+        ; finalize the "streaming" crc16, returns resulting crc16 value
+        return cx16.r15
     }
 
     sub crc32(uword data, uword length) {
         ; Calculates the CRC-32 (POSIX) checksum of the buffer.
         ; because prog8 doesn't have 32 bits integers, we have to split up the calculation over 2 words.
-        ; result stored in cx16.r0 (low word) and cx16.r1 (high word)
-        cx16.r1 = 0
-        cx16.r0 = 0
-        repeat length {
-            cx16.r1H ^= @(data)
-            repeat 8 {
-                if cx16.r1H & $80 !=0 {
-                    cx16.r0 <<= 1
-                    rol(cx16.r1)
-                    cx16.r1 ^= $04c1
-                    cx16.r0 ^= $1db7
-                }
-                else {
-                    cx16.r0 <<= 1
-                    rol(cx16.r1)
-                }
-            }
-            data++
+        ; result stored in cx16.r14 (low word) and cx16.r15 (high word)
+        ; There are also "streaming" crc32_start/update/end routines below, that allow you to calculate crc32 for data that doesn't fit in a single memory block.
+        crc32_start()
+        cx16.r12 = data
+        cx16.r13 = data+length
+        while cx16.r12!=cx16.r13 {
+            crc32_update(@(cx16.r12))
+            cx16.r12++
         }
-        cx16.r1 ^= $ffff
-        cx16.r0 ^= $ffff
+        crc32_end()
+    }
+
+    sub crc32_start() {
+        ; start the "streaming" crc32
+        ; note: tracks the crc32 checksum in cx16.r14 and cx16.r15!
+        ;       if your code uses these, it must save/restore them before calling this routine
+        cx16.r14 = cx16.r15 = 0
+    }
+
+    sub crc32_update(ubyte value) {
+        ; update the "streaming" crc32 with next byte value
+        ; note: tracks the crc32 checksum in cx16.r14 and cx16.r15!
+        ;       if your code uses these, it must save/restore them before calling this routine
+        cx16.r15H ^= value
+        repeat 8 {
+            if cx16.r15H & $80 !=0 {
+                cx16.r14 <<= 1
+                rol(cx16.r15)
+                cx16.r15 ^= $04c1
+                cx16.r14 ^= $1db7
+            }
+            else {
+                cx16.r14 <<= 1
+                rol(cx16.r15)
+            }
+        }
+    }
+
+    sub crc32_end() {
+        ; finalize the "streaming" crc32
+        ; result stored in cx16.r14 (low word) and cx16.r15 (high word)
+        cx16.r15 ^= $ffff
+        cx16.r14 ^= $ffff
     }
 }
diff --git a/docs/source/libraries.rst b/docs/source/libraries.rst
index 4342b83df..b05a5fd0c 100644
--- a/docs/source/libraries.rst
+++ b/docs/source/libraries.rst
@@ -532,12 +532,22 @@ but perhaps the provided ones can be of service too.
 ``crc16 (uword data, uword length) -> uword``
     Returns a CRC-16 (XMODEM) checksum over the given data buffer.
     Note: on the Commander X16, there is a CRC-16 routine in the kernal: cx16.memory_crc().
-    That one is faster, but yields different results. It is unclear what flavour of crc it is calculating.
+    That one is faster, but yields different results. It is unclear to me what flavour of crc it is calculating.
+
+``crc16_start() / crc16_update(ubyte value) / crc16_end() -> uword``
+    "streaming" crc16 calculation routines, when the data doesn't fit in a single buffer.
+    Tracks the crc16 checksum in cx16.r15! If your code uses that, it must save/restore it before calling this routine!
+    Call the start() routine first, feed it bytes with the update() routine, finalize with calling the end() routine which returns the crc16 value.
 
 ``crc32 (uword data, uword length)``
     Calculates a CRC-32 (POSIX) checksum over the given data buffer.
-    The 32 bits result is stored in cx16.r0 (low word) and cx16.r1 (high word).
+    The 32 bits result is stored in cx16.r14 (low word) and cx16.r15 (high word).
 
+``crc32_start() / crc32_update(ubyte value) / crc32_end()``
+    "streaming" crc32 calculation routines, when the data doesn't fit in a single buffer.
+    Tracks the crc32 checksum in cx16.r14 and cx16.r15! If your code uses these, it must save/restore them before calling this routine!
+    Call the start() routine first, feed it bytes with the update() routine, finalize with calling the end() routine.
+    The 32 bits result is stored in cx16.r14 (low word) and cx16.r15 (high word).
 
 cx16logo
 --------
diff --git a/examples/cx16/diskspeed.p8 b/examples/cx16/diskspeed.p8
index 1d9d13b02..9be27dccc 100644
--- a/examples/cx16/diskspeed.p8
+++ b/examples/cx16/diskspeed.p8
@@ -24,8 +24,8 @@ main {
                 large[cx16.r0] = math.rnd()
             }
             math.crc32(large, 20000)
-            uword crc32_l = cx16.r0
-            uword crc32_h = cx16.r1
+            uword crc32_l = cx16.r14
+            uword crc32_h = cx16.r15
         }
 
         txt.print("\n\x12diskio.save()\x92 writing 10*20kb=200kb total")
@@ -168,9 +168,9 @@ main {
     sub compare_crc32(uword ptr, uword size, uword crc32_low, uword crc32_high)
     {
         math.crc32(ptr, size)
-        if cx16.r0!=crc32_low or cx16.r1!=crc32_high {
-            txt.print_uwhex(cx16.r1, true)
-            txt.print_uwhex(cx16.r0, false)
+        if cx16.r14!=crc32_low or cx16.r15!=crc32_high {
+            txt.print_uwhex(cx16.r15, true)
+            txt.print_uwhex(cx16.r14, false)
             txt.nl()
             txt.print_uwhex(crc32_high, true)
             txt.print_uwhex(crc32_low, false)
diff --git a/examples/test.p8 b/examples/test.p8
index 8f4cd8b0e..bd656ca3e 100644
--- a/examples/test.p8
+++ b/examples/test.p8
@@ -1,27 +1,41 @@
+%import math
 %import textio
 %zeropage basicsafe
 %option no_sysinit
 
+; $029f
+
 main {
     sub start() {
-        cx16.reset_system()
-        repeat {
-            for cx16.r0L in 0 to 255 {
-               cx16.set_led_brightness(cx16.r0L)
-               delay()
-            }
-            for cx16.r0L in 255 downto 0 {
-               cx16.set_led_brightness(cx16.r0L)
-               delay()
-            }
-        }
-    }
+        txt.print("crc16\n")
+        txt.print_uwhex(math.crc16($0800, 32768), true)
+        txt.nl()
 
-    sub delay() {
-        repeat 2000 {
-            %asm {{
-                nop
-            }}
+        cx16.r15 = 0
+        math.crc16_start()
+        for cx16.r9 in $0800 to $0800+32768-1 {
+            math.crc16_update(@(cx16.r9))
         }
+        txt.print_uwhex(math.crc16_end(), true)
+        txt.nl()
+
+        txt.print("crc32\n")
+        cx16.r0 = cx16.r1 = 0
+        math.crc32($0800, 32768)
+        txt.print_uwhex(cx16.r15, true)
+        txt.print_uwhex(cx16.r14, false)
+        txt.nl()
+
+        cx16.r0 = cx16.r1 = 0
+        math.crc32_start()
+        for cx16.r9 in $0800 to $0800+32768-1 {
+            math.crc32_update(@(cx16.r9))
+        }
+        math.crc32_end()
+        txt.print_uwhex(cx16.r15, true)
+        txt.print_uwhex(cx16.r14, false)
+        txt.nl()
     }
 }
+
+