Bump version

Faster LZSA1 compression
Increase LZSA2 ratio for some input files
2025-07-02 22:23:57 +00:00 · 2020-08-18 11:57:25 +02:00 · 2020-08-18 11:51:24 +02:00 · 2020-08-18 09:13:54 +02:00 · 2020-08-02 09:40:31 +02:00 · 2020-07-29 15:23:22 +02:00
9 changed files with 611 additions and 438 deletions
--- a/asm/6809/unlzsa1.s
+++ b/asm/6809/unlzsa1.s
@ -1,4 +1,4 @@
-;  unlzsa1.s - 6809 decompression routine for raw LZSA1 - 111 bytes
+;  unlzsa1.s - 6809 decompression routine for raw LZSA1 - 110 bytes
 ;  compress with lzsa -r <original_file> <compressed_file>
 ;
 ;  in:  x = start of compressed data
@ -25,8 +25,7 @@

 decompress_lzsa1 equ lz1token

-lz1bigof ldb ,x+           ; O set: load long 16 bit (negative, signed) offset
-         lda ,x+           ; (little endian)
+lz1bigof lda ,x+           ; O set: load MSB 16-bit (negative, signed) offest
 lz1gotof leau d,y          ; put backreference start address in U (dst+offset)

         puls b            ; restore token
@ -65,7 +64,6 @@ lz1cpymt lda ,u+           ; copy matched byte
 lz1token ldb ,x+           ; load next token into B: O|LLL|MMMM
         pshs b            ; save it

-         clra              ; clear A (high part of literals count)
         andb #$70         ; isolate LLL (embedded literals count) in B
         beq lz1nolt       ; skip if no literals
         cmpb #$70         ; LITERALS_RUN_LEN?
@ -73,7 +71,7 @@ lz1token ldb ,x+           ; load next token into B: O|LLL|MMMM

         ldb ,x+           ; load extra literals count byte
         addb #$07         ; add LITERALS_RUN_LEN
-         bcc lz1gotlt      ; if no overflow, we got the complete count, copy
+         bcc lz1gotla      ; if no overflow, we got the complete count, copy
         bne lz1midlt

         ldb ,x+           ; load low 8 bits of little-endian literals count
@ -88,6 +86,7 @@ lz1declt lsrb              ; shift literals count into place
         lsrb
         lsrb
         lsrb
+lz1gotla clra              ; clear A (high part of literals count)

 lz1gotlt tfr x,u
         tfr d,x           ; transfer 16-bit count into X
@ -97,9 +96,9 @@ lz1cpylt lda ,u+           ; copy literal byte
         bne lz1cpylt      ; loop until all literal bytes are copied
         tfr u,x

-lz1nolt  ldb ,s            ; get token again, don't pop it from the stack
+lz1nolt  ldb ,x+           ; load either 8-bit or LSB 16-bit offset (negative, signed)
+         lda ,s            ; get token again, don't pop it from the stack
         bmi lz1bigof      ; test O bit (small or large offset)

-         ldb ,x+           ; O clear: load 8 bit (negative, signed) offset
         lda #$ff          ; set high 8 bits
         bra lz1gotof
--- a/asm/6809/unlzsa1b.s
+++ b/asm/6809/unlzsa1b.s
@ -1,4 +1,4 @@
-;  unlzsa1b.s - 6809 backward decompression routine for raw LZSA1 - 112 bytes
+;  unlzsa1b.s - 6809 backward decompression routine for raw LZSA1 - 113 bytes
 ;  compress with lzsa -r -b <original_file> <compressed_file>
 ;
 ;  in:  x = last byte of compressed data
@ -26,11 +26,47 @@
 decompress_lzsa1
         leax 1,x
         leay 1,y
+         bra lz1token
+
+lz1bigof ldd ,--x          ; O set: load long 16 bit (negative, signed) offset
+lz1gotof nega              ; reverse sign of offset in D
+         negb
+         sbca #0
+         leau d,y          ; put backreference start address in U (dst+offset)
+
+         puls b            ; restore token
+
+         clra              ; clear A (high part of match length)
+         andb #$0F         ; isolate MMMM (embedded match length)
+         addb #$03         ; add MIN_MATCH_SIZE
+         cmpb #$12         ; MATCH_RUN_LEN?
+         bne lz1gotln      ; no, we have the full match length, go copy
+
+         addb ,-x          ; add extra match length byte + MIN_MATCH_SIZE + MATCH_RUN_LEN
+         bcc lz1gotln      ; if no overflow, we have the full length
+         bne lz1midln
+
+         ldd ,--x          ; load 16-bit len in D (low part in B, high in A)
+         bne lz1gotln      ; check if we hit EOD (16-bit length = 0)
+
+         rts               ; done, bail
+
+lz1midln tfr b,a           ; copy high part of len into A
+         ldb ,-x           ; grab low 8 bits of len in B
+
+lz1gotln pshs x            ; save source compressed data pointer
+         tfr d,x           ; copy match length to X
+
+lz1cpymt lda ,-u           ; copy matched byte
+         sta ,-y
+         leax -1,x         ; decrement X
+         bne lz1cpymt      ; loop until all matched bytes are copied
+
+         puls x            ; restore source compressed data pointer

 lz1token ldb ,-x           ; load next token into B: O|LLL|MMMM
         pshs b            ; save it

-         clra              ; clear A (high part of literals count)
         andb #$70         ; isolate LLL (embedded literals count) in B
         beq lz1nolt       ; skip if no literals
         cmpb #$70         ; LITERALS_RUN_LEN?
@ -38,7 +74,7 @@ lz1token ldb ,-x           ; load next token into B: O|LLL|MMMM

         ldb ,-x           ; load extra literals count byte
         addb #$07         ; add LITERALS_RUN_LEN
-         bcc lz1gotlt      ; if no overflow, we got the complete count, copy
+         bcc lz1gotla      ; if no overflow, we got the complete count, copy
         bne lz1midlt

         ldd ,--x          ; load 16 bit count in D (low part in B, high in A)
@ -52,11 +88,12 @@ lz1declt lsrb              ; shift literals count into place
         lsrb
         lsrb
         lsrb
- 
+
+lz1gotla clra              ; clear A (high part of literals count)
 lz1gotlt tfr x,u
         tfr d,x           ; transfer 16-bit count into X
 lz1cpylt lda ,-u           ; copy literal byte
-         sta ,-y 
+         sta ,-y
         leax -1,x         ; decrement X and update Z flag
         bne lz1cpylt      ; loop until all literal bytes are copied
         tfr u,x
@ -67,40 +104,3 @@ lz1nolt  ldb ,s            ; get token again, don't pop it from the stack
         ldb ,-x           ; O clear: load 8 bit (negative, signed) offset
         lda #$ff          ; set high 8 bits
         bra lz1gotof
-
-lz1bigof ldd ,--x          ; O set: load long 16 bit (negative, signed) offset
-lz1gotof nega              ; reverse sign of offset in D
-         negb
-         sbca #0
-         leau d,y          ; put backreference start address in U (dst+offset)
-         
-         puls b            ; restore token
-         
-         clra              ; clear A (high part of match length)
-         andb #$0F         ; isolate MMMM (embedded match length)
-         addb #$03         ; add MIN_MATCH_SIZE
-         cmpb #$12         ; MATCH_RUN_LEN?
-         bne lz1gotln      ; no, we have the full match length, go copy
-
-         addb ,-x          ; add extra match length byte + MIN_MATCH_SIZE + MATCH_RUN_LEN
-         bcc lz1gotln      ; if no overflow, we have the full length
-         bne lz1midln
-
-         ldd ,--x          ; load 16-bit len in D (low part in B, high in A)
-         bne lz1gotln      ; check if we hit EOD (16-bit length = 0)
-      
-         rts               ; done, bail
-
-lz1midln tfr b,a           ; copy high part of len into A
-         ldb ,-x           ; grab low 8 bits of len in B
-
-lz1gotln pshs x            ; save source compressed data pointer
-         tfr d,x           ; copy match length to X
-
-lz1cpymt lda ,-u           ; copy matched byte
-         sta ,-y 
-         leax -1,x         ; decrement X
-         bne lz1cpymt      ; loop until all matched bytes are copied
-
-         puls x            ; restore source compressed data pointer
-         bra lz1token      ; go decode next token
--- a/asm/6809/unlzsa2.s
+++ b/asm/6809/unlzsa2.s
@ -1,4 +1,4 @@
-;  unlzsa2.s - 6809 decompression routine for raw LZSA2 - 183 bytes
+;  unlzsa2.s - 6809 decompression routine for raw LZSA2 - 172 bytes
 ;  compress with lzsa -f2 -r <original_file> <compressed_file>
 ;
 ;  in:  x = start of compressed data
@ -29,8 +29,7 @@ decompress_lzsa2
 lz2token ldb ,x+           ; load next token into B: XYZ|LL|MMM
         pshs b            ; save it

-         clra              ; clear A (high part of literals count)
-         andb #$18         ; isolate LLL (embedded literals count) in B
+         andb #$18         ; isolate LL (embedded literals count) in B
         beq lz2nolt       ; skip if no literals
         cmpb #$18         ; LITERALS_RUN_LEN_V2?
         bne lz2declt      ; if not, we have the complete count, go unshift
@ -38,10 +37,10 @@ lz2token ldb ,x+           ; load next token into B: XYZ|LL|MMM
         bsr lz2nibl       ; get extra literals length nibble in B
         addb #$03         ; add LITERALS_RUN_LEN_V2
         cmpb #$12         ; LITERALS_RUN_LEN_V2 + 15 ?
-         bne lz2gotlt      ; if not, we have the full literals count, go copy
+         bne lz2gotla      ; if not, we have the full literals count, go copy

         addb ,x+          ; add extra literals count byte + LITERALS_RUN_LEN + 15
-         bcc lz2gotlt      ; if no overflow, we got the complete count, copy
+         bcc lz2gotla      ; if no overflow, we got the complete count, copy

         ldb ,x+           ; load low 8 bits of little-endian literals count
         lda ,x+           ; load high 8 bits of literal count
@ -50,11 +49,12 @@ lz2token ldb ,x+           ; load next token into B: XYZ|LL|MMM
 lz2declt lsrb              ; shift literals count into place
         lsrb
         lsrb
- 
+
+lz2gotla clra              ; clear A (high part of literals count)
 lz2gotlt tfr x,u
         tfr d,x           ; transfer 16-bit count into X
 lz2cpylt lda ,u+           ; copy literal byte
-         sta ,y+ 
+         sta ,y+
         leax -1,x         ; decrement X and update Z flag
         bne lz2cpylt      ; loop until all literal bytes are copied
         tfr u,x
@ -65,23 +65,18 @@ lz2nolt  ldb ,s            ; get token again, don't pop it from the stack
         bcs lz2replg      ; if token's X bit is set, rep or large offset

         lslb              ; push token's Y flag bit into carry
+         sex               ; push token's Z flag bit into reg A (carry flag is not effected)
         bcs lz2offs9      ; if token's Y bit is set, 9 bits offset

-         lslb              ; push token's Z flag bit into carry
-         tfr cc,a          ; preserve cpu flags (to preserve carry)
         bsr lz2nibl       ; get offset nibble in B
-         tfr a,cc          ; restore cpu flags
+         lsla              ; retrieve token's Z flag bit and push into carry

         rolb              ; shift Z flag from carry into bit 0 of B
         eorb #$e1         ; set bits 5-7 of offset, reverse bit 0
-         lda #$ff          ; set bits 8-15 of offset
+         sex               ; set bits 8-15 of offset to $FF
         bra lz2gotof

-lz2offs9 clra              ; clear A (to prepare for high 8 bits of offset)
-         lslb              ; push token's Z flag bit into carry         
-         rola              ; shift Z flag from carry into bit 0 of A
-         coma              ; set bits 9-15 of offset, reverse bit 8
-
+lz2offs9 deca              ; set bits 9-15 of offset, reverse bit 8
         ldb ,x+           ; load low 8 bits of (negative, signed) offset
         bra lz2gotof

@ -106,30 +101,24 @@ lz2done  rts
 lz2replg lslb              ; push token's Y flag bit into carry
         bcs lz2rep16      ; if token's Y bit is set, rep or 16 bit offset

-         lslb              ; push token's Z flag bit into carry
-         tfr cc,a          ; preserve cpu flags (to preserve carry)
+         sex               ; push token's Z flag bit into reg A
         bsr lz2nibl       ; get offset nibble in B
-         tfr a,cc          ; restore cpu flags
-
+         lsla              ; push token's Z flag bit into carry
         rolb              ; shift Z flag from carry into bit 0 of B
         eorb #$e1         ; set bits 13-15 of offset, reverse bit 8
         tfr b,a           ; copy bits 8-15 of offset into A
         suba #$02         ; substract 512 from offset
-
         ldb ,x+           ; load low 8 bits of (negative, signed) offset
         bra lz2gotof

 lz2rep16 bmi lz2repof      ; if token's Z flag bit is set, rep match
-         
         ldd ,x++          ; load high then low 8 bits of offset

-lz2gotof std <lz2repof+1,pcr ; store match offset
+lz2gotof std <lz2repof+2,pcr ; store match offset
+lz2repof leau $aaaa,y      ; put backreference start address in U (dst+offset)

-lz2repof ldd #$aaaa        ; load match offset
-         leau d,y          ; put backreference start address in U (dst+offset)
-         
         puls b            ; restore token
-         
+
         clra              ; clear A (high part of match length)
         andb #$07         ; isolate MMM (embedded match length)
         addb #$02         ; add MIN_MATCH_SIZE_V2
@ -141,8 +130,7 @@ lz2repof ldd #$aaaa        ; load match offset
         cmpb #$18         ; MIN_MATCH_SIZE_V2 + MATCH_RUN_LEN_V2 + 15?
         bne lz2gotln      ; if not, we have the full match length, go copy

-         ldb ,x+           ; load extra length byte
-         addb #$18         ; add MIN_MATCH_SIZE_V2 + MATCH_RUN_LEN_V2 + 15
+         addb ,x+          ; add extra length byte + MIN_MATCH_SIZE_V2 + MATCH_RUN_LEN_V2 + 15
         bcc lz2gotln      ; if no overflow, we have the full length
         beq lz2done       ; detect EOD code

@ -153,10 +141,9 @@ lz2gotln pshs x            ; save source compressed data pointer
         tfr d,x           ; copy match length to X

 lz2cpymt lda ,u+           ; copy matched byte
-         sta ,y+ 
+         sta ,y+
         leax -1,x         ; decrement X
         bne lz2cpymt      ; loop until all matched bytes are copied

         puls x            ; restore source compressed data pointer
         lbra lz2token     ; go decode next token
-
--- a/asm/6809/unlzsa2b.s
+++ b/asm/6809/unlzsa2b.s
@ -1,4 +1,4 @@
-;  unlzsa2b.s - 6809 backward decompression routine for raw LZSA2 - 187 bytes
+;  unlzsa2b.s - 6809 backward decompression routine for raw LZSA2 - 174 bytes
 ;  compress with lzsa -f2 -r -b <original_file> <compressed_file>
 ;
 ;  in:  x = last byte of compressed data
@ -31,7 +31,6 @@ decompress_lzsa2
 lz2token ldb ,-x           ; load next token into B: XYZ|LL|MMM
         pshs b            ; save it

-         clra              ; clear A (high part of literals count)
         andb #$18         ; isolate LLL (embedded literals count) in B
         beq lz2nolt       ; skip if no literals
         cmpb #$18         ; LITERALS_RUN_LEN_V2?
@ -40,10 +39,10 @@ lz2token ldb ,-x           ; load next token into B: XYZ|LL|MMM
         bsr lz2nibl       ; get extra literals length nibble in B
         addb #$03         ; add LITERALS_RUN_LEN_V2
         cmpb #$12         ; LITERALS_RUN_LEN_V2 + 15 ?
-         bne lz2gotlt      ; if not, we have the full literals count, go copy
+         bne lz2gotla      ; if not, we have the full literals count, go copy

         addb ,-x         ; add extra literals count byte + LITERALS_RUN_LEN + 15
-         bcc lz2gotlt      ; if no overflow, we got the complete count, copy
+         bcc lz2gotla      ; if no overflow, we got the complete count, copy

         ldd ,--x          ; load 16 bit count in D (low part in B, high in A)
         bra lz2gotlt      ; we now have the complete count, go copy
@ -51,11 +50,12 @@ lz2token ldb ,-x           ; load next token into B: XYZ|LL|MMM
 lz2declt lsrb              ; shift literals count into place
         lsrb
         lsrb
- 
+lz2gotla clra              ; clear A (high part of literals count)
+
 lz2gotlt tfr x,u
         tfr d,x           ; transfer 16-bit count into X
 lz2cpylt lda ,-u           ; copy literal byte
-         sta ,-y 
+         sta ,-y
         leax -1,x         ; decrement X and update Z flag
         bne lz2cpylt      ; loop until all literal bytes are copied
         tfr u,x
@ -66,22 +66,18 @@ lz2nolt  ldb ,s            ; get token again, don't pop it from the stack
         bcs lz2replg      ; if token's X bit is set, rep or large offset

         lslb              ; push token's Y flag bit into carry
+         sex               ; push token's Z flag bit into reg A (carry flag is not effected)
         bcs lz2offs9      ; if token's Y bit is set, 9 bits offset

-         lslb              ; push token's Z flag bit into carry
-         tfr cc,a          ; preserve cpu flags (to preserve carry)
         bsr lz2nibl       ; get offset nibble in B
-         tfr a,cc          ; restore cpu flags
+         lsla              ; retrieve token's Z flag bit and push into carry

         rolb              ; shift Z flag from carry into bit 0 of B
         eorb #$e1         ; set bits 5-7 of offset, reverse bit 0
-         lda #$ff          ; set bits 8-15 of offset
+         sex               ; set bits 8-15 of offset to $FF
         bra lz2gotof

-lz2offs9 clra              ; clear A (to prepare for high 8 bits of offset)
-         lslb              ; push token's Z flag bit into carry         
-         rola              ; shift Z flag from carry into bit 0 of A
-         coma              ; set bits 9-15 of offset, reverse bit 8
+lz2offs9 deca               ; set bits 9-15 of offset, reverse bit 8
         bra lz2lowof

 lz2nibct fcb $00           ; nibble ready flag
@ -105,10 +101,9 @@ lz2done  rts
 lz2replg lslb              ; push token's Y flag bit into carry
         bcs lz2rep16      ; if token's Y bit is set, rep or 16 bit offset

-         lslb              ; push token's Z flag bit into carry
-         tfr cc,a          ; preserve cpu flags (to preserve carry)
+         sex               ; push token's Z flag bit into reg A
         bsr lz2nibl       ; get offset nibble in B
-         tfr a,cc          ; restore cpu flags
+         lsla              ; retrieve token's Z flag bit and push into carry

         rolb              ; shift Z flag from carry into bit 0 of B
         eorb #$e1         ; set bits 13-15 of offset, reverse bit 8
@ -117,20 +112,19 @@ lz2replg lslb              ; push token's Y flag bit into carry
         bra lz2lowof

 lz2rep16 bmi lz2repof      ; if token's Z flag bit is set, rep match
-         
+
         lda ,-x           ; load high 8 bits of (negative, signed) offset
 lz2lowof ldb ,-x           ; load low 8 bits of offset

 lz2gotof nega              ; reverse sign of offset in D
         negb
         sbca #0
-         std <lz2repof+1,pcr ; store match offset
+         std <lz2repof+2,pcr ; store match offset
+
+lz2repof leau $aaaa,y      ; put backreference start address in U (dst+offset)

-lz2repof ldd #$aaaa        ; load match offset
-         leau d,y          ; put backreference start address in U (dst+offset)
-         
         puls b            ; restore token
-         
+
         clra              ; clear A (high part of match length)
         andb #$07         ; isolate MMM (embedded match length)
         addb #$02         ; add MIN_MATCH_SIZE_V2
@ -142,8 +136,7 @@ lz2repof ldd #$aaaa        ; load match offset
         cmpb #$18         ; MIN_MATCH_SIZE_V2 + MATCH_RUN_LEN_V2 + 15?
         bne lz2gotln      ; if not, we have the full match length, go copy

-         ldb ,-x           ; load extra length byte
-         addb #$18         ; add MIN_MATCH_SIZE_V2 + MATCH_RUN_LEN_V2 + 15
+         addb ,-x          ; add extra length byte + MIN_MATCH_SIZE_V2 + MATCH_RUN_LEN_V2 + 15
         bcc lz2gotln      ; if no overflow, we have the full length
         beq lz2done       ; detect EOD code

@ -153,10 +146,9 @@ lz2gotln pshs x            ; save source compressed data pointer
         tfr d,x           ; copy match length to X

 lz2cpymt lda ,-u           ; copy matched byte
-         sta ,-y 
+         sta ,-y
         leax -1,x         ; decrement X
         bne lz2cpymt      ; loop until all matched bytes are copied

         puls x            ; restore source compressed data pointer
         lbra lz2token     ; go decode next token
-
--- a/src/lzsa.c
+++ b/src/lzsa.c
@ -48,7 +48,7 @@
 #define OPT_RAW_BACKWARD   8
 #define OPT_STATS          16

-#define TOOL_VERSION "1.3.0"
+#define TOOL_VERSION "1.3.4"

 /*---------------------------------------------------------------------------*/

--- a/src/shrink_block_v1.c
+++ b/src/shrink_block_v1.c
@ -157,66 +157,69 @@ static inline int lzsa_get_offset_cost_v1(const unsigned int nMatchOffset) {
 * @param nEndOffset offset to end finding matches at (typically the size of the total input window in bytes
 */
 static void lzsa_optimize_forward_v1(lzsa_compressor *pCompressor, lzsa_match *pBestMatch, const int nStartOffset, const int nEndOffset, const int nReduce) {
-   lzsa_arrival *arrival = pCompressor->arrival - (nStartOffset << MATCHES_PER_ARRIVAL_SHIFT);
+   lzsa_arrival *arrival = pCompressor->arrival - (nStartOffset << ARRIVALS_PER_POSITION_SHIFT);
   const int nMinMatchSize = pCompressor->min_match_size;
   const int nFavorRatio = (pCompressor->flags & LZSA_FLAG_FAVOR_RATIO) ? 1 : 0;
+   const int nModeSwitchPenalty = nFavorRatio ? 0 : MODESWITCH_PENALTY;
   const int nDisableScore = nReduce ? 0 : (2 * BLOCK_SIZE);
   int i, j, n;

   if ((nEndOffset - nStartOffset) > BLOCK_SIZE) return;

-   memset(arrival + (nStartOffset << MATCHES_PER_ARRIVAL_SHIFT), 0, sizeof(lzsa_arrival) * ((nEndOffset - nStartOffset + 1) << MATCHES_PER_ARRIVAL_SHIFT));
+   memset(arrival + (nStartOffset << ARRIVALS_PER_POSITION_SHIFT), 0, sizeof(lzsa_arrival) * ((nEndOffset - nStartOffset + 1) << ARRIVALS_PER_POSITION_SHIFT));

-   arrival[nStartOffset << MATCHES_PER_ARRIVAL_SHIFT].from_slot = -1;
+   arrival[nStartOffset << ARRIVALS_PER_POSITION_SHIFT].from_slot = -1;

   for (i = nStartOffset; i != nEndOffset; i++) {
+      lzsa_arrival* cur_arrival = &arrival[i << ARRIVALS_PER_POSITION_SHIFT];
      int m;

-      for (j = 0; j < NMATCHES_PER_ARRIVAL_V1 && arrival[(i << MATCHES_PER_ARRIVAL_SHIFT) + j].from_slot; j++) {
-         int nPrevCost = arrival[(i << MATCHES_PER_ARRIVAL_SHIFT) + j].cost;
+      for (j = 0; j < NARRIVALS_PER_POSITION_V1 && cur_arrival[j].from_slot; j++) {
+         int nPrevCost = cur_arrival[j].cost;
         int nCodingChoiceCost = nPrevCost + 8 /* literal */;
-         int nScore = arrival[(i << MATCHES_PER_ARRIVAL_SHIFT) + j].score + 1;
-         int nNumLiterals = arrival[(i << MATCHES_PER_ARRIVAL_SHIFT) + j].num_literals + 1;
+         int nScore = cur_arrival[j].score + 1;
+         int nNumLiterals = cur_arrival[j].num_literals + 1;

         if (nNumLiterals == LITERALS_RUN_LEN_V1 || nNumLiterals == 256 || nNumLiterals == 512) {
            nCodingChoiceCost += 8;
         }

-         if (!nFavorRatio && nNumLiterals == 1)
-            nCodingChoiceCost += MODESWITCH_PENALTY;
+         if (nNumLiterals == 1)
+            nCodingChoiceCost += nModeSwitchPenalty;

-         for (n = 0; n < NMATCHES_PER_ARRIVAL_V1 /* we only need the literals + short match cost + long match cost cases */; n++) {
-            lzsa_arrival *pDestArrival = &arrival[((i + 1) << MATCHES_PER_ARRIVAL_SHIFT) + n];
+         lzsa_arrival *pDestSlots = &arrival[(i + 1) << ARRIVALS_PER_POSITION_SHIFT];
+         for (n = 0; n < NARRIVALS_PER_POSITION_V1 /* we only need the literals + short match cost + long match cost cases */; n++) {
+            lzsa_arrival *pDestArrival = &pDestSlots[n];

            if (pDestArrival->from_slot == 0 ||
               nCodingChoiceCost < pDestArrival->cost ||
               (nCodingChoiceCost == pDestArrival->cost && nScore < (pDestArrival->score + nDisableScore))) {
-               memmove(&arrival[((i + 1) << MATCHES_PER_ARRIVAL_SHIFT) + n + 1],
-                  &arrival[((i + 1) << MATCHES_PER_ARRIVAL_SHIFT) + n],
-                  sizeof(lzsa_arrival) * (NMATCHES_PER_ARRIVAL_V1 - n - 1));
+               memmove(&arrival[((i + 1) << ARRIVALS_PER_POSITION_SHIFT) + n + 1],
+                  &arrival[((i + 1) << ARRIVALS_PER_POSITION_SHIFT) + n],
+                  sizeof(lzsa_arrival) * (NARRIVALS_PER_POSITION_V1 - n - 1));

               pDestArrival->cost = nCodingChoiceCost;
               pDestArrival->from_pos = i;
               pDestArrival->from_slot = j + 1;
-               pDestArrival->match_offset = 0;
               pDestArrival->match_len = 0;
               pDestArrival->num_literals = nNumLiterals;
               pDestArrival->score = nScore;
-               pDestArrival->rep_offset = arrival[(i << MATCHES_PER_ARRIVAL_SHIFT) + j].rep_offset;
+               pDestArrival->rep_offset = cur_arrival[j].rep_offset;
               break;
            }
         }
      }

      const lzsa_match *match = pCompressor->match + ((i - nStartOffset) << MATCHES_PER_INDEX_SHIFT_V1);
+      int nNumArrivalsForThisPos = j;

      for (m = 0; m < NMATCHES_PER_INDEX_V1 && match[m].length; m++) {
         int nMatchLen = match[m].length;
         int nMatchOffsetCost = lzsa_get_offset_cost_v1(match[m].offset);
         int nStartingMatchLen, k;

-         if ((i + nMatchLen) > (nEndOffset - LAST_LITERALS))
-            nMatchLen = nEndOffset - LAST_LITERALS - i;
+         if ((i + nMatchLen) > nEndOffset)
+            nMatchLen = nEndOffset - i;

         if (nMatchLen >= LEAVE_ALONE_MATCH_SIZE)
            nStartingMatchLen = nMatchLen;
@ -225,43 +228,48 @@ static void lzsa_optimize_forward_v1(lzsa_compressor *pCompressor, lzsa_match *p
         for (k = nStartingMatchLen; k <= nMatchLen; k++) {
            int nMatchLenCost = lzsa_get_match_varlen_size_v1(k - MIN_MATCH_SIZE_V1);

-            for (j = 0; j < NMATCHES_PER_ARRIVAL_V1 && arrival[(i << MATCHES_PER_ARRIVAL_SHIFT) + j].from_slot; j++) {
-               int nPrevCost = arrival[(i << MATCHES_PER_ARRIVAL_SHIFT) + j].cost;
+            lzsa_arrival *pDestSlots = &arrival[(i + k) << ARRIVALS_PER_POSITION_SHIFT];
+
+            for (j = 0; j < nNumArrivalsForThisPos; j++) {
+               int nPrevCost = cur_arrival[j].cost;
               int nCodingChoiceCost = nPrevCost + 8 /* token */ /* the actual cost of the literals themselves accumulates up the chain */ + nMatchOffsetCost + nMatchLenCost;
-               int nScore = arrival[(i << MATCHES_PER_ARRIVAL_SHIFT) + j].score + 5;
               int exists = 0;

-               if (!nFavorRatio && !arrival[(i << MATCHES_PER_ARRIVAL_SHIFT) + j].num_literals)
-                  nCodingChoiceCost += MODESWITCH_PENALTY;
+               if (!cur_arrival[j].num_literals)
+                  nCodingChoiceCost += nModeSwitchPenalty;

               for (n = 0;
-                  n < NMATCHES_PER_ARRIVAL_V1 && arrival[((i + k) << MATCHES_PER_ARRIVAL_SHIFT) + n].from_slot && arrival[((i + k) << MATCHES_PER_ARRIVAL_SHIFT) + n].cost <= nCodingChoiceCost;
+                  n < NARRIVALS_PER_POSITION_V1 && pDestSlots[n].from_slot && pDestSlots[n].cost <= nCodingChoiceCost;
                  n++) {
-                  if (lzsa_get_offset_cost_v1(arrival[((i + k) << MATCHES_PER_ARRIVAL_SHIFT) + n].rep_offset) == lzsa_get_offset_cost_v1(match[m].offset)) {
+                  if (lzsa_get_offset_cost_v1(pDestSlots[n].rep_offset) == nMatchOffsetCost) {
                     exists = 1;
                     break;
                  }
               }

-               for (n = 0; !exists && n < NMATCHES_PER_ARRIVAL_V1 /* we only need the literals + short match cost + long match cost cases */; n++) {
-                  lzsa_arrival *pDestArrival = &arrival[((i + k) << MATCHES_PER_ARRIVAL_SHIFT) + n];
+               if (!exists) {
+                  int nScore = cur_arrival[j].score + 5;

-                  if (pDestArrival->from_slot == 0 ||
-                     nCodingChoiceCost < pDestArrival->cost ||
-                     (nCodingChoiceCost == pDestArrival->cost && nScore < (pDestArrival->score + nDisableScore))) {
-                     memmove(&arrival[((i + k) << MATCHES_PER_ARRIVAL_SHIFT) + n + 1],
-                        &arrival[((i + k) << MATCHES_PER_ARRIVAL_SHIFT) + n],
-                        sizeof(lzsa_arrival) * (NMATCHES_PER_ARRIVAL_V1 - n - 1));
+                  for (n = 0; n < NARRIVALS_PER_POSITION_V1 /* we only need the literals + short match cost + long match cost cases */; n++) {
+                     lzsa_arrival *pDestArrival = &pDestSlots[n];

-                     pDestArrival->cost = nCodingChoiceCost;
-                     pDestArrival->from_pos = i;
-                     pDestArrival->from_slot = j + 1;
-                     pDestArrival->match_offset = match[m].offset;
-                     pDestArrival->match_len = k;
-                     pDestArrival->num_literals = 0;
-                     pDestArrival->score = nScore;
-                     pDestArrival->rep_offset = match[m].offset;
-                     break;
+                     if (pDestArrival->from_slot == 0 ||
+                        nCodingChoiceCost < pDestArrival->cost ||
+                        (nCodingChoiceCost == pDestArrival->cost && nScore < (pDestArrival->score + nDisableScore))) {
+                        memmove(&pDestSlots[n + 1],
+                           &pDestSlots[n],
+                           sizeof(lzsa_arrival) * (NARRIVALS_PER_POSITION_V1 - n - 1));
+
+                        pDestArrival->cost = nCodingChoiceCost;
+                        pDestArrival->from_pos = i;
+                        pDestArrival->from_slot = j + 1;
+                        pDestArrival->match_len = k;
+                        pDestArrival->num_literals = 0;
+                        pDestArrival->score = nScore;
+                        pDestArrival->rep_offset = match[m].offset;
+                        j = NARRIVALS_PER_POSITION_V1;
+                        break;
+                     }
                  }
               }
            }
@ -269,14 +277,17 @@ static void lzsa_optimize_forward_v1(lzsa_compressor *pCompressor, lzsa_match *p
      }
   }

-   lzsa_arrival *end_arrival = &arrival[(i << MATCHES_PER_ARRIVAL_SHIFT) + 0];
+   lzsa_arrival *end_arrival = &arrival[(i << ARRIVALS_PER_POSITION_SHIFT) + 0];

   while (end_arrival->from_slot > 0 && end_arrival->from_pos >= 0) {
      if (end_arrival->from_pos >= nEndOffset) return;
      pBestMatch[end_arrival->from_pos].length = end_arrival->match_len;
-      pBestMatch[end_arrival->from_pos].offset = end_arrival->match_offset;
+      if (end_arrival->match_len)
+         pBestMatch[end_arrival->from_pos].offset = end_arrival->rep_offset;
+      else
+         pBestMatch[end_arrival->from_pos].offset = 0;

-      end_arrival = &arrival[(end_arrival->from_pos << MATCHES_PER_ARRIVAL_SHIFT) + (end_arrival->from_slot - 1)];
+      end_arrival = &arrival[(end_arrival->from_pos << ARRIVALS_PER_POSITION_SHIFT) + (end_arrival->from_slot - 1)];
   }
 }

@ -301,12 +312,12 @@ static int lzsa_optimize_command_count_v1(lzsa_compressor *pCompressor, const un
      lzsa_match *pMatch = pBestMatch + i;

      if (pMatch->length == 0 &&
-         (i + 1) < (nEndOffset - LAST_LITERALS) &&
+         (i + 1) < nEndOffset &&
         pBestMatch[i + 1].length >= MIN_MATCH_SIZE_V1 &&
         pBestMatch[i + 1].length < MAX_VARLEN &&
         pBestMatch[i + 1].offset &&
         i >= pBestMatch[i + 1].offset &&
-         (i + pBestMatch[i + 1].length + 1) <= (nEndOffset - LAST_LITERALS) &&
+         (i + pBestMatch[i + 1].length + 1) <= nEndOffset &&
         !memcmp(pInWindow + i - (pBestMatch[i + 1].offset), pInWindow + i, pBestMatch[i + 1].length + 1)) {
         int nCurLenSize = lzsa_get_match_varlen_size_v1(pBestMatch[i + 1].length - MIN_MATCH_SIZE_V1);
         int nReducedLenSize = lzsa_get_match_varlen_size_v1(pBestMatch[i + 1].length + 1 - MIN_MATCH_SIZE_V1);
--- a/src/shrink_block_v2.c
+++ b/src/shrink_block_v2.c
--- a/src/shrink_context.c
+++ b/src/shrink_context.c
@ -62,6 +62,9 @@ int lzsa_compressor_init(lzsa_compressor *pCompressor, const int nMaxWindowSize,
   pCompressor->best_match = NULL;
   pCompressor->improved_match = NULL;
   pCompressor->arrival = NULL;
+   pCompressor->rep_handled_mask = NULL;
+   pCompressor->first_offset_for_byte = NULL;
+   pCompressor->next_offset_for_pos = NULL;
   pCompressor->min_match_size = nMinMatchSize;
   if (pCompressor->min_match_size < nMinMatchSizeForFormat)
      pCompressor->min_match_size = nMinMatchSizeForFormat;
@ -89,7 +92,7 @@ int lzsa_compressor_init(lzsa_compressor *pCompressor, const int nMaxWindowSize,
            pCompressor->open_intervals = (unsigned int *)malloc((LCP_AND_TAG_MAX + 1) * sizeof(unsigned int));

            if (pCompressor->open_intervals) {
-               pCompressor->arrival = (lzsa_arrival *)malloc(((BLOCK_SIZE + 1) << MATCHES_PER_ARRIVAL_SHIFT) * sizeof(lzsa_arrival));
+               pCompressor->arrival = (lzsa_arrival *)malloc(((BLOCK_SIZE + 1) << ARRIVALS_PER_POSITION_SHIFT) * sizeof(lzsa_arrival));
   
               if (pCompressor->arrival) {
                  pCompressor->best_match = (lzsa_match *)malloc(BLOCK_SIZE * sizeof(lzsa_match));
@ -102,8 +105,23 @@ int lzsa_compressor_init(lzsa_compressor *pCompressor, const int nMaxWindowSize,
                           pCompressor->match = (lzsa_match *)malloc(BLOCK_SIZE * NMATCHES_PER_INDEX_V2 * sizeof(lzsa_match));
                        else
                           pCompressor->match = (lzsa_match *)malloc(BLOCK_SIZE * NMATCHES_PER_INDEX_V1 * sizeof(lzsa_match));
-                        if (pCompressor->match)
-                           return 0;
+                        if (pCompressor->match) {
+                           if (pCompressor->format_version == 2) {
+                              pCompressor->rep_handled_mask = (char*)malloc(NARRIVALS_PER_POSITION_V2_BIG * ((LCP_MAX + 1) / 8) * sizeof(char));
+                              if (pCompressor->rep_handled_mask) {
+                                 pCompressor->first_offset_for_byte = (int*)malloc(65536 * sizeof(int));
+                                 if (pCompressor->first_offset_for_byte) {
+                                    pCompressor->next_offset_for_pos = (int*)malloc(BLOCK_SIZE * sizeof(int));
+                                    if (pCompressor->next_offset_for_pos) {
+                                       return 0;
+                                    }
+                                 }
+                              }
+                           }
+                           else {
+                              return 0;
+                           }
+                        }
                     }
                  }
               }
@ -124,6 +142,21 @@ int lzsa_compressor_init(lzsa_compressor *pCompressor, const int nMaxWindowSize,
 void lzsa_compressor_destroy(lzsa_compressor *pCompressor) {
   divsufsort_destroy(&pCompressor->divsufsort_context);

+   if (pCompressor->next_offset_for_pos) {
+      free(pCompressor->next_offset_for_pos);
+      pCompressor->next_offset_for_pos = NULL;
+   }
+
+   if (pCompressor->first_offset_for_byte) {
+      free(pCompressor->first_offset_for_byte);
+      pCompressor->first_offset_for_byte = NULL;
+   }
+
+   if (pCompressor->rep_handled_mask) {
+      free(pCompressor->rep_handled_mask);
+      pCompressor->rep_handled_mask = NULL;
+   }
+
   if (pCompressor->match) {
      free(pCompressor->match);
      pCompressor->match = NULL;
--- a/src/shrink_context.h
+++ b/src/shrink_context.h
@ -49,10 +49,10 @@ extern "C" {
 #define VISITED_FLAG 0x80000000
 #define EXCL_VISITED_MASK  0x7fffffff

-#define NMATCHES_PER_ARRIVAL_V1 8
-#define NMATCHES_PER_ARRIVAL_V2_SMALL 9
-#define NMATCHES_PER_ARRIVAL_V2_BIG 32
-#define MATCHES_PER_ARRIVAL_SHIFT 5
+#define NARRIVALS_PER_POSITION_V1 8
+#define NARRIVALS_PER_POSITION_V2_SMALL 9
+#define NARRIVALS_PER_POSITION_V2_BIG 32
+#define ARRIVALS_PER_POSITION_SHIFT 5

 #define NMATCHES_PER_INDEX_V1 8
 #define MATCHES_PER_INDEX_SHIFT_V1 3
@ -63,8 +63,6 @@ extern "C" {
 #define LEAVE_ALONE_MATCH_SIZE 300
 #define LEAVE_ALONE_MATCH_SIZE_SMALL 1000

-#define LAST_LITERALS 0
-
 #define MODESWITCH_PENALTY 3

 /** One match */
@ -81,12 +79,10 @@ typedef struct {

   int from_pos;
   unsigned short rep_len;
+   unsigned short match_len;
   int rep_pos;
   int num_literals;
   int score;
-
-   unsigned short match_offset;
-   unsigned short match_len;
 } lzsa_arrival;

 /** Compression statistics */
@ -128,6 +124,9 @@ typedef struct _lzsa_compressor {
   lzsa_match *best_match;
   lzsa_match *improved_match;
   lzsa_arrival *arrival;
+   char *rep_handled_mask;
+   int *first_offset_for_byte;
+   int *next_offset_for_pos;
   int min_match_size;
   int format_version;
   int flags;
Author	SHA1	Message	Date
Emmanuel Marty	01228f3eeb	Bump version	2020-08-18 11:57:25 +02:00
Emmanuel Marty	e3fd315541	Faster LZSA1 compression	2020-08-18 11:51:24 +02:00
Emmanuel Marty	5a0da16874	Increase LZSA2 ratio for some input files	2020-08-18 09:13:54 +02:00
Emmanuel Marty	028007b57c	Bump version	2020-08-02 09:40:31 +02:00
Emmanuel Marty	4682b2e917	Small simplification	2020-07-29 15:23:22 +02:00
Emmanuel Marty	060f5d3350	Simplify code, compress LZSA2 another 15% faster	2020-07-29 13:01:24 +02:00
Emmanuel Marty	33eec56b9b	Bump version	2020-07-27 15:36:01 +02:00
Emmanuel Marty	90fa770458	Compress another 8% faster	2020-07-27 13:25:16 +02:00
Emmanuel Marty	b2971da2b4	Nicer code	2020-07-26 16:38:22 +02:00
Emmanuel Marty	3fb9dc54b1	Compress LZSA2 another 3% faster	2020-07-26 10:07:03 +02:00
Emmanuel Marty	00d1d95625	Small improvement	2020-07-24 19:18:46 +02:00
Emmanuel Marty	e4f013f2db	Compress LZSA2 17% faster	2020-07-24 17:14:01 +02:00
Emmanuel Marty	703ff19a3a	Bump version	2020-07-14 23:50:44 +02:00
Emmanuel Marty	fc5081fb1a	Rename confusing definitions	2020-07-14 22:36:38 +02:00
Emmanuel Marty	61698b5036	Another LZSA2 compression speedup	2020-07-14 17:01:07 +02:00
Emmanuel Marty	cf49af5cda	Faster LZSA2 compression	2020-07-14 12:36:56 +02:00
Emmanuel Marty	c39158eea8	Compress LZSA2 faster, clean code up	2020-07-13 19:34:07 +02:00
Emmanuel Marty	4864f3c184	Compress LZSA1 a little faster	2020-07-10 17:45:13 +02:00
Emmanuel Marty	8ed768aafc	Nicer and faster code	2020-07-10 08:55:45 +02:00
Emmanuel Marty	9c7495f458	Compress LZSA2 ~12% faster	2020-07-06 12:47:56 +02:00
Emmanuel Marty	afbb1de16c	Merge pull request #48 from dougmasten/dev More optimizations to the 6809 LZSA depackers	2020-06-28 15:46:17 +02:00
Doug Masten	078edef880	Optimize match offset code in 6809 LZSA2 depacker	2020-06-27 04:17:05 -05:00
Doug Masten	03692fca2c	Update code byte counts for 6809 LZSA depackers	2020-06-27 02:02:33 -05:00
Doug Masten	39e11422ec	Delay clearing high part of literals count until we really have to in 6809 LZSA1 depacker	2020-06-27 02:02:33 -05:00
Doug Masten	fde15d3fb0	Move instruction before branch to save one byte in 6809 LZSA2 depacker	2020-06-27 02:02:33 -05:00
Doug Masten	fc8120f0da	Optimize handling of 9 bits offset in 6809 LZSA2 depacker	2020-06-27 02:02:33 -05:00
Doug Masten	c7b3ffc067	Delay clearing high part of literals count until we really have to in 6809 LZSA2 depacker	2020-06-27 02:02:33 -05:00
Doug Masten	137c6201be	One byte saving for setting reg A to $FF in 6809 LZSA2 depacker	2020-06-27 02:02:33 -05:00
Doug Masten	e397428c1f	Remove trailing whitespaces in 6809 LZSA2 depacker	2020-06-27 02:02:33 -05:00
Doug Masten	b8cfbbbc7b	Optimize handling of token's Z flag bit in 6809 LZSA2 depacker	2020-06-27 02:02:33 -05:00
Doug Masten	28ca829924	delay clearing high part of literals count until we really have to in 6809 LZSA1 depacker	2020-06-27 02:02:33 -05:00
Doug Masten	27562f4761	Restructure code to eliminate one BRA instruction from loop in 6809 LZSA1 depacker	2020-06-27 02:02:33 -05:00
Doug Masten	0307d228a0	Merge pull request #3 from dougmasten/master Merge pull request	2020-06-26 17:13:02 -05:00
Doug Masten	e8b2ebb89f	Merge pull request #2 from emmanuel-marty/master Merge pull request	2020-06-26 17:08:44 -05:00
Doug Masten	f72133f4cf	Move instruction before branch to save one byte in 6809 LZSA2 depacker	2020-06-26 16:48:58 -05:00
Doug Masten	56ba563794	One byte saving for setting reg A to $FF in 6809 LZSA2 depacker	2020-06-26 15:59:24 -05:00
Doug Masten	c0f09db364	Delay clearing high part of literals count until we really have to in 6809 LZSA2 depacker	2020-06-26 15:56:28 -05:00
Doug Masten	99db30a732	Optimize handling of 9 bits offset in 6809 LZSA2 depacker	2020-06-26 15:18:53 -05:00
Doug Masten	061ca99838	Optimize handling of token's Z flag bit in 6809 LZSA2 depacker	2020-06-26 14:52:09 -05:00
Doug Masten	7b96368469	Optimize match offset code in 6809 LZSA2 depacker	2020-06-26 14:42:36 -05:00
Doug Masten	e9540b2e3d	Remove unnecessary "ADDB #$18" as register B will always have this value from 6809 LZSA2 depacker	2020-06-24 00:49:09 -05:00
Doug Masten	6a47ed7f41	Remove unnecessary "ADDB #$18" as B register will always have this value from 6809 LZSA depacker	2020-06-21 12:32:54 -05:00
Doug Masten	06d63de9d7	Rearrange "CLRA" instruction for slight speed optimization in 6809 LZSA depacker	2020-06-21 12:16:07 -05:00
Doug Masten	b5b8ca556a	Rearrange match offset code to save 2 bytes in 6809 LZSA depacker	2020-06-21 01:09:15 -05:00