Merge pull request #5 from emmanuel-marty/master

Catch up with the changes in main
2024-11-26 02:49:19 +00:00 · 2020-01-02 13:51:29 +00:00 · 2020-01-02 13:51:29 +00:00 · 3b37a0bb70
commit 3b37a0bb70
parent d5d788946e 8721c11041
16 changed files with 2709 additions and 1050 deletions
--- a/README.md
+++ b/README.md
@ -5,6 +5,8 @@ LZSA is a collection of byte-aligned compression formats that are specifically e

 Check out [The Hollow](https://www.pouet.net/prod.php?which=81909) by Darklite and Offense, winner of the Solskogen 2019 wild compo, that uses LZSA on Z80.

+[Gabba](https://www.pouet.net/prod.php?which=83539) by Stardust ranked 2nd in the ZX Spectrum demo compo at CAFe demoparty 2019 and also used LZSA on Z80. 
+
 The LZSA compression tool uses an aggressive optimal packing strategy to try to find the sequence of commands that gives the smallest packed file that decompresses to the original while maintaining the maximum possible decompression speed.

 The compression formats give the user choices that range from decompressing faster than LZ4 on 8-bit systems with better compression, to compressing as well as ZX7 with much better decompression speed. LZSA1 is designed to replace LZ4 and LZSA2 to replace ZX7, in 8-bit scenarios.
--- a/asm/6502/decompress_fast_v1.asm
+++ b/asm/6502/decompress_fast_v1.asm
@ -0,0 +1,305 @@
+; -----------------------------------------------------------------------------
+; Decompress raw LZSA1 block. Create one with lzsa -r <original_file> <compressed_file>
+;
+; in:
+; * LZSA_SRC_LO and LZSA_SRC_HI contain the compressed raw block address
+; * LZSA_DST_LO and LZSA_DST_HI contain the destination buffer address
+;
+; out:
+; * LZSA_DST_LO and LZSA_DST_HI contain the last decompressed byte address, +1
+;
+; -----------------------------------------------------------------------------
+; Backward decompression is also supported, use lzsa -r -b <original_file> <compressed_file>
+; To use it, also define BACKWARD_DECOMPRESS=1 before including this code!
+;
+; in:
+; * LZSA_SRC_LO/LZSA_SRC_HI must contain the address of the last byte of compressed data
+; * LZSA_DST_LO/LZSA_DST_HI must contain the address of the last byte of the destination buffer
+;
+; out:
+; * LZSA_DST_LO/LZSA_DST_HI contain the last decompressed byte address, -1
+;
+; -----------------------------------------------------------------------------
+;
+;  Copyright (C) 2019 Emmanuel Marty, Peter Ferrie
+;
+;  This software is provided 'as-is', without any express or implied
+;  warranty.  In no event will the authors be held liable for any damages
+;  arising from the use of this software.
+;
+;  Permission is granted to anyone to use this software for any purpose,
+;  including commercial applications, and to alter it and redistribute it
+;  freely, subject to the following restrictions:
+;
+;  1. The origin of this software must not be misrepresented; you must not
+;     claim that you wrote the original software. If you use this software
+;     in a product, an acknowledgment in the product documentation would be
+;     appreciated but is not required.
+;  2. Altered source versions must be plainly marked as such, and must not be
+;     misrepresented as being the original software.
+;  3. This notice may not be removed or altered from any source distribution.
+; -----------------------------------------------------------------------------
+
+DECOMPRESS_LZSA1_FAST
+   LDY #$00
+
+DECODE_TOKEN
+   JSR GETSRC                           ; read token byte: O|LLL|MMMM
+   PHA                                  ; preserve token on stack
+
+   AND #$70                             ; isolate literals count
+   BEQ NO_LITERALS                      ; skip if no literals to copy
+   CMP #$70                             ; LITERALS_RUN_LEN?
+   BNE PREPARE_COPY_LITERALS            ; if not, count is directly embedded in token
+
+   JSR GETSRC                           ; get extra byte of variable literals count
+                                        ; the carry is always set by the CMP above
+                                        ; GETSRC doesn't change it
+   SBC #$F9                             ; (LITERALS_RUN_LEN)
+   BCC PREPARE_COPY_LITERALS_DIRECT
+   BEQ LARGE_VARLEN_LITERALS            ; if adding up to zero, go grab 16-bit count
+
+   JSR GETSRC                           ; get single extended byte of variable literals count
+   INY                                  ; add 256 to literals count
+   BCS PREPARE_COPY_LITERALS_DIRECT     ; (*like JMP PREPARE_COPY_LITERALS_DIRECT but shorter)
+
+LARGE_VARLEN_LITERALS                   ; handle 16 bits literals count
+                                        ; literals count = directly these 16 bits
+   JSR GETLARGESRC                      ; grab low 8 bits in X, high 8 bits in A
+   TAY                                  ; put high 8 bits in Y
+   TXA
+   JMP PREPARE_COPY_LARGE_LITERALS
+
+PREPARE_COPY_LITERALS
+   TAX
+   LDA SHIFT_TABLE-1,X                  ; shift literals length into place
+                                        ; -1 because position 00 is reserved
+PREPARE_COPY_LITERALS_DIRECT
+   TAX
+
+PREPARE_COPY_LARGE_LITERALS
+   BEQ COPY_LITERALS
+   INY
+
+COPY_LITERALS
+   JSR GETPUT                           ; copy one byte of literals
+   DEX
+   BNE COPY_LITERALS
+   DEY
+   BNE COPY_LITERALS
+   
+NO_LITERALS
+   PLA                                  ; retrieve token from stack
+   PHA                                  ; preserve token again
+   BMI GET_LONG_OFFSET                  ; $80: 16 bit offset
+
+   JSR GETSRC                           ; get 8 bit offset from stream in A
+   TAX                                  ; save for later
+   LDA #$FF                             ; high 8 bits
+   BNE GOT_OFFSET                       ; go prepare match
+                                        ; (*like JMP GOT_OFFSET but shorter)
+
+SHORT_VARLEN_MATCHLEN
+   JSR GETSRC                           ; get single extended byte of variable match len
+   INY                                  ; add 256 to match length
+
+PREPARE_COPY_MATCH
+   TAX
+PREPARE_COPY_MATCH_Y
+   TXA
+   BEQ COPY_MATCH_LOOP
+   INY
+
+COPY_MATCH_LOOP
+   LDA $AAAA                            ; get one byte of backreference
+   JSR PUTDST                           ; copy to destination
+
+!ifdef BACKWARD_DECOMPRESS {
+
+   ; Backward decompression -- put backreference bytes backward
+
+   LDA COPY_MATCH_LOOP+1
+   BEQ GETMATCH_ADJ_HI
+GETMATCH_DONE
+   DEC COPY_MATCH_LOOP+1
+
+} else {
+
+   ; Forward decompression -- put backreference bytes forward
+
+   INC COPY_MATCH_LOOP+1
+   BEQ GETMATCH_ADJ_HI
+GETMATCH_DONE
+
+}
+
+   DEX
+   BNE COPY_MATCH_LOOP
+   DEY
+   BNE COPY_MATCH_LOOP
+   BEQ DECODE_TOKEN                     ; (*like JMP DECODE_TOKEN but shorter)
+
+!ifdef BACKWARD_DECOMPRESS {
+
+GETMATCH_ADJ_HI
+   DEC COPY_MATCH_LOOP+2
+   JMP GETMATCH_DONE
+
+} else {
+
+GETMATCH_ADJ_HI
+   INC COPY_MATCH_LOOP+2
+   JMP GETMATCH_DONE
+
+}
+
+GET_LONG_OFFSET                         ; handle 16 bit offset:
+   JSR GETLARGESRC                      ; grab low 8 bits in X, high 8 bits in A
+
+GOT_OFFSET
+
+!ifdef BACKWARD_DECOMPRESS {
+
+   ; Backward decompression - substract match offset
+
+   STA OFFSHI                           ; store high 8 bits of offset
+   STX OFFSLO
+
+   SEC                                  ; substract dest - match offset
+   LDA PUTDST+1
+OFFSLO = *+1
+   SBC #$AA                             ; low 8 bits
+   STA COPY_MATCH_LOOP+1                ; store back reference address
+   LDA PUTDST+2
+OFFSHI = *+1
+   SBC #$AA                             ; high 8 bits
+   STA COPY_MATCH_LOOP+2                ; store high 8 bits of address
+   SEC
+
+} else {
+
+   ; Forward decompression - add match offset
+
+   STA OFFSHI                           ; store high 8 bits of offset
+   TXA
+
+   CLC                                  ; add dest + match offset
+   ADC PUTDST+1                         ; low 8 bits
+   STA COPY_MATCH_LOOP+1                ; store back reference address
+OFFSHI = *+1
+   LDA #$AA                             ; high 8 bits
+
+   ADC PUTDST+2
+   STA COPY_MATCH_LOOP+2                ; store high 8 bits of address
+   
+}
+
+   PLA                                  ; retrieve token from stack again
+   AND #$0F                             ; isolate match len (MMMM)
+   ADC #$02                             ; plus carry which is always set by the high ADC
+   CMP #$12                             ; MATCH_RUN_LEN?
+   BCC PREPARE_COPY_MATCH               ; if not, count is directly embedded in token
+
+   JSR GETSRC                           ; get extra byte of variable match length
+                                        ; the carry is always set by the CMP above
+                                        ; GETSRC doesn't change it
+   SBC #$EE                             ; add MATCH_RUN_LEN and MIN_MATCH_SIZE to match length
+   BCC PREPARE_COPY_MATCH
+   BNE SHORT_VARLEN_MATCHLEN
+
+                                        ; Handle 16 bits match length
+   JSR GETLARGESRC                      ; grab low 8 bits in X, high 8 bits in A
+   TAY                                  ; put high 8 bits in Y
+                                        ; large match length with zero high byte?
+   BNE PREPARE_COPY_MATCH_Y             ; if not, continue
+
+DECOMPRESSION_DONE
+   RTS
+
+SHIFT_TABLE
+   !BYTE     $00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00
+   !BYTE $01,$01,$01,$01,$01,$01,$01,$01,$01,$01,$01,$01,$01,$01,$01,$01
+   !BYTE $02,$02,$02,$02,$02,$02,$02,$02,$02,$02,$02,$02,$02,$02,$02,$02
+   !BYTE $03,$03,$03,$03,$03,$03,$03,$03,$03,$03,$03,$03,$03,$03,$03,$03
+   !BYTE $04,$04,$04,$04,$04,$04,$04,$04,$04,$04,$04,$04,$04,$04,$04,$04
+   !BYTE $05,$05,$05,$05,$05,$05,$05,$05,$05,$05,$05,$05,$05,$05,$05,$05
+   !BYTE $06,$06,$06,$06,$06,$06,$06,$06,$06,$06,$06,$06,$06,$06,$06,$06
+   !BYTE $07,$07,$07,$07,$07,$07,$07,$07,$07,$07,$07,$07,$07,$07,$07,$07
+
+!ifdef BACKWARD_DECOMPRESS {
+
+   ; Backward decompression -- get and put bytes backward
+
+GETPUT
+   JSR GETSRC
+PUTDST
+LZSA_DST_LO = *+1
+LZSA_DST_HI = *+2
+   STA $AAAA
+   LDA PUTDST+1
+   BEQ PUTDST_ADJ_HI
+   DEC PUTDST+1
+   RTS
+
+PUTDST_ADJ_HI
+   DEC PUTDST+2
+   DEC PUTDST+1
+   RTS
+
+GETLARGESRC
+   JSR GETSRC                           ; grab low 8 bits
+   TAX                                  ; move to X
+                                        ; fall through grab high 8 bits
+
+GETSRC
+LZSA_SRC_LO = *+1
+LZSA_SRC_HI = *+2
+   LDA $AAAA
+   PHA
+   LDA GETSRC+1
+   BEQ GETSRC_ADJ_HI
+   DEC GETSRC+1
+   PLA
+   RTS
+
+GETSRC_ADJ_HI
+   DEC GETSRC+2
+   DEC GETSRC+1
+   PLA
+   RTS
+
+} else {
+
+   ; Forward decompression -- get and put bytes forward
+
+GETPUT
+   JSR GETSRC
+PUTDST
+LZSA_DST_LO = *+1
+LZSA_DST_HI = *+2
+   STA $AAAA
+   INC PUTDST+1
+   BEQ PUTDST_ADJ_HI
+   RTS
+
+PUTDST_ADJ_HI
+   INC PUTDST+2
+   RTS
+
+GETLARGESRC
+   JSR GETSRC                           ; grab low 8 bits
+   TAX                                  ; move to X
+                                        ; fall through grab high 8 bits
+
+GETSRC
+LZSA_SRC_LO = *+1
+LZSA_SRC_HI = *+2
+   LDA $AAAA
+   INC GETSRC+1
+   BEQ GETSRC_ADJ_HI
+   RTS
+
+GETSRC_ADJ_HI
+   INC GETSRC+2
+   RTS
+}
--- a/asm/6502/decompress_fast_v2.asm
+++ b/asm/6502/decompress_fast_v2.asm
@ -0,0 +1,363 @@
+; -----------------------------------------------------------------------------
+; Decompress raw LZSA2 block.
+; Create one with lzsa -r -f2 <original_file> <compressed_file>
+;
+; in:
+; * LZSA_SRC_LO and LZSA_SRC_HI contain the compressed raw block address
+; * LZSA_DST_LO and LZSA_DST_HI contain the destination buffer address
+;
+; out:
+; * LZSA_DST_LO and LZSA_DST_HI contain the last decompressed byte address, +1
+;
+; -----------------------------------------------------------------------------
+; Backward decompression is also supported, use lzsa -r -b -f2 <original_file> <compressed_file>
+; To use it, also define BACKWARD_DECOMPRESS=1 before including this code!
+;
+; in:
+; * LZSA_SRC_LO/LZSA_SRC_HI must contain the address of the last byte of compressed data
+; * LZSA_DST_LO/LZSA_DST_HI must contain the address of the last byte of the destination buffer
+;
+; out:
+; * LZSA_DST_LO/LZSA_DST_HI contain the last decompressed byte address, -1
+;
+; -----------------------------------------------------------------------------
+;
+;  Copyright (C) 2019 Emmanuel Marty, Peter Ferrie
+;
+;  This software is provided 'as-is', without any express or implied
+;  warranty.  In no event will the authors be held liable for any damages
+;  arising from the use of this software.
+;
+;  Permission is granted to anyone to use this software for any purpose,
+;  including commercial applications, and to alter it and redistribute it
+;  freely, subject to the following restrictions:
+;
+;  1. The origin of this software must not be misrepresented; you must not
+;     claim that you wrote the original software. If you use this software
+;     in a product, an acknowledgment in the product documentation would be
+;     appreciated but is not required.
+;  2. Altered source versions must be plainly marked as such, and must not be
+;     misrepresented as being the original software.
+;  3. This notice may not be removed or altered from any source distribution.
+; -----------------------------------------------------------------------------
+
+NIBCOUNT = $FC                          ; zero-page location for temp offset
+
+DECOMPRESS_LZSA2_FAST
+   LDY #$00
+   STY NIBCOUNT
+
+DECODE_TOKEN
+   JSR GETSRC                           ; read token byte: XYZ|LL|MMM
+   PHA                                  ; preserve token on stack
+
+   AND #$18                             ; isolate literals count (LL)
+   BEQ NO_LITERALS                      ; skip if no literals to copy
+   CMP #$18                             ; LITERALS_RUN_LEN_V2?
+   BCC PREPARE_COPY_LITERALS            ; if less, count is directly embedded in token
+
+   JSR GETNIBBLE                        ; get extra literals length nibble
+                                        ; add nibble to len from token
+   ADC #$02                             ; (LITERALS_RUN_LEN_V2) minus carry
+   CMP #$12                             ; LITERALS_RUN_LEN_V2 + 15 ?
+   BCC PREPARE_COPY_LITERALS_DIRECT     ; if less, literals count is complete
+
+   JSR GETSRC                           ; get extra byte of variable literals count
+                                        ; the carry is always set by the CMP above
+                                        ; GETSRC doesn't change it
+   SBC #$EE                             ; overflow?
+   JMP PREPARE_COPY_LITERALS_DIRECT
+
+PREPARE_COPY_LITERALS_LARGE
+                                        ; handle 16 bits literals count
+                                        ; literals count = directly these 16 bits
+   JSR GETLARGESRC                      ; grab low 8 bits in X, high 8 bits in A
+   TAY                                  ; put high 8 bits in Y
+   BCS PREPARE_COPY_LITERALS_HIGH       ; (*same as JMP PREPARE_COPY_LITERALS_HIGH but shorter)
+
+PREPARE_COPY_LITERALS
+   LSR                                  ; shift literals count into place
+   LSR
+   LSR
+
+PREPARE_COPY_LITERALS_DIRECT
+   TAX
+   BCS PREPARE_COPY_LITERALS_LARGE      ; if so, literals count is large
+
+PREPARE_COPY_LITERALS_HIGH
+   TXA
+   BEQ COPY_LITERALS
+   INY
+
+COPY_LITERALS
+   JSR GETPUT                           ; copy one byte of literals
+   DEX
+   BNE COPY_LITERALS
+   DEY
+   BNE COPY_LITERALS
+   
+NO_LITERALS
+   PLA                                  ; retrieve token from stack
+   PHA                                  ; preserve token again
+   ASL
+   BCS REPMATCH_OR_LARGE_OFFSET         ; 1YZ: rep-match or 13/16 bit offset
+
+   ASL                                  ; 0YZ: 5 or 9 bit offset
+   BCS OFFSET_9_BIT         
+    
+                                        ; 00Z: 5 bit offset
+
+   LDX #$FF                             ; set offset bits 15-8 to 1
+
+   JSR GETCOMBINEDBITS                  ; rotate Z bit into bit 0, read nibble for bits 4-1
+   ORA #$E0                             ; set bits 7-5 to 1
+   BNE GOT_OFFSET_LO                    ; go store low byte of match offset and prepare match
+   
+OFFSET_9_BIT                            ; 01Z: 9 bit offset
+   ;;ASL                                  ; shift Z (offset bit 8) in place
+   ROL
+   ROL
+   AND #$01
+   EOR #$FF                             ; set offset bits 15-9 to 1
+   BNE GOT_OFFSET_HI                    ; go store high byte, read low byte of match offset and prepare match
+                                        ; (*same as JMP GOT_OFFSET_HI but shorter)
+
+REPMATCH_OR_LARGE_OFFSET
+   ASL                                  ; 13 bit offset?
+   BCS REPMATCH_OR_16_BIT               ; handle rep-match or 16-bit offset if not
+
+                                        ; 10Z: 13 bit offset
+
+   JSR GETCOMBINEDBITS                  ; rotate Z bit into bit 8, read nibble for bits 12-9
+   ADC #$DE                             ; set bits 15-13 to 1 and substract 2 (to substract 512)
+   BNE GOT_OFFSET_HI                    ; go store high byte, read low byte of match offset and prepare match
+                                        ; (*same as JMP GOT_OFFSET_HI but shorter)
+
+REPMATCH_OR_16_BIT                      ; rep-match or 16 bit offset
+   ;;ASL                                  ; XYZ=111?
+   BMI REP_MATCH                        ; reuse previous offset if so (rep-match)
+   
+                                        ; 110: handle 16 bit offset
+   JSR GETSRC                           ; grab high 8 bits
+GOT_OFFSET_HI
+   TAX
+   JSR GETSRC                           ; grab low 8 bits
+GOT_OFFSET_LO
+   STA OFFSLO                           ; store low byte of match offset
+   STX OFFSHI                           ; store high byte of match offset
+
+REP_MATCH
+!ifdef BACKWARD_DECOMPRESS {
+
+   ; Backward decompression - substract match offset
+
+   SEC                                  ; add dest + match offset
+   LDA PUTDST+1                         ; low 8 bits
+OFFSLO = *+1
+   SBC #$AA
+   STA COPY_MATCH_LOOP+1                ; store back reference address
+   LDA PUTDST+2
+OFFSHI = *+1
+   SBC #$AA                             ; high 8 bits
+   STA COPY_MATCH_LOOP+2                ; store high 8 bits of address
+   SEC
+
+} else {
+
+   ; Forward decompression - add match offset
+
+   CLC                                  ; add dest + match offset
+   LDA PUTDST+1                         ; low 8 bits
+OFFSLO = *+1
+   ADC #$AA
+   STA COPY_MATCH_LOOP+1                ; store back reference address
+OFFSHI = *+1
+   LDA #$AA                             ; high 8 bits
+   ADC PUTDST+2
+   STA COPY_MATCH_LOOP+2                ; store high 8 bits of address
+   
+}
+   
+   PLA                                  ; retrieve token from stack again
+   AND #$07                             ; isolate match len (MMM)
+   ADC #$01                             ; add MIN_MATCH_SIZE_V2 and carry
+   CMP #$09                             ; MIN_MATCH_SIZE_V2 + MATCH_RUN_LEN_V2?
+   BCC PREPARE_COPY_MATCH               ; if less, length is directly embedded in token
+
+   JSR GETNIBBLE                        ; get extra match length nibble
+                                        ; add nibble to len from token
+   ADC #$08                             ; (MIN_MATCH_SIZE_V2 + MATCH_RUN_LEN_V2) minus carry
+   CMP #$18                             ; MIN_MATCH_SIZE_V2 + MATCH_RUN_LEN_V2 + 15?
+   BCC PREPARE_COPY_MATCH               ; if less, match length is complete
+
+   JSR GETSRC                           ; get extra byte of variable match length
+                                        ; the carry is always set by the CMP above
+                                        ; GETSRC doesn't change it
+   SBC #$E8                             ; overflow?
+
+PREPARE_COPY_MATCH
+   TAX
+   BCC PREPARE_COPY_MATCH_Y             ; if not, the match length is complete
+   BEQ DECOMPRESSION_DONE               ; if EOD code, bail
+
+                                        ; Handle 16 bits match length
+   JSR GETLARGESRC                      ; grab low 8 bits in X, high 8 bits in A
+   TAY                                  ; put high 8 bits in Y
+
+PREPARE_COPY_MATCH_Y
+   TXA
+   BEQ COPY_MATCH_LOOP
+   INY
+
+COPY_MATCH_LOOP
+   LDA $AAAA                            ; get one byte of backreference
+   JSR PUTDST                           ; copy to destination
+
+!ifdef BACKWARD_DECOMPRESS {
+
+   ; Backward decompression -- put backreference bytes backward
+
+   LDA COPY_MATCH_LOOP+1
+   BEQ GETMATCH_ADJ_HI
+GETMATCH_DONE
+   DEC COPY_MATCH_LOOP+1
+
+} else {
+
+   ; Forward decompression -- put backreference bytes forward
+
+   INC COPY_MATCH_LOOP+1
+   BEQ GETMATCH_ADJ_HI
+GETMATCH_DONE
+
+}
+
+   DEX
+   BNE COPY_MATCH_LOOP
+   DEY
+   BNE COPY_MATCH_LOOP
+   JMP DECODE_TOKEN
+
+!ifdef BACKWARD_DECOMPRESS {
+
+GETMATCH_ADJ_HI
+   DEC COPY_MATCH_LOOP+2
+   JMP GETMATCH_DONE
+
+} else {
+
+GETMATCH_ADJ_HI
+   INC COPY_MATCH_LOOP+2
+   JMP GETMATCH_DONE
+
+}
+
+GETCOMBINEDBITS
+   EOR #$80
+   ASL
+   PHP
+
+   JSR GETNIBBLE                        ; get nibble into bits 0-3 (for offset bits 1-4)
+   PLP                                  ; merge Z bit as the carry bit (for offset bit 0)
+COMBINEDBITZ
+   ROL                                  ; nibble -> bits 1-4; carry(!Z bit) -> bit 0 ; carry cleared
+DECOMPRESSION_DONE
+   RTS
+
+GETNIBBLE
+NIBBLES = *+1
+   LDA #$AA
+   LSR NIBCOUNT
+   BCC NEED_NIBBLES
+   AND #$0F                             ; isolate low 4 bits of nibble
+   RTS
+
+NEED_NIBBLES
+   INC NIBCOUNT
+   JSR GETSRC                           ; get 2 nibbles
+   STA NIBBLES
+   LSR 
+   LSR 
+   LSR 
+   LSR 
+   SEC
+   RTS
+
+!ifdef BACKWARD_DECOMPRESS {
+
+   ; Backward decompression -- get and put bytes backward
+
+GETPUT
+   JSR GETSRC
+PUTDST
+LZSA_DST_LO = *+1
+LZSA_DST_HI = *+2
+   STA $AAAA
+   LDA PUTDST+1
+   BEQ PUTDST_ADJ_HI
+   DEC PUTDST+1
+   RTS
+
+PUTDST_ADJ_HI
+   DEC PUTDST+2
+   DEC PUTDST+1
+   RTS
+
+GETLARGESRC
+   JSR GETSRC                           ; grab low 8 bits
+   TAX                                  ; move to X
+                                        ; fall through grab high 8 bits
+
+GETSRC
+LZSA_SRC_LO = *+1
+LZSA_SRC_HI = *+2
+   LDA $AAAA
+   PHA
+   LDA GETSRC+1
+   BEQ GETSRC_ADJ_HI
+   DEC GETSRC+1
+   PLA
+   RTS
+
+GETSRC_ADJ_HI
+   DEC GETSRC+2
+   DEC GETSRC+1
+   PLA
+   RTS
+
+} else {
+
+   ; Forward decompression -- get and put bytes forward
+
+GETPUT
+   JSR GETSRC
+PUTDST
+LZSA_DST_LO = *+1
+LZSA_DST_HI = *+2
+   STA $AAAA
+   INC PUTDST+1
+   BEQ PUTDST_ADJ_HI
+   RTS
+
+PUTDST_ADJ_HI
+   INC PUTDST+2
+   RTS
+
+GETLARGESRC
+   JSR GETSRC                           ; grab low 8 bits
+   TAX                                  ; move to X
+                                        ; fall through grab high 8 bits
+
+GETSRC
+LZSA_SRC_LO = *+1
+LZSA_SRC_HI = *+2
+   LDA $AAAA
+   INC GETSRC+1
+   BEQ GETSRC_ADJ_HI
+   RTS
+
+GETSRC_ADJ_HI
+   INC GETSRC+2
+   RTS
+}
+
--- a/asm/6502/decompress_faster_v2.asm
+++ b/asm/6502/decompress_faster_v2.asm
@ -0,0 +1,470 @@
+; ***************************************************************************
+; ***************************************************************************
+;
+; lzsa2_6502.s
+;
+; NMOS 6502 decompressor for data stored in Emmanuel Marty's LZSA2 format.
+;
+; Optional code is presented for two minor 6502 optimizations that break
+; compatibility with the current LZSA2 format standard.
+;
+; This code is written for the ACME assembler.
+;
+; Copyright John Brandwood 2019.
+;
+; Distributed under the Boost Software License, Version 1.0.
+; (See accompanying file LICENSE_1_0.txt or copy at
+;  http://www.boost.org/LICENSE_1_0.txt)
+;
+; ***************************************************************************
+; ***************************************************************************
+
+
+
+; ***************************************************************************
+; ***************************************************************************
+;
+; Decompression Options & Macros
+;
+
+                ;
+                ; Save 7 bytes of code, and 21 cycles every time that a 
+                ; 16-bit length is decoded?
+                ;
+                ; N.B. Setting this breaks compatibility with LZSA v1.2
+                ;
+
+LZSA_SWAP_LEN16 =       0
+
+                ;
+                ; Save 3 bytes of code, and 4 or 8 cycles when decoding
+                ; an offset?
+                ;
+                ; N.B. Setting this breaks compatibility with LZSA v1.2
+                ;
+
+LZSA_SWAP_XZY   =       0
+
+                ;
+                ; Remove code inlining to save space?
+                ;
+                ; This saves 15 bytes of code, but decompression is 7% slower.
+                ;
+
+LZSA_BEST_SIZE  =       0
+
+                ;
+                ; Assume that we're decompessing from a large multi-bank
+                ; compressed data file, and that the next bank may need to
+                ; paged in when a page-boundary is crossed.
+                ;
+
+LZSA_FROM_BANK  =       0
+
+                ;
+                ; Macro to increment the source pointer to the next page.
+                ;
+
+                !if     LZSA_FROM_BANK {
+
+                   !macro LZSA_INC_PAGE {
+                      jsr     .next_page
+                   }
+
+                } else {
+
+                   !macro LZSA_INC_PAGE {
+                      inc     <lzsa_srcptr + 1
+                   }
+
+                }
+
+                ;
+                ; Macro to read a byte from the compressed source data.
+                ;
+
+                !if     LZSA_BEST_SIZE {
+
+                   !macro LZSA_GET_SRC {
+                      jsr     .get_byte
+                   }
+
+                } else {
+
+                   !macro LZSA_GET_SRC {
+                      lda     (lzsa_srcptr),y
+                      inc     <lzsa_srcptr + 0
+                      bne     .skip
+                      +LZSA_INC_PAGE
+.skip:
+                   }
+
+                }
+
+                ;
+                ; Macro to speed up reading 50% of nibbles.
+                ;
+
+LZSA_SLOW_NIBL  =       1
+
+                !if     (LZSA_SLOW_NIBL + LZSA_BEST_SIZE) {
+
+                   !macro LZSA_GET_NIBL {
+                      jsr     lzsa2_get_nibble        ; Always call a function.
+                   }
+
+                } else {
+
+                   !macro LZSA_GET_NIBL {
+                      lsr     <lzsa_nibflg            ; Is there a nibble waiting?
+                      lda     <lzsa_nibble            ; Extract the lo-nibble.
+                      bcs     .skip\@
+                      jsr     .new_nibble             ; Extract the hi-nibble.
+      .skip\@:        ora     #$F0
+                   }
+
+                }
+
+
+
+; ***************************************************************************
+; ***************************************************************************
+;
+; Data usage is last 11 bytes of zero-page.
+;
+
+lzsa_cmdbuf     =       $F5                     ; 1 byte.
+lzsa_nibflg     =       $F6                     ; 1 byte.
+lzsa_nibble     =       $F7                     ; 1 byte.
+lzsa_offset     =       $F8                     ; 1 word.
+lzsa_winptr     =       $FA                     ; 1 word.
+lzsa_srcptr     =       $FC                     ; 1 word.
+lzsa_dstptr     =       $FE                     ; 1 word.
+
+LZSA_SRC_LO     =       $FC
+LZSA_SRC_HI     =       $FD
+LZSA_DST_LO     =       $FE
+LZSA_DST_HI     =       $FF
+
+; ***************************************************************************
+; ***************************************************************************
+;
+; lzsa2_unpack - Decompress data stored in Emmanuel Marty's LZSA2b format.
+;
+; Args: lzsa_srcptr = ptr to compessed data
+; Args: lzsa_dstptr = ptr to output buffer
+; Uses: lots!
+;
+; If compiled with LZSA_FROM_BANK, then lzsa_srcptr should be within the bank
+; window range.
+;
+
+DECOMPRESS_LZSA2_FAST:
+lzsa2_unpack:   ldy     #0                      ; Initialize source index.
+                sty     <lzsa_nibflg            ; Initialize nibble buffer.
+
+                ;
+                ; Copy bytes from compressed source data.
+                ;
+
+.cp_length:     ldx     #$00                    ; Hi-byte of length or offset.
+
+                +LZSA_GET_SRC
+                sta     <lzsa_cmdbuf            ; Preserve this for later.
+                and     #$18                    ; Extract literal length.
+                beq     .lz_offset              ; Skip directly to match?
+
+                lsr                             ; Get 2-bit literal length.
+                lsr
+                lsr
+                cmp     #$03                    ; Extended length?
+                bne     .got_cp_len
+
+                jsr     .get_length             ; X=0 table index for literals.
+
+.got_cp_len:    tay                             ; Check the lo-byte of length.
+                beq     .cp_page
+
+                inx                             ; Increment # of pages to copy.
+
+.get_cp_src:    clc                             ; Calc source for partial
+                adc     <lzsa_srcptr + 0        ; page.
+                sta     <lzsa_srcptr + 0
+                bcs     .get_cp_dst
+                dec     <lzsa_srcptr + 1
+
+.get_cp_dst:    tya
+                clc                             ; Calc destination for partial
+                adc     <lzsa_dstptr + 0        ; page.
+                sta     <lzsa_dstptr + 0
+                bcs     .get_cp_idx
+                dec     <lzsa_dstptr + 1
+
+.get_cp_idx:    tya                             ; Negate the lo-byte of length.
+                eor     #$FF
+                tay
+                iny
+
+.cp_page:       lda     (lzsa_srcptr),y
+                sta     (lzsa_dstptr),y
+                iny
+                bne     .cp_page
+                inc     <lzsa_srcptr + 1
+                inc     <lzsa_dstptr + 1
+                dex                             ; Any full pages left to copy?
+                bne     .cp_page
+
+                !if      LZSA_SWAP_XZY {
+
+                ;
+                ; Shorter and faster path with NEW order of bits.
+                ;
+                ; STD  NEW
+                ; ================================ 
+                ; xyz  xzy
+                ; 00z  0z0  5-bit offset
+                ; 01z  0z1  9-bit offset
+                ; 10z  1z0  13-bit offset
+                ; 110  101  16-bit offset
+                ; 111  111  repeat offset
+                ;      NVZ  for a BIT instruction
+                ;
+                ; N.B. Saves 3 bytes in code length.
+                ;      get5 and get13 are 8 cycles faster.
+                ;      get9, get16, and rep are 4 cycles faster.
+                ;
+
+.lz_offset:     lda     #$20                    ; Y bit in lzsa_cmdbuf.
+                bit     <lzsa_cmdbuf
+                bmi     .get_13_16_rep
+                bne     .get_9_bits
+
+.get_5_bits:    dex                             ; X=$FF
+.get_13_bits:   LZSA_GET_NIBL                   ; Always returns with CS.
+                bvc     .get_5_skip
+                clc
+.get_5_skip:    rol     a                       ; Shift into position, set C.
+                cpx     #$00                    ; X=$FF for a 5-bit offset.
+                bne     .set_offset
+                sbc     #2                      ; Subtract 512 because 13-bit
+                tax                             ; offset starts at $FE00.
+                bne     .get_low8               ; Always NZ from previous TAX.
+
+.get_9_bits:    dex                             ; X=$FF if VC, X=$FE if VS.
+                bvc     .get_low8
+                dex
+                bvs     .get_low8               ; Always VS from previous BIT.
+
+.get_13_16_rep: beq     .get_13_bits            ; Shares code with 5-bit path.
+
+.get_16_rep:    bvs     .lz_length              ; Repeat previous offset.
+
+                } else {
+
+                ;
+                ; Slower and longer path with STD order of bits.
+                ;
+                ; Z80  NES
+                ; ================================ 
+                ; xyz  xzy
+                ; 00z  0z0  5-bit offset
+                ; 01z  0z1  9-bit offset
+                ; 10z  1z0  13-bit offset
+                ; 110  101  16-bit offset
+                ; 111  111  repeat offset
+                ;      NVZ  for a BIT instruction
+                ;
+
+.lz_offset:     lda     <lzsa_cmdbuf
+                asl
+                bcs     .get_13_16_rep
+                asl
+                bcs     .get_9_bits
+
+.get_5_bits:    dex                             ; X=$FF
+.get_13_bits:   asl
+                php
+                +LZSA_GET_NIBL                  ; Always returns with CS.
+                plp
+                rol                             ; Shift into position, set C.
+                eor     #$01
+                cpx     #$00                    ; X=$FF for a 5-bit offset.
+                bne     .set_offset
+                sbc     #2                      ; Subtract 512 because 13-bit
+                tax                             ; offset starts at $FE00.
+                bne     .get_low8               ; Always NZ from previous TAX.
+
+.get_9_bits:    dex                             ; X=$FF if CS, X=$FE if CC.
+                asl
+                bcc     .get_low8
+                dex
+                bcs     .get_low8               ; Always VS from previous BIT.
+
+.get_13_16_rep: asl
+                bcc     .get_13_bits            ; Shares code with 5-bit path.
+
+.get_16_rep:    bmi     .lz_length              ; Repeat previous offset.
+
+                }
+
+                ;
+                ; Copy bytes from decompressed window.
+                ;
+                ; N.B. X=0 is expected and guaranteed when we get here.
+                ;
+
+.get_16_bits:   jsr     .get_byte               ; Get hi-byte of offset.
+                tax
+
+.get_low8:      +LZSA_GET_SRC                   ; Get lo-byte of offset.
+
+.set_offset:    stx     <lzsa_offset + 1        ; Save new offset.
+                sta     <lzsa_offset + 0
+
+.lz_length:     ldx     #$00                    ; Hi-byte of length.
+
+                lda     <lzsa_cmdbuf
+                and     #$07
+                clc
+                adc     #$02
+                cmp     #$09                    ; Extended length?
+                bne     .got_lz_len
+
+                inx
+                jsr     .get_length             ; X=1 table index for match.
+
+.got_lz_len:    eor     #$FF                    ; Negate the lo-byte of length
+                tay                             ; and check for zero.
+                iny
+                beq     .calc_lz_addr
+                eor     #$FF
+
+                inx                             ; Increment # of pages to copy.
+
+                clc                             ; Calc destination for partial
+                adc     <lzsa_dstptr + 0        ; page.
+                sta     <lzsa_dstptr + 0
+                bcs     .calc_lz_addr
+                dec     <lzsa_dstptr + 1
+
+.calc_lz_addr:  clc                             ; Calc address of match.
+                lda     <lzsa_dstptr + 0        ; N.B. Offset is negative!
+                adc     <lzsa_offset + 0
+                sta     <lzsa_winptr + 0
+                lda     <lzsa_dstptr + 1
+                adc     <lzsa_offset + 1
+                sta     <lzsa_winptr + 1
+
+.lz_page:       lda     (lzsa_winptr),y
+                sta     (lzsa_dstptr),y
+                iny
+                bne     .lz_page
+                inc     <lzsa_winptr + 1
+                inc     <lzsa_dstptr + 1
+                dex                             ; Any full pages left to copy?
+                bne     .lz_page
+
+                jmp     .cp_length              ; Loop around to the beginning.
+
+                ;
+                ; Lookup tables to differentiate literal and match lengths.
+                ;
+
+.nibl_len_tbl:  !byte   3 + $10                 ; 0+3 (for literal).
+                !byte   9 + $10                 ; 2+7 (for match).
+
+.byte_len_tbl:  !byte   18 - 1                  ; 0+3+15 - CS (for literal).
+                !byte   24 - 1                  ; 2+7+15 - CS (for match).
+
+                ;
+                ; Get 16-bit length in X:A register pair.
+                ;
+                ; N.B. Requires reversal of bytes in 16-bit length.
+                ;
+
+.get_length:    +LZSA_GET_NIBL
+                cmp     #$FF                    ; Extended length?
+                bcs     .byte_length
+                adc     .nibl_len_tbl,x         ; Always CC from previous CMP.
+
+.got_length:    ldx     #$00                    ; Set hi-byte of 4 & 8 bit
+                rts                             ; lengths.
+
+.byte_length:   jsr     .get_byte               ; So rare, this can be slow!
+                adc     .byte_len_tbl,x         ; Always CS from previous CMP.
+                bcc     .got_length
+                beq     .finished
+
+                !if      LZSA_SWAP_LEN16 {
+
+.word_length:   jsr     .get_byte               ; So rare, this can be slow!
+                tax
+
+                } else {
+
+.word_length:   jsr     .get_byte               ; So rare, this can be slow!
+                pha
+                jsr     .get_byte               ; So rare, this can be slow!
+                tax
+                pla
+                rts
+
+                }
+
+.get_byte:      lda     (lzsa_srcptr),y         ; Subroutine version for when
+                inc     <lzsa_srcptr + 0        ; inlining isn't advantageous.
+                beq     .next_page
+                rts
+
+.next_page:     inc     <lzsa_srcptr + 1        ; Inc & test for bank overflow.
+                !if      LZSA_FROM_BANK {
+                bmi     .next_bank              ; Change for target hardware!
+                }
+                rts
+
+.finished:      pla                             ; Decompression completed, pop
+                pla                             ; return address.
+                rts
+
+                ;
+                ; Get a nibble value from compressed data in A.
+                ;
+
+                !if      (LZSA_SLOW_NIBL + LZSA_BEST_SIZE) {
+
+lzsa2_get_nibble:    lsr     <lzsa_nibflg            ; Is there a nibble waiting?
+                lda     <lzsa_nibble            ; Extract the lo-nibble.
+                bcs     .got_nibble
+
+                inc     <lzsa_nibflg            ; Reset the flag.
+                +LZSA_GET_SRC
+                sta     <lzsa_nibble            ; Preserve for next time.
+                lsr                             ; Extract the hi-nibble.
+                lsr
+                lsr
+                lsr
+
+                !if     LZSA_SWAP_XZY {
+                sec                             ; Offset code relies on CS.
+                }
+
+.got_nibble:    ora     #$F0
+                rts
+
+                } else {
+
+.new_nibble:    inc     <lzsa_nibflg            ; Reset the flag.
+                LZSA_GET_SRC
+                sta     <lzsa_nibble            ; Preserve for next time.
+                lsr     a                       ; Extract the hi-nibble.
+                lsr     a
+                lsr     a
+                lsr     a
+
+                !if     LZSA_SWAP_XZY {
+                sec                             ; Offset code relies on CS.
+                }
+
+                rts
+
+                }
--- a/asm/6502/decompress_small_v1.asm
+++ b/asm/6502/decompress_small_v1.asm
@ -1,270 +1,270 @@
-; -----------------------------------------------------------------------------
-; Decompress raw LZSA1 block. Create one with lzsa -r <original_file> <compressed_file>
-;
-; in:
-; * LZSA_SRC_LO and LZSA_SRC_HI contain the compressed raw block address
-; * LZSA_DST_LO and LZSA_DST_HI contain the destination buffer address
-;
-; out:
-; * LZSA_DST_LO and LZSA_DST_HI contain the last decompressed byte address, +1
-;
-; -----------------------------------------------------------------------------
-; Backward decompression is also supported, use lzsa -r -b <original_file> <compressed_file>
-; To use it, also define BACKWARD_DECOMPRESS=1 before including this code!
-;
-; in:
-; * LZSA_SRC_LO/LZSA_SRC_HI must contain the address of the last byte of compressed data
-; * LZSA_DST_LO/LZSA_DST_HI must contain the address of the last byte of the destination buffer
-;
-; out:
-; * LZSA_DST_LO/LZSA_DST_HI contain the last decompressed byte address, -1
-;
-; -----------------------------------------------------------------------------
-;
-;  Copyright (C) 2019 Emmanuel Marty
-;
-;  This software is provided 'as-is', without any express or implied
-;  warranty.  In no event will the authors be held liable for any damages
-;  arising from the use of this software.
-;
-;  Permission is granted to anyone to use this software for any purpose,
-;  including commercial applications, and to alter it and redistribute it
-;  freely, subject to the following restrictions:
-;
-;  1. The origin of this software must not be misrepresented; you must not
-;     claim that you wrote the original software. If you use this software
-;     in a product, an acknowledgment in the product documentation would be
-;     appreciated but is not required.
-;  2. Altered source versions must be plainly marked as such, and must not be
-;     misrepresented as being the original software.
-;  3. This notice may not be removed or altered from any source distribution.
-; -----------------------------------------------------------------------------
-
-DECOMPRESS_LZSA1
-   LDY #$00
-
-DECODE_TOKEN
-   JSR GETSRC                           ; read token byte: O|LLL|MMMM
-   PHA                                  ; preserve token on stack
-
-   AND #$70                             ; isolate literals count
-   BEQ NO_LITERALS                      ; skip if no literals to copy
-   LSR                                  ; shift literals count into place
-   LSR
-   LSR
-   LSR
-   CMP #$07                             ; LITERALS_RUN_LEN?
-   BCC PREPARE_COPY_LITERALS            ; if not, count is directly embedded in token
-
-   JSR GETSRC                           ; get extra byte of variable literals count
-                                        ; the carry is always set by the CMP above
-                                        ; GETSRC doesn't change it
-   SBC #$F9                             ; (LITERALS_RUN_LEN)
-   BCC PREPARE_COPY_LITERALS
-   BEQ LARGE_VARLEN_LITERALS            ; if adding up to zero, go grab 16-bit count
-
-   JSR GETSRC                           ; get single extended byte of variable literals count
-   INY                                  ; add 256 to literals count
-   BCS PREPARE_COPY_LITERALS            ; (*like JMP PREPARE_COPY_LITERALS but shorter)
-
-LARGE_VARLEN_LITERALS                   ; handle 16 bits literals count
-                                        ; literals count = directly these 16 bits
-   JSR GETLARGESRC                      ; grab low 8 bits in X, high 8 bits in A
-   TAY                                  ; put high 8 bits in Y
-   TXA
-
-PREPARE_COPY_LITERALS
-   TAX
-   BEQ COPY_LITERALS
-   INY
-
-COPY_LITERALS
-   JSR GETPUT                           ; copy one byte of literals
-   DEX
-   BNE COPY_LITERALS
-   DEY
-   BNE COPY_LITERALS
-   
-NO_LITERALS
-   PLA                                  ; retrieve token from stack
-   PHA                                  ; preserve token again
-   BMI GET_LONG_OFFSET                  ; $80: 16 bit offset
-
-   JSR GETSRC                           ; get 8 bit offset from stream in A
-   TAX                                  ; save for later
-   LDA #$FF                             ; high 8 bits
-   BNE GOT_OFFSET                       ; go prepare match
-                                        ; (*like JMP GOT_OFFSET but shorter)
-
-SHORT_VARLEN_MATCHLEN
-   JSR GETSRC                           ; get single extended byte of variable match len
-   INY                                  ; add 256 to match length
-
-PREPARE_COPY_MATCH
-   TAX
-PREPARE_COPY_MATCH_Y
-   TXA
-   BEQ COPY_MATCH_LOOP
-   INY
-
-COPY_MATCH_LOOP
-   LDA $AAAA                            ; get one byte of backreference
-   JSR PUTDST                           ; copy to destination
-
-!ifdef BACKWARD_DECOMPRESS {
-
-   ; Backward decompression -- put backreference bytes backward
-
-   LDA COPY_MATCH_LOOP+1
-   BNE GETMATCH_DONE
-   DEC COPY_MATCH_LOOP+2
-GETMATCH_DONE
-   DEC COPY_MATCH_LOOP+1
-
-} else {
-
-   ; Forward decompression -- put backreference bytes forward
-
-   INC COPY_MATCH_LOOP+1
-   BNE GETMATCH_DONE
-   INC COPY_MATCH_LOOP+2
-GETMATCH_DONE
-
-}
-
-   DEX
-   BNE COPY_MATCH_LOOP
-   DEY
-   BNE COPY_MATCH_LOOP
-   BEQ DECODE_TOKEN                     ; (*like JMP DECODE_TOKEN but shorter)
-
-GET_LONG_OFFSET                         ; handle 16 bit offset:
-   JSR GETLARGESRC                      ; grab low 8 bits in X, high 8 bits in A
-
-GOT_OFFSET
-
-!ifdef BACKWARD_DECOMPRESS {
-
-   ; Backward decompression - substract match offset
-
-   STA OFFSHI                           ; store high 8 bits of offset
-   STX OFFSLO
-
-   SEC                                  ; substract dest - match offset
-   LDA PUTDST+1
-OFFSLO = *+1
-   SBC #$AA                             ; low 8 bits
-   STA COPY_MATCH_LOOP+1                ; store back reference address
-   LDA PUTDST+2
-OFFSHI = *+1
-   SBC #$AA                             ; high 8 bits
-   STA COPY_MATCH_LOOP+2                ; store high 8 bits of address
-   SEC
-
-} else {
-
-   ; Forward decompression - add match offset
-
-   STA OFFSHI                           ; store high 8 bits of offset
-   TXA
-
-   CLC                                  ; add dest + match offset
-   ADC PUTDST+1                         ; low 8 bits
-   STA COPY_MATCH_LOOP+1                ; store back reference address
-OFFSHI = *+1
-   LDA #$AA                             ; high 8 bits
-
-   ADC PUTDST+2
-   STA COPY_MATCH_LOOP+2                ; store high 8 bits of address
-   
-}
-
-   PLA                                  ; retrieve token from stack again
-   AND #$0F                             ; isolate match len (MMMM)
-   ADC #$02                             ; plus carry which is always set by the high ADC
-   CMP #$12                             ; MATCH_RUN_LEN?
-   BCC PREPARE_COPY_MATCH               ; if not, count is directly embedded in token
-
-   JSR GETSRC                           ; get extra byte of variable match length
-                                        ; the carry is always set by the CMP above
-                                        ; GETSRC doesn't change it
-   SBC #$EE                             ; add MATCH_RUN_LEN and MIN_MATCH_SIZE to match length
-   BCC PREPARE_COPY_MATCH
-   BNE SHORT_VARLEN_MATCHLEN
-
-                                        ; Handle 16 bits match length
-   JSR GETLARGESRC                      ; grab low 8 bits in X, high 8 bits in A
-   TAY                                  ; put high 8 bits in Y
-                                        ; large match length with zero high byte?
-   BNE PREPARE_COPY_MATCH_Y             ; if not, continue
-
-DECOMPRESSION_DONE
-   RTS
-
-!ifdef BACKWARD_DECOMPRESS {
-
-   ; Backward decompression -- get and put bytes backward
-
-GETPUT
-   JSR GETSRC
-PUTDST
-LZSA_DST_LO = *+1
-LZSA_DST_HI = *+2
-   STA $AAAA
-   LDA PUTDST+1
-   BNE PUTDST_DONE
-   DEC PUTDST+2
-PUTDST_DONE
-   DEC PUTDST+1
-   RTS
-
-GETLARGESRC
-   JSR GETSRC                           ; grab low 8 bits
-   TAX                                  ; move to X
-                                        ; fall through grab high 8 bits
-
-GETSRC
-LZSA_SRC_LO = *+1
-LZSA_SRC_HI = *+2
-   LDA $AAAA
-   PHA
-   LDA GETSRC+1
-   BNE GETSRC_DONE
-   DEC GETSRC+2
-GETSRC_DONE
-   DEC GETSRC+1
-   PLA
-   RTS
-
-} else {
-
-   ; Forward decompression -- get and put bytes forward
-
-GETPUT
-   JSR GETSRC
-PUTDST
-LZSA_DST_LO = *+1
-LZSA_DST_HI = *+2
-   STA $AAAA
-   INC PUTDST+1
-   BNE PUTDST_DONE
-   INC PUTDST+2
-PUTDST_DONE
-   RTS
-
-GETLARGESRC
-   JSR GETSRC                           ; grab low 8 bits
-   TAX                                  ; move to X
-                                        ; fall through grab high 8 bits
-
-GETSRC
-LZSA_SRC_LO = *+1
-LZSA_SRC_HI = *+2
-   LDA $AAAA
-   INC GETSRC+1
-   BNE GETSRC_DONE
-   INC GETSRC+2
-GETSRC_DONE
-   RTS
-
-}
+; -----------------------------------------------------------------------------
+; Decompress raw LZSA1 block. Create one with lzsa -r <original_file> <compressed_file>
+;
+; in:
+; * LZSA_SRC_LO and LZSA_SRC_HI contain the compressed raw block address
+; * LZSA_DST_LO and LZSA_DST_HI contain the destination buffer address
+;
+; out:
+; * LZSA_DST_LO and LZSA_DST_HI contain the last decompressed byte address, +1
+;
+; -----------------------------------------------------------------------------
+; Backward decompression is also supported, use lzsa -r -b <original_file> <compressed_file>
+; To use it, also define BACKWARD_DECOMPRESS=1 before including this code!
+;
+; in:
+; * LZSA_SRC_LO/LZSA_SRC_HI must contain the address of the last byte of compressed data
+; * LZSA_DST_LO/LZSA_DST_HI must contain the address of the last byte of the destination buffer
+;
+; out:
+; * LZSA_DST_LO/LZSA_DST_HI contain the last decompressed byte address, -1
+;
+; -----------------------------------------------------------------------------
+;
+;  Copyright (C) 2019 Emmanuel Marty
+;
+;  This software is provided 'as-is', without any express or implied
+;  warranty.  In no event will the authors be held liable for any damages
+;  arising from the use of this software.
+;
+;  Permission is granted to anyone to use this software for any purpose,
+;  including commercial applications, and to alter it and redistribute it
+;  freely, subject to the following restrictions:
+;
+;  1. The origin of this software must not be misrepresented; you must not
+;     claim that you wrote the original software. If you use this software
+;     in a product, an acknowledgment in the product documentation would be
+;     appreciated but is not required.
+;  2. Altered source versions must be plainly marked as such, and must not be
+;     misrepresented as being the original software.
+;  3. This notice may not be removed or altered from any source distribution.
+; -----------------------------------------------------------------------------
+
+DECOMPRESS_LZSA1
+   LDY #$00
+
+DECODE_TOKEN
+   JSR GETSRC                           ; read token byte: O|LLL|MMMM
+   PHA                                  ; preserve token on stack
+
+   AND #$70                             ; isolate literals count
+   BEQ NO_LITERALS                      ; skip if no literals to copy
+   LSR                                  ; shift literals count into place
+   LSR
+   LSR
+   LSR
+   CMP #$07                             ; LITERALS_RUN_LEN?
+   BCC PREPARE_COPY_LITERALS            ; if not, count is directly embedded in token
+
+   JSR GETSRC                           ; get extra byte of variable literals count
+                                        ; the carry is always set by the CMP above
+                                        ; GETSRC doesn't change it
+   SBC #$F9                             ; (LITERALS_RUN_LEN)
+   BCC PREPARE_COPY_LITERALS
+   BEQ LARGE_VARLEN_LITERALS            ; if adding up to zero, go grab 16-bit count
+
+   JSR GETSRC                           ; get single extended byte of variable literals count
+   INY                                  ; add 256 to literals count
+   BCS PREPARE_COPY_LITERALS            ; (*like JMP PREPARE_COPY_LITERALS but shorter)
+
+LARGE_VARLEN_LITERALS                   ; handle 16 bits literals count
+                                        ; literals count = directly these 16 bits
+   JSR GETLARGESRC                      ; grab low 8 bits in X, high 8 bits in A
+   TAY                                  ; put high 8 bits in Y
+   TXA
+
+PREPARE_COPY_LITERALS
+   TAX
+   BEQ COPY_LITERALS
+   INY
+
+COPY_LITERALS
+   JSR GETPUT                           ; copy one byte of literals
+   DEX
+   BNE COPY_LITERALS
+   DEY
+   BNE COPY_LITERALS
+   
+NO_LITERALS
+   PLA                                  ; retrieve token from stack
+   PHA                                  ; preserve token again
+   BMI GET_LONG_OFFSET                  ; $80: 16 bit offset
+
+   JSR GETSRC                           ; get 8 bit offset from stream in A
+   TAX                                  ; save for later
+   LDA #$FF                             ; high 8 bits
+   BNE GOT_OFFSET                       ; go prepare match
+                                        ; (*like JMP GOT_OFFSET but shorter)
+
+SHORT_VARLEN_MATCHLEN
+   JSR GETSRC                           ; get single extended byte of variable match len
+   INY                                  ; add 256 to match length
+
+PREPARE_COPY_MATCH
+   TAX
+PREPARE_COPY_MATCH_Y
+   TXA
+   BEQ COPY_MATCH_LOOP
+   INY
+
+COPY_MATCH_LOOP
+   LDA $AAAA                            ; get one byte of backreference
+   JSR PUTDST                           ; copy to destination
+
+!ifdef BACKWARD_DECOMPRESS {
+
+   ; Backward decompression -- put backreference bytes backward
+
+   LDA COPY_MATCH_LOOP+1
+   BNE GETMATCH_DONE
+   DEC COPY_MATCH_LOOP+2
+GETMATCH_DONE
+   DEC COPY_MATCH_LOOP+1
+
+} else {
+
+   ; Forward decompression -- put backreference bytes forward
+
+   INC COPY_MATCH_LOOP+1
+   BNE GETMATCH_DONE
+   INC COPY_MATCH_LOOP+2
+GETMATCH_DONE
+
+}
+
+   DEX
+   BNE COPY_MATCH_LOOP
+   DEY
+   BNE COPY_MATCH_LOOP
+   BEQ DECODE_TOKEN                     ; (*like JMP DECODE_TOKEN but shorter)
+
+GET_LONG_OFFSET                         ; handle 16 bit offset:
+   JSR GETLARGESRC                      ; grab low 8 bits in X, high 8 bits in A
+
+GOT_OFFSET
+
+!ifdef BACKWARD_DECOMPRESS {
+
+   ; Backward decompression - substract match offset
+
+   STA OFFSHI                           ; store high 8 bits of offset
+   STX OFFSLO
+
+   SEC                                  ; substract dest - match offset
+   LDA PUTDST+1
+OFFSLO = *+1
+   SBC #$AA                             ; low 8 bits
+   STA COPY_MATCH_LOOP+1                ; store back reference address
+   LDA PUTDST+2
+OFFSHI = *+1
+   SBC #$AA                             ; high 8 bits
+   STA COPY_MATCH_LOOP+2                ; store high 8 bits of address
+   SEC
+
+} else {
+
+   ; Forward decompression - add match offset
+
+   STA OFFSHI                           ; store high 8 bits of offset
+   TXA
+
+   CLC                                  ; add dest + match offset
+   ADC PUTDST+1                         ; low 8 bits
+   STA COPY_MATCH_LOOP+1                ; store back reference address
+OFFSHI = *+1
+   LDA #$AA                             ; high 8 bits
+
+   ADC PUTDST+2
+   STA COPY_MATCH_LOOP+2                ; store high 8 bits of address
+   
+}
+
+   PLA                                  ; retrieve token from stack again
+   AND #$0F                             ; isolate match len (MMMM)
+   ADC #$02                             ; plus carry which is always set by the high ADC
+   CMP #$12                             ; MATCH_RUN_LEN?
+   BCC PREPARE_COPY_MATCH               ; if not, count is directly embedded in token
+
+   JSR GETSRC                           ; get extra byte of variable match length
+                                        ; the carry is always set by the CMP above
+                                        ; GETSRC doesn't change it
+   SBC #$EE                             ; add MATCH_RUN_LEN and MIN_MATCH_SIZE to match length
+   BCC PREPARE_COPY_MATCH
+   BNE SHORT_VARLEN_MATCHLEN
+
+                                        ; Handle 16 bits match length
+   JSR GETLARGESRC                      ; grab low 8 bits in X, high 8 bits in A
+   TAY                                  ; put high 8 bits in Y
+                                        ; large match length with zero high byte?
+   BNE PREPARE_COPY_MATCH_Y             ; if not, continue
+
+DECOMPRESSION_DONE
+   RTS
+
+!ifdef BACKWARD_DECOMPRESS {
+
+   ; Backward decompression -- get and put bytes backward
+
+GETPUT
+   JSR GETSRC
+PUTDST
+LZSA_DST_LO = *+1
+LZSA_DST_HI = *+2
+   STA $AAAA
+   LDA PUTDST+1
+   BNE PUTDST_DONE
+   DEC PUTDST+2
+PUTDST_DONE
+   DEC PUTDST+1
+   RTS
+
+GETLARGESRC
+   JSR GETSRC                           ; grab low 8 bits
+   TAX                                  ; move to X
+                                        ; fall through grab high 8 bits
+
+GETSRC
+LZSA_SRC_LO = *+1
+LZSA_SRC_HI = *+2
+   LDA $AAAA
+   PHA
+   LDA GETSRC+1
+   BNE GETSRC_DONE
+   DEC GETSRC+2
+GETSRC_DONE
+   DEC GETSRC+1
+   PLA
+   RTS
+
+} else {
+
+   ; Forward decompression -- get and put bytes forward
+
+GETPUT
+   JSR GETSRC
+PUTDST
+LZSA_DST_LO = *+1
+LZSA_DST_HI = *+2
+   STA $AAAA
+   INC PUTDST+1
+   BNE PUTDST_DONE
+   INC PUTDST+2
+PUTDST_DONE
+   RTS
+
+GETLARGESRC
+   JSR GETSRC                           ; grab low 8 bits
+   TAX                                  ; move to X
+                                        ; fall through grab high 8 bits
+
+GETSRC
+LZSA_SRC_LO = *+1
+LZSA_SRC_HI = *+2
+   LDA $AAAA
+   INC GETSRC+1
+   BNE GETSRC_DONE
+   INC GETSRC+2
+GETSRC_DONE
+   RTS
+
+}
--- a/asm/6502/decompress_small_v2.asm
+++ b/asm/6502/decompress_small_v2.asm
@ -1,336 +1,336 @@
-; -----------------------------------------------------------------------------
-; Decompress raw LZSA2 block.
-; Create one with lzsa -r -f2 <original_file> <compressed_file>
-;
-; in:
-; * LZSA_SRC_LO and LZSA_SRC_HI contain the compressed raw block address
-; * LZSA_DST_LO and LZSA_DST_HI contain the destination buffer address
-;
-; out:
-; * LZSA_DST_LO and LZSA_DST_HI contain the last decompressed byte address, +1
-;
-; -----------------------------------------------------------------------------
-; Backward decompression is also supported, use lzsa -r -b -f2 <original_file> <compressed_file>
-; To use it, also define BACKWARD_DECOMPRESS=1 before including this code!
-;
-; in:
-; * LZSA_SRC_LO/LZSA_SRC_HI must contain the address of the last byte of compressed data
-; * LZSA_DST_LO/LZSA_DST_HI must contain the address of the last byte of the destination buffer
-;
-; out:
-; * LZSA_DST_LO/LZSA_DST_HI contain the last decompressed byte address, -1
-;
-; -----------------------------------------------------------------------------
-;
-;  Copyright (C) 2019 Emmanuel Marty
-;
-;  This software is provided 'as-is', without any express or implied
-;  warranty.  In no event will the authors be held liable for any damages
-;  arising from the use of this software.
-;
-;  Permission is granted to anyone to use this software for any purpose,
-;  including commercial applications, and to alter it and redistribute it
-;  freely, subject to the following restrictions:
-;
-;  1. The origin of this software must not be misrepresented; you must not
-;     claim that you wrote the original software. If you use this software
-;     in a product, an acknowledgment in the product documentation would be
-;     appreciated but is not required.
-;  2. Altered source versions must be plainly marked as such, and must not be
-;     misrepresented as being the original software.
-;  3. This notice may not be removed or altered from any source distribution.
-; -----------------------------------------------------------------------------
-
-NIBCOUNT = $FC                          ; zero-page location for temp offset
-
-DECOMPRESS_LZSA2
-   LDY #$00
-   STY NIBCOUNT
-
-DECODE_TOKEN
-   JSR GETSRC                           ; read token byte: XYZ|LL|MMM
-   PHA                                  ; preserve token on stack
-
-   AND #$18                             ; isolate literals count (LL)
-   BEQ NO_LITERALS                      ; skip if no literals to copy
-   LSR                                  ; shift literals count into place
-   LSR
-   LSR
-   CMP #$03                             ; LITERALS_RUN_LEN_V2?
-   BCC PREPARE_COPY_LITERALS            ; if less, count is directly embedded in token
-
-   JSR GETNIBBLE                        ; get extra literals length nibble
-                                        ; add nibble to len from token
-   ADC #$02                             ; (LITERALS_RUN_LEN_V2) minus carry
-   CMP #$12                             ; LITERALS_RUN_LEN_V2 + 15 ?
-   BCC PREPARE_COPY_LITERALS            ; if less, literals count is complete
-
-   JSR GETSRC                           ; get extra byte of variable literals count
-                                        ; the carry is always set by the CMP above
-                                        ; GETSRC doesn't change it
-   SBC #$EE                             ; overflow?
-
-PREPARE_COPY_LITERALS
-   TAX
-   BCC PREPARE_COPY_LITERALS_HIGH       ; if not, literals count is complete
-
-                                        ; handle 16 bits literals count
-                                        ; literals count = directly these 16 bits
-   JSR GETLARGESRC                      ; grab low 8 bits in X, high 8 bits in A
-   TAY                                  ; put high 8 bits in Y
-
-PREPARE_COPY_LITERALS_HIGH
-   TXA
-   BEQ COPY_LITERALS
-   INY
-
-COPY_LITERALS
-   JSR GETPUT                           ; copy one byte of literals
-   DEX
-   BNE COPY_LITERALS
-   DEY
-   BNE COPY_LITERALS
-   
-NO_LITERALS
-   PLA                                  ; retrieve token from stack
-   PHA                                  ; preserve token again
-   ASL
-   BCS REPMATCH_OR_LARGE_OFFSET         ; 1YZ: rep-match or 13/16 bit offset
-
-   ASL                                  ; 0YZ: 5 or 9 bit offset
-   BCS OFFSET_9_BIT         
-    
-                                        ; 00Z: 5 bit offset
-
-   LDX #$FF                             ; set offset bits 15-8 to 1
-
-   JSR GETCOMBINEDBITS                  ; rotate Z bit into bit 0, read nibble for bits 4-1
-   ORA #$E0                             ; set bits 7-5 to 1
-   BNE GOT_OFFSET_LO                    ; go store low byte of match offset and prepare match
-   
-OFFSET_9_BIT                            ; 01Z: 9 bit offset
-   ;;ASL                                  ; shift Z (offset bit 8) in place
-   ROL
-   ROL
-   AND #$01
-   EOR #$FF                             ; set offset bits 15-9 to 1
-   BNE GOT_OFFSET_HI                    ; go store high byte, read low byte of match offset and prepare match
-                                        ; (*same as JMP GOT_OFFSET_HI but shorter)
-
-REPMATCH_OR_LARGE_OFFSET
-   ASL                                  ; 13 bit offset?
-   BCS REPMATCH_OR_16_BIT               ; handle rep-match or 16-bit offset if not
-
-                                        ; 10Z: 13 bit offset
-
-   JSR GETCOMBINEDBITS                  ; rotate Z bit into bit 8, read nibble for bits 12-9
-   ADC #$DE                             ; set bits 15-13 to 1 and substract 2 (to substract 512)
-   BNE GOT_OFFSET_HI                    ; go store high byte, read low byte of match offset and prepare match
-                                        ; (*same as JMP GOT_OFFSET_HI but shorter)
-
-REPMATCH_OR_16_BIT                      ; rep-match or 16 bit offset
-   ;;ASL                                  ; XYZ=111?
-   BMI REP_MATCH                        ; reuse previous offset if so (rep-match)
-   
-                                        ; 110: handle 16 bit offset
-   JSR GETSRC                           ; grab high 8 bits
-GOT_OFFSET_HI
-   TAX
-   JSR GETSRC                           ; grab low 8 bits
-GOT_OFFSET_LO
-   STA OFFSLO                           ; store low byte of match offset
-   STX OFFSHI                           ; store high byte of match offset
-
-REP_MATCH
-!ifdef BACKWARD_DECOMPRESS {
-
-   ; Backward decompression - substract match offset
-
-   SEC                                  ; add dest + match offset
-   LDA PUTDST+1                         ; low 8 bits
-OFFSLO = *+1
-   SBC #$AA
-   STA COPY_MATCH_LOOP+1                ; store back reference address
-   LDA PUTDST+2
-OFFSHI = *+1
-   SBC #$AA                             ; high 8 bits
-   STA COPY_MATCH_LOOP+2                ; store high 8 bits of address
-   SEC
-
-} else {
-
-   ; Forward decompression - add match offset
-
-   CLC                                  ; add dest + match offset
-   LDA PUTDST+1                         ; low 8 bits
-OFFSLO = *+1
-   ADC #$AA
-   STA COPY_MATCH_LOOP+1                ; store back reference address
-OFFSHI = *+1
-   LDA #$AA                             ; high 8 bits
-   ADC PUTDST+2
-   STA COPY_MATCH_LOOP+2                ; store high 8 bits of address
-   
-}
-   
-   PLA                                  ; retrieve token from stack again
-   AND #$07                             ; isolate match len (MMM)
-   ADC #$01                             ; add MIN_MATCH_SIZE_V2 and carry
-   CMP #$09                             ; MIN_MATCH_SIZE_V2 + MATCH_RUN_LEN_V2?
-   BCC PREPARE_COPY_MATCH               ; if less, length is directly embedded in token
-
-   JSR GETNIBBLE                        ; get extra match length nibble
-                                        ; add nibble to len from token
-   ADC #$08                             ; (MIN_MATCH_SIZE_V2 + MATCH_RUN_LEN_V2) minus carry
-   CMP #$18                             ; MIN_MATCH_SIZE_V2 + MATCH_RUN_LEN_V2 + 15?
-   BCC PREPARE_COPY_MATCH               ; if less, match length is complete
-
-   JSR GETSRC                           ; get extra byte of variable match length
-                                        ; the carry is always set by the CMP above
-                                        ; GETSRC doesn't change it
-   SBC #$E8                             ; overflow?
-
-PREPARE_COPY_MATCH
-   TAX
-   BCC PREPARE_COPY_MATCH_Y             ; if not, the match length is complete
-   BEQ DECOMPRESSION_DONE               ; if EOD code, bail
-
-                                        ; Handle 16 bits match length
-   JSR GETLARGESRC                      ; grab low 8 bits in X, high 8 bits in A
-   TAY                                  ; put high 8 bits in Y
-
-PREPARE_COPY_MATCH_Y
-   TXA
-   BEQ COPY_MATCH_LOOP
-   INY
-
-COPY_MATCH_LOOP
-   LDA $AAAA                            ; get one byte of backreference
-   JSR PUTDST                           ; copy to destination
-
-!ifdef BACKWARD_DECOMPRESS {
-
-   ; Backward decompression -- put backreference bytes backward
-
-   LDA COPY_MATCH_LOOP+1
-   BNE GETMATCH_DONE
-   DEC COPY_MATCH_LOOP+2
-GETMATCH_DONE
-   DEC COPY_MATCH_LOOP+1
-
-} else {
-
-   ; Forward decompression -- put backreference bytes forward
-
-   INC COPY_MATCH_LOOP+1
-   BNE GETMATCH_DONE
-   INC COPY_MATCH_LOOP+2
-GETMATCH_DONE
-
-}
-
-   DEX
-   BNE COPY_MATCH_LOOP
-   DEY
-   BNE COPY_MATCH_LOOP
-   JMP DECODE_TOKEN
-
-GETCOMBINEDBITS
-   EOR #$80
-   ASL
-   PHP
-
-   JSR GETNIBBLE                        ; get nibble into bits 0-3 (for offset bits 1-4)
-   PLP                                  ; merge Z bit as the carry bit (for offset bit 0)
-COMBINEDBITZ
-   ROL                                  ; nibble -> bits 1-4; carry(!Z bit) -> bit 0 ; carry cleared
-DECOMPRESSION_DONE
-   RTS
-
-GETNIBBLE
-NIBBLES = *+1
-   LDA #$AA
-   LSR NIBCOUNT
-   BCS HAS_NIBBLES
-
-   INC NIBCOUNT
-   JSR GETSRC                           ; get 2 nibbles
-   STA NIBBLES
-   LSR 
-   LSR 
-   LSR 
-   LSR 
-   SEC
-
-HAS_NIBBLES
-   AND #$0F                             ; isolate low 4 bits of nibble
-   RTS
-
-!ifdef BACKWARD_DECOMPRESS {
-
-   ; Backward decompression -- get and put bytes backward
-
-GETPUT
-   JSR GETSRC
-PUTDST
-LZSA_DST_LO = *+1
-LZSA_DST_HI = *+2
-   STA $AAAA
-   LDA PUTDST+1
-   BNE PUTDST_DONE
-   DEC PUTDST+2
-PUTDST_DONE
-   DEC PUTDST+1
-   RTS
-
-GETLARGESRC
-   JSR GETSRC                           ; grab low 8 bits
-   TAX                                  ; move to X
-                                        ; fall through grab high 8 bits
-
-GETSRC
-LZSA_SRC_LO = *+1
-LZSA_SRC_HI = *+2
-   LDA $AAAA
-   PHA
-   LDA GETSRC+1
-   BNE GETSRC_DONE
-   DEC GETSRC+2
-GETSRC_DONE
-   DEC GETSRC+1
-   PLA
-   RTS
-
-} else {
-
-   ; Forward decompression -- get and put bytes forward
-
-GETPUT
-   JSR GETSRC
-PUTDST
-LZSA_DST_LO = *+1
-LZSA_DST_HI = *+2
-   STA $AAAA
-   INC PUTDST+1
-   BNE PUTDST_DONE
-   INC PUTDST+2
-PUTDST_DONE
-   RTS
-
-GETLARGESRC
-   JSR GETSRC                           ; grab low 8 bits
-   TAX                                  ; move to X
-                                        ; fall through grab high 8 bits
-
-GETSRC
-LZSA_SRC_LO = *+1
-LZSA_SRC_HI = *+2
-   LDA $AAAA
-   INC GETSRC+1
-   BNE GETSRC_DONE
-   INC GETSRC+2
-GETSRC_DONE
-   RTS
-
-}
-
+; -----------------------------------------------------------------------------
+; Decompress raw LZSA2 block.
+; Create one with lzsa -r -f2 <original_file> <compressed_file>
+;
+; in:
+; * LZSA_SRC_LO and LZSA_SRC_HI contain the compressed raw block address
+; * LZSA_DST_LO and LZSA_DST_HI contain the destination buffer address
+;
+; out:
+; * LZSA_DST_LO and LZSA_DST_HI contain the last decompressed byte address, +1
+;
+; -----------------------------------------------------------------------------
+; Backward decompression is also supported, use lzsa -r -b -f2 <original_file> <compressed_file>
+; To use it, also define BACKWARD_DECOMPRESS=1 before including this code!
+;
+; in:
+; * LZSA_SRC_LO/LZSA_SRC_HI must contain the address of the last byte of compressed data
+; * LZSA_DST_LO/LZSA_DST_HI must contain the address of the last byte of the destination buffer
+;
+; out:
+; * LZSA_DST_LO/LZSA_DST_HI contain the last decompressed byte address, -1
+;
+; -----------------------------------------------------------------------------
+;
+;  Copyright (C) 2019 Emmanuel Marty
+;
+;  This software is provided 'as-is', without any express or implied
+;  warranty.  In no event will the authors be held liable for any damages
+;  arising from the use of this software.
+;
+;  Permission is granted to anyone to use this software for any purpose,
+;  including commercial applications, and to alter it and redistribute it
+;  freely, subject to the following restrictions:
+;
+;  1. The origin of this software must not be misrepresented; you must not
+;     claim that you wrote the original software. If you use this software
+;     in a product, an acknowledgment in the product documentation would be
+;     appreciated but is not required.
+;  2. Altered source versions must be plainly marked as such, and must not be
+;     misrepresented as being the original software.
+;  3. This notice may not be removed or altered from any source distribution.
+; -----------------------------------------------------------------------------
+
+NIBCOUNT = $FC                          ; zero-page location for temp offset
+
+DECOMPRESS_LZSA2
+   LDY #$00
+   STY NIBCOUNT
+
+DECODE_TOKEN
+   JSR GETSRC                           ; read token byte: XYZ|LL|MMM
+   PHA                                  ; preserve token on stack
+
+   AND #$18                             ; isolate literals count (LL)
+   BEQ NO_LITERALS                      ; skip if no literals to copy
+   LSR                                  ; shift literals count into place
+   LSR
+   LSR
+   CMP #$03                             ; LITERALS_RUN_LEN_V2?
+   BCC PREPARE_COPY_LITERALS            ; if less, count is directly embedded in token
+
+   JSR GETNIBBLE                        ; get extra literals length nibble
+                                        ; add nibble to len from token
+   ADC #$02                             ; (LITERALS_RUN_LEN_V2) minus carry
+   CMP #$12                             ; LITERALS_RUN_LEN_V2 + 15 ?
+   BCC PREPARE_COPY_LITERALS            ; if less, literals count is complete
+
+   JSR GETSRC                           ; get extra byte of variable literals count
+                                        ; the carry is always set by the CMP above
+                                        ; GETSRC doesn't change it
+   SBC #$EE                             ; overflow?
+
+PREPARE_COPY_LITERALS
+   TAX
+   BCC PREPARE_COPY_LITERALS_HIGH       ; if not, literals count is complete
+
+                                        ; handle 16 bits literals count
+                                        ; literals count = directly these 16 bits
+   JSR GETLARGESRC                      ; grab low 8 bits in X, high 8 bits in A
+   TAY                                  ; put high 8 bits in Y
+
+PREPARE_COPY_LITERALS_HIGH
+   TXA
+   BEQ COPY_LITERALS
+   INY
+
+COPY_LITERALS
+   JSR GETPUT                           ; copy one byte of literals
+   DEX
+   BNE COPY_LITERALS
+   DEY
+   BNE COPY_LITERALS
+   
+NO_LITERALS
+   PLA                                  ; retrieve token from stack
+   PHA                                  ; preserve token again
+   ASL
+   BCS REPMATCH_OR_LARGE_OFFSET         ; 1YZ: rep-match or 13/16 bit offset
+
+   ASL                                  ; 0YZ: 5 or 9 bit offset
+   BCS OFFSET_9_BIT         
+    
+                                        ; 00Z: 5 bit offset
+
+   LDX #$FF                             ; set offset bits 15-8 to 1
+
+   JSR GETCOMBINEDBITS                  ; rotate Z bit into bit 0, read nibble for bits 4-1
+   ORA #$E0                             ; set bits 7-5 to 1
+   BNE GOT_OFFSET_LO                    ; go store low byte of match offset and prepare match
+   
+OFFSET_9_BIT                            ; 01Z: 9 bit offset
+   ;;ASL                                  ; shift Z (offset bit 8) in place
+   ROL
+   ROL
+   AND #$01
+   EOR #$FF                             ; set offset bits 15-9 to 1
+   BNE GOT_OFFSET_HI                    ; go store high byte, read low byte of match offset and prepare match
+                                        ; (*same as JMP GOT_OFFSET_HI but shorter)
+
+REPMATCH_OR_LARGE_OFFSET
+   ASL                                  ; 13 bit offset?
+   BCS REPMATCH_OR_16_BIT               ; handle rep-match or 16-bit offset if not
+
+                                        ; 10Z: 13 bit offset
+
+   JSR GETCOMBINEDBITS                  ; rotate Z bit into bit 8, read nibble for bits 12-9
+   ADC #$DE                             ; set bits 15-13 to 1 and substract 2 (to substract 512)
+   BNE GOT_OFFSET_HI                    ; go store high byte, read low byte of match offset and prepare match
+                                        ; (*same as JMP GOT_OFFSET_HI but shorter)
+
+REPMATCH_OR_16_BIT                      ; rep-match or 16 bit offset
+   ;;ASL                                  ; XYZ=111?
+   BMI REP_MATCH                        ; reuse previous offset if so (rep-match)
+   
+                                        ; 110: handle 16 bit offset
+   JSR GETSRC                           ; grab high 8 bits
+GOT_OFFSET_HI
+   TAX
+   JSR GETSRC                           ; grab low 8 bits
+GOT_OFFSET_LO
+   STA OFFSLO                           ; store low byte of match offset
+   STX OFFSHI                           ; store high byte of match offset
+
+REP_MATCH
+!ifdef BACKWARD_DECOMPRESS {
+
+   ; Backward decompression - substract match offset
+
+   SEC                                  ; add dest + match offset
+   LDA PUTDST+1                         ; low 8 bits
+OFFSLO = *+1
+   SBC #$AA
+   STA COPY_MATCH_LOOP+1                ; store back reference address
+   LDA PUTDST+2
+OFFSHI = *+1
+   SBC #$AA                             ; high 8 bits
+   STA COPY_MATCH_LOOP+2                ; store high 8 bits of address
+   SEC
+
+} else {
+
+   ; Forward decompression - add match offset
+
+   CLC                                  ; add dest + match offset
+   LDA PUTDST+1                         ; low 8 bits
+OFFSLO = *+1
+   ADC #$AA
+   STA COPY_MATCH_LOOP+1                ; store back reference address
+OFFSHI = *+1
+   LDA #$AA                             ; high 8 bits
+   ADC PUTDST+2
+   STA COPY_MATCH_LOOP+2                ; store high 8 bits of address
+   
+}
+   
+   PLA                                  ; retrieve token from stack again
+   AND #$07                             ; isolate match len (MMM)
+   ADC #$01                             ; add MIN_MATCH_SIZE_V2 and carry
+   CMP #$09                             ; MIN_MATCH_SIZE_V2 + MATCH_RUN_LEN_V2?
+   BCC PREPARE_COPY_MATCH               ; if less, length is directly embedded in token
+
+   JSR GETNIBBLE                        ; get extra match length nibble
+                                        ; add nibble to len from token
+   ADC #$08                             ; (MIN_MATCH_SIZE_V2 + MATCH_RUN_LEN_V2) minus carry
+   CMP #$18                             ; MIN_MATCH_SIZE_V2 + MATCH_RUN_LEN_V2 + 15?
+   BCC PREPARE_COPY_MATCH               ; if less, match length is complete
+
+   JSR GETSRC                           ; get extra byte of variable match length
+                                        ; the carry is always set by the CMP above
+                                        ; GETSRC doesn't change it
+   SBC #$E8                             ; overflow?
+
+PREPARE_COPY_MATCH
+   TAX
+   BCC PREPARE_COPY_MATCH_Y             ; if not, the match length is complete
+   BEQ DECOMPRESSION_DONE               ; if EOD code, bail
+
+                                        ; Handle 16 bits match length
+   JSR GETLARGESRC                      ; grab low 8 bits in X, high 8 bits in A
+   TAY                                  ; put high 8 bits in Y
+
+PREPARE_COPY_MATCH_Y
+   TXA
+   BEQ COPY_MATCH_LOOP
+   INY
+
+COPY_MATCH_LOOP
+   LDA $AAAA                            ; get one byte of backreference
+   JSR PUTDST                           ; copy to destination
+
+!ifdef BACKWARD_DECOMPRESS {
+
+   ; Backward decompression -- put backreference bytes backward
+
+   LDA COPY_MATCH_LOOP+1
+   BNE GETMATCH_DONE
+   DEC COPY_MATCH_LOOP+2
+GETMATCH_DONE
+   DEC COPY_MATCH_LOOP+1
+
+} else {
+
+   ; Forward decompression -- put backreference bytes forward
+
+   INC COPY_MATCH_LOOP+1
+   BNE GETMATCH_DONE
+   INC COPY_MATCH_LOOP+2
+GETMATCH_DONE
+
+}
+
+   DEX
+   BNE COPY_MATCH_LOOP
+   DEY
+   BNE COPY_MATCH_LOOP
+   JMP DECODE_TOKEN
+
+GETCOMBINEDBITS
+   EOR #$80
+   ASL
+   PHP
+
+   JSR GETNIBBLE                        ; get nibble into bits 0-3 (for offset bits 1-4)
+   PLP                                  ; merge Z bit as the carry bit (for offset bit 0)
+COMBINEDBITZ
+   ROL                                  ; nibble -> bits 1-4; carry(!Z bit) -> bit 0 ; carry cleared
+DECOMPRESSION_DONE
+   RTS
+
+GETNIBBLE
+NIBBLES = *+1
+   LDA #$AA
+   LSR NIBCOUNT
+   BCS HAS_NIBBLES
+
+   INC NIBCOUNT
+   JSR GETSRC                           ; get 2 nibbles
+   STA NIBBLES
+   LSR 
+   LSR 
+   LSR 
+   LSR 
+   SEC
+
+HAS_NIBBLES
+   AND #$0F                             ; isolate low 4 bits of nibble
+   RTS
+
+!ifdef BACKWARD_DECOMPRESS {
+
+   ; Backward decompression -- get and put bytes backward
+
+GETPUT
+   JSR GETSRC
+PUTDST
+LZSA_DST_LO = *+1
+LZSA_DST_HI = *+2
+   STA $AAAA
+   LDA PUTDST+1
+   BNE PUTDST_DONE
+   DEC PUTDST+2
+PUTDST_DONE
+   DEC PUTDST+1
+   RTS
+
+GETLARGESRC
+   JSR GETSRC                           ; grab low 8 bits
+   TAX                                  ; move to X
+                                        ; fall through grab high 8 bits
+
+GETSRC
+LZSA_SRC_LO = *+1
+LZSA_SRC_HI = *+2
+   LDA $AAAA
+   PHA
+   LDA GETSRC+1
+   BNE GETSRC_DONE
+   DEC GETSRC+2
+GETSRC_DONE
+   DEC GETSRC+1
+   PLA
+   RTS
+
+} else {
+
+   ; Forward decompression -- get and put bytes forward
+
+GETPUT
+   JSR GETSRC
+PUTDST
+LZSA_DST_LO = *+1
+LZSA_DST_HI = *+2
+   STA $AAAA
+   INC PUTDST+1
+   BNE PUTDST_DONE
+   INC PUTDST+2
+PUTDST_DONE
+   RTS
+
+GETLARGESRC
+   JSR GETSRC                           ; grab low 8 bits
+   TAX                                  ; move to X
+                                        ; fall through grab high 8 bits
+
+GETSRC
+LZSA_SRC_LO = *+1
+LZSA_SRC_HI = *+2
+   LDA $AAAA
+   INC GETSRC+1
+   BNE GETSRC_DONE
+   INC GETSRC+2
+GETSRC_DONE
+   RTS
+
+}
+
--- a/asm/8088/LZSA1JMP.ASM
+++ b/asm/8088/LZSA1JMP.ASM
@ -1,32 +1,125 @@
-;  lzsa1fta.asm time-efficient decompressor implementation for 8086 CPUs.
-;  Turbo Assembler IDEAL mode dialect; can also be assembled with NASM.
+; lzsa1fta.asm time-efficient decompressor implementation for 808x CPUs.
+; Turbo Assembler IDEAL mode dialect.
+; (Is supposed to also assemble with NASM's IDEAL mode support, but YMMV.)
 ;
-;  Usual DOS assembler SMALL model assumptions apply.  This code:
-;  - Assumes it was invoked via NEAR call (change RET to RETF for FAR calls)
-;  - Is interrupt-safe
-;  - Is not re-entrant (do not decompress while already running decompression)
-;  - Trashes all data and segment registers
+; This code assembles to about 3K of lookup tables and unrolled code,
+; but the tradeoff for that size is the absolute fastest decompressor
+; of LZSA1 block data for 808x CPUs.
+; If you need moderately fast code with less size, see LZSA1FTA.ASM.
+; If you need the smallest decompression code, see decompress_small_v1.S.
 ;
-;  Copyright (C) 2019 Jim Leonard, Emmanuel Marty
+; Usual DOS assembler SMALL model assumptions apply.  This code:
+; - Assumes it was invoked via NEAR call (change RET to RETF for FAR calls)
+; - Is interrupt-safe
+; - Is not re-entrant (do not decompress while already running decompression)
+; - Trashes all data and segment registers
 ;
-;  This software is provided 'as-is', without any express or implied
-;  warranty.  In no event will the authors be held liable for any damages
-;  arising from the use of this software.
+; Copyright (C) 2019 Jim Leonard, Emmanuel Marty
 ;
-;  Permission is granted to anyone to use this software for any purpose,
-;  including commercial applications, and to alter it and redistribute it
-;  freely, subject to the following restrictions:
+; This software is provided 'as-is', without any express or implied
+; warranty.  In no event will the authors be held liable for any damages
+; arising from the use of this software.
 ;
-;  1. The origin of this software must not be misrepresented; you must not
-;     claim that you wrote the original software. If you use this software
-;     in a product, an acknowledgment in the product documentation would be
-;     appreciated but is not required.
-;  2. Altered source versions must be plainly marked as such, and must not be
-;     misrepresented as being the original software.
-;  3. This notice may not be removed or altered from any source distribution.
+; Permission is granted to anyone to use this software for any purpose,
+; including commercial applications, and to alter it and redistribute it
+; freely, subject to the following restrictions:
+;
+; 1. The origin of this software must not be misrepresented; you must not
+;    claim that you wrote the original software. If you use this software
+;    in a product, an acknowledgment in the product documentation would be
+;    appreciated but is not required.
+; 2. Altered source versions must be plainly marked as such, and must not be
+;    misrepresented as being the original software.
+; 3. This notice may not be removed or altered from any source distribution.
+;
+; ===========================================================================
+;
+; The key area to concentrate on when optimizing LZSA1 decompression speed is
+; reducing time spent handling the shortest matches. This is for two reasons:
+;   1. shorter matches are more common
+;   2. short matches are least efficient in terms of decomp speed per byte
+; You can confirm #1 using the --stats mode of the compressor.
+;
+; Branches are costly on 8086.  To ensure we branch as little as possible, a
+; jumptable will be used to branch directly to as many direct decode paths as
+; possible.  This will burn up 512 bytes of RAM for a jumptable, and a few
+; hundred bytes of duplicated program code (rather than JMP/CALL common code
+; blocks, we inline them to avoid the branch overhead).
+;
+; ===========================================================================
+;
+; === LZSA1 block reference:
+;
+; Blocks encoded as LZSA1 are composed from consecutive commands.
+; Each command follows this format:
+;
+; token: <O|LLL|MMMM>
+; optional extra literal length
+; literal values
+; match offset low
+; optional match offset high
+; optional extra encoded match length
+;
+;
+; === LZSA1 Token Reference:
+;
+; 7 6 5 4 3 2 1 0
+; O L L L M M M M
+;
+; L: 3-bit literals length (0-6, or 7 if extended). If the number of literals for
+; this command is 0 to 6, the length is encoded in the token and no extra bytes
+; are required. Otherwise, a value of 7 is encoded and extra bytes follow as
+; 'optional extra literal length'
+;
+; M: 4-bit encoded match length (0-14, or 15 if extended). Likewise, if the
+; encoded match length for this command is 0 to 14, it is directly stored,
+; otherwise 15 is stored and extra bytes follow as 'optional extra encoded match
+; length'. Except for the last command in a block, a command always contains a
+; match, so the encoded match length is the actual match length, offset by the
+; minimum which is 3 bytes. For instance, an actual match length of 10 bytes to
+; be copied, is encoded as 7.
+;
+; O: set for a 2-bytes match offset, clear for a 1-byte match offset
+;
+;
+; === Decoding extended literal length:
+;
+; If the literals length is 7 or more, then an extra byte follows here, with
+; three possible values:
+;
+;   0-248: the value is added to the 7 stored in the token.
+;   250: a second byte follows. The final literals value is 256 + the second byte.
+;   249: a little-endian 16-bit value follows, forming the final literals value.
+;
+;
+; === Decoding match offsets:
+;
+; match offset low: The low 8 bits of the match offset follows.
+;
+; optional match offset high: If the 'O' bit (bit 7) is set in the token, the
+; high 8 bits of the match offset follow, otherwise they are understood to be all
+; set to 1. For instance, a short offset of 0x70 is interpreted as 0xff70
+;
+;
+; === Decoding extra encoded match length:
+;
+; optional extra encoded match length: If the encoded match length is 15 or more,
+; the 'M' bits in the token form the value 15, and an extra byte follows here,
+; with three possible types of value.
+;
+;  0-237: the value is added to the 15 stored in the token. The final value is 3 + 15 + this byte.
+;  239:   a second byte follows. The final match length is 256 + the second byte.
+;  238:   a second and third byte follow, forming a little-endian 16-bit value.
+;         The final encoded match length is that 16-bit value.
+;
+; ===========================================================================

-        IDEAL
-        P8086
+        IDEAL   ; Use Turbo Assembler IDEAL syntax checking
+        P8086   ; Restrict code generation to the 808x and later
+        JUMPS   ; Perform fixups for out-of-bound conditional jumps
+                ; This is required for the (L=07 & M=0Fh) decode paths as they
+                ; have the most code, but these are uncommon paths so the
+                ; tiny speed loss in just these paths is not a concern.

 SEGMENT CODE para public

@ -34,203 +127,385 @@ ASSUME  cs:CODE, ds:CODE

 PUBLIC  lzsa1_decompress_speed_jumptable

-;  ---------------------------------------------------------------------------
-;  Decompress raw LZSA1 block
-;  inputs:
-;  * ds:si: raw LZSA1 block
-;  * es:di: output buffer
-;  output:
-;  * ax:    decompressed size
-;  ---------------------------------------------------------------------------
+; EQU helper statements (so we can construct a jump table without going crazy)

-;Jump table for handling LLL bits in initial LZSA1 tokens.
-;Previous code would SHR val,4 to get a count from 0 to 7, then rep movsb.
-;We can overload the shift operation into a jump table that jumps directly
-;to optimized copying routine for 0-7 bytes.  Must declare in code segment.
-;Note: If this looks strange for declaring a jump table, that's because it
-;is a workaround for the Turbo Pascal harness that tests it.  Turbo Pascal
-;treats OFFSET (label) as a relocatble item and throws an error, so we fool
-;it by building the table with absolute EQU/literals instead.
-L0b EQU OFFSET check_offset_size
-L1b EQU OFFSET copy1b
-L2b EQU OFFSET copy2b
-L3b EQU OFFSET copy3b
-L4b EQU OFFSET copy4b
-L5b EQU OFFSET copy5b
-L6b EQU OFFSET copy6b
-L7b EQU OFFSET need_length_byte
-copytable DW L0b,L0b,L0b,L0b,L0b,L0b,L0b,L0b
-          DW L1b,L1b,L1b,L1b,L1b,L1b,L1b,L1b
-          DW L2b,L2b,L2b,L2b,L2b,L2b,L2b,L2b
-          DW L3b,L3b,L3b,L3b,L3b,L3b,L3b,L3b
-          DW L4b,L4b,L4b,L4b,L4b,L4b,L4b,L4b
-          DW L5b,L5b,L5b,L5b,L5b,L5b,L5b,L5b
-          DW L6b,L6b,L6b,L6b,L6b,L6b,L6b,L6b
-          DW L7b,L7b,L7b,L7b,L7b,L7b,L7b,L7b
+minmatch EQU 3
+litrunlen EQU 7
+
+leml1 EQU OFFSET lit_ext_mat_len_1b
+leme1 EQU OFFSET lit_ext_mat_ext_1b
+leml2 EQU OFFSET lit_ext_mat_len_2b
+leme2 EQU OFFSET lit_ext_mat_ext_2b
+
+;short-circuit special cases for 0 through 6 literal copies:
+l6ml1 EQU OFFSET lit_len_mat_len_1b
+l6me1 EQU OFFSET lit_len_mat_ext_1b
+l6ml2 EQU OFFSET lit_len_mat_len_2b
+l6me2 EQU OFFSET lit_len_mat_ext_2b
+l5ml1 EQU OFFSET lit_len_mat_len_1b + 1
+l5me1 EQU OFFSET lit_len_mat_ext_1b + 1
+l5ml2 EQU OFFSET lit_len_mat_len_2b + 1
+l5me2 EQU OFFSET lit_len_mat_ext_2b + 1
+l4ml1 EQU OFFSET lit_len_mat_len_1b + 2
+l4me1 EQU OFFSET lit_len_mat_ext_1b + 2
+l4ml2 EQU OFFSET lit_len_mat_len_2b + 2
+l4me2 EQU OFFSET lit_len_mat_ext_2b + 2
+l3ml1 EQU OFFSET lit_len_mat_len_1b + 3
+l3me1 EQU OFFSET lit_len_mat_ext_1b + 3
+l3ml2 EQU OFFSET lit_len_mat_len_2b + 3
+l3me2 EQU OFFSET lit_len_mat_ext_2b + 3
+l2ml1 EQU OFFSET lit_len_mat_len_1b + 4
+l2me1 EQU OFFSET lit_len_mat_ext_1b + 4
+l2ml2 EQU OFFSET lit_len_mat_len_2b + 4
+l2me2 EQU OFFSET lit_len_mat_ext_2b + 4
+l1ml1 EQU OFFSET lit_len_mat_len_1b + 5
+l1me1 EQU OFFSET lit_len_mat_ext_1b + 5
+l1ml2 EQU OFFSET lit_len_mat_len_2b + 5
+l1me2 EQU OFFSET lit_len_mat_ext_2b + 5
+l0ml1 EQU OFFSET lit_len_mat_len_1b + 6 ; MMMM handling comes after LLL code
+l0me1 EQU OFFSET lit_len_mat_ext_1b + 6 ; MMMM handling comes after LLL code
+l0ml2 EQU OFFSET lit_len_mat_len_2b + 6 ; MMMM handling comes after LLL code
+l0me2 EQU OFFSET lit_len_mat_ext_2b + 6 ; MMMM handling comes after LLL code
+
+; === Hand-written (!) jumptable actually begins here.
+; Located before the program code results in an extra JMP and 3 wasted bytes,
+; but it makes the code easier to follow in this location.
+; Relocate the jump table after the ENDP directive to save 3 bytes.
+;
+; 7 6 5 4 3 2 1 0
+; O L L L M M M M
+;
+;         0     1     2     3     4     5     6     7     8     9     a     b     c     d     e     f
+jtbl DW l0ml1,l0ml1,l0ml1,l0ml1,l0ml1,l0ml1,l0ml1,l0ml1,l0ml1,l0ml1,l0ml1,l0ml1,l0ml1,l0ml1,l0ml1,l0me1 ;0
+     DW l1ml1,l1ml1,l1ml1,l1ml1,l1ml1,l1ml1,l1ml1,l1ml1,l1ml1,l1ml1,l1ml1,l1ml1,l1ml1,l1ml1,l1ml1,l1me1 ;1
+     DW l2ml1,l2ml1,l2ml1,l2ml1,l2ml1,l2ml1,l2ml1,l2ml1,l2ml1,l2ml1,l2ml1,l2ml1,l2ml1,l2ml1,l2ml1,l2me1 ;2
+     DW l3ml1,l3ml1,l3ml1,l3ml1,l3ml1,l3ml1,l3ml1,l3ml1,l3ml1,l3ml1,l3ml1,l3ml1,l3ml1,l3ml1,l3ml1,l3me1 ;3
+     DW l4ml1,l4ml1,l4ml1,l4ml1,l4ml1,l4ml1,l4ml1,l4ml1,l4ml1,l4ml1,l4ml1,l4ml1,l4ml1,l4ml1,l4ml1,l4me1 ;4
+     DW l5ml1,l5ml1,l5ml1,l5ml1,l5ml1,l5ml1,l5ml1,l5ml1,l5ml1,l5ml1,l5ml1,l5ml1,l5ml1,l5ml1,l5ml1,l5me1 ;5
+     DW l6ml1,l6ml1,l6ml1,l6ml1,l6ml1,l6ml1,l6ml1,l6ml1,l6ml1,l6ml1,l6ml1,l6ml1,l6ml1,l6ml1,l6ml1,l6me1 ;6
+     DW leml1,leml1,leml1,leml1,leml1,leml1,leml1,leml1,leml1,leml1,leml1,leml1,leml1,leml1,leml1,leme1 ;7
+     DW l0ml2,l0ml2,l0ml2,l0ml2,l0ml2,l0ml2,l0ml2,l0ml2,l0ml2,l0ml2,l0ml2,l0ml2,l0ml2,l0ml2,l0ml2,l0me2 ;8
+     DW l1ml2,l1ml2,l1ml2,l1ml2,l1ml2,l1ml2,l1ml2,l1ml2,l1ml2,l1ml2,l1ml2,l1ml2,l1ml2,l1ml2,l1ml2,l1me2 ;9
+     DW l2ml2,l2ml2,l2ml2,l2ml2,l2ml2,l2ml2,l2ml2,l2ml2,l2ml2,l2ml2,l2ml2,l2ml2,l2ml2,l2ml2,l2ml2,l2me2 ;a
+     DW l3ml2,l3ml2,l3ml2,l3ml2,l3ml2,l3ml2,l3ml2,l3ml2,l3ml2,l3ml2,l3ml2,l3ml2,l3ml2,l3ml2,l3ml2,l3me2 ;b
+     DW l4ml2,l4ml2,l4ml2,l4ml2,l4ml2,l4ml2,l4ml2,l4ml2,l4ml2,l4ml2,l4ml2,l4ml2,l4ml2,l4ml2,l4ml2,l4me2 ;c
+     DW l5ml2,l5ml2,l5ml2,l5ml2,l5ml2,l5ml2,l5ml2,l5ml2,l5ml2,l5ml2,l5ml2,l5ml2,l5ml2,l5ml2,l5ml2,l5me2 ;d
+     DW l6ml2,l6ml2,l6ml2,l6ml2,l6ml2,l6ml2,l6ml2,l6ml2,l6ml2,l6ml2,l6ml2,l6ml2,l6ml2,l6ml2,l6ml2,l6me2 ;e
+     DW leml2,leml2,leml2,leml2,leml2,leml2,leml2,leml2,leml2,leml2,leml2,leml2,leml2,leml2,leml2,leme2 ;f

 PROC    lzsa1_decompress_speed_jumptable  NEAR
+; ---------------------------------------------------------------------------
+; Decompress raw LZSA1 block
+; inputs:
+; * ds:si: raw LZSA1 block
+; * es:di: output buffer
+; output:
+; * ax:    decompressed size
+; ---------------------------------------------------------------------------
+
+MACRO get_byte_match_offset
+        mov     ah,0ffh         ;O=0, so set up offset's high byte
+        lodsb                   ;load low byte; ax=match offset
+        xchg    bp,ax           ;bp=match offset  ax=00 + original token
+ENDM
+
+MACRO get_word_match_offset
+        lodsw                   ;ax=match offset
+        xchg    bp,ax           ;bp=match offset  ax=00 + original token
+ENDM
+
+MACRO do_match_copy_long
+LOCAL do_run, do_run_w
+; Copies a long match as optimally as possible.
+; requirements: cx=length, bp=negative offset, ds:si=compdata, es:di=output
+; trashes: ax, bx
+; must leave cx=0 at exit
+        mov     bx,ds           ;save ds
+        mov     ax,es
+        mov     ds,ax           ;ds=es
+        xchg    ax,si           ;save si
+        lea     si,[bp+di]      ;si = output buffer + negative match offset
+        cmp     bp,-2           ;do we have a byte/word run to optimize?
+        jae     do_run          ;perform a run if so, otherwise fall through
+;You may be tempted to change "jae" to "jge" because DX is a signed number.
+;Don't!  The total window is 64k, so if you treat this as a signed comparison,
+;you will get incorrect results for offsets over 32K.
+
+;If we're here, we have a long copy and it isn't byte-overlapping (if it
+;overlapped, we'd be in @@do_run)  So, let's copy faster with REP MOVSW.
+;This affects 8088 only slightly, but is a bigger win on 8086 and higher.
+        shr     cx,1
+        rep     movsw
+        adc     cl,0
+        rep     movsb
+        xchg    si,ax           ;restore si
+        mov     ds,bx           ;restore ds
+        jmp     decode_token
+
+do_run:
+        je      do_run_w        ;if applicable, handle word-sized value faster
+        xchg    dx,ax           ;save si into dx, as ax is getting trashed
+        lodsb                   ;load first byte of run into al
+        mov     ah,al
+        shr     cx,1
+        rep     stosw           ;perform word run
+        adc     cl,0
+        rep     stosb           ;finish word run
+        mov     si,dx           ;restore si
+        mov     ds,bx           ;restore ds
+        jmp     decode_token
+
+do_run_w:
+        xchg    dx,ax           ;save si into dx, as ax is getting trashed
+        lodsw                   ;load first word of run
+        shr     cx,1
+        rep     stosw           ;perform word run
+        adc     cl,0            ;despite 2-byte offset, compressor might
+        rep     stosb           ;output odd length. better safe than sorry.
+        mov     si,dx           ;restore si
+        mov     ds,bx           ;restore ds
+        jmp     decode_token
+ENDM
+
+MACRO do_match_copy
+; Copies a shorter match with as little overhead as possible.
+; requirements: cx=length, bp=negative offset, ds:si=compdata, es:di=output
+; trashes: ax, bx
+; must leave cx=0 at exit
+        mov     bx,ds           ;save ds
+        mov     ax,es
+        mov     ds,ax           ;ds=es
+        xchg    ax,si           ;save si
+        lea     si,[bp+di]      ;si = output buffer + negative match offset
+        rep     movsb
+        xchg    si,ax           ;restore si
+        mov     ds,bx           ;restore ds
+        jmp     decode_token
+ENDM
+
+MACRO do_literal_copy
+; Copies a literal sequence using words.
+; Meant for longer lengths; for 128 bytes or less, use REP MOVSB.
+; requirements: cx=length, ds:si=compdata, es:di=output
+; must leave cx=0 at exit
+        shr     cx,1
+        rep     movsw
+        adc     cl,0
+        rep     movsb
+ENDM
+
+MACRO copy_small_match_len
+        and     al,0FH          ;isolate length in token (MMMM)
+        add     al,minmatch     ;ax=match length
+        xchg    cx,ax           ;cx=match length
+        do_match_copy           ;copy match with cx=length, bp=offset
+ENDM
+
+MACRO copy_large_match_len
+LOCAL val239, val238, EOD
+; Handle MMMM=Fh
+; Assumptions: ah=0 from get_????_match_offset's xchg
+        lodsb                   ;grab extra match length byte
+        add     al,0Fh+minmatch ;add MATCH_RUN_LEN + MIN_MATCH_SIZE
+        jz      val238          ;if zf & cf, 238: get 16-bit match length
+        jc      val239          ;if cf,      239: get extra match length byte
+        xchg    cx,ax           ;otherwise, we have our match length
+        do_match_copy_long      ;copy match with cx=length, bp=offset
+val239:
+        lodsb                   ;ah=0; grab single extra length byte
+        inc     ah              ;ax=256+length byte
+        xchg    cx,ax
+        do_match_copy_long      ;copy match with cx=length, bp=offset
+val238:
+        lodsw                   ;grab 16-bit length
+        xchg    cx,ax
+        jcxz    EOD             ;is it the EOD marker? Exit if so
+        do_match_copy_long      ;copy match with cx=length, bp=offset
+EOD:
+        jmp     done_decompressing
+ENDM
+

 lzsa1_start:
        push    di              ;remember decompression offset
        cld                     ;ensure string ops move forward
        xor     cx,cx

-@@decode_token:
-        xchg    cx,ax           ;clear ah (cx = 0 from match copy's rep movsb)
+decode_token:
+        xchg    cx,ax           ;clear ah (cx = 0 from match copy's REP)
        lodsb                   ;read token byte: O|LLL|MMMM
-        mov     dx,ax           ;copy our token to dl for later MMMM handling
+        mov     bp,ax           ;preserve 0+token in bp for later MMMM handling
+        mov     bx,ax           ;prep for table lookup
+        shl     bx,1            ;adjust for offset word size
+        jmp     [cs:jtbl+bx]    ;jump directly to relevant decode path

-        and     al,070H         ;isolate literals length in token (LLL)
-        jz      check_offset_size ;if LLL=0, we have no literals; goto match
+; There are eight basic decode paths for an LZSA1 token.  Each of these
+; paths perform only the necessary actions to decode the token and then
+; fetch the next token.  This results in a lot of code duplication, but
+; it is the only way to get down to two branches per token (jump to unique
+; decode path, then jump back to next token) for the most common cases.

-; Jump to short copy routine for LLL=1 though 6, need_length_byte for LLL=7
-        mov     bx,ax           ;prep for table lookup (must copy, don't XCHG!)
-        jmp     [cs:copytable+bx]
+; Path #1: LLL=0-6, MMMM=0-Eh, O=0 (1-byte match offset)
+; Handle LLL=0-6 by jumping directly into # of bytes to copy (6 down to 1)
+lit_len_mat_len_1b:
+        movsb
+        movsb
+        movsb
+        movsb
+        movsb
+        movsb
+        get_byte_match_offset
+        copy_small_match_len

-need_length_byte:
-        lodsb                   ;grab extra length byte
-        add     al,07H          ;add LITERALS_RUN_LEN
-        jnc     @@got_literals_exact ;if no overflow, we have full count
-        je      @@big_literals

-@@mid_literals:
-        lodsb                   ;grab single extra length byte
-        inc     ah              ;add 256
-        xchg    cx,ax           ;with longer counts, we can save some time
-        shr     cx,1            ;by doing a word copy instead of a byte copy.
-        rep     movsw           ;We don't need to account for overlap because
-        adc     cx,0            ;source for literals isn't the output buffer.
-        rep     movsb
-        jmp     check_offset_size
+; Path #2: LLL=0-6, MMMM=Fh,   O=0 (1-byte match offset)
+lit_len_mat_ext_1b:
+        movsb
+        movsb
+        movsb
+        movsb
+        movsb
+        movsb
+        get_byte_match_offset
+        copy_large_match_len

-@@big_literals:
-        lodsw                   ;grab 16-bit extra length
-        xchg    cx,ax           ;with longer counts, we can save some time
-        shr     cx,1            ;by doing a word copy instead of a byte copy.
-        rep     movsw
-        adc     cx,0
-        rep     movsb
-        jmp     check_offset_size

-; Used for counts 7-248. In test data, average value around 1Ah.  YMMV.
-@@got_literals_exact:
+; Path #3: LLL=7,   MMMM=0-Eh, O=0 (1-byte match offset)
+lit_ext_mat_len_1b:
+; on entry: ax=0 + token, bp=ax
+        lodsb                   ;grab extra literal length byte
+        add     al,litrunlen    ;add 7h literal run length
+        jz      @@val249_3      ;if zf & cf, 249: get 16-bit literal length
+        jc      @@val250_3      ;if cf,      250: get extra literal length byte
+        xchg    cx,ax           ;otherwise, we have our literal length
+        do_literal_copy         ;this might be better as rep movsw !!! benchmark
+        get_byte_match_offset
+        copy_small_match_len
+@@val250_3:
+        lodsb                   ;ah=0; grab single extra length byte
+        inc     ah              ;ax=256+length byte
        xchg    cx,ax
-        rep     movsb           ;copy cx literals from ds:si to es:di
-        jmp     check_offset_size
-
-;Literal copy sequence for lengths 1-6:
-copy6b: movsb
-copy5b: movsb
-copy4b: movsb
-copy3b: movsb
-copy2b: movsb
-copy1b: movsb
-
-;Literals done; fall through to match offset determination
-check_offset_size:
-        test    dl,dl           ;check match offset size in token (O bit)
-        js      @@get_long_offset ;load absolute 16-bit match offset
-
-        mov     ah,0ffh         ;set up high byte
-        lodsb                   ;load low byte
-
-@@get_match_length:
-        xchg    dx,ax           ;dx: match offset  ax: original token
-        and     al,0FH          ;isolate match length in token (MMMM)
-        cmp     al,0FH          ;MATCH_RUN_LEN?
-        jne     @@got_matchlen_short  ;no, we have the full match length from the token, go copy
-
-        lodsb                   ;grab extra length byte
-        add     al,012H         ;add MIN_MATCH_SIZE + MATCH_RUN_LEN
-        jnc     @@do_long_copy  ;if no overflow, we have the entire length
-        jne     @@mid_matchlen
-
+        do_literal_copy
+        get_byte_match_offset
+        copy_small_match_len
+@@val249_3:
        lodsw                   ;grab 16-bit length
-        xchg    cx,ax           ;get ready to do a long copy
-        jcxz    @@done_decompressing ;wait, is it the EOD marker? Exit if so
-        jmp     @@copy_len_preset ;otherwise, do the copy
+        xchg    cx,ax
+        do_literal_copy
+        get_byte_match_offset
+        copy_small_match_len

-@@got_matchlen_short:
-        add     al,3            ;add MIN_MATCH_SIZE
-        xchg    cx,ax           ;copy match length into cx
-        mov     bp,ds           ;save ds
-        mov     ax,es
-        mov     ds,ax           ;ds=es
-        xchg    ax,si           ;save si
-        mov     si,di           ;ds:si now points at back reference in output data
-        add     si,dx
-        rep     movsb           ;copy match
-        xchg    si,ax           ;restore si
-        mov     ds,bp           ;restore ds
-        jmp     @@decode_token  ;go decode another token

-@@done_decompressing:
+; Path #4: LLL=7,   MMMM=Fh,   O=0 (1-byte match offset)
+lit_ext_mat_ext_1b:
+; on entry: ax=0 + token, bp=ax
+        lodsb                   ;grab extra literal length byte
+        add     al,litrunlen    ;add 7h literal run length
+        jz      @@val249_4      ;if zf & cf, 249: get 16-bit literal length
+        jc      @@val250_4      ;if cf,      250: get extra literal length byte
+        xchg    cx,ax           ;otherwise, we have our literal length
+        do_literal_copy         ;this might be better as rep movsw !!! benchmark
+        get_byte_match_offset
+        copy_large_match_len
+@@val250_4:
+        lodsb                   ;ah=0; grab single extra length byte
+        inc     ah              ;ax=256+length byte
+        xchg    cx,ax
+        do_literal_copy
+        get_byte_match_offset
+        copy_large_match_len
+@@val249_4:
+        lodsw                   ;grab 16-bit length
+        xchg    cx,ax
+        do_literal_copy
+        get_byte_match_offset
+        copy_large_match_len
+
+
+; Path #5: LLL=0-6, MMMM=0-Eh, O=1 (2-byte match offset)
+; Handle LLL=0-6 by jumping directly into # of bytes to copy (6 down to 1)
+lit_len_mat_len_2b:
+        movsb
+        movsb
+        movsb
+        movsb
+        movsb
+        movsb
+        get_word_match_offset
+        copy_small_match_len
+
+
+; Path #6: LLL=0-6, MMMM=Fh,   O=1 (2-byte match offset)
+lit_len_mat_ext_2b:
+        movsb
+        movsb
+        movsb
+        movsb
+        movsb
+        movsb
+        get_word_match_offset
+        copy_large_match_len
+
+
+; Path #7: LLL=7,   MMMM=0-Eh, O=1 (2-byte match offset)
+lit_ext_mat_len_2b:
+; on entry: ax=0 + token, bp=ax
+        lodsb                   ;grab extra literal length byte
+        add     al,litrunlen    ;add 7h literal run length
+        jz      @@val249_7      ;if zf & cf, 249: get 16-bit literal length
+        jc      @@val250_7      ;if cf,      250: get extra literal length byte
+        xchg    cx,ax           ;otherwise, we have our literal length
+        do_literal_copy         ;this might be better as rep movsw !!! benchmark
+        get_word_match_offset
+        copy_small_match_len
+@@val250_7:
+        lodsb                   ;ah=0; grab single extra length byte
+        inc     ah              ;ax=256+length byte
+        xchg    cx,ax
+        do_literal_copy
+        get_word_match_offset
+        copy_small_match_len
+@@val249_7:
+        lodsw                   ;grab 16-bit length
+        xchg    cx,ax
+        do_literal_copy
+        get_word_match_offset
+        copy_small_match_len
+
+
+; Path #8: LLL=7,   MMMM=Fh,   O=1 (2-byte match offset)
+lit_ext_mat_ext_2b:
+; on entry: ax=0 + token, bp=ax
+        lodsb                   ;grab extra literal length byte
+        add     al,litrunlen    ;add 7h literal run length
+        jz      @@val249_8      ;if zf & cf, 249: get 16-bit literal length
+        jc      @@val250_8      ;if cf,      250: get extra literal length byte
+        xchg    cx,ax           ;otherwise, we have our literal length
+        do_literal_copy         ;this might be better as rep movsw !!! benchmark
+        get_word_match_offset
+        copy_large_match_len
+@@val250_8:
+        lodsb                   ;ah=0; grab single extra length byte
+        inc     ah              ;ax=256+length byte
+        xchg    cx,ax
+        do_literal_copy
+        get_word_match_offset
+        copy_large_match_len
+@@val249_8:
+        lodsw                   ;grab 16-bit length
+        xchg    cx,ax
+        do_literal_copy
+        get_word_match_offset
+        copy_large_match_len
+
+
+done_decompressing:
+;return # of decompressed bytes in ax
        pop     ax              ;retrieve the original decompression offset
-        xchg    di,ax           ;compute decompressed size
-        sub     ax,di
+        sub     di,ax           ;adjust for original offset
+        xchg    di,ax           ;return adjusted value in ax
        ret                     ;done decompressing, exit to caller

-;These are called less often; moved here to optimize the fall-through case
-@@get_long_offset:
-        lodsw                   ;Get 2-byte match offset
-        jmp     @@get_match_length
-
-;With a confirmed longer match length, we have an opportunity to optimize for
-;the case where a single byte is repeated long enough that we can benefit
-;from rep movsw to perform the run (instead of rep movsb).
-@@mid_matchlen:
-        lodsb                   ;grab single extra length byte
-        inc     ah              ;add 256
-@@do_long_copy:
-        xchg    cx,ax           ;copy match length into cx
-@@copy_len_preset:
-        push    ds              ;save ds
-        mov     bp,es
-        mov     ds,bp           ;ds=es
-        mov     bp,si           ;save si
-        mov     si,di           ;ds:si now points at back reference in output data
-        add     si,dx
-        cmp     dx,-2           ;do we have a byte/word run to optimize?
-        jae     @@do_run        ;perform a run
-;You may be tempted to change "jae" to "jge" because DX is a signed number.
-;Don't!  The total window is 64k, so if you treat this as a signed comparison,
-;you will get incorrect results for offsets over 32K.
-
-;If we're here, we have a long copy and it isn't byte-overlapping (if it
-;overlapped, we'd be in @@do_run_1)  So, let's copy faster with REP MOVSW.
-;This won't affect 8088 that much, but it speeds up 8086 and higher.
-        shr     cx,1
-        rep     movsw
-        adc     cx,0
-        rep     movsb
-        mov     si,bp           ;restore si
-        pop     ds
-        jmp     @@decode_token  ;go decode another token
-
-@@do_run:
-        je      @@do_run_2      ;fall through to byte (common) if not word run
-
-@@do_run_1:
-        lodsb                   ;load first byte of run into al
-        mov     ah,al
-        shr     cx,1
-        rep     stosw           ;perform word run
-        adc     cx,0
-        rep     stosb           ;finish word run
-        mov     si,bp           ;restore si
-        pop     ds
-        jmp     @@decode_token  ;go decode another token
-
-@@do_run_2:
-        lodsw                   ;load first word of run
-        shr     cx,1
-        rep     stosw           ;perform word run
-        adc     cx,0            ;despite 2-byte offset, compressor might
-        rep     stosb           ;output odd length. better safe than sorry.
-        mov     si,bp           ;restore si
-        pop     ds
-        jmp     @@decode_token  ;go decode another token
-
 ENDP    lzsa1_decompress_speed_jumptable

 ENDS    CODE
@ -238,37 +513,11 @@ ENDS    CODE
 END

 ;Speed optimization history (decompression times in microseconds @ 4.77 MHz):
-; original E. Marty code    shuttle 123208 alice 65660 robotron 407338 ***
-; table for shr al,4        shuttle 120964 alice 63230 robotron 394733 +++
-; push/pop to mov/mov       shuttle 118176 alice 61835 robotron 386762 +++
-; movsw for literalcpys     shuttle 124102 alice 64908 robotron 400220 --- rb
-; stosw for byte runs       shuttle 118897 alice 65040 robotron 403518 --- rb
-; better stosw for runs     shuttle 117712 alice 65040 robotron 403343 +--
-; disable RLE by default    shuttle 116924 alice 60783 robotron 381226 +++
-; optimize got_matchlen     shuttle 115294 alice 59588 robotron 374330 +++
-; fall through to getML     shuttle 113258 alice 59572 robotron 372004 +++
-; fall through to midLI     shuttle 113258 alice 59572 robotron 375060 ..- rb
-; fall through midMaLen     shuttle 113247 alice 59572 robotron 372004 +.+
-; movsw for litlen > 255    shuttle 113247 alice 59572 robotron 371612 ..+
-; rep stosw for long runs   shuttle 113247 alice 59572 robotron 371612 ...
-; rep movsw for long cpys   shuttle 113247 alice 59572 robotron 371035 ..+
-; xchg/dec ah -> mov ah,val shuttle 112575 alice 59272 robotron 369198 +++
-; force >12h len.to longcpy shuttle 101998 alice 59266 robotron 364459 +.+
-; more efficient run branch shuttle 102239 alice 59297 robotron 364716 --- rb
-; even more eff. run branch shuttle 101998 alice 59266 robotron 364459 ***
-; BUGFIX - bad sign compare shuttle 101955 alice 59225 robotron 364117 +++
-; reverse 16-bit len compar shuttle 102000 alice 59263 robotron 364460 --- rb
-; jcxz for EOD detection    no change to speed, but is 1 byte shorter  +++
-; force movsw for literals  shuttle 107183 alice 62555 robotron 379524 --- rb
-; defer shr4 until necessry shuttle 102069 alice 60236 robotron 364096 ---
-; skip literals if LLL=0    shuttle  98655 alice 57849 robotron 363358 ---
-; fall through to mid_liter shuttle  98595 alice 57789 robotron 361998 +++
-; == jumptable experiments begin ==
-; jumptable for small copys shuttle 101594 alice 61078 robotron 386018 ---
-; start:xchg instead of mov shuttle 100948 alice 60467 robotron 381112 +++
-; use table for LLL=0 check shuttle 106972 alice 63333 robotron 388304 --- rb
-; jmptbl to fallthrough mov shuttle 102532 alice 60760 robotron 383070 ---
-; cpy fallthrough check_ofs shuttle  98939 alice 58917 robotron 371019 +**
-; single jumptable jump     shuttle  97528 alice 57264 robotron 362194 ++*
-; conditional check for L=7 shuttle  98610 alice 58521 robotron 368153 --- rb
 ; defer add MIN_MATCH_SIZE  shuttle  97207 alice 57200 robotron 362884 ++*
+; jumptable rewrite, no RLE shuttle  97744 alice 46905 robotron 309032 -++
+; adc cx,0 -> adc cl,0      shuttle  97744 alice 46893 robotron 309032 .+.!
+; jumptable rewrite w/RLE   shuttle  88776 alice 50433 robotron 319222 +--
+; short match copies movsb  shuttle  97298 alice 49769 robotron 326282 ---rb
+; long match copy #1 16-bit shuttle  92490 alice 46905 robotron 308722 +*+
+; long match copy #2 extraB shuttle  92464 alice 46905 robotron 308371 +.+
+; long match copy #3 0f->ed shuttle  86765 alice 46864 robotron 303895 +++!
--- a/src/expand_block_v2.c
+++ b/src/expand_block_v2.c
@ -146,7 +146,7 @@ int lzsa_decompressor_expand_block_v2(const unsigned char *pInBlock, int nBlockS
         }
      }

-      if ((pInBlock + 1) < pInBlockEnd) { /* The last token in the block does not include match information */
+      if (pInBlock < pInBlockEnd) { /* The last token in the block does not include match information */
         unsigned char nOffsetMode = token & 0xc0;
         unsigned int nValue;

@ -185,6 +185,7 @@ int lzsa_decompressor_expand_block_v2(const unsigned char *pInBlock, int nBlockS
            if ((token & 0x20) == 0) {
               /* 16 bit offset */
               nMatchOffset = (((unsigned int)(*pInBlock++)) << 8);
+               if (pInBlock >= pInBlockEnd) return -1;
               nMatchOffset |= (unsigned int)(*pInBlock++);
               nMatchOffset ^= 0xffff;
               nMatchOffset++;
--- a/src/lzsa.c
+++ b/src/lzsa.c
@ -48,7 +48,7 @@
 #define OPT_RAW_BACKWARD   8
 #define OPT_STATS          16

-#define TOOL_VERSION "1.1.2"
+#define TOOL_VERSION "1.2.0"

 /*---------------------------------------------------------------------------*/

@ -512,7 +512,7 @@ static int do_self_test(const unsigned int nOptions, const int nMinMatchSize, in
   size_t nDataSizeStep = 128;
   float fProbabilitySizeStep = 0.0005f;

-   for (nGeneratedDataSize = 1024; nGeneratedDataSize <= ((nOptions & OPT_RAW) ? BLOCK_SIZE : (4 * BLOCK_SIZE)); nGeneratedDataSize += nDataSizeStep) {
+   for (nGeneratedDataSize = 1024; nGeneratedDataSize <= ((size_t)((nOptions & OPT_RAW) ? BLOCK_SIZE : (4 * BLOCK_SIZE))); nGeneratedDataSize += nDataSizeStep) {
      float fMatchProbability;

      fprintf(stdout, "size %zd", nGeneratedDataSize);
@ -530,7 +530,7 @@ static int do_self_test(const unsigned int nOptions, const int nMinMatchSize, in
            /* Try to compress it, expected to succeed */
            size_t nActualCompressedSize = lzsa_compress_inmem(pGeneratedData, pCompressedData, nGeneratedDataSize, lzsa_get_max_compressed_size_inmem(nGeneratedDataSize),
               nFlags, nMinMatchSize, nFormatVersion);
-            if (nActualCompressedSize == -1 || nActualCompressedSize < (lzsa_get_header_size() + lzsa_get_frame_size() + lzsa_get_frame_size() /* footer */)) {
+            if (nActualCompressedSize == -1 || (int)nActualCompressedSize < (lzsa_get_header_size() + lzsa_get_frame_size() + lzsa_get_frame_size() /* footer */)) {
               free(pTmpDecompressedData);
               pTmpDecompressedData = NULL;
               free(pTmpCompressedData);
--- a/src/matchfinder.c
+++ b/src/matchfinder.c
@ -91,7 +91,7 @@ int lzsa_build_suffix_array(lzsa_compressor *pCompressor, const unsigned char *p
   int nMinMatchSize = pCompressor->min_match_size;

   if (pCompressor->format_version >= 2) {
-      for (i = 1; i < nInWindowSize - 1; i++) {
+      for (i = 1; i < nInWindowSize; i++) {
         int nIndex = (int)(intervals[i] & POS_MASK);
         int nLen = PLCP[nIndex];
         if (nLen < nMinMatchSize)
@ -105,7 +105,7 @@ int lzsa_build_suffix_array(lzsa_compressor *pCompressor, const unsigned char *p
      }
   }
   else {
-      for (i = 1; i < nInWindowSize - 1; i++) {
+      for (i = 1; i < nInWindowSize; i++) {
         int nIndex = (int)(intervals[i] & POS_MASK);
         int nLen = PLCP[nIndex];
         if (nLen < nMinMatchSize)
@ -116,9 +116,6 @@ int lzsa_build_suffix_array(lzsa_compressor *pCompressor, const unsigned char *p
      }
   }

-   if (i < nInWindowSize)
-      intervals[i] &= POS_MASK;
-
   /**
    * Build intervals for finding matches
    *
@ -195,16 +192,18 @@ int lzsa_build_suffix_array(lzsa_compressor *pCompressor, const unsigned char *p
 * @param nOffset offset to find matches at, in the input window
 * @param pMatches pointer to returned matches
 * @param nMaxMatches maximum number of matches to return (0 for none)
+ * @param nInWindowSize total input size in bytes (previously compressed bytes + bytes to compress)
 *
 * @return number of matches
 */
-int lzsa_find_matches_at(lzsa_compressor *pCompressor, const int nOffset, lzsa_match *pMatches, const int nMaxMatches) {
+int lzsa_find_matches_at(lzsa_compressor *pCompressor, const int nOffset, lzsa_match *pMatches, const int nMaxMatches, const int nInWindowSize) {
   unsigned int *intervals = pCompressor->intervals;
   unsigned int *pos_data = pCompressor->pos_data;
   unsigned int ref;
   unsigned int super_ref;
   unsigned int match_pos;
   lzsa_match *matchptr;
+   int nPrevOffset = 0;

   /**
    * Find matches using intervals
@ -238,7 +237,40 @@ int lzsa_find_matches_at(lzsa_compressor *pCompressor, const int nOffset, lzsa_m
   /* Ascend indirectly via pos_data[] links.  */
   match_pos = super_ref & EXCL_VISITED_MASK;
   matchptr = pMatches;
+
+   if (pCompressor->format_version >= 2 && nInWindowSize < 65536) {
+      if ((matchptr - pMatches) < nMaxMatches) {
+         int nMatchOffset = (int)(nOffset - match_pos);
+
+         if (nMatchOffset <= MAX_OFFSET) {
+            matchptr->length = (unsigned short)(ref >> (LCP_SHIFT + TAG_BITS));
+            matchptr->offset = (unsigned short)nMatchOffset;
+            matchptr++;
+
+            nPrevOffset = nMatchOffset;
+         }
+      }
+   }
+
   for (;;) {
+      if ((super_ref = pos_data[match_pos]) > ref) {
+         match_pos = intervals[super_ref & POS_MASK] & EXCL_VISITED_MASK;
+
+         if (pCompressor->format_version >= 2 && nInWindowSize < 65536) {
+            if ((matchptr - pMatches) < nMaxMatches) {
+               int nMatchOffset = (int)(nOffset - match_pos);
+
+               if (nMatchOffset <= MAX_OFFSET && nMatchOffset != nPrevOffset) {
+                  matchptr->length = ((unsigned short)(ref >> (LCP_SHIFT + TAG_BITS))) | 0x8000;
+                  matchptr->offset = (unsigned short)nMatchOffset;
+                  matchptr++;
+
+                  nPrevOffset = nMatchOffset;
+               }
+            }
+         }
+      }
+
      while ((super_ref = pos_data[match_pos]) > ref)
         match_pos = intervals[super_ref & POS_MASK] & EXCL_VISITED_MASK;
      intervals[ref & POS_MASK] = nOffset | VISITED_FLAG;
@ -247,7 +279,7 @@ int lzsa_find_matches_at(lzsa_compressor *pCompressor, const int nOffset, lzsa_m
      if ((matchptr - pMatches) < nMaxMatches) {
         int nMatchOffset = (int)(nOffset - match_pos);

-         if (nMatchOffset <= MAX_OFFSET) {
+         if (nMatchOffset <= MAX_OFFSET && nMatchOffset != nPrevOffset) {
            if (pCompressor->format_version >= 2) {
               matchptr->length = (unsigned short)(ref >> (LCP_SHIFT + TAG_BITS));
            }
@ -263,6 +295,23 @@ int lzsa_find_matches_at(lzsa_compressor *pCompressor, const int nOffset, lzsa_m
         break;
      ref = super_ref;
      match_pos = intervals[ref & POS_MASK] & EXCL_VISITED_MASK;
+
+      if (pCompressor->format_version >= 2 && nInWindowSize < 65536) {
+         if ((matchptr - pMatches) < nMaxMatches) {
+            int nMatchOffset = (int)(nOffset - match_pos);
+
+            if (nMatchOffset <= MAX_OFFSET && nMatchOffset != nPrevOffset) {
+               matchptr->length = ((unsigned short)(ref >> (LCP_SHIFT + TAG_BITS))) | 0x8000;
+               matchptr->offset = (unsigned short)nMatchOffset;
+
+               if ((matchptr->length & 0x7fff) > 2) {
+                  matchptr++;
+
+                  nPrevOffset = nMatchOffset;
+               }
+            }
+         }
+      }
   }

   return (int)(matchptr - pMatches);
@ -282,7 +331,7 @@ void lzsa_skip_matches(lzsa_compressor *pCompressor, const int nStartOffset, con
   /* Skipping still requires scanning for matches, as this also performs a lazy update of the intervals. However,
    * we don't store the matches. */
   for (i = nStartOffset; i < nEndOffset; i++) {
-      lzsa_find_matches_at(pCompressor, i, &match, 0);
+      lzsa_find_matches_at(pCompressor, i, &match, 0, 0);
   }
 }

@ -295,11 +344,11 @@ void lzsa_skip_matches(lzsa_compressor *pCompressor, const int nStartOffset, con
 * @param nEndOffset offset to end finding matches at (typically the size of the total input window in bytes
 */
 void lzsa_find_all_matches(lzsa_compressor *pCompressor, const int nMatchesPerOffset, const int nStartOffset, const int nEndOffset) {
-   lzsa_match *pMatch = pCompressor->match + (nStartOffset * nMatchesPerOffset);
+   lzsa_match *pMatch = pCompressor->match;
   int i;

   for (i = nStartOffset; i < nEndOffset; i++) {
-      int nMatches = lzsa_find_matches_at(pCompressor, i, pMatch, nMatchesPerOffset);
+      int nMatches = lzsa_find_matches_at(pCompressor, i, pMatch, nMatchesPerOffset, nEndOffset - nStartOffset);

      while (nMatches < nMatchesPerOffset) {
         pMatch[nMatches].length = 0;
--- a/src/matchfinder.h
+++ b/src/matchfinder.h
@ -59,10 +59,11 @@ int lzsa_build_suffix_array(lzsa_compressor *pCompressor, const unsigned char *p
 * @param nOffset offset to find matches at, in the input window
 * @param pMatches pointer to returned matches
 * @param nMaxMatches maximum number of matches to return (0 for none)
+ * @param nInWindowSize total input size in bytes (previously compressed bytes + bytes to compress)
 *
 * @return number of matches
 */
-int lzsa_find_matches_at(lzsa_compressor *pCompressor, const int nOffset, lzsa_match *pMatches, const int nMaxMatches);
+int lzsa_find_matches_at(lzsa_compressor *pCompressor, const int nOffset, lzsa_match *pMatches, const int nMaxMatches, const int nInWindowSize);

 /**
 * Skip previously compressed bytes
--- a/src/shrink_block_v1.c
+++ b/src/shrink_block_v1.c
@ -157,24 +157,26 @@ static inline int lzsa_get_offset_cost_v1(const unsigned int nMatchOffset) {
 * @param nEndOffset offset to end finding matches at (typically the size of the total input window in bytes
 */
 static void lzsa_optimize_forward_v1(lzsa_compressor *pCompressor, lzsa_match *pBestMatch, const int nStartOffset, const int nEndOffset, const int nReduce) {
-   lzsa_arrival *arrival = pCompressor->arrival;
+   lzsa_arrival *arrival = pCompressor->arrival - (nStartOffset << MATCHES_PER_ARRIVAL_SHIFT);
   const int nMinMatchSize = pCompressor->min_match_size;
   const int nFavorRatio = (pCompressor->flags & LZSA_FLAG_FAVOR_RATIO) ? 1 : 0;
   const int nDisableScore = nReduce ? 0 : (2 * BLOCK_SIZE);
   int i, j, n;

-   memset(arrival + (nStartOffset << MATCHES_PER_OFFSET_SHIFT), 0, sizeof(lzsa_arrival) * ((nEndOffset - nStartOffset) << MATCHES_PER_OFFSET_SHIFT));
+   if ((nEndOffset - nStartOffset) > BLOCK_SIZE) return;

-   arrival[nStartOffset << MATCHES_PER_OFFSET_SHIFT].from_slot = -1;
+   memset(arrival + (nStartOffset << MATCHES_PER_ARRIVAL_SHIFT), 0, sizeof(lzsa_arrival) * ((nEndOffset - nStartOffset + 1) << MATCHES_PER_ARRIVAL_SHIFT));

-   for (i = nStartOffset; i != (nEndOffset - 1); i++) {
+   arrival[nStartOffset << MATCHES_PER_ARRIVAL_SHIFT].from_slot = -1;
+
+   for (i = nStartOffset; i != nEndOffset; i++) {
      int m;

-      for (j = 0; j < NMATCHES_PER_OFFSET && arrival[(i << MATCHES_PER_OFFSET_SHIFT) + j].from_slot; j++) {
-         int nPrevCost = arrival[(i << MATCHES_PER_OFFSET_SHIFT) + j].cost;
+      for (j = 0; j < NMATCHES_PER_ARRIVAL_V1 && arrival[(i << MATCHES_PER_ARRIVAL_SHIFT) + j].from_slot; j++) {
+         int nPrevCost = arrival[(i << MATCHES_PER_ARRIVAL_SHIFT) + j].cost;
         int nCodingChoiceCost = nPrevCost + 8 /* literal */;
-         int nScore = arrival[(i << MATCHES_PER_OFFSET_SHIFT) + j].score + 1;
-         int nNumLiterals = arrival[(i << MATCHES_PER_OFFSET_SHIFT) + j].num_literals + 1;
+         int nScore = arrival[(i << MATCHES_PER_ARRIVAL_SHIFT) + j].score + 1;
+         int nNumLiterals = arrival[(i << MATCHES_PER_ARRIVAL_SHIFT) + j].num_literals + 1;

         if (nNumLiterals == LITERALS_RUN_LEN_V1 || nNumLiterals == 256 || nNumLiterals == 512) {
            nCodingChoiceCost += 8;
@ -183,15 +185,15 @@ static void lzsa_optimize_forward_v1(lzsa_compressor *pCompressor, lzsa_match *p
         if (!nFavorRatio && nNumLiterals == 1)
            nCodingChoiceCost += MODESWITCH_PENALTY;

-         for (n = 0; n < NMATCHES_PER_OFFSET /* we only need the literals + short match cost + long match cost cases */; n++) {
-            lzsa_arrival *pDestArrival = &arrival[((i + 1) << MATCHES_PER_OFFSET_SHIFT) + n];
+         for (n = 0; n < NMATCHES_PER_ARRIVAL_V1 /* we only need the literals + short match cost + long match cost cases */; n++) {
+            lzsa_arrival *pDestArrival = &arrival[((i + 1) << MATCHES_PER_ARRIVAL_SHIFT) + n];

            if (pDestArrival->from_slot == 0 ||
               nCodingChoiceCost < pDestArrival->cost ||
               (nCodingChoiceCost == pDestArrival->cost && nScore < (pDestArrival->score + nDisableScore))) {
-               memmove(&arrival[((i + 1) << MATCHES_PER_OFFSET_SHIFT) + n + 1],
-                  &arrival[((i + 1) << MATCHES_PER_OFFSET_SHIFT) + n],
-                  sizeof(lzsa_arrival) * (NMATCHES_PER_OFFSET - n - 1));
+               memmove(&arrival[((i + 1) << MATCHES_PER_ARRIVAL_SHIFT) + n + 1],
+                  &arrival[((i + 1) << MATCHES_PER_ARRIVAL_SHIFT) + n],
+                  sizeof(lzsa_arrival) * (NMATCHES_PER_ARRIVAL_V1 - n - 1));

               pDestArrival->cost = nCodingChoiceCost;
               pDestArrival->from_pos = i;
@ -200,15 +202,15 @@ static void lzsa_optimize_forward_v1(lzsa_compressor *pCompressor, lzsa_match *p
               pDestArrival->match_len = 0;
               pDestArrival->num_literals = nNumLiterals;
               pDestArrival->score = nScore;
-               pDestArrival->rep_offset = arrival[(i << MATCHES_PER_OFFSET_SHIFT) + j].rep_offset;
+               pDestArrival->rep_offset = arrival[(i << MATCHES_PER_ARRIVAL_SHIFT) + j].rep_offset;
               break;
            }
         }
      }

-      const lzsa_match *match = pCompressor->match + (i << 3);
+      const lzsa_match *match = pCompressor->match + ((i - nStartOffset) << MATCHES_PER_INDEX_SHIFT_V1);

-      for (m = 0; m < 8 && match[m].length; m++) {
+      for (m = 0; m < NMATCHES_PER_INDEX_V1 && match[m].length; m++) {
         int nMatchLen = match[m].length;
         int nMatchOffsetCost = lzsa_get_offset_cost_v1(match[m].offset);
         int nStartingMatchLen, k;
@ -223,33 +225,33 @@ static void lzsa_optimize_forward_v1(lzsa_compressor *pCompressor, lzsa_match *p
         for (k = nStartingMatchLen; k <= nMatchLen; k++) {
            int nMatchLenCost = lzsa_get_match_varlen_size_v1(k - MIN_MATCH_SIZE_V1);

-            for (j = 0; j < NMATCHES_PER_OFFSET && arrival[(i << MATCHES_PER_OFFSET_SHIFT) + j].from_slot; j++) {
-               int nPrevCost = arrival[(i << MATCHES_PER_OFFSET_SHIFT) + j].cost;
+            for (j = 0; j < NMATCHES_PER_ARRIVAL_V1 && arrival[(i << MATCHES_PER_ARRIVAL_SHIFT) + j].from_slot; j++) {
+               int nPrevCost = arrival[(i << MATCHES_PER_ARRIVAL_SHIFT) + j].cost;
               int nCodingChoiceCost = nPrevCost + 8 /* token */ /* the actual cost of the literals themselves accumulates up the chain */ + nMatchOffsetCost + nMatchLenCost;
-               int nScore = arrival[(i << MATCHES_PER_OFFSET_SHIFT) + j].score + 5;
+               int nScore = arrival[(i << MATCHES_PER_ARRIVAL_SHIFT) + j].score + 5;
               int exists = 0;

-               if (!nFavorRatio && !arrival[(i << MATCHES_PER_OFFSET_SHIFT) + j].num_literals)
+               if (!nFavorRatio && !arrival[(i << MATCHES_PER_ARRIVAL_SHIFT) + j].num_literals)
                  nCodingChoiceCost += MODESWITCH_PENALTY;

               for (n = 0;
-                  n < NMATCHES_PER_OFFSET && arrival[((i + k) << MATCHES_PER_OFFSET_SHIFT) + n].from_slot && arrival[((i + k) << MATCHES_PER_OFFSET_SHIFT) + n].cost <= nCodingChoiceCost;
+                  n < NMATCHES_PER_ARRIVAL_V1 && arrival[((i + k) << MATCHES_PER_ARRIVAL_SHIFT) + n].from_slot && arrival[((i + k) << MATCHES_PER_ARRIVAL_SHIFT) + n].cost <= nCodingChoiceCost;
                  n++) {
-                  if (lzsa_get_offset_cost_v1(arrival[((i + k) << MATCHES_PER_OFFSET_SHIFT) + n].rep_offset) == lzsa_get_offset_cost_v1(match[m].offset)) {
+                  if (lzsa_get_offset_cost_v1(arrival[((i + k) << MATCHES_PER_ARRIVAL_SHIFT) + n].rep_offset) == lzsa_get_offset_cost_v1(match[m].offset)) {
                     exists = 1;
                     break;
                  }
               }

-               for (n = 0; !exists && n < NMATCHES_PER_OFFSET /* we only need the literals + short match cost + long match cost cases */; n++) {
-                  lzsa_arrival *pDestArrival = &arrival[((i + k) << MATCHES_PER_OFFSET_SHIFT) + n];
+               for (n = 0; !exists && n < NMATCHES_PER_ARRIVAL_V1 /* we only need the literals + short match cost + long match cost cases */; n++) {
+                  lzsa_arrival *pDestArrival = &arrival[((i + k) << MATCHES_PER_ARRIVAL_SHIFT) + n];

                  if (pDestArrival->from_slot == 0 ||
                     nCodingChoiceCost < pDestArrival->cost ||
                     (nCodingChoiceCost == pDestArrival->cost && nScore < (pDestArrival->score + nDisableScore))) {
-                     memmove(&arrival[((i + k) << MATCHES_PER_OFFSET_SHIFT) + n + 1],
-                        &arrival[((i + k) << MATCHES_PER_OFFSET_SHIFT) + n],
-                        sizeof(lzsa_arrival) * (NMATCHES_PER_OFFSET - n - 1));
+                     memmove(&arrival[((i + k) << MATCHES_PER_ARRIVAL_SHIFT) + n + 1],
+                        &arrival[((i + k) << MATCHES_PER_ARRIVAL_SHIFT) + n],
+                        sizeof(lzsa_arrival) * (NMATCHES_PER_ARRIVAL_V1 - n - 1));

                     pDestArrival->cost = nCodingChoiceCost;
                     pDestArrival->from_pos = i;
@ -267,15 +269,14 @@ static void lzsa_optimize_forward_v1(lzsa_compressor *pCompressor, lzsa_match *p
      }
   }

-   lzsa_arrival *end_arrival = &arrival[(i << MATCHES_PER_OFFSET_SHIFT) + 0];
-   pBestMatch[i].length = 0;
-   pBestMatch[i].offset = 0;
+   lzsa_arrival *end_arrival = &arrival[(i << MATCHES_PER_ARRIVAL_SHIFT) + 0];

   while (end_arrival->from_slot > 0 && end_arrival->from_pos >= 0) {
+      if (end_arrival->from_pos >= nEndOffset) return;
      pBestMatch[end_arrival->from_pos].length = end_arrival->match_len;
      pBestMatch[end_arrival->from_pos].offset = end_arrival->match_offset;

-      end_arrival = &arrival[(end_arrival->from_pos << MATCHES_PER_OFFSET_SHIFT) + (end_arrival->from_slot - 1)];
+      end_arrival = &arrival[(end_arrival->from_pos << MATCHES_PER_ARRIVAL_SHIFT) + (end_arrival->from_slot - 1)];
   }
 }

@ -284,13 +285,14 @@ static void lzsa_optimize_forward_v1(lzsa_compressor *pCompressor, lzsa_match *p
 * impacting the compression ratio
 *
 * @param pCompressor compression context
+ * @param pInWindow pointer to input data window (previously compressed bytes + bytes to compress)
 * @param pBestMatch optimal matches to emit
 * @param nStartOffset current offset in input window (typically the number of previously compressed bytes)
 * @param nEndOffset offset to end finding matches at (typically the size of the total input window in bytes
 *
 * @return non-zero if the number of tokens was reduced, 0 if it wasn't
 */
-static int lzsa_optimize_command_count_v1(lzsa_compressor *pCompressor, lzsa_match *pBestMatch, const int nStartOffset, const int nEndOffset) {
+static int lzsa_optimize_command_count_v1(lzsa_compressor *pCompressor, const unsigned char *pInWindow, lzsa_match *pBestMatch, const int nStartOffset, const int nEndOffset) {
   int i;
   int nNumLiterals = 0;
   int nDidReduce = 0;
@ -298,6 +300,28 @@ static int lzsa_optimize_command_count_v1(lzsa_compressor *pCompressor, lzsa_mat
   for (i = nStartOffset; i < nEndOffset; ) {
      lzsa_match *pMatch = pBestMatch + i;

+      if (pMatch->length == 0 &&
+         (i + 1) < (nEndOffset - LAST_LITERALS) &&
+         pBestMatch[i + 1].length >= MIN_MATCH_SIZE_V1 &&
+         pBestMatch[i + 1].length < MAX_VARLEN &&
+         pBestMatch[i + 1].offset &&
+         i >= pBestMatch[i + 1].offset &&
+         (i + pBestMatch[i + 1].length + 1) <= (nEndOffset - LAST_LITERALS) &&
+         !memcmp(pInWindow + i - (pBestMatch[i + 1].offset), pInWindow + i, pBestMatch[i + 1].length + 1)) {
+         int nCurLenSize = lzsa_get_match_varlen_size_v1(pBestMatch[i + 1].length - MIN_MATCH_SIZE_V1);
+         int nReducedLenSize = lzsa_get_match_varlen_size_v1(pBestMatch[i + 1].length + 1 - MIN_MATCH_SIZE_V1);
+
+         if ((nReducedLenSize - nCurLenSize) <= 8) {
+            /* Merge */
+            pBestMatch[i].length = pBestMatch[i + 1].length + 1;
+            pBestMatch[i].offset = pBestMatch[i + 1].offset;
+            pBestMatch[i + 1].length = 0;
+            pBestMatch[i + 1].offset = 0;
+            nDidReduce = 1;
+            continue;
+         }
+      }
+
      if (pMatch->length >= MIN_MATCH_SIZE_V1) {
         if (pMatch->length <= 9 /* Don't waste time considering large matches, they will always win over literals */ &&
            (i + pMatch->length) < nEndOffset /* Don't consider the last token in the block, we can only reduce a match inbetween other tokens */) {
@ -326,17 +350,33 @@ static int lzsa_optimize_command_count_v1(lzsa_compressor *pCompressor, lzsa_mat
            }
         }

-         if ((i + pMatch->length) < nEndOffset && pMatch->length >= LCP_MAX &&
-            pMatch->offset && pMatch->offset <= 32 && pBestMatch[i + pMatch->length].offset == pMatch->offset && (pMatch->length % pMatch->offset) == 0 &&
-            (pMatch->length + pBestMatch[i + pMatch->length].length) <= MAX_VARLEN) {
-            int nMatchLen = pMatch->length;
+         if ((i + pMatch->length) <= nEndOffset && pMatch->offset > 0 && pMatch->length >= MIN_MATCH_SIZE_V1 &&
+            pBestMatch[i + pMatch->length].offset > 0 &&
+            pBestMatch[i + pMatch->length].length >= MIN_MATCH_SIZE_V1 &&
+            (pMatch->length + pBestMatch[i + pMatch->length].length) >= LEAVE_ALONE_MATCH_SIZE &&
+            (pMatch->length + pBestMatch[i + pMatch->length].length) <= MAX_VARLEN &&
+            (i + pMatch->length) > pMatch->offset &&
+            (i + pMatch->length) > pBestMatch[i + pMatch->length].offset &&
+            (i + pMatch->length + pBestMatch[i + pMatch->length].length) <= nEndOffset &&
+            !memcmp(pInWindow + i - pMatch->offset + pMatch->length,
+               pInWindow + i + pMatch->length - pBestMatch[i + pMatch->length].offset,
+               pBestMatch[i + pMatch->length].length)) {

-            /* Join */
+            int nCurPartialSize = lzsa_get_match_varlen_size_v1(pMatch->length - MIN_MATCH_SIZE_V1);
+            nCurPartialSize += 8 /* token */ + lzsa_get_literals_varlen_size_v1(0) + ((pBestMatch[i + pMatch->length].offset <= 256) ? 8 : 16) /* match offset */ + lzsa_get_match_varlen_size_v1(pBestMatch[i + pMatch->length].length - MIN_MATCH_SIZE_V1);

-            pMatch->length += pBestMatch[i + nMatchLen].length;
-            pBestMatch[i + nMatchLen].offset = 0;
-            pBestMatch[i + nMatchLen].length = -1;
-            continue;
+            int nReducedPartialSize = lzsa_get_match_varlen_size_v1(pMatch->length + pBestMatch[i + pMatch->length].length - MIN_MATCH_SIZE_V1);
+
+            if (nCurPartialSize >= nReducedPartialSize) {
+               int nMatchLen = pMatch->length;
+
+               /* Join */
+
+               pMatch->length += pBestMatch[i + nMatchLen].length;
+               pBestMatch[i + nMatchLen].offset = 0;
+               pBestMatch[i + nMatchLen].length = -1;
+               continue;
+            }
         }

         i += pMatch->length;
@ -620,34 +660,36 @@ int lzsa_optimize_and_write_block_v1(lzsa_compressor *pCompressor, const unsigne

   /* Compress optimally without breaking ties in favor of less tokens */

-   lzsa_optimize_forward_v1(pCompressor, pCompressor->best_match, nPreviousBlockSize, nPreviousBlockSize + nInDataSize, 0 /* reduce */);
+   memset(pCompressor->best_match, 0, BLOCK_SIZE * sizeof(lzsa_match));
+   lzsa_optimize_forward_v1(pCompressor, pCompressor->best_match - nPreviousBlockSize, nPreviousBlockSize, nPreviousBlockSize + nInDataSize, 0 /* reduce */);

   int nDidReduce;
   int nPasses = 0;
   do {
-      nDidReduce = lzsa_optimize_command_count_v1(pCompressor, pCompressor->best_match, nPreviousBlockSize, nPreviousBlockSize + nInDataSize);
+      nDidReduce = lzsa_optimize_command_count_v1(pCompressor, pInWindow, pCompressor->best_match - nPreviousBlockSize, nPreviousBlockSize, nPreviousBlockSize + nInDataSize);
      nPasses++;
   } while (nDidReduce && nPasses < 20);

-   nBaseCompressedSize = lzsa_get_compressed_size_v1(pCompressor, pCompressor->best_match, nPreviousBlockSize, nPreviousBlockSize + nInDataSize);
-   lzsa_match *pBestMatch = pCompressor->best_match;
+   nBaseCompressedSize = lzsa_get_compressed_size_v1(pCompressor, pCompressor->best_match - nPreviousBlockSize, nPreviousBlockSize, nPreviousBlockSize + nInDataSize);
+   lzsa_match *pBestMatch = pCompressor->best_match - nPreviousBlockSize;

   if (nBaseCompressedSize > 0 && nInDataSize < 65536) {
      int nReducedCompressedSize;

      /* Compress optimally and do break ties in favor of less tokens */
-      lzsa_optimize_forward_v1(pCompressor, pCompressor->improved_match, nPreviousBlockSize, nPreviousBlockSize + nInDataSize, 1 /* reduce */);
+      memset(pCompressor->improved_match, 0, BLOCK_SIZE * sizeof(lzsa_match));
+      lzsa_optimize_forward_v1(pCompressor, pCompressor->improved_match - nPreviousBlockSize, nPreviousBlockSize, nPreviousBlockSize + nInDataSize, 1 /* reduce */);
      
      nPasses = 0;
      do {
-         nDidReduce = lzsa_optimize_command_count_v1(pCompressor, pCompressor->improved_match, nPreviousBlockSize, nPreviousBlockSize + nInDataSize);
+         nDidReduce = lzsa_optimize_command_count_v1(pCompressor, pInWindow, pCompressor->improved_match - nPreviousBlockSize, nPreviousBlockSize, nPreviousBlockSize + nInDataSize);
         nPasses++;
      } while (nDidReduce && nPasses < 20);

-      nReducedCompressedSize = lzsa_get_compressed_size_v1(pCompressor, pCompressor->improved_match, nPreviousBlockSize, nPreviousBlockSize + nInDataSize);
+      nReducedCompressedSize = lzsa_get_compressed_size_v1(pCompressor, pCompressor->improved_match - nPreviousBlockSize, nPreviousBlockSize, nPreviousBlockSize + nInDataSize);
      if (nReducedCompressedSize > 0 && nReducedCompressedSize <= nBaseCompressedSize) {
         /* Pick the parse with the reduced number of tokens as it didn't negatively affect the size */
-         pBestMatch = pCompressor->improved_match;
+         pBestMatch = pCompressor->improved_match - nPreviousBlockSize;
      }
   }

--- a/src/shrink_block_v2.c
+++ b/src/shrink_block_v2.c
@ -174,36 +174,110 @@ static inline int lzsa_write_match_varlen_v2(unsigned char *pOutData, int nOutOf
   return nOutOffset;
 }

+/**
+ * Insert forward rep candidate
+ *
+ * @param pCompressor compression context
+ * @param pInWindow pointer to input data window (previously compressed bytes + bytes to compress)
+ * @param i input data window position whose matches are being considered
+ * @param nMatchOffset match offset to use as rep candidate
+ * @param nStartOffset current offset in input window (typically the number of previously compressed bytes)
+ * @param nEndOffset offset to end finding matches at (typically the size of the total input window in bytes
+ * @param nMatchesPerArrival number of arrivals to record per input buffer position
+ * @param nDepth current insertion depth
+ */
+static void lzsa_insert_forward_match_v2(lzsa_compressor *pCompressor, const unsigned char *pInWindow, const int i, const int nMatchOffset, const int nStartOffset, const int nEndOffset, const int nMatchesPerArrival, int nDepth) {
+   lzsa_arrival *arrival = pCompressor->arrival - (nStartOffset << MATCHES_PER_ARRIVAL_SHIFT);
+   int j;
+
+   if (nDepth >= 10) return;
+
+   for (j = 0; j < nMatchesPerArrival && arrival[(i << MATCHES_PER_ARRIVAL_SHIFT) + j].from_slot; j++) {
+      int nRepOffset = arrival[(i << MATCHES_PER_ARRIVAL_SHIFT) + j].rep_offset;
+
+      if (nMatchOffset != nRepOffset && nRepOffset && arrival[(i << MATCHES_PER_ARRIVAL_SHIFT) + j].rep_len >= MIN_MATCH_SIZE_V2) {
+         int nRepPos = arrival[(i << MATCHES_PER_ARRIVAL_SHIFT) + j].rep_pos;
+         int nRepLen = arrival[(i << MATCHES_PER_ARRIVAL_SHIFT) + j].rep_len;
+
+         if (nRepPos > nMatchOffset &&
+            (nRepPos - nMatchOffset + nRepLen) <= (nEndOffset - LAST_LITERALS) &&
+            !memcmp(pInWindow + nRepPos - nRepOffset, pInWindow + nRepPos - nMatchOffset, nRepLen)) {
+            int nCurRepLen = nRepLen;
+
+            int nMaxRepLen = nEndOffset - nRepPos;
+            if (nMaxRepLen > LCP_MAX)
+               nMaxRepLen = LCP_MAX;
+            while ((nCurRepLen + 8) < nMaxRepLen && !memcmp(pInWindow + nRepPos + nCurRepLen, pInWindow + nRepPos - nMatchOffset + nCurRepLen, 8))
+               nCurRepLen += 8;
+            while ((nCurRepLen + 4) < nMaxRepLen && !memcmp(pInWindow + nRepPos + nCurRepLen, pInWindow + nRepPos - nMatchOffset + nCurRepLen, 4))
+               nCurRepLen += 4;
+            while (nCurRepLen < nMaxRepLen && pInWindow[nRepPos + nCurRepLen] == pInWindow[nRepPos - nMatchOffset + nCurRepLen])
+               nCurRepLen++;
+
+            lzsa_match *fwd_match = pCompressor->match + ((nRepPos - nStartOffset) << MATCHES_PER_INDEX_SHIFT_V2);
+            int exists = 0;
+            int r;
+
+            for (r = 0; r < NMATCHES_PER_INDEX_V2 && fwd_match[r].length >= MIN_MATCH_SIZE_V2; r++) {
+               if (fwd_match[r].offset == nMatchOffset) {
+                  exists = 1;
+
+                  if (fwd_match[r].length < nCurRepLen) {
+                     fwd_match[r].length = nCurRepLen;
+                     lzsa_insert_forward_match_v2(pCompressor, pInWindow, nRepPos, nMatchOffset, nStartOffset, nEndOffset, nMatchesPerArrival, nDepth + 1);
+                  }
+                  break;
+               }
+            }
+
+            if (!exists && r < NMATCHES_PER_INDEX_V2) {
+               fwd_match[r].offset = nMatchOffset;
+               fwd_match[r].length = nCurRepLen;
+
+               lzsa_insert_forward_match_v2(pCompressor, pInWindow, nRepPos, nMatchOffset, nStartOffset, nEndOffset, nMatchesPerArrival, nDepth + 1);
+            }
+         }
+      }
+   }
+}
+
 /**
 * Attempt to pick optimal matches using a forward arrivals parser, so as to produce the smallest possible output that decompresses to the same input
 *
 * @param pCompressor compression context
+ * @param pInWindow pointer to input data window (previously compressed bytes + bytes to compress)
+ * @param pBestMatch pointer to buffer for outputting optimal matches
 * @param nStartOffset current offset in input window (typically the number of previously compressed bytes)
 * @param nEndOffset offset to end finding matches at (typically the size of the total input window in bytes
+ * @param nReduce non-zero to reduce the number of tokens when the path costs are equal, zero not to
 * @param nInsertForwardReps non-zero to insert forward repmatch candidates, zero to use the previously inserted candidates
+ * @param nMatchesPerArrival number of arrivals to record per input buffer position
 */
-static void lzsa_optimize_forward_v2(lzsa_compressor *pCompressor, const unsigned char *pInWindow, lzsa_match *pBestMatch, const int nStartOffset, const int nEndOffset, const int nReduce, const int nInsertForwardReps) {
-   lzsa_arrival *arrival = pCompressor->arrival;
+static void lzsa_optimize_forward_v2(lzsa_compressor *pCompressor, const unsigned char *pInWindow, lzsa_match *pBestMatch, const int nStartOffset, const int nEndOffset, const int nReduce, const int nInsertForwardReps, const int nMatchesPerArrival) {
+   lzsa_arrival *arrival = pCompressor->arrival - (nStartOffset << MATCHES_PER_ARRIVAL_SHIFT);
   const int nFavorRatio = (pCompressor->flags & LZSA_FLAG_FAVOR_RATIO) ? 1 : 0;
   const int nMinMatchSize = pCompressor->min_match_size;
   const int nDisableScore = nReduce ? 0 : (2 * BLOCK_SIZE);
+   const int nLeaveAloneMatchSize = (nMatchesPerArrival == NMATCHES_PER_ARRIVAL_V2_SMALL) ? LEAVE_ALONE_MATCH_SIZE_SMALL : LEAVE_ALONE_MATCH_SIZE;
   int i, j, n;

-   memset(arrival + (nStartOffset << MATCHES_PER_OFFSET_SHIFT), 0, sizeof(lzsa_arrival) * ((nEndOffset - nStartOffset) << MATCHES_PER_OFFSET_SHIFT));
+   if ((nEndOffset - nStartOffset) > BLOCK_SIZE) return;

-   for (i = (nStartOffset << MATCHES_PER_OFFSET_SHIFT); i != (nEndOffset << MATCHES_PER_OFFSET_SHIFT); i++) {
+   memset(arrival + (nStartOffset << MATCHES_PER_ARRIVAL_SHIFT), 0, sizeof(lzsa_arrival) * ((nEndOffset - nStartOffset + 1) << MATCHES_PER_ARRIVAL_SHIFT));
+
+   for (i = (nStartOffset << MATCHES_PER_ARRIVAL_SHIFT); i != ((nEndOffset + 1) << MATCHES_PER_ARRIVAL_SHIFT); i++) {
      arrival[i].cost = 0x40000000;
   }

-   arrival[nStartOffset << MATCHES_PER_OFFSET_SHIFT].from_slot = -1;
+   arrival[nStartOffset << MATCHES_PER_ARRIVAL_SHIFT].from_slot = -1;

-   for (i = nStartOffset; i != (nEndOffset - 1); i++) {
+   for (i = nStartOffset; i != nEndOffset; i++) {
      int m;

-      for (j = 0; j < NMATCHES_PER_OFFSET && arrival[(i << MATCHES_PER_OFFSET_SHIFT) + j].from_slot; j++) {
-         const int nPrevCost = arrival[(i << MATCHES_PER_OFFSET_SHIFT) + j].cost & 0x3fffffff;
+      for (j = 0; j < nMatchesPerArrival && arrival[(i << MATCHES_PER_ARRIVAL_SHIFT) + j].from_slot; j++) {
+         const int nPrevCost = arrival[(i << MATCHES_PER_ARRIVAL_SHIFT) + j].cost & 0x3fffffff;
         int nCodingChoiceCost = nPrevCost + 8 /* literal */;
-         int nNumLiterals = arrival[(i << MATCHES_PER_OFFSET_SHIFT) + j].num_literals + 1;
+         int nNumLiterals = arrival[(i << MATCHES_PER_ARRIVAL_SHIFT) + j].num_literals + 1;

         if (nNumLiterals == LITERALS_RUN_LEN_V2) {
            nCodingChoiceCost += 4;
@ -218,29 +292,36 @@ static void lzsa_optimize_forward_v2(lzsa_compressor *pCompressor, const unsigne
         if (!nFavorRatio && nNumLiterals == 1)
            nCodingChoiceCost += MODESWITCH_PENALTY;

-         lzsa_arrival *pDestSlots = &arrival[(i + 1) << MATCHES_PER_OFFSET_SHIFT];
-         if (nCodingChoiceCost <= pDestSlots[NMATCHES_PER_OFFSET - 1].cost) {
+         lzsa_arrival *pDestSlots = &arrival[(i + 1) << MATCHES_PER_ARRIVAL_SHIFT];
+         if (nCodingChoiceCost <= pDestSlots[nMatchesPerArrival - 1].cost) {
            int exists = 0;
            for (n = 0;
-               n < NMATCHES_PER_OFFSET && pDestSlots[n].cost <= nCodingChoiceCost;
+               n < nMatchesPerArrival && pDestSlots[n].cost <= nCodingChoiceCost;
               n++) {
-               if (pDestSlots[n].rep_offset == arrival[(i << MATCHES_PER_OFFSET_SHIFT) + j].rep_offset) {
+               if (pDestSlots[n].rep_offset == arrival[(i << MATCHES_PER_ARRIVAL_SHIFT) + j].rep_offset) {
                  exists = 1;
                  break;
               }
            }

            if (!exists) {
-               int nScore = arrival[(i << MATCHES_PER_OFFSET_SHIFT) + j].score + 1;
-               for (n = 0; n < NMATCHES_PER_OFFSET; n++) {
+               int nScore = arrival[(i << MATCHES_PER_ARRIVAL_SHIFT) + j].score + 1;
+               for (n = 0; n < nMatchesPerArrival; n++) {
                  lzsa_arrival *pDestArrival = &pDestSlots[n];
                  if (nCodingChoiceCost < pDestArrival->cost ||
                     (nCodingChoiceCost == pDestArrival->cost && nScore < (pDestArrival->score + nDisableScore))) {

                     if (pDestArrival->from_slot) {
+                        int z;
+
+                        for (z = n; z < nMatchesPerArrival - 1; z++) {
+                           if (pDestSlots[z].rep_offset == arrival[(i << MATCHES_PER_ARRIVAL_SHIFT) + j].rep_offset)
+                              break;
+                        }
+
                        memmove(&pDestSlots[n + 1],
                           &pDestSlots[n],
-                           sizeof(lzsa_arrival) * (NMATCHES_PER_OFFSET - n - 1));
+                           sizeof(lzsa_arrival) * (z - n));
                     }

                     pDestArrival->cost = nCodingChoiceCost;
@ -250,9 +331,9 @@ static void lzsa_optimize_forward_v2(lzsa_compressor *pCompressor, const unsigne
                     pDestArrival->match_len = 0;
                     pDestArrival->num_literals = nNumLiterals;
                     pDestArrival->score = nScore;
-                     pDestArrival->rep_offset = arrival[(i << MATCHES_PER_OFFSET_SHIFT) + j].rep_offset;
-                     pDestArrival->rep_pos = arrival[(i << MATCHES_PER_OFFSET_SHIFT) + j].rep_pos;
-                     pDestArrival->rep_len = arrival[(i << MATCHES_PER_OFFSET_SHIFT) + j].rep_len;
+                     pDestArrival->rep_offset = arrival[(i << MATCHES_PER_ARRIVAL_SHIFT) + j].rep_offset;
+                     pDestArrival->rep_pos = arrival[(i << MATCHES_PER_ARRIVAL_SHIFT) + j].rep_pos;
+                     pDestArrival->rep_len = arrival[(i << MATCHES_PER_ARRIVAL_SHIFT) + j].rep_len;
                     break;
                  }
               }
@ -260,125 +341,143 @@ static void lzsa_optimize_forward_v2(lzsa_compressor *pCompressor, const unsigne
         }
      }

-      lzsa_match *match = pCompressor->match + (i << 5);
+      lzsa_match *match = pCompressor->match + ((i - nStartOffset) << MATCHES_PER_INDEX_SHIFT_V2);

-      for (m = 0; m < 32 && match[m].length; m++) {
-         int nMatchLen = match[m].length;
+      int nMinRepLen[NMATCHES_PER_ARRIVAL_V2_BIG];
+      memset(nMinRepLen, 0, nMatchesPerArrival * sizeof(int));
+
+      for (m = 0; m < NMATCHES_PER_INDEX_V2 && match[m].length; m++) {
+         int nMatchLen = match[m].length & 0x7fff;
         int nMatchOffset = match[m].offset;
+         int nScorePenalty = ((match[m].length & 0x8000) >> 15);
         int nNoRepmatchOffsetCost = (nMatchOffset <= 32) ? 4 : ((nMatchOffset <= 512) ? 8 : ((nMatchOffset <= (8192 + 512)) ? 12 : 16));
         int nStartingMatchLen, k;
-         int nMaxRepLen[NMATCHES_PER_OFFSET];
+         int nMaxRepLen[NMATCHES_PER_ARRIVAL_V2_BIG];

         if ((i + nMatchLen) > (nEndOffset - LAST_LITERALS))
            nMatchLen = nEndOffset - LAST_LITERALS - i;

-         for (j = 0; j < NMATCHES_PER_OFFSET && arrival[(i << MATCHES_PER_OFFSET_SHIFT) + j].from_slot; j++) {
-            int nRepOffset = arrival[(i << MATCHES_PER_OFFSET_SHIFT) + j].rep_offset;
+         for (j = 0; j < nMatchesPerArrival && arrival[(i << MATCHES_PER_ARRIVAL_SHIFT) + j].from_slot; j++) {
+            int nRepOffset = arrival[(i << MATCHES_PER_ARRIVAL_SHIFT) + j].rep_offset;
            int nCurMaxRepLen = 0;

-            if (nMatchOffset != nRepOffset &&
-               nRepOffset &&
-               i > nRepOffset &&
-               (i - nRepOffset + nMatchLen) <= (nEndOffset - LAST_LITERALS)) {
-               while (nCurMaxRepLen < nMatchLen && pInWindow[i - nRepOffset + nCurMaxRepLen] == pInWindow[i - nMatchOffset + nCurMaxRepLen])
-                  nCurMaxRepLen++;
+            if (nRepOffset) {
+               if (nMatchOffset == nRepOffset)
+                  nCurMaxRepLen = nMatchLen;
+               else {
+                  if (i > nRepOffset &&
+                     (i - nRepOffset + nMatchLen) <= (nEndOffset - LAST_LITERALS)) {
+                     nCurMaxRepLen = nMinRepLen[j];
+                     while ((nCurMaxRepLen + 8) < nMatchLen && !memcmp(pInWindow + i - nRepOffset + nCurMaxRepLen, pInWindow + i + nCurMaxRepLen, 8))
+                        nCurMaxRepLen += 8;
+                     while ((nCurMaxRepLen + 4) < nMatchLen && !memcmp(pInWindow + i - nRepOffset + nCurMaxRepLen, pInWindow + i + nCurMaxRepLen, 4))
+                        nCurMaxRepLen += 4;
+                     while (nCurMaxRepLen < nMatchLen && pInWindow[i - nRepOffset + nCurMaxRepLen] == pInWindow[i + nCurMaxRepLen])
+                        nCurMaxRepLen++;
+                     nMinRepLen[j] = nCurMaxRepLen;
+                  }
+               }
            }

            nMaxRepLen[j] = nCurMaxRepLen;
         }
-         while (j < NMATCHES_PER_OFFSET)
+         while (j < nMatchesPerArrival)
            nMaxRepLen[j++] = 0;

-         for (j = 0; j < NMATCHES_PER_OFFSET && arrival[(i << MATCHES_PER_OFFSET_SHIFT) + j].from_slot; j++) {
-            int nRepOffset = arrival[(i << MATCHES_PER_OFFSET_SHIFT) + j].rep_offset;
+         if (nInsertForwardReps)
+            lzsa_insert_forward_match_v2(pCompressor, pInWindow, i, nMatchOffset, nStartOffset, nEndOffset, nMatchesPerArrival, 0);

-            if (nMatchOffset != nRepOffset && nRepOffset && nInsertForwardReps && arrival[(i << MATCHES_PER_OFFSET_SHIFT) + j].rep_len >= MIN_MATCH_SIZE_V2) {
-               int nRepPos = arrival[(i << MATCHES_PER_OFFSET_SHIFT) + j].rep_pos;
-               int nRepLen = arrival[(i << MATCHES_PER_OFFSET_SHIFT) + j].rep_len;
-
-               if (nRepPos > nMatchOffset &&
-                  (nRepPos - nMatchOffset + nRepLen) <= (nEndOffset - LAST_LITERALS) &&
-                  !memcmp(pInWindow + nRepPos - nRepOffset, pInWindow + nRepPos - nMatchOffset, nRepLen)) {
-
-                  lzsa_match *fwd_match = pCompressor->match + (nRepPos << 5);
-                  int exists = 0;
-                  int r;
-
-                  for (r = 0; r < 32 && fwd_match[r].length >= MIN_MATCH_SIZE_V2; r++) {
-                     if (fwd_match[r].offset == nMatchOffset) {
-                        exists = 1;
-                        break;
-                     }
-                  }
-
-                  if (!exists && r < 32) {
-                     fwd_match[r].offset = nMatchOffset;
-                     fwd_match[r].length = nRepLen;
-                  }
-               }
-            }
+         int nMatchLenCost = 0;
+         if (nMatchLen >= nLeaveAloneMatchSize) {
+            nStartingMatchLen = nMatchLen;
+            nMatchLenCost = 4 + 24;
+         }
+         else {
+            nStartingMatchLen = nMinMatchSize;
+            nMatchLenCost = 0;
         }

-         if (nMatchLen >= LEAVE_ALONE_MATCH_SIZE)
-            nStartingMatchLen = nMatchLen;
-         else
-            nStartingMatchLen = nMinMatchSize;
-
         for (k = nStartingMatchLen; k <= nMatchLen; k++) {
-            int nMatchLenCost = lzsa_get_match_varlen_size_v2(k - MIN_MATCH_SIZE_V2);
-            lzsa_arrival *pDestSlots = &arrival[(i + k) << MATCHES_PER_OFFSET_SHIFT];
+            if (k == (MATCH_RUN_LEN_V2 + MIN_MATCH_SIZE_V2)) {
+               nMatchLenCost = 4;
+            }
+            else {
+               if (k == (MATCH_RUN_LEN_V2 + 15 + MIN_MATCH_SIZE_V2))
+                  nMatchLenCost = 4 + 8;
+               else {
+                  if (k == 256)
+                     nMatchLenCost = 4 + 24;
+               }
+            }

-            for (j = 0; j < NMATCHES_PER_OFFSET && arrival[(i << MATCHES_PER_OFFSET_SHIFT) + j].from_slot; j++) {
-               const int nPrevCost = arrival[(i << MATCHES_PER_OFFSET_SHIFT) + j].cost & 0x3fffffff;
-               int nRepOffset = arrival[(i << MATCHES_PER_OFFSET_SHIFT) + j].rep_offset;
+            lzsa_arrival *pDestSlots = &arrival[(i + k) << MATCHES_PER_ARRIVAL_SHIFT];
+            int nInsertedNoRepMatchCandidate = 0;

-               int nMatchOffsetCost = (nMatchOffset == nRepOffset) ? 0 : nNoRepmatchOffsetCost;
+            for (j = 0; j < nMatchesPerArrival && arrival[(i << MATCHES_PER_ARRIVAL_SHIFT) + j].from_slot; j++) {
+               const int nPrevCost = arrival[(i << MATCHES_PER_ARRIVAL_SHIFT) + j].cost & 0x3fffffff;
               int nRepCodingChoiceCost = nPrevCost + 8 /* token */ /* the actual cost of the literals themselves accumulates up the chain */ + nMatchLenCost;
-               int nCodingChoiceCost = nRepCodingChoiceCost + nMatchOffsetCost;

-               if (!nFavorRatio && !arrival[(i << MATCHES_PER_OFFSET_SHIFT) + j].num_literals)
-                  nCodingChoiceCost += MODESWITCH_PENALTY;
+               if (nRepCodingChoiceCost <= pDestSlots[nMatchesPerArrival - 1].cost) {
+                  int nRepOffset = arrival[(i << MATCHES_PER_ARRIVAL_SHIFT) + j].rep_offset;

-               if (nRepCodingChoiceCost <= pDestSlots[NMATCHES_PER_OFFSET - 1].cost) {
-                  if (nCodingChoiceCost <= pDestSlots[NMATCHES_PER_OFFSET - 1].cost) {
-                     int exists = 0;
+                  if (nMatchOffset != nRepOffset && !nInsertedNoRepMatchCandidate) {
+                     int nCodingChoiceCost = nRepCodingChoiceCost + nNoRepmatchOffsetCost;

-                     for (n = 0;
-                        n < NMATCHES_PER_OFFSET && pDestSlots[n].cost <= nCodingChoiceCost;
-                        n++) {
-                        if (pDestSlots[n].rep_offset == nMatchOffset) {
-                           exists = 1;
-                           break;
-                        }
-                     }
+                     if (!nFavorRatio && !arrival[(i << MATCHES_PER_ARRIVAL_SHIFT) + j].num_literals)
+                        nCodingChoiceCost += MODESWITCH_PENALTY;

-                     if (!exists) {
-                        int nScore = arrival[(i << MATCHES_PER_OFFSET_SHIFT) + j].score + ((nMatchOffset == nRepOffset) ? 2 : 3);
+                     if (nCodingChoiceCost <= pDestSlots[nMatchesPerArrival - 1].cost) {
+                        int exists = 0;
+                        int nScore = arrival[(i << MATCHES_PER_ARRIVAL_SHIFT) + j].score + 3 + nScorePenalty;

-                        for (n = 0; n < NMATCHES_PER_OFFSET; n++) {
-                           lzsa_arrival *pDestArrival = &pDestSlots[n];
-
-                           if (nCodingChoiceCost < pDestArrival->cost ||
-                              (nCodingChoiceCost == pDestArrival->cost && nScore < (pDestArrival->score + nDisableScore))) {
-                              if (pDestArrival->from_slot) {
-                                 memmove(&pDestSlots[n + 1],
-                                    &pDestSlots[n],
-                                    sizeof(lzsa_arrival) * (NMATCHES_PER_OFFSET - n - 1));
-                              }
-
-                              pDestArrival->cost = nCodingChoiceCost;
-                              pDestArrival->from_pos = i;
-                              pDestArrival->from_slot = j + 1;
-                              pDestArrival->match_offset = nMatchOffset;
-                              pDestArrival->match_len = k;
-                              pDestArrival->num_literals = 0;
-                              pDestArrival->score = nScore;
-                              pDestArrival->rep_offset = nMatchOffset;
-                              pDestArrival->rep_pos = i;
-                              pDestArrival->rep_len = k;
+                        for (n = 0;
+                           n < nMatchesPerArrival && pDestSlots[n].cost <= nCodingChoiceCost;
+                           n++) {
+                           if (pDestSlots[n].rep_offset == nMatchOffset &&
+                              (!nInsertForwardReps || pDestSlots[n].cost != nCodingChoiceCost || pDestSlots[n].rep_pos >= i || nScore >= (pDestSlots[n].score + nDisableScore) ||
+                                 pDestSlots[nMatchesPerArrival - 1].from_slot)) {
+                              exists = 1;
                              break;
                           }
                        }
+
+                        if (!exists) {
+                           for (n = 0; n < nMatchesPerArrival - 1; n++) {
+                              lzsa_arrival *pDestArrival = &pDestSlots[n];
+
+                              if (nCodingChoiceCost < pDestArrival->cost ||
+                                 (nCodingChoiceCost == pDestArrival->cost && nScore < (pDestArrival->score + nDisableScore))) {
+                                 if (pDestArrival->from_slot) {
+                                    int z;
+
+                                    for (z = n; z < nMatchesPerArrival - 1; z++) {
+                                       if (pDestSlots[z].rep_offset == nMatchOffset)
+                                          break;
+                                    }
+
+                                    if (z == (nMatchesPerArrival - 1) && pDestSlots[z].from_slot && pDestSlots[z].match_len < MIN_MATCH_SIZE_V2)
+                                       z--;
+
+                                    memmove(&pDestSlots[n + 1],
+                                       &pDestSlots[n],
+                                       sizeof(lzsa_arrival) * (z - n));
+                                 }
+
+                                 pDestArrival->cost = nCodingChoiceCost;
+                                 pDestArrival->from_pos = i;
+                                 pDestArrival->from_slot = j + 1;
+                                 pDestArrival->match_offset = nMatchOffset;
+                                 pDestArrival->match_len = k;
+                                 pDestArrival->num_literals = 0;
+                                 pDestArrival->score = nScore;
+                                 pDestArrival->rep_offset = nMatchOffset;
+                                 pDestArrival->rep_pos = i;
+                                 pDestArrival->rep_len = k;
+                                 nInsertedNoRepMatchCandidate = 1;
+                                 break;
+                              }
+                           }
+                        }
                     }
                  }

@ -392,7 +491,7 @@ static void lzsa_optimize_forward_v2(lzsa_compressor *pCompressor, const unsigne
                     /* A match is possible at the rep offset; insert the extra coding choice. */

                     for (n = 0;
-                        n < NMATCHES_PER_OFFSET && pDestSlots[n].cost <= nRepCodingChoiceCost;
+                        n < nMatchesPerArrival && pDestSlots[n].cost <= nRepCodingChoiceCost;
                        n++) {
                        if (pDestSlots[n].rep_offset == nRepOffset) {
                           exists = 1;
@ -401,17 +500,24 @@ static void lzsa_optimize_forward_v2(lzsa_compressor *pCompressor, const unsigne
                     }

                     if (!exists) {
-                        int nScore = arrival[(i << MATCHES_PER_OFFSET_SHIFT) + j].score + 2;
+                        int nScore = arrival[(i << MATCHES_PER_ARRIVAL_SHIFT) + j].score + 2;

-                        for (n = 0; n < NMATCHES_PER_OFFSET; n++) {
+                        for (n = 0; n < nMatchesPerArrival; n++) {
                           lzsa_arrival *pDestArrival = &pDestSlots[n];

                           if (nRepCodingChoiceCost < pDestArrival->cost ||
                              (nRepCodingChoiceCost == pDestArrival->cost && nScore < (pDestArrival->score + nDisableScore))) {
                              if (pDestArrival->from_slot) {
+                                 int z;
+
+                                 for (z = n; z < nMatchesPerArrival - 1; z++) {
+                                    if (pDestSlots[z].rep_offset == nRepOffset)
+                                       break;
+                                 }
+
                                 memmove(&pDestSlots[n + 1],
                                    &pDestSlots[n],
-                                    sizeof(lzsa_arrival) * (NMATCHES_PER_OFFSET - n - 1));
+                                    sizeof(lzsa_arrival) * (z - n));
                              }

                              pDestArrival->cost = nRepCodingChoiceCost;
@ -430,19 +536,24 @@ static void lzsa_optimize_forward_v2(lzsa_compressor *pCompressor, const unsigne
                     }
                  }
               }
+               else {
+                  break;
+               }
            }
         }
+
+         if (nMatchLen >= LCP_MAX && ((m + 1) >= NMATCHES_PER_INDEX_V2 || match[m + 1].length < LCP_MAX))
+            break;
      }
   }

-   lzsa_arrival *end_arrival = &arrival[(i << MATCHES_PER_OFFSET_SHIFT) + 0];
-   pBestMatch[i].length = 0;
-   pBestMatch[i].offset = 0;
+   lzsa_arrival *end_arrival = &arrival[(i << MATCHES_PER_ARRIVAL_SHIFT) + 0];

   while (end_arrival->from_slot > 0 && end_arrival->from_pos >= 0) {
+      if (end_arrival->from_pos >= nEndOffset) return;
      pBestMatch[end_arrival->from_pos].length = end_arrival->match_len;
      pBestMatch[end_arrival->from_pos].offset = end_arrival->match_offset;
-      end_arrival = &arrival[(end_arrival->from_pos << MATCHES_PER_OFFSET_SHIFT) + (end_arrival->from_slot - 1)];
+      end_arrival = &arrival[(end_arrival->from_pos << MATCHES_PER_ARRIVAL_SHIFT) + (end_arrival->from_slot - 1)];
   }
 }

@ -470,6 +581,28 @@ static int lzsa_optimize_command_count_v2(lzsa_compressor *pCompressor, const un
   for (i = nStartOffset; i < nEndOffset; ) {
      lzsa_match *pMatch = pBestMatch + i;

+      if (pMatch->length == 0 &&
+         (i + 1) < (nEndOffset - LAST_LITERALS) &&
+         pBestMatch[i + 1].length >= MIN_MATCH_SIZE_V2 &&
+         pBestMatch[i + 1].length < MAX_VARLEN &&
+         pBestMatch[i + 1].offset &&
+         i >= pBestMatch[i + 1].offset &&
+         (i + pBestMatch[i + 1].length + 1) <= (nEndOffset - LAST_LITERALS) &&
+         !memcmp(pInWindow + i - (pBestMatch[i + 1].offset), pInWindow + i, pBestMatch[i + 1].length + 1)) {
+         int nCurLenSize = lzsa_get_match_varlen_size_v2(pBestMatch[i + 1].length - MIN_MATCH_SIZE_V2);
+         int nReducedLenSize = lzsa_get_match_varlen_size_v2(pBestMatch[i + 1].length + 1 - MIN_MATCH_SIZE_V2);
+
+         if ((nReducedLenSize - nCurLenSize) <= 8) {
+            /* Merge */
+            pBestMatch[i].length = pBestMatch[i + 1].length + 1;
+            pBestMatch[i].offset = pBestMatch[i + 1].offset;
+            pBestMatch[i + 1].length = 0;
+            pBestMatch[i + 1].offset = 0;
+            nDidReduce = 1;
+            continue;
+         }
+      }
+
      if (pMatch->length >= MIN_MATCH_SIZE_V2) {
         if ((i + pMatch->length) < nEndOffset /* Don't consider the last match in the block, we can only reduce a match inbetween other tokens */) {
            int nNextIndex = i + pMatch->length;
@ -583,18 +716,51 @@ static int lzsa_optimize_command_count_v2(lzsa_compressor *pCompressor, const un
            }
         }

-         if ((i + pMatch->length) < nEndOffset && pMatch->length >= LCP_MAX &&
-            pMatch->offset && pMatch->offset <= 32 && pBestMatch[i + pMatch->length].offset == pMatch->offset && (pMatch->length % pMatch->offset) == 0 &&
-            (pMatch->length + pBestMatch[i + pMatch->length].length) <= MAX_VARLEN) {
-            int nMatchLen = pMatch->length;
+         if ((i + pMatch->length) <= nEndOffset && pMatch->offset > 0 && pMatch->length >= MIN_MATCH_SIZE_V2 &&
+            pBestMatch[i + pMatch->length].offset > 0 &&
+            pBestMatch[i + pMatch->length].length >= MIN_MATCH_SIZE_V2 &&
+            (pMatch->length + pBestMatch[i + pMatch->length].length) >= LEAVE_ALONE_MATCH_SIZE &&
+            (pMatch->length + pBestMatch[i + pMatch->length].length) <= MAX_VARLEN &&
+            (i + pMatch->length) > pMatch->offset &&
+            (i + pMatch->length) > pBestMatch[i + pMatch->length].offset &&
+            (i + pMatch->length + pBestMatch[i + pMatch->length].length) <= nEndOffset &&
+            !memcmp(pInWindow + i - pMatch->offset + pMatch->length,
+               pInWindow + i + pMatch->length - pBestMatch[i + pMatch->length].offset,
+               pBestMatch[i + pMatch->length].length)) {

-            /* Join */
+            int nNextIndex = i + pMatch->length;
+            int nNextLiterals = 0;

-            pMatch->length += pBestMatch[i + nMatchLen].length;
-            pBestMatch[i + nMatchLen].offset = 0;
-            pBestMatch[i + nMatchLen].length = -1;
-            nDidReduce = 1;
-            continue;
+            while (nNextIndex < nEndOffset && pBestMatch[nNextIndex].length < MIN_MATCH_SIZE_V2) {
+               nNextLiterals++;
+               nNextIndex++;
+            }
+
+            int nCurPartialSize = lzsa_get_match_varlen_size_v2(pMatch->length - MIN_MATCH_SIZE_V2);
+
+            nCurPartialSize += 8 /* token */ + lzsa_get_literals_varlen_size_v2(0) + lzsa_get_match_varlen_size_v2(pBestMatch[i + pMatch->length].length - MIN_MATCH_SIZE_V2);
+            if (pBestMatch[i + pMatch->length].offset != pMatch->offset)
+               nCurPartialSize += (pBestMatch[i + pMatch->length].offset <= 32) ? 4 : ((pBestMatch[i + pMatch->length].offset <= 512) ? 8 : ((pBestMatch[i + pMatch->length].offset <= (8192 + 512)) ? 12 : 16));
+
+            if (pBestMatch[nNextIndex].offset != pBestMatch[i + pMatch->length].offset)
+               nCurPartialSize += (pBestMatch[nNextIndex].offset <= 32) ? 4 : ((pBestMatch[nNextIndex].offset <= 512) ? 8 : ((pBestMatch[nNextIndex].offset <= (8192 + 512)) ? 12 : 16));
+
+            int nReducedPartialSize = lzsa_get_match_varlen_size_v2(pMatch->length + pBestMatch[i + pMatch->length].length - MIN_MATCH_SIZE_V2);
+
+            if (pBestMatch[nNextIndex].offset != pMatch->offset)
+               nReducedPartialSize += (pBestMatch[nNextIndex].offset <= 32) ? 4 : ((pBestMatch[nNextIndex].offset <= 512) ? 8 : ((pBestMatch[nNextIndex].offset <= (8192 + 512)) ? 12 : 16));
+
+            if (nCurPartialSize >= nReducedPartialSize) {
+               int nMatchLen = pMatch->length;
+
+               /* Join */
+
+               pMatch->length += pBestMatch[i + nMatchLen].length;
+               pBestMatch[i + nMatchLen].offset = 0;
+               pBestMatch[i + nMatchLen].length = -1;
+               nDidReduce = 1;
+               continue;
+            }
         }

         nPrevRepMatchOffset = nRepMatchOffset;
@ -971,37 +1137,40 @@ static int lzsa_write_raw_uncompressed_block_v2(lzsa_compressor *pCompressor, co
 */
 int lzsa_optimize_and_write_block_v2(lzsa_compressor *pCompressor, const unsigned char *pInWindow, const int nPreviousBlockSize, const int nInDataSize, unsigned char *pOutData, const int nMaxOutDataSize) {
   int nResult, nBaseCompressedSize;
+   int nMatchesPerArrival = (nInDataSize < 65536) ? NMATCHES_PER_ARRIVAL_V2_BIG : NMATCHES_PER_ARRIVAL_V2_SMALL;

   /* Compress optimally without breaking ties in favor of less tokens */
   
-   lzsa_optimize_forward_v2(pCompressor, pInWindow, pCompressor->best_match, nPreviousBlockSize, nPreviousBlockSize + nInDataSize, 0 /* reduce */, (nInDataSize < 65536) ? 1 : 0 /* insert forward reps */);
+   memset(pCompressor->best_match, 0, BLOCK_SIZE * sizeof(lzsa_match));
+   lzsa_optimize_forward_v2(pCompressor, pInWindow, pCompressor->best_match - nPreviousBlockSize, nPreviousBlockSize, nPreviousBlockSize + nInDataSize, 0 /* reduce */, (nInDataSize < 65536) ? 1 : 0 /* insert forward reps */, nMatchesPerArrival);

   int nDidReduce;
   int nPasses = 0;
   do {
-      nDidReduce = lzsa_optimize_command_count_v2(pCompressor, pInWindow, pCompressor->best_match, nPreviousBlockSize, nPreviousBlockSize + nInDataSize);
+      nDidReduce = lzsa_optimize_command_count_v2(pCompressor, pInWindow, pCompressor->best_match - nPreviousBlockSize, nPreviousBlockSize, nPreviousBlockSize + nInDataSize);
      nPasses++;
   } while (nDidReduce && nPasses < 20);

-   nBaseCompressedSize = lzsa_get_compressed_size_v2(pCompressor, pCompressor->best_match, nPreviousBlockSize, nPreviousBlockSize + nInDataSize);
-   lzsa_match *pBestMatch = pCompressor->best_match;
+   nBaseCompressedSize = lzsa_get_compressed_size_v2(pCompressor, pCompressor->best_match - nPreviousBlockSize, nPreviousBlockSize, nPreviousBlockSize + nInDataSize);
+   lzsa_match *pBestMatch = pCompressor->best_match - nPreviousBlockSize;

   if (nBaseCompressedSize > 0 && nInDataSize < 65536) {
      int nReducedCompressedSize;

      /* Compress optimally and do break ties in favor of less tokens */
-      lzsa_optimize_forward_v2(pCompressor, pInWindow, pCompressor->improved_match, nPreviousBlockSize, nPreviousBlockSize + nInDataSize, 1 /* reduce */, 0 /* use forward reps */);
+      memset(pCompressor->improved_match, 0, BLOCK_SIZE * sizeof(lzsa_match));
+      lzsa_optimize_forward_v2(pCompressor, pInWindow, pCompressor->improved_match - nPreviousBlockSize, nPreviousBlockSize, nPreviousBlockSize + nInDataSize, 1 /* reduce */, 0 /* use forward reps */, nMatchesPerArrival);

      nPasses = 0;
      do {
-         nDidReduce = lzsa_optimize_command_count_v2(pCompressor, pInWindow, pCompressor->improved_match, nPreviousBlockSize, nPreviousBlockSize + nInDataSize);
+         nDidReduce = lzsa_optimize_command_count_v2(pCompressor, pInWindow, pCompressor->improved_match - nPreviousBlockSize, nPreviousBlockSize, nPreviousBlockSize + nInDataSize);
         nPasses++;
      } while (nDidReduce && nPasses < 20);

-      nReducedCompressedSize = lzsa_get_compressed_size_v2(pCompressor, pCompressor->improved_match, nPreviousBlockSize, nPreviousBlockSize + nInDataSize);
+      nReducedCompressedSize = lzsa_get_compressed_size_v2(pCompressor, pCompressor->improved_match - nPreviousBlockSize, nPreviousBlockSize, nPreviousBlockSize + nInDataSize);
      if (nReducedCompressedSize > 0 && nReducedCompressedSize <= nBaseCompressedSize) {
         /* Pick the parse with the reduced number of tokens as it didn't negatively affect the size */
-         pBestMatch = pCompressor->improved_match;
+         pBestMatch = pCompressor->improved_match - nPreviousBlockSize;
      }
   }

--- a/src/shrink_context.c
+++ b/src/shrink_context.c
@ -89,19 +89,19 @@ int lzsa_compressor_init(lzsa_compressor *pCompressor, const int nMaxWindowSize,
            pCompressor->open_intervals = (unsigned int *)malloc((LCP_AND_TAG_MAX + 1) * sizeof(unsigned int));

            if (pCompressor->open_intervals) {
-               pCompressor->arrival = (lzsa_arrival *)malloc(nMaxWindowSize * NMATCHES_PER_OFFSET * sizeof(lzsa_arrival));
-
+               pCompressor->arrival = (lzsa_arrival *)malloc(((BLOCK_SIZE + 1) << MATCHES_PER_ARRIVAL_SHIFT) * sizeof(lzsa_arrival));
+   
               if (pCompressor->arrival) {
-                  pCompressor->best_match = (lzsa_match *)malloc(nMaxWindowSize * sizeof(lzsa_match));
+                  pCompressor->best_match = (lzsa_match *)malloc(BLOCK_SIZE * sizeof(lzsa_match));

                  if (pCompressor->best_match) {
-                     pCompressor->improved_match = (lzsa_match *)malloc(nMaxWindowSize * sizeof(lzsa_match));
+                     pCompressor->improved_match = (lzsa_match *)malloc(BLOCK_SIZE * sizeof(lzsa_match));

                     if (pCompressor->improved_match) {
                        if (pCompressor->format_version == 2)
-                           pCompressor->match = (lzsa_match *)malloc(nMaxWindowSize * 32 * sizeof(lzsa_match));
+                           pCompressor->match = (lzsa_match *)malloc(BLOCK_SIZE * NMATCHES_PER_INDEX_V2 * sizeof(lzsa_match));
                        else
-                           pCompressor->match = (lzsa_match *)malloc(nMaxWindowSize * 8 * sizeof(lzsa_match));
+                           pCompressor->match = (lzsa_match *)malloc(BLOCK_SIZE * NMATCHES_PER_INDEX_V1 * sizeof(lzsa_match));
                        if (pCompressor->match)
                           return 0;
                     }
@ -185,7 +185,7 @@ int lzsa_compressor_shrink_block(lzsa_compressor *pCompressor, unsigned char *pI
      if (nPreviousBlockSize) {
         lzsa_skip_matches(pCompressor, 0, nPreviousBlockSize);
      }
-      lzsa_find_all_matches(pCompressor, (pCompressor->format_version == 2) ? 32 : 8, nPreviousBlockSize, nPreviousBlockSize + nInDataSize);
+      lzsa_find_all_matches(pCompressor, (pCompressor->format_version == 2) ? NMATCHES_PER_INDEX_V2 : NMATCHES_PER_INDEX_V1, nPreviousBlockSize, nPreviousBlockSize + nInDataSize);

      if (pCompressor->format_version == 1) {
         nCompressedSize = lzsa_optimize_and_write_block_v1(pCompressor, pInWindow, nPreviousBlockSize, nInDataSize, pOutData, nMaxOutDataSize);
--- a/src/shrink_context.h
+++ b/src/shrink_context.h
@ -40,8 +40,8 @@ extern "C" {
 #endif

 #define LCP_BITS 14
-#define TAG_BITS 3
-#define LCP_MAX (1U<<(LCP_BITS - TAG_BITS - 1))
+#define TAG_BITS 4
+#define LCP_MAX ((1U<<(LCP_BITS - TAG_BITS)) - 1)
 #define LCP_AND_TAG_MAX (1U<<(LCP_BITS - 1))
 #define LCP_SHIFT (31-LCP_BITS)
 #define LCP_MASK (((1U<<LCP_BITS) - 1) << LCP_SHIFT)
@ -49,13 +49,21 @@ extern "C" {
 #define VISITED_FLAG 0x80000000
 #define EXCL_VISITED_MASK  0x7fffffff

-#define NMATCHES_PER_OFFSET 8
-#define MATCHES_PER_OFFSET_SHIFT 3
+#define NMATCHES_PER_ARRIVAL_V1 8
+#define NMATCHES_PER_ARRIVAL_V2_SMALL 9
+#define NMATCHES_PER_ARRIVAL_V2_BIG 32
+#define MATCHES_PER_ARRIVAL_SHIFT 5

-#define LEAVE_ALONE_MATCH_SIZE 1000
+#define NMATCHES_PER_INDEX_V1 8
+#define MATCHES_PER_INDEX_SHIFT_V1 3

-#define LAST_MATCH_OFFSET 4
-#define LAST_LITERALS 1
+#define NMATCHES_PER_INDEX_V2 64
+#define MATCHES_PER_INDEX_SHIFT_V2 6
+
+#define LEAVE_ALONE_MATCH_SIZE 300
+#define LEAVE_ALONE_MATCH_SIZE_SMALL 1000
+
+#define LAST_LITERALS 0

 #define MODESWITCH_PENALTY 3

@ -68,10 +76,10 @@ typedef struct _lzsa_match {
 /** Forward arrival slot */
 typedef struct {
   int cost;
-   int from_pos;
+   unsigned short rep_offset;
   short from_slot;

-   unsigned short rep_offset;
+   int from_pos;
   unsigned short rep_len;
   int rep_pos;
   int num_literals;
--- a/src/shrink_inmem.c
+++ b/src/shrink_inmem.c
@ -142,7 +142,7 @@ size_t lzsa_compress_inmem(unsigned char *pInputData, unsigned char *pOutBuffer,
            if (nBlockheaderSize < 0)
               nError = LZSA_ERROR_COMPRESSION;
            else {
-               if (nInDataSize > (nMaxOutBufferSize - (nCompressedSize + nBlockheaderSize)))
+               if ((size_t)nInDataSize > (nMaxOutBufferSize - (nCompressedSize + nBlockheaderSize)))
                  nError = LZSA_ERROR_DST;
               else {
                  memcpy(pOutBuffer + nBlockheaderSize + nCompressedSize, pInputData + nOriginalSize, nInDataSize);