diff --git a/BlockFormat_LZSA2.md b/BlockFormat_LZSA2.md
index 15ec378..f46909a 100644
--- a/BlockFormat_LZSA2.md
+++ b/BlockFormat_LZSA2.md
@@ -44,7 +44,7 @@ The match offset is decoded according to the XYZ bits in the token
     XYZ
     00Z 5-bit offset: read a nibble for offset bits 1-4 and use the inverted bit Z of the token as bit 0 of the offset. set bits 5-15 of the offset to 1.
     01Z 9-bit offset: read a byte for offset bits 0-7 and use the inverted bit Z for bit 8 of the offset. set bits 9-15 of the offset to 1.
-    10Z 13-bit offset: read a nibble for offset bits 9-12 and use the inverted bit Z for bit 8 of the offset, then read a byte for offset bits 0-7. set bits 13-15 of the offset to 1.
+    10Z 13-bit offset: read a nibble for offset bits 9-12 and use the inverted bit Z for bit 8 of the offset, then read a byte for offset bits 0-7. set bits 13-15 of the offset to 1. substract 512 from the offset to get the final value.
     110 16-bit offset: read a byte for offset bits 8-15, then another byte for offset bits 0-7.
     111 repeat offset: reuse the offset value of the previous match command.
 
@@ -58,7 +58,7 @@ Note that the match offset is negative: it is added to the current decompressed
 
 If the encoded match length is 7 or more, the 'M' bits in the token form the value 7, and an extra nibble is read:
 
-* 0-14: the value is added to the 3 stored in the token, and then the minmatch of 2 is added, to compose the final match length.
+* 0-14: the value is added to the 7 stored in the token, and then the minmatch of 2 is added, to compose the final match length.
 * 15: an extra byte follows
 
 If an extra byte follows here, it can have two possible types of value:
diff --git a/Makefile b/Makefile
index 9e98565..68614f0 100755
--- a/Makefile
+++ b/Makefile
@@ -1,8 +1,7 @@
 CC=clang
-CFLAGS=-O3 -fomit-frame-pointer -Isrc/libdivsufsort/include -Isrc
+CFLAGS=-O3 -g -fomit-frame-pointer -Isrc/libdivsufsort/include -Isrc
 OBJDIR=obj
 LDFLAGS=
-STRIP=strip
 
 $(OBJDIR)/%.o: src/../%.c
 	@mkdir -p '$(@D)'
@@ -33,9 +32,7 @@ OBJS += $(OBJDIR)/src/libdivsufsort/lib/trsort.o
 all: $(APP)
 
 $(APP): $(OBJS)
-	@mkdir -p ../../bin/posix
 	$(CC) $^ $(LDFLAGS) -o $(APP)
-	$(STRIP) $(APP)
 
 clean:
 	@rm -rf $(APP) $(OBJDIR)
diff --git a/README.md b/README.md
index 6430ee5..4646c8f 100755
--- a/README.md
+++ b/README.md
@@ -7,6 +7,16 @@ Check out [The Hollow](https://www.pouet.net/prod.php?which=81909) by Darklite a
 
 [Gabba](https://www.pouet.net/prod.php?which=83539) by Stardust ranked 2nd in the ZX Spectrum demo compo at CAFe demoparty 2019 and also used LZSA on Z80. 
 
+[Myst Demake](http://www.deater.net/weave/vmwprod/mist/) for the Apple II by Vince Weaver, uses LZSA on 6502.
+
+The 8 bit guy's [Commander X16 ROM](https://github.com/commanderx16/x16-rom) uses LZSA on 6502 as well.
+
+[RomWBW](https://github.com/wwarthen/RomWBW) uses LZSA on Z80 for a variety of hobbyist computers.
+
+The popular [rasm](https://github.com/EdouardBERGE/rasm) assembler for Z80 features LZSA-compressed data sections.
+
+The [desolate](https://github.com/nzeemin/spectrum-desolate) game port to the ZX Spectrum uses LZSA compression on Z80.
+
 The LZSA compression tool uses an aggressive optimal packing strategy to try to find the sequence of commands that gives the smallest packed file that decompresses to the original while maintaining the maximum possible decompression speed.
 
 The compression formats give the user choices that range from decompressing faster than LZ4 on 8-bit systems with better compression, to compressing as well as ZX7 with much better decompression speed. LZSA1 is designed to replace LZ4 and LZSA2 to replace ZX7, in 8-bit scenarios.
@@ -55,7 +65,7 @@ Inspirations:
 * [LZ5/Lizard](https://github.com/inikep/lizard) by Przemyslaw Skibinski and Yann Collet.
 * The suffix array intervals in [Wimlib](https://wimlib.net/git/?p=wimlib;a=tree) by Eric Biggers.
 * ZX7 by Einar Saukas
-* [apc](https://github.com/svendahl/cap) by Sven-Ĺke Dahl
+* [apc](https://github.com/svendahl/cap) by Sven-Ă…ke Dahl
 * [Charles Bloom](http://cbloomrants.blogspot.com/)'s compression blog
 
 License:
@@ -65,14 +75,19 @@ License:
 
 8-bit assembly code:
 
-* Z80 decompressors (size- and speed-optimized) written by [introspec](https://github.com/specke)
+* Z80 decompressors (size- and speed-optimized) written by [introspec](https://github.com/specke) with optimizations by [uniabis](https://github.com/uniabis)
 * 6502 and 8088 size-optimized improvements by [Peter Ferrie](https://github.com/peterferrie)
+* 6502 speed-optimized decompressor by [John Brandwood](https://github.com/jbrandwood)
 * 8088 speed-optimized decompressor by [Jim Leonard](https://github.com/mobygamer)
+* 6809 decompressors (Tandy CoCo, Thomson MO/TO, Dragon 32/64..) optimized by [Doug Masten](https://github.com/dougmasten)
+* Hitachi 6309 decompressors (Tandy CoCo 3) also contributed by [Doug Masten](https://github.com/dougmasten)
 
 External links:
 
 * [i8080 decompressors](https://gitlab.com/ivagor/lzsa8080/tree/master) by Ivan Gorodetsky
 * [PDP-11 decompressors](https://gitlab.com/ivagor/lzsa8080/tree/master/PDP11) also by Ivan Gorodetsky
+* [MC68000 decompressors](https://github.com/tattlemuss/lz4-m68k/blob/master/src/lzsa.s) by Steven Tattersall
+* [Gameboy decompressors](https://github.com/meltycode) by Meltycode, based on the Z80 code by introspec
 * LZSA's page on [Pouet](https://www.pouet.net/prod.php?which=81573)
 
 # Compressed format
diff --git a/StreamFormat.md b/StreamFormat.md
index 3f37f86..8eebb7e 100644
--- a/StreamFormat.md
+++ b/StreamFormat.md
@@ -17,7 +17,7 @@ The 3-bytes LZSA header contains a signature and a traits byte:
 
 Trait bits:
 
-* V: 3 bit code that indicates which block data encoding is used. 0 is LZSA1 and 2 is LZSA2.
+* V: 3 bit code that indicates which block data encoding is used. 0 is LZSA1 and 1 is LZSA2.
 * Z: these bits in the traits are set to 0 for LZSA1 and LZSA2.
 
 # Frame format
diff --git a/asm/6502/decompress_fast_v1.asm b/asm/6502/decompress_fast_v1.asm
index b36cc17..7aa651d 100644
--- a/asm/6502/decompress_fast_v1.asm
+++ b/asm/6502/decompress_fast_v1.asm
@@ -68,7 +68,7 @@ LARGE_VARLEN_LITERALS                   ; handle 16 bits literals count
    JSR GETLARGESRC                      ; grab low 8 bits in X, high 8 bits in A
    TAY                                  ; put high 8 bits in Y
    TXA
-   JMP PREPARE_COPY_LARGE_LITERALS
+   BCS PREPARE_COPY_LARGE_LITERALS      ; (*like JMP PREPARE_COPY_LITERALS_DIRECT but shorter)
 
 PREPARE_COPY_LITERALS
    TAX
diff --git a/asm/6502/decompress_fast_v2.asm b/asm/6502/decompress_fast_v2.asm
index 681d42d..1e49a75 100644
--- a/asm/6502/decompress_fast_v2.asm
+++ b/asm/6502/decompress_fast_v2.asm
@@ -114,11 +114,9 @@ NO_LITERALS
    BNE GOT_OFFSET_LO                    ; go store low byte of match offset and prepare match
    
 OFFSET_9_BIT                            ; 01Z: 9 bit offset
-   ;;ASL                                  ; shift Z (offset bit 8) in place
-   ROL
-   ROL
-   AND #$01
-   EOR #$FF                             ; set offset bits 15-9 to 1
+   ROL                                  ; carry: Z bit; A: xxxxxxx1 (carry known set from BCS OFFSET_9_BIT)
+   ADC #$00                             ; if Z bit is set, add 1 to A (bit 0 of A is now 0), otherwise bit 0 is 1
+   ORA #$FE                             ; set offset bits 15-9 to 1. reversed Z is already in bit 0
    BNE GOT_OFFSET_HI                    ; go store high byte, read low byte of match offset and prepare match
                                         ; (*same as JMP GOT_OFFSET_HI but shorter)
 
@@ -134,7 +132,6 @@ REPMATCH_OR_LARGE_OFFSET
                                         ; (*same as JMP GOT_OFFSET_HI but shorter)
 
 REPMATCH_OR_16_BIT                      ; rep-match or 16 bit offset
-   ;;ASL                                  ; XYZ=111?
    BMI REP_MATCH                        ; reuse previous offset if so (rep-match)
    
                                         ; 110: handle 16 bit offset
@@ -259,7 +256,6 @@ GETCOMBINEDBITS
 
    JSR GETNIBBLE                        ; get nibble into bits 0-3 (for offset bits 1-4)
    PLP                                  ; merge Z bit as the carry bit (for offset bit 0)
-COMBINEDBITZ
    ROL                                  ; nibble -> bits 1-4; carry(!Z bit) -> bit 0 ; carry cleared
 DECOMPRESSION_DONE
    RTS
diff --git a/asm/6502/decompress_faster_v1.asm b/asm/6502/decompress_faster_v1.asm
new file mode 100644
index 0000000..1f65a40
--- /dev/null
+++ b/asm/6502/decompress_faster_v1.asm
@@ -0,0 +1,353 @@
+; ***************************************************************************
+; ***************************************************************************
+;
+; lzsa1_6502.s
+;
+; NMOS 6502 decompressor for data stored in Emmanuel Marty's LZSA1 format.
+;
+; This code is written for the ACME assembler.
+;
+; Optional code is presented for one minor 6502 optimization that breaks
+; compatibility with the current LZSA1 format standard.
+;
+; The code is 168 bytes for the small version, and 205 bytes for the normal.
+;
+; Copyright John Brandwood 2019.
+;
+; Distributed under the Boost Software License, Version 1.0.
+; (See accompanying file LICENSE_1_0.txt or copy at
+;  http://www.boost.org/LICENSE_1_0.txt)
+;
+; ***************************************************************************
+; ***************************************************************************
+
+
+
+; ***************************************************************************
+; ***************************************************************************
+;
+; Decompression Options & Macros
+;
+
+                ;
+                ; Choose size over space (within sane limits)?
+                ;
+
+LZSA_SMALL_SIZE =       0
+
+                ;
+                ; Remove code inlining to save space?
+                ;
+                ; This saves 15 bytes of code at the cost of 7% speed.
+                ;
+
+                !if     LZSA_SMALL_SIZE {
+LZSA_NO_INLINE  =       1
+                } else {
+LZSA_NO_INLINE  =       0
+                }
+
+                ;
+                ; Use smaller code for copying literals?
+                ;
+                ; This saves 11 bytes of code at the cost of 15% speed.
+                ;
+
+                !if     LZSA_SMALL_SIZE {
+LZSA_SHORT_CP   =       1
+                } else {
+LZSA_SHORT_CP   =       0
+                }
+
+                ;
+                ; Use smaller code for copying literals?
+                ;
+                ; This saves 11 bytes of code at the cost of 30% speed.
+                ;
+
+                !if     LZSA_SMALL_SIZE {
+LZSA_SHORT_LZ   =       1
+                } else {
+LZSA_SHORT_LZ   =       0
+                }
+
+                ;
+                ; Macro to increment the source pointer to the next page.
+                ;
+                ; This should call a subroutine to determine if a bank
+                ; has been crossed, and a new bank should be paged in.
+                ;
+
+                !macro  LZSA_INC_PAGE {
+                        inc     <lzsa_srcptr + 1
+                }
+
+                ;
+                ; Macro to read a byte from the compressed source data.
+                ;
+
+                !if     LZSA_NO_INLINE {
+
+                        !macro LZSA_GET_SRC {
+                        jsr     lzsa1_get_byte
+                        }
+
+                } else {
+
+                        !macro LZSA_GET_SRC {
+                        lda     (lzsa_srcptr),y
+                        inc     <lzsa_srcptr + 0
+                        bne     .skip
+                        +LZSA_INC_PAGE
+.skip:
+                        }
+
+                }
+
+
+
+; ***************************************************************************
+; ***************************************************************************
+;
+; Data usage is last 8 bytes of zero-page.
+;
+
+                !if     (LZSA_SHORT_CP | LZSA_SHORT_LZ) {
+lzsa_length     =       $F8                     ; 1 byte.
+                }
+
+lzsa_cmdbuf     =       $F9                     ; 1 byte.
+lzsa_winptr     =       $FA                     ; 1 word.
+lzsa_srcptr     =       $FC                     ; 1 word.
+lzsa_dstptr     =       $FE                     ; 1 word.
+
+LZSA_SRC_LO     =       $FC
+LZSA_SRC_HI     =       $FD
+LZSA_DST_LO     =       $FE
+LZSA_DST_HI     =       $FF
+
+
+; ***************************************************************************
+; ***************************************************************************
+;
+; lzsa1_unpack - Decompress data stored in Emmanuel Marty's LZSA1 format.
+;
+; Args: lzsa_srcptr = ptr to compessed data
+; Args: lzsa_dstptr = ptr to output buffer
+; Uses: lots!
+;
+
+DECOMPRESS_LZSA1_FAST:
+lzsa1_unpack:   ldy     #0                      ; Initialize source index.
+                ldx     #0                      ; Initialize hi-byte of length.
+
+                ;
+                ; Copy bytes from compressed source data.
+                ;
+                ; N.B. X=0 is expected and guaranteed when we get here.
+                ;
+
+.cp_length:     +LZSA_GET_SRC
+                sta     <lzsa_cmdbuf            ; Preserve this for later.
+                and     #$70                    ; Extract literal length.
+                beq     .lz_offset              ; Skip directly to match?
+
+                lsr                             ; Get 3-bit literal length.
+                lsr
+                lsr
+                lsr
+                cmp     #$07                    ; Extended length?
+                bne     .got_cp_len
+
+                jsr     .get_length             ; CS from CMP, X=0.
+
+                !if     LZSA_SHORT_CP {
+
+.got_cp_len:    cmp     #0                      ; Check the lo-byte of length.
+                beq     .put_cp_len
+
+                inx                             ; Increment # of pages to copy.
+
+.put_cp_len:    stx     <lzsa_length
+                tax
+
+.cp_page:       lda     (lzsa_srcptr),y
+                sta     (lzsa_dstptr),y
+                inc     <lzsa_srcptr + 0
+                bne     .skip1
+                inc     <lzsa_srcptr + 1
+.skip1:         inc     <lzsa_dstptr + 0
+                bne     .skip2
+                inc     <lzsa_dstptr + 1
+.skip2:         dex
+                bne     .cp_page
+                dec     <lzsa_length            ; Any full pages left to copy?
+                bne     .cp_page
+
+                } else {
+
+.got_cp_len:    tay                             ; Check the lo-byte of length.
+                beq     .cp_page
+
+                inx                             ; Increment # of pages to copy.
+
+.get_cp_src:    clc                             ; Calc address of partial page.
+                adc     <lzsa_srcptr + 0
+                sta     <lzsa_srcptr + 0
+                bcs     .get_cp_dst
+                dec     <lzsa_srcptr + 1
+
+.get_cp_dst:    tya
+                clc                             ; Calc address of partial page.
+                adc     <lzsa_dstptr + 0
+                sta     <lzsa_dstptr + 0
+                bcs     .get_cp_idx
+                dec     <lzsa_dstptr + 1
+
+.get_cp_idx:    tya                             ; Negate the lo-byte of length.
+                eor     #$FF
+                tay
+                iny
+
+.cp_page:       lda     (lzsa_srcptr),y
+                sta     (lzsa_dstptr),y
+                iny
+                bne     .cp_page
+                inc     <lzsa_srcptr + 1
+                inc     <lzsa_dstptr + 1
+                dex                             ; Any full pages left to copy?
+                bne     .cp_page
+
+                }
+
+                ;
+                ; Copy bytes from decompressed window.
+                ;
+                ; N.B. X=0 is expected and guaranteed when we get here.
+                ;
+
+.lz_offset:     +LZSA_GET_SRC
+                clc
+                adc     <lzsa_dstptr + 0
+                sta     <lzsa_winptr + 0
+
+                lda     #$FF
+                bit     <lzsa_cmdbuf
+                bpl     .hi_offset
+                +LZSA_GET_SRC
+
+.hi_offset:     adc     <lzsa_dstptr + 1
+                sta     <lzsa_winptr + 1
+
+.lz_length:     lda     <lzsa_cmdbuf            ; X=0 from previous loop.
+                and     #$0F
+                adc     #$03 - 1                ; CS from previous ADC.
+                cmp     #$12                    ; Extended length?
+                bne     .got_lz_len
+
+                jsr     .get_length             ; CS from CMP, X=0.
+
+                !if     LZSA_SHORT_LZ {
+
+.got_lz_len:    cmp     #0                      ; Check the lo-byte of length.
+                beq     .put_lz_len
+
+                inx                             ; Increment # of pages to copy.
+
+.put_lz_len:    stx     <lzsa_length
+                tax
+
+.lz_page:       lda     (lzsa_winptr),y
+                sta     (lzsa_dstptr),y
+                inc     <lzsa_winptr + 0
+                bne     .skip3
+                inc     <lzsa_winptr + 1
+.skip3:         inc     <lzsa_dstptr + 0
+                bne     .skip4
+                inc     <lzsa_dstptr + 1
+.skip4:         dex
+                bne     .lz_page
+                dec     <lzsa_length            ; Any full pages left to copy?
+                bne     .lz_page
+
+                jmp     .cp_length              ; Loop around to the beginning.
+
+                } else {
+
+.got_lz_len:    tay                             ; Check the lo-byte of length.
+                beq     .lz_page
+
+                inx                             ; Increment # of pages to copy.
+
+.get_lz_win:    clc                             ; Calc address of partial page.
+                adc     <lzsa_winptr + 0
+                sta     <lzsa_winptr + 0
+                bcs     .get_lz_dst
+                dec     <lzsa_winptr + 1
+
+.get_lz_dst:    tya
+                clc                             ; Calc address of partial page.
+                adc     <lzsa_dstptr + 0
+                sta     <lzsa_dstptr + 0
+                bcs     .get_lz_idx
+                dec     <lzsa_dstptr + 1
+
+.get_lz_idx:    tya                             ; Negate the lo-byte of length.
+                eor     #$FF
+                tay
+                iny
+
+.lz_page:       lda     (lzsa_winptr),y
+                sta     (lzsa_dstptr),y
+                iny
+                bne     .lz_page
+                inc     <lzsa_winptr + 1
+                inc     <lzsa_dstptr + 1
+                dex                             ; Any full pages left to copy?
+                bne     .lz_page
+
+                jmp     .cp_length              ; Loop around to the beginning.
+
+                }
+
+                ;
+                ; Get 16-bit length in X:A register pair.
+                ;
+                ; N.B. X=0 is expected and guaranteed when we get here.
+                ;
+
+.get_length:    clc                             ; Add on the next byte to get
+                adc     (lzsa_srcptr),y         ; the length.
+                inc     <lzsa_srcptr + 0
+                bne     .skip_inc
+                +LZSA_INC_PAGE
+
+.skip_inc:      bcc     .got_length             ; No overflow means done.
+                cmp     #$00                    ; Overflow to 256 or 257?
+                beq     .extra_word
+
+.extra_byte:    inx
+                jmp     lzsa1_get_byte          ; So rare, this can be slow!
+
+.extra_word:    jsr     lzsa1_get_byte          ; So rare, this can be slow!
+                pha
+                jsr     lzsa1_get_byte          ; So rare, this can be slow!
+                tax
+                beq     .finished               ; Length-hi == 0 at EOF.
+                pla                             ; Length-lo.
+                rts
+
+lzsa1_get_byte:
+                lda     (lzsa_srcptr),y         ; Subroutine version for when
+                inc     <lzsa_srcptr + 0        ; inlining isn't advantageous.
+                beq     lzsa1_next_page
+.got_length:    rts
+
+lzsa1_next_page:
+                inc     <lzsa_srcptr + 1        ; Inc & test for bank overflow.
+                rts
+
+.finished:      pla                             ; Length-lo.
+                pla                             ; Decompression completed, pop
+                pla                             ; return address.
+                rts
diff --git a/asm/6502/decompress_faster_v2.asm b/asm/6502/decompress_faster_v2.asm
index c8768c8..1d9b90a 100644
--- a/asm/6502/decompress_faster_v2.asm
+++ b/asm/6502/decompress_faster_v2.asm
@@ -5,10 +5,12 @@
 ;
 ; NMOS 6502 decompressor for data stored in Emmanuel Marty's LZSA2 format.
 ;
+; This code is written for the ACME assembler.
+;
 ; Optional code is presented for two minor 6502 optimizations that break
 ; compatibility with the current LZSA2 format standard.
 ;
-; This code is written for the ACME assembler.
+; The code is 241 bytes for the small version, and 267 bytes for the normal.
 ;
 ; Copyright John Brandwood 2019.
 ;
@@ -28,100 +30,96 @@
 ;
 
                 ;
-                ; Save 7 bytes of code, and 21 cycles every time that a 
-                ; 16-bit length is decoded?
-                ;
-                ; N.B. Setting this breaks compatibility with LZSA v1.2
+                ; Choose size over space (within sane limits)?
                 ;
 
-LZSA_SWAP_LEN16 =       0
-
-                ;
-                ; Save 3 bytes of code, and 4 or 8 cycles when decoding
-                ; an offset?
-                ;
-                ; N.B. Setting this breaks compatibility with LZSA v1.2
-                ;
-
-LZSA_SWAP_XZY   =       0
+LZSA_SMALL_SIZE =       0
 
                 ;
                 ; Remove code inlining to save space?
                 ;
-                ; This saves 15 bytes of code, but decompression is 7% slower.
+                ; This saves 15 bytes of code at the cost of 7% speed.
                 ;
 
-LZSA_BEST_SIZE  =       0
+                !if      LZSA_SMALL_SIZE {
+LZSA_NO_INLINE  =       1
+                } else {
+LZSA_NO_INLINE  =       0
+                }
 
                 ;
-                ; Assume that we're decompessing from a large multi-bank
-                ; compressed data file, and that the next bank may need to
-                ; paged in when a page-boundary is crossed.
+                ; Use smaller code for copying literals?
+                ;
+                ; This saves 11 bytes of code at the cost of 5% speed.
                 ;
 
-LZSA_FROM_BANK  =       0
+                !if      LZSA_SMALL_SIZE {
+LZSA_SHORT_CP   =       1
+                } else {
+LZSA_SHORT_CP   =       0
+                }
+
+                ;
+                ; We will read from or write to $FFFF.  This prevents the
+                ; use of the "INC ptrhi / BNE" trick and reduces speed.
+                ;
+
+LZSA_USE_FFFF  =        0
 
                 ;
                 ; Macro to increment the source pointer to the next page.
                 ;
 
-                !if     LZSA_FROM_BANK {
-
-                   !macro LZSA_INC_PAGE {
-                      jsr     .next_page
-                   }
-
-                } else {
-
-                   !macro LZSA_INC_PAGE {
-                      inc     <lzsa_srcptr + 1
-                   }
-
+                !macro LZSA_INC_PAGE {
+                        inc     <lzsa_srcptr + 1
                 }
 
                 ;
                 ; Macro to read a byte from the compressed source data.
                 ;
 
-                !if     LZSA_BEST_SIZE {
+                !if     LZSA_NO_INLINE {
 
-                   !macro LZSA_GET_SRC {
-                      jsr     .get_byte
-                   }
+                        !macro  LZSA_GET_SRC {
+                        jsr     lzsa2_get_byte
+                        }
 
                 } else {
 
-                   !macro LZSA_GET_SRC {
-                      lda     (lzsa_srcptr),y
-                      inc     <lzsa_srcptr + 0
-                      bne     .skip
-                      +LZSA_INC_PAGE
+                        !macro  LZSA_GET_SRC {
+                        lda     (lzsa_srcptr),y
+                        inc     <lzsa_srcptr + 0
+                        bne     .skip
+                        +LZSA_INC_PAGE
 .skip:
-                   }
+                        }
 
                 }
 
                 ;
                 ; Macro to speed up reading 50% of nibbles.
                 ;
+                ; This seems to save very few cycles compared to the
+                ; increase in code size, and it isn't recommended.
+                ;
 
 LZSA_SLOW_NIBL  =       1
 
-                !if     (LZSA_SLOW_NIBL + LZSA_BEST_SIZE) {
+                !if     (LZSA_SLOW_NIBL + LZSA_SMALL_SIZE) {
 
-                   !macro LZSA_GET_NIBL {
-                      jsr     lzsa2_get_nibble        ; Always call a function.
-                   }
+                        !macro  LZSA_GET_NIBL {
+                        jsr     lzsa2_get_nibble        ; Always call a function.
+                        }
 
                 } else {
 
-                   !macro LZSA_GET_NIBL {
-                      lsr     <lzsa_nibflg            ; Is there a nibble waiting?
-                      lda     <lzsa_nibble            ; Extract the lo-nibble.
-                      bcs     .skip\@
-                      jsr     .new_nibble             ; Extract the hi-nibble.
-      .skip\@:        ora     #$F0
-                   }
+                        !macro  LZSA_GET_NIBL {
+                        lsr     <lzsa_nibflg            ; Is there a nibble waiting?
+                        lda     <lzsa_nibble            ; Extract the lo-nibble.
+                        bcs     .skip
+                        jsr     lzsa2_new_nibble        ; Extract the hi-nibble.
+.skip:                  ora     #$F0
+                        }
 
                 }
 
@@ -141,35 +139,68 @@ lzsa_winptr     =       $FA                     ; 1 word.
 lzsa_srcptr     =       $FC                     ; 1 word.
 lzsa_dstptr     =       $FE                     ; 1 word.
 
+lzsa_length     =       lzsa_winptr             ; 1 word.
+
 LZSA_SRC_LO     =       $FC
 LZSA_SRC_HI     =       $FD
 LZSA_DST_LO     =       $FE
 LZSA_DST_HI     =       $FF
 
+
+
 ; ***************************************************************************
 ; ***************************************************************************
 ;
-; lzsa2_unpack - Decompress data stored in Emmanuel Marty's LZSA2b format.
+; lzsa2_unpack - Decompress data stored in Emmanuel Marty's LZSA2 format.
 ;
 ; Args: lzsa_srcptr = ptr to compessed data
 ; Args: lzsa_dstptr = ptr to output buffer
 ; Uses: lots!
 ;
-; If compiled with LZSA_FROM_BANK, then lzsa_srcptr should be within the bank
-; window range.
-;
 
 DECOMPRESS_LZSA2_FAST:
 lzsa2_unpack:   ldy     #0                      ; Initialize source index.
                 sty     <lzsa_nibflg            ; Initialize nibble buffer.
 
+                !if     (LZSA_NO_INLINE | LZSA_USE_FFFF) = 0 {
+
+                beq     .cp_length              ; always taken
+.incsrc1:
+                inc     <lzsa_srcptr + 1
+                bne     .resume_src1            ; always taken
+
+                !if     LZSA_SHORT_CP {
+.incsrc2:
+                inc     <lzsa_srcptr + 1
+                bne     .resume_src2            ; always taken
+
+.incdst:
+                inc     <lzsa_dstptr + 1
+                bne     .resume_dst             ; always taken
+
+                }
+
+                }
+
                 ;
                 ; Copy bytes from compressed source data.
                 ;
 
 .cp_length:     ldx     #$00                    ; Hi-byte of length or offset.
 
+                !if     (LZSA_NO_INLINE | LZSA_USE_FFFF) {
+
                 +LZSA_GET_SRC
+
+                } else {
+
+                lda     (lzsa_srcptr),y
+                inc     <lzsa_srcptr + 0
+                beq     .incsrc1
+
+                }
+
+.resume_src1:
                 sta     <lzsa_cmdbuf            ; Preserve this for later.
                 and     #$18                    ; Extract literal length.
                 beq     .lz_offset              ; Skip directly to match?
@@ -182,20 +213,60 @@ lzsa2_unpack:   ldy     #0                      ; Initialize source index.
 
                 jsr     .get_length             ; X=0 table index for literals.
 
+                !if     LZSA_SHORT_CP {
+
+.got_cp_len:    cmp     #0                      ; Check the lo-byte of length.
+                beq     .put_cp_len
+
+                inx                             ; Increment # of pages to copy.
+
+.put_cp_len:    stx     <lzsa_length
+                tax
+
+.cp_page:       lda     (lzsa_srcptr),y
+                sta     (lzsa_dstptr),y
+                inc     <lzsa_srcptr + 0
+
+                !if     (LZSA_NO_INLINE | LZSA_USE_FFFF) {
+
+                bne     .skip1
+                inc     <lzsa_srcptr + 1
+.skip1:         inc     <lzsa_dstptr + 0
+                bne     .skip2
+                inc     <lzsa_dstptr + 1
+.skip2:
+
+                } else {
+
+                beq     .incsrc2
+.resume_src2:
+                inc     <lzsa_dstptr + 0
+                beq     .incdst
+.resume_dst:
+
+                }
+
+                dex
+                bne     .cp_page
+                dec     <lzsa_length            ; Any full pages left to copy?
+                bne     .cp_page
+
+                } else {
+
 .got_cp_len:    tay                             ; Check the lo-byte of length.
                 beq     .cp_page
 
                 inx                             ; Increment # of pages to copy.
 
-.get_cp_src:    clc                             ; Calc source for partial
-                adc     <lzsa_srcptr + 0        ; page.
+.get_cp_src:    clc                             ; Calc address of partial page.
+                adc     <lzsa_srcptr + 0
                 sta     <lzsa_srcptr + 0
                 bcs     .get_cp_dst
                 dec     <lzsa_srcptr + 1
 
 .get_cp_dst:    tya
-                clc                             ; Calc destination for partial
-                adc     <lzsa_dstptr + 0        ; page.
+                clc                             ; Calc address of partial page.
+                adc     <lzsa_dstptr + 0
                 sta     <lzsa_dstptr + 0
                 bcs     .get_cp_idx
                 dec     <lzsa_dstptr + 1
@@ -214,66 +285,15 @@ lzsa2_unpack:   ldy     #0                      ; Initialize source index.
                 dex                             ; Any full pages left to copy?
                 bne     .cp_page
 
-                !if      LZSA_SWAP_XZY {
+                }
 
-                ;
-                ; Shorter and faster path with NEW order of bits.
-                ;
-                ; STD  NEW
-                ; ================================ 
-                ; xyz  xzy
-                ; 00z  0z0  5-bit offset
-                ; 01z  0z1  9-bit offset
-                ; 10z  1z0  13-bit offset
-                ; 110  101  16-bit offset
-                ; 111  111  repeat offset
-                ;      NVZ  for a BIT instruction
-                ;
-                ; N.B. Saves 3 bytes in code length.
-                ;      get5 and get13 are 8 cycles faster.
-                ;      get9, get16, and rep are 4 cycles faster.
-                ;
-
-.lz_offset:     lda     #$20                    ; Y bit in lzsa_cmdbuf.
-                bit     <lzsa_cmdbuf
-                bmi     .get_13_16_rep
-                bne     .get_9_bits
-
-.get_5_bits:    dex                             ; X=$FF
-.get_13_bits:   LZSA_GET_NIBL                   ; Always returns with CS.
-                bvc     .get_5_skip
-                clc
-.get_5_skip:    rol     a                       ; Shift into position, set C.
-                cpx     #$00                    ; X=$FF for a 5-bit offset.
-                bne     .set_offset
-                sbc     #2                      ; Subtract 512 because 13-bit
-                tax                             ; offset starts at $FE00.
-                bne     .get_low8               ; Always NZ from previous TAX.
-
-.get_9_bits:    dex                             ; X=$FF if VC, X=$FE if VS.
-                bvc     .get_low8
-                dex
-                bvs     .get_low8               ; Always VS from previous BIT.
-
-.get_13_16_rep: beq     .get_13_bits            ; Shares code with 5-bit path.
-
-.get_16_rep:    bvs     .lz_length              ; Repeat previous offset.
-
-                } else {
-
-                ;
-                ; Slower and longer path with STD order of bits.
-                ;
-                ; Z80  NES
-                ; ================================ 
-                ; xyz  xzy
-                ; 00z  0z0  5-bit offset
-                ; 01z  0z1  9-bit offset
-                ; 10z  1z0  13-bit offset
-                ; 110  101  16-bit offset
-                ; 111  111  repeat offset
-                ;      NVZ  for a BIT instruction
-                ;
+                ; ================================
+                ; xyz  
+                ; 00z  5-bit offset
+                ; 01z  9-bit offset
+                ; 10z  13-bit offset
+                ; 110  16-bit offset
+                ; 111  repeat offset
 
 .lz_offset:     lda     <lzsa_cmdbuf
                 asl
@@ -291,8 +311,8 @@ lzsa2_unpack:   ldy     #0                      ; Initialize source index.
                 cpx     #$00                    ; X=$FF for a 5-bit offset.
                 bne     .set_offset
                 sbc     #2                      ; Subtract 512 because 13-bit
-                tax                             ; offset starts at $FE00.
-                bne     .get_low8               ; Always NZ from previous TAX.
+                                                ; offset starts at $FE00.
+                bne     .get_low8x              ; Always NZ from previous SBC.
 
 .get_9_bits:    dex                             ; X=$FF if CS, X=$FE if CC.
                 asl
@@ -305,18 +325,29 @@ lzsa2_unpack:   ldy     #0                      ; Initialize source index.
 
 .get_16_rep:    bmi     .lz_length              ; Repeat previous offset.
 
-                }
-
                 ;
                 ; Copy bytes from decompressed window.
                 ;
                 ; N.B. X=0 is expected and guaranteed when we get here.
                 ;
 
-.get_16_bits:   jsr     .get_byte               ; Get hi-byte of offset.
-                tax
+.get_16_bits:   jsr     lzsa2_get_byte          ; Get hi-byte of offset.
 
-.get_low8:      +LZSA_GET_SRC                   ; Get lo-byte of offset.
+.get_low8x:     tax
+
+.get_low8:
+                !if     (LZSA_NO_INLINE | LZSA_USE_FFFF) {
+
+                +LZSA_GET_SRC                   ; Get lo-byte of offset.
+
+                } else {
+
+                lda     (lzsa_srcptr),y
+                inc     <lzsa_srcptr + 0
+                beq     .incsrc3
+.resume_src3:
+
+                }
 
 .set_offset:    stx     <lzsa_offset + 1        ; Save new offset.
                 sta     <lzsa_offset + 0
@@ -366,6 +397,14 @@ lzsa2_unpack:   ldy     #0                      ; Initialize source index.
 
                 jmp     .cp_length              ; Loop around to the beginning.
 
+                !if     (LZSA_NO_INLINE | LZSA_USE_FFFF) = 0 {
+
+.incsrc3:
+                inc     <lzsa_srcptr + 1
+                bne     .resume_src3            ; always taken
+
+                }
+
                 ;
                 ; Lookup tables to differentiate literal and match lengths.
                 ;
@@ -379,8 +418,6 @@ lzsa2_unpack:   ldy     #0                      ; Initialize source index.
                 ;
                 ; Get 16-bit length in X:A register pair.
                 ;
-                ; N.B. Requires reversal of bytes in 16-bit length.
-                ;
 
 .get_length:    +LZSA_GET_NIBL
                 cmp     #$FF                    ; Extended length?
@@ -390,36 +427,26 @@ lzsa2_unpack:   ldy     #0                      ; Initialize source index.
 .got_length:    ldx     #$00                    ; Set hi-byte of 4 & 8 bit
                 rts                             ; lengths.
 
-.byte_length:   jsr     .get_byte               ; So rare, this can be slow!
+.byte_length:   jsr     lzsa2_get_byte          ; So rare, this can be slow!
                 adc     .byte_len_tbl,x         ; Always CS from previous CMP.
                 bcc     .got_length
                 beq     .finished
 
-                !if      LZSA_SWAP_LEN16 {
-
-.word_length:   jsr     .get_byte               ; So rare, this can be slow!
-                tax
-
-                } else {
-
-.word_length:   jsr     .get_byte               ; So rare, this can be slow!
+.word_length:   jsr     lzsa2_get_byte          ; So rare, this can be slow!
                 pha
-                jsr     .get_byte               ; So rare, this can be slow!
+                jsr     lzsa2_get_byte          ; So rare, this can be slow!
                 tax
                 pla
                 rts
 
-                }
-
-.get_byte:      lda     (lzsa_srcptr),y         ; Subroutine version for when
+lzsa2_get_byte: 
+                lda     (lzsa_srcptr),y         ; Subroutine version for when
                 inc     <lzsa_srcptr + 0        ; inlining isn't advantageous.
-                beq     .next_page
+                beq     lzsa2_next_page
                 rts
 
-.next_page:     inc     <lzsa_srcptr + 1        ; Inc & test for bank overflow.
-                !if      LZSA_FROM_BANK {
-                bmi     .next_bank              ; Change for target hardware!
-                }
+lzsa2_next_page:
+                inc     <lzsa_srcptr + 1        ; Inc & test for bank overflow.
                 rts
 
 .finished:      pla                             ; Decompression completed, pop
@@ -430,41 +457,66 @@ lzsa2_unpack:   ldy     #0                      ; Initialize source index.
                 ; Get a nibble value from compressed data in A.
                 ;
 
-                !if      (LZSA_SLOW_NIBL + LZSA_BEST_SIZE) {
+                !if     (LZSA_SLOW_NIBL | LZSA_SMALL_SIZE) {
 
-lzsa2_get_nibble:    lsr     <lzsa_nibflg            ; Is there a nibble waiting?
+lzsa2_get_nibble:
+                lsr     <lzsa_nibflg            ; Is there a nibble waiting?
                 lda     <lzsa_nibble            ; Extract the lo-nibble.
                 bcs     .got_nibble
 
                 inc     <lzsa_nibflg            ; Reset the flag.
+                !if     (LZSA_NO_INLINE | LZSA_USE_FFFF) {
+
                 +LZSA_GET_SRC
+
+                } else {
+
+                lda     (lzsa_srcptr),y
+                inc     <lzsa_srcptr + 0
+                beq     .incsrc4
+.resume_src4:
+
+                }
+
                 sta     <lzsa_nibble            ; Preserve for next time.
                 lsr                             ; Extract the hi-nibble.
                 lsr
                 lsr
                 lsr
 
-                !if     LZSA_SWAP_XZY {
-                sec                             ; Offset code relies on CS.
-                }
-
 .got_nibble:    ora     #$F0
                 rts
 
                 } else {
 
-.new_nibble:    inc     <lzsa_nibflg            ; Reset the flag.
-                LZSA_GET_SRC
-                sta     <lzsa_nibble            ; Preserve for next time.
-                lsr     a                       ; Extract the hi-nibble.
-                lsr     a
-                lsr     a
-                lsr     a
+lzsa2_new_nibble:
+                inc     <lzsa_nibflg            ; Reset the flag.
+                !if     (LZSA_NO_INLINE | LZSA_USE_FFFF) {
+
+                +LZSA_GET_SRC
+
+                } else {
+
+                lda     (lzsa_srcptr),y
+                inc     <lzsa_srcptr + 0
+                beq     .incsrc4
+.resume_src4:
 
-                !if     LZSA_SWAP_XZY {
-                sec                             ; Offset code relies on CS.
                 }
 
+                sta     <lzsa_nibble            ; Preserve for next time.
+                lsr                             ; Extract the hi-nibble.
+                lsr
+                lsr
+                lsr
                 rts
 
                 }
+
+                !if     (LZSA_NO_INLINE | LZSA_USE_FFFF) = 0 {
+
+.incsrc4:
+                inc     <lzsa_srcptr + 1
+                bne     .resume_src4            ; always taken
+
+                }
diff --git a/asm/6502/decompress_small_v2.asm b/asm/6502/decompress_small_v2.asm
index 6daa1bd..42a7520 100644
--- a/asm/6502/decompress_small_v2.asm
+++ b/asm/6502/decompress_small_v2.asm
@@ -109,11 +109,9 @@ NO_LITERALS
    BNE GOT_OFFSET_LO                    ; go store low byte of match offset and prepare match
    
 OFFSET_9_BIT                            ; 01Z: 9 bit offset
-   ;;ASL                                  ; shift Z (offset bit 8) in place
-   ROL
-   ROL
-   AND #$01
-   EOR #$FF                             ; set offset bits 15-9 to 1
+   ROL                                  ; carry: Z bit; A: xxxxxxx1 (carry known set from BCS OFFSET_9_BIT)
+   ADC #$00                             ; if Z bit is set, add 1 to A (bit 0 of A is now 0), otherwise bit 0 is 1
+   ORA #$FE                             ; set offset bits 15-9 to 1. reversed Z is already in bit 0
    BNE GOT_OFFSET_HI                    ; go store high byte, read low byte of match offset and prepare match
                                         ; (*same as JMP GOT_OFFSET_HI but shorter)
 
@@ -129,7 +127,6 @@ REPMATCH_OR_LARGE_OFFSET
                                         ; (*same as JMP GOT_OFFSET_HI but shorter)
 
 REPMATCH_OR_16_BIT                      ; rep-match or 16 bit offset
-   ;;ASL                                  ; XYZ=111?
    BMI REP_MATCH                        ; reuse previous offset if so (rep-match)
    
                                         ; 110: handle 16 bit offset
@@ -242,7 +239,6 @@ GETCOMBINEDBITS
 
    JSR GETNIBBLE                        ; get nibble into bits 0-3 (for offset bits 1-4)
    PLP                                  ; merge Z bit as the carry bit (for offset bit 0)
-COMBINEDBITZ
    ROL                                  ; nibble -> bits 1-4; carry(!Z bit) -> bit 0 ; carry cleared
 DECOMPRESSION_DONE
    RTS
diff --git a/asm/65816/decompress_v1.asm b/asm/65816/decompress_v1.asm
new file mode 100644
index 0000000..4754e55
--- /dev/null
+++ b/asm/65816/decompress_v1.asm
@@ -0,0 +1,281 @@
+; -----------------------------------------------------------------------------
+; Decompress raw LZSA1 block. Create one with lzsa -r <original_file> <compressed_file>
+;
+; in:
+; * LZSA_SRC_LO/LZSA_SRC_HI/LZSA_SRC_BANK contain the compressed raw block address
+; * LZSA_DST_LO/LZSA_DST_HI/LZSA_DST_BANK contain the destination buffer address
+;
+; out:
+; * LZSA_DST_LO/LZSA_DST_HI/LZSA_DST_BANK contain the last decompressed byte address, +1
+;
+; -----------------------------------------------------------------------------
+; Backward decompression is also supported, use lzsa -r -b <original_file> <compressed_file>
+; To use it, also define BACKWARD_DECOMPRESS=1 before including this code!
+;
+; in:
+; * LZSA_SRC_LO/LZSA_SRC_HI/LZSA_SRC_BANK must contain the address of the last byte of compressed data
+; * LZSA_DST_LO/LZSA_DST_HI/LZSA_DST_BANK must contain the address of the last byte of the destination buffer
+;
+; out:
+; * LZSA_DST_LO/LZSA_DST_HI/BANK contain the last decompressed byte address, -1
+;
+; -----------------------------------------------------------------------------
+;
+;  Copyright (C) 2019-2020 Emmanuel Marty, Peter Ferrie
+;
+;  This software is provided 'as-is', without any express or implied
+;  warranty.  In no event will the authors be held liable for any damages
+;  arising from the use of this software.
+;
+;  Permission is granted to anyone to use this software for any purpose,
+;  including commercial applications, and to alter it and redistribute it
+;  freely, subject to the following restrictions:
+;
+;  1. The origin of this software must not be misrepresented; you must not
+;     claim that you wrote the original software. If you use this software
+;     in a product, an acknowledgment in the product documentation would be
+;     appreciated but is not required.
+;  2. Altered source versions must be plainly marked as such, and must not be
+;     misrepresented as being the original software.
+;  3. This notice may not be removed or altered from any source distribution.
+; -----------------------------------------------------------------------------
+
+!cpu 65816
+DECOMPRESS_LZSA1
+   SEP #$30
+!as
+!rs
+   LDY #$00
+
+DECODE_TOKEN
+   JSR GETSRC                           ; read token byte: O|LLL|MMMM
+   PHA                                  ; preserve token on stack
+
+   AND #$70                             ; isolate literals count
+   BEQ NO_LITERALS                      ; skip if no literals to copy
+   CMP #$70                             ; LITERALS_RUN_LEN?
+   BNE PREPARE_COPY_LITERALS            ; if not, count is directly embedded in token
+
+   JSR GETSRC                           ; get extra byte of variable literals count
+                                        ; the carry is always set by the CMP above
+                                        ; GETSRC doesn't change it
+   SBC #$F9                             ; (LITERALS_RUN_LEN)
+   BCC PREPARE_COPY_LITERALS_DIRECT
+   BEQ LARGE_VARLEN_LITERALS            ; if adding up to zero, go grab 16-bit count
+
+   JSR GETSRC                           ; get single extended byte of variable literals count
+   INY                                  ; add 256 to literals count
+   BCS PREPARE_COPY_LITERALS_DIRECT     ; (*like JMP PREPARE_COPY_LITERALS_DIRECT but shorter)
+
+LARGE_VARLEN_LITERALS                   ; handle 16 bits literals count
+                                        ; literals count = directly these 16 bits
+   JSR GETLARGESRC                      ; grab low 8 bits in X, high 8 bits in A
+   TAY                                  ; put high 8 bits in Y
+   TXA
+   BCS PREPARE_COPY_LARGE_LITERALS      ; (*like JMP PREPARE_COPY_LITERALS_DIRECT but shorter)
+
+PREPARE_COPY_LITERALS
+   TAX
+   LDA SHIFT_TABLE-1,X                  ; shift literals length into place
+                                        ; -1 because position 00 is reserved
+PREPARE_COPY_LITERALS_DIRECT
+   TAX
+
+PREPARE_COPY_LARGE_LITERALS
+   BEQ COPY_LITERALS
+   INY
+
+COPY_LITERALS
+   JSR GETPUT                           ; copy one byte of literals
+   DEX
+   BNE COPY_LITERALS
+   DEY
+   BNE COPY_LITERALS
+   
+NO_LITERALS
+   PLA                                  ; retrieve token from stack
+   PHA                                  ; preserve token again
+   BMI GET_LONG_OFFSET                  ; $80: 16 bit offset
+
+   JSR GETSRC                           ; get 8 bit offset from stream in A
+   TAX                                  ; save for later
+   LDA #$FF                             ; high 8 bits
+   BNE GOT_OFFSET                       ; go prepare match
+                                        ; (*like JMP GOT_OFFSET but shorter)
+
+SHORT_VARLEN_MATCHLEN
+   JSR GETSRC                           ; get single extended byte of variable match len
+   INY                                  ; add 256 to match length
+
+PREPARE_COPY_MATCH
+   TAX
+PREPARE_COPY_MATCH_Y
+   TXA
+   BEQ COPY_MATCH_LOOP
+   INY
+
+COPY_MATCH_LOOP
+   LDA $AAAAAA                          ; get one byte of backreference
+   JSR PUTDST                           ; copy to destination
+
+   REP #$20
+!ifdef BACKWARD_DECOMPRESS {
+
+   ; Backward decompression -- put backreference bytes backward
+
+   DEC COPY_MATCH_LOOP+1
+
+} else {
+
+   ; Forward decompression -- put backreference bytes forward
+
+   INC COPY_MATCH_LOOP+1
+
+}
+   SEP #$20
+
+   DEX
+   BNE COPY_MATCH_LOOP
+   DEY
+   BNE COPY_MATCH_LOOP
+   BEQ DECODE_TOKEN                     ; (*like JMP DECODE_TOKEN but shorter)
+
+GET_LONG_OFFSET                         ; handle 16 bit offset:
+   JSR GETLARGESRC                      ; grab low 8 bits in X, high 8 bits in A
+
+GOT_OFFSET
+
+!ifdef BACKWARD_DECOMPRESS {
+
+   ; Backward decompression - substract match offset
+
+   STA OFFSHI                           ; store high 8 bits of offset
+   STX OFFSLO
+
+   SEC                                  ; substract dest - match offset
+   REP #$20
+!al
+   LDA PUTDST+1
+OFFSLO = *+1
+OFFSHI = *+2
+   SBC #$AAAA                           ; 16 bits
+   STA COPY_MATCH_LOOP+1                ; store back reference address
+   SEP #$20
+!as
+   SEC
+
+} else {
+
+   ; Forward decompression - add match offset
+
+   STA OFFSHI                           ; store high 8 bits of offset
+   TXA
+
+   CLC                                  ; add dest + match offset
+   ADC PUTDST+1                         ; low 8 bits
+   STA COPY_MATCH_LOOP+1                ; store back reference address
+OFFSHI = *+1
+   LDA #$AA                             ; high 8 bits
+
+   ADC PUTDST+2
+   STA COPY_MATCH_LOOP+2                ; store high 8 bits of address
+   
+}
+
+   LDA PUTDST+3                         ; bank
+   STA COPY_MATCH_LOOP+3                ; store back reference address
+
+   PLA                                  ; retrieve token from stack again
+   AND #$0F                             ; isolate match len (MMMM)
+   ADC #$02                             ; plus carry which is always set by the high ADC
+   CMP #$12                             ; MATCH_RUN_LEN?
+   BCC PREPARE_COPY_MATCH               ; if not, count is directly embedded in token
+
+   JSR GETSRC                           ; get extra byte of variable match length
+                                        ; the carry is always set by the CMP above
+                                        ; GETSRC doesn't change it
+   SBC #$EE                             ; add MATCH_RUN_LEN and MIN_MATCH_SIZE to match length
+   BCC PREPARE_COPY_MATCH
+   BNE SHORT_VARLEN_MATCHLEN
+
+                                        ; Handle 16 bits match length
+   JSR GETLARGESRC                      ; grab low 8 bits in X, high 8 bits in A
+   TAY                                  ; put high 8 bits in Y
+                                        ; large match length with zero high byte?
+   BNE PREPARE_COPY_MATCH_Y             ; if not, continue
+
+DECOMPRESSION_DONE
+   RTS
+
+SHIFT_TABLE
+   !BYTE     $00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00
+   !BYTE $01,$01,$01,$01,$01,$01,$01,$01,$01,$01,$01,$01,$01,$01,$01,$01
+   !BYTE $02,$02,$02,$02,$02,$02,$02,$02,$02,$02,$02,$02,$02,$02,$02,$02
+   !BYTE $03,$03,$03,$03,$03,$03,$03,$03,$03,$03,$03,$03,$03,$03,$03,$03
+   !BYTE $04,$04,$04,$04,$04,$04,$04,$04,$04,$04,$04,$04,$04,$04,$04,$04
+   !BYTE $05,$05,$05,$05,$05,$05,$05,$05,$05,$05,$05,$05,$05,$05,$05,$05
+   !BYTE $06,$06,$06,$06,$06,$06,$06,$06,$06,$06,$06,$06,$06,$06,$06,$06
+   !BYTE $07,$07,$07,$07,$07,$07,$07,$07,$07,$07,$07,$07,$07,$07,$07,$07
+
+!ifdef BACKWARD_DECOMPRESS {
+
+   ; Backward decompression -- get and put bytes backward
+
+GETPUT
+   JSR GETSRC
+PUTDST
+LZSA_DST_LO = *+1
+LZSA_DST_HI = *+2
+LZSA_DST_BANK = *+3
+   STA $AAAAAA
+   REP #$20
+   DEC PUTDST+1
+   SEP #$20
+   RTS
+
+GETLARGESRC
+   JSR GETSRC                           ; grab low 8 bits
+   TAX                                  ; move to X
+                                        ; fall through grab high 8 bits
+
+GETSRC
+LZSA_SRC_LO = *+1
+LZSA_SRC_HI = *+2
+LZSA_SRC_BANK = *+3
+   LDA $AAAAAA
+   REP #$20
+   DEC GETSRC+1
+   SEP #$20
+   RTS
+
+} else {
+
+   ; Forward decompression -- get and put bytes forward
+
+GETPUT
+   JSR GETSRC
+PUTDST
+LZSA_DST_LO = *+1
+LZSA_DST_HI = *+2
+LZSA_DST_BANK = *+3
+   STA $AAAAAA
+   REP #$20
+   INC PUTDST+1
+   SEP #$20
+   RTS
+
+GETLARGESRC
+   JSR GETSRC                           ; grab low 8 bits
+   TAX                                  ; move to X
+                                        ; fall through grab high 8 bits
+
+GETSRC
+LZSA_SRC_LO = *+1
+LZSA_SRC_HI = *+2
+LZSA_SRC_BANK = *+3
+   LDA $AAAAAA
+   REP #$20
+   INC GETSRC+1
+   SEP #$20
+   RTS
+}
diff --git a/asm/65816/decompress_v2.asm b/asm/65816/decompress_v2.asm
new file mode 100644
index 0000000..08c2ac8
--- /dev/null
+++ b/asm/65816/decompress_v2.asm
@@ -0,0 +1,338 @@
+; -----------------------------------------------------------------------------
+; Decompress raw LZSA2 block.
+; Create one with lzsa -r -f2 <original_file> <compressed_file>
+;
+; in:
+; * LZSA_SRC_LO/LZSA_SRC_HI/LZSA_SRC_BANK contain the compressed raw block address
+; * LZSA_DST_LO/LZSA_DST_HI/LZSA_DST_BANK contain the destination buffer address
+;
+; out:
+; * LZSA_DST_LO/LZSA_DST_HI/LZSA_DST_BANK contain the last decompressed byte address, +1
+;
+; -----------------------------------------------------------------------------
+; Backward decompression is also supported, use lzsa -r -b -f2 <original_file> <compressed_file>
+; To use it, also define BACKWARD_DECOMPRESS=1 before including this code!
+;
+; in:
+; * LZSA_SRC_LO/LZSA_SRC_HI/LZSA_SRC_BANK must contain the address of the last byte of compressed data
+; * LZSA_DST_LO/LZSA_DST_HI/LZSA_DST_BANK must contain the address of the last byte of the destination buffer
+;
+; out:
+; * LZSA_DST_LO/LZSA_DST_HI/BANK contain the last decompressed byte address, -1
+;
+; -----------------------------------------------------------------------------
+;
+;  Copyright (C) 2019-2020 Emmanuel Marty, Peter Ferrie
+;
+;  This software is provided 'as-is', without any express or implied
+;  warranty.  In no event will the authors be held liable for any damages
+;  arising from the use of this software.
+;
+;  Permission is granted to anyone to use this software for any purpose,
+;  including commercial applications, and to alter it and redistribute it
+;  freely, subject to the following restrictions:
+;
+;  1. The origin of this software must not be misrepresented; you must not
+;     claim that you wrote the original software. If you use this software
+;     in a product, an acknowledgment in the product documentation would be
+;     appreciated but is not required.
+;  2. Altered source versions must be plainly marked as such, and must not be
+;     misrepresented as being the original software.
+;  3. This notice may not be removed or altered from any source distribution.
+; -----------------------------------------------------------------------------
+
+!cpu 65816
+NIBCOUNT = $FC                          ; zero-page location for temp offset
+
+DECOMPRESS_LZSA2
+   SEP #$30
+!as
+!rs
+   LDY #$00
+   STY NIBCOUNT
+
+DECODE_TOKEN
+   JSR GETSRC                           ; read token byte: XYZ|LL|MMM
+   PHA                                  ; preserve token on stack
+
+   AND #$18                             ; isolate literals count (LL)
+   BEQ NO_LITERALS                      ; skip if no literals to copy
+   CMP #$18                             ; LITERALS_RUN_LEN_V2?
+   BCC PREPARE_COPY_LITERALS            ; if less, count is directly embedded in token
+
+   JSR GETNIBBLE                        ; get extra literals length nibble
+                                        ; add nibble to len from token
+   ADC #$02                             ; (LITERALS_RUN_LEN_V2) minus carry
+   CMP #$12                             ; LITERALS_RUN_LEN_V2 + 15 ?
+   BCC PREPARE_COPY_LITERALS_DIRECT     ; if less, literals count is complete
+
+   JSR GETSRC                           ; get extra byte of variable literals count
+                                        ; the carry is always set by the CMP above
+                                        ; GETSRC doesn't change it
+   SBC #$EE                             ; overflow?
+   BRA PREPARE_COPY_LITERALS_DIRECT
+
+PREPARE_COPY_LITERALS_LARGE
+                                        ; handle 16 bits literals count
+                                        ; literals count = directly these 16 bits
+   JSR GETLARGESRC                      ; grab low 8 bits in X, high 8 bits in A
+   TAY                                  ; put high 8 bits in Y
+   BCS PREPARE_COPY_LITERALS_HIGH       ; (*same as JMP PREPARE_COPY_LITERALS_HIGH but shorter)
+
+PREPARE_COPY_LITERALS
+   LSR                                  ; shift literals count into place
+   LSR
+   LSR
+
+PREPARE_COPY_LITERALS_DIRECT
+   TAX
+   BCS PREPARE_COPY_LITERALS_LARGE      ; if so, literals count is large
+
+PREPARE_COPY_LITERALS_HIGH
+   TXA
+   BEQ COPY_LITERALS
+   INY
+
+COPY_LITERALS
+   JSR GETPUT                           ; copy one byte of literals
+   DEX
+   BNE COPY_LITERALS
+   DEY
+   BNE COPY_LITERALS
+   
+NO_LITERALS
+   PLA                                  ; retrieve token from stack
+   PHA                                  ; preserve token again
+   ASL
+   BCS REPMATCH_OR_LARGE_OFFSET         ; 1YZ: rep-match or 13/16 bit offset
+
+   ASL                                  ; 0YZ: 5 or 9 bit offset
+   BCS OFFSET_9_BIT         
+    
+                                        ; 00Z: 5 bit offset
+
+   LDX #$FF                             ; set offset bits 15-8 to 1
+
+   JSR GETCOMBINEDBITS                  ; rotate Z bit into bit 0, read nibble for bits 4-1
+   ORA #$E0                             ; set bits 7-5 to 1
+   BNE GOT_OFFSET_LO                    ; go store low byte of match offset and prepare match
+   
+OFFSET_9_BIT                            ; 01Z: 9 bit offset
+   ;;ASL                                  ; shift Z (offset bit 8) in place
+   ROL
+   ROL
+   AND #$01
+   EOR #$FF                             ; set offset bits 15-9 to 1
+   BNE GOT_OFFSET_HI                    ; go store high byte, read low byte of match offset and prepare match
+                                        ; (*same as JMP GOT_OFFSET_HI but shorter)
+
+REPMATCH_OR_LARGE_OFFSET
+   ASL                                  ; 13 bit offset?
+   BCS REPMATCH_OR_16_BIT               ; handle rep-match or 16-bit offset if not
+
+                                        ; 10Z: 13 bit offset
+
+   JSR GETCOMBINEDBITS                  ; rotate Z bit into bit 8, read nibble for bits 12-9
+   ADC #$DE                             ; set bits 15-13 to 1 and substract 2 (to substract 512)
+   BNE GOT_OFFSET_HI                    ; go store high byte, read low byte of match offset and prepare match
+                                        ; (*same as JMP GOT_OFFSET_HI but shorter)
+
+REPMATCH_OR_16_BIT                      ; rep-match or 16 bit offset
+   ;;ASL                                  ; XYZ=111?
+   BMI REP_MATCH                        ; reuse previous offset if so (rep-match)
+   
+                                        ; 110: handle 16 bit offset
+   JSR GETSRC                           ; grab high 8 bits
+GOT_OFFSET_HI
+   TAX
+   JSR GETSRC                           ; grab low 8 bits
+GOT_OFFSET_LO
+   STA OFFSLO                           ; store low byte of match offset
+   STX OFFSHI                           ; store high byte of match offset
+
+REP_MATCH
+!ifdef BACKWARD_DECOMPRESS {
+
+   ; Backward decompression - substract match offset
+
+   SEC                                  ; add dest + match offset
+   REP #$20
+!al
+   LDA PUTDST+1                         ; 16 bits
+OFFSLO = *+1
+OFFSHI = *+2
+   SBC #$AAAA
+   STA COPY_MATCH_LOOP+1                ; store back reference address
+   SEP #$20
+!as
+   SEC
+
+} else {
+
+   ; Forward decompression - add match offset
+
+   CLC                                  ; add dest + match offset
+   REP #$20
+!al
+   LDA PUTDST+1                         ; 16 bits
+OFFSLO = *+1
+OFFSHI = *+2
+   ADC #$AAAA
+   STA COPY_MATCH_LOOP+1                ; store back reference address
+   SEP #$20
+!as
+}
+
+   LDA PUTDST+3                         ; bank
+   STA COPY_MATCH_LOOP+3                ; store back reference address
+   
+   PLA                                  ; retrieve token from stack again
+   AND #$07                             ; isolate match len (MMM)
+   ADC #$01                             ; add MIN_MATCH_SIZE_V2 and carry
+   CMP #$09                             ; MIN_MATCH_SIZE_V2 + MATCH_RUN_LEN_V2?
+   BCC PREPARE_COPY_MATCH               ; if less, length is directly embedded in token
+
+   JSR GETNIBBLE                        ; get extra match length nibble
+                                        ; add nibble to len from token
+   ADC #$08                             ; (MIN_MATCH_SIZE_V2 + MATCH_RUN_LEN_V2) minus carry
+   CMP #$18                             ; MIN_MATCH_SIZE_V2 + MATCH_RUN_LEN_V2 + 15?
+   BCC PREPARE_COPY_MATCH               ; if less, match length is complete
+
+   JSR GETSRC                           ; get extra byte of variable match length
+                                        ; the carry is always set by the CMP above
+                                        ; GETSRC doesn't change it
+   SBC #$E8                             ; overflow?
+
+PREPARE_COPY_MATCH
+   TAX
+   BCC PREPARE_COPY_MATCH_Y             ; if not, the match length is complete
+   BEQ DECOMPRESSION_DONE               ; if EOD code, bail
+
+                                        ; Handle 16 bits match length
+   JSR GETLARGESRC                      ; grab low 8 bits in X, high 8 bits in A
+   TAY                                  ; put high 8 bits in Y
+
+PREPARE_COPY_MATCH_Y
+   TXA
+   BEQ COPY_MATCH_LOOP
+   INY
+
+COPY_MATCH_LOOP
+   LDA $AAAAAA                          ; get one byte of backreference
+   JSR PUTDST                           ; copy to destination
+
+   REP #$20
+!ifdef BACKWARD_DECOMPRESS {
+
+   ; Backward decompression -- put backreference bytes backward
+
+   DEC COPY_MATCH_LOOP+1
+
+} else {
+
+   ; Forward decompression -- put backreference bytes forward
+
+   INC COPY_MATCH_LOOP+1
+
+}
+   SEP #$20
+
+   DEX
+   BNE COPY_MATCH_LOOP
+   DEY
+   BNE COPY_MATCH_LOOP
+   JMP DECODE_TOKEN
+
+GETCOMBINEDBITS
+   EOR #$80
+   ASL
+   PHP
+
+   JSR GETNIBBLE                        ; get nibble into bits 0-3 (for offset bits 1-4)
+   PLP                                  ; merge Z bit as the carry bit (for offset bit 0)
+COMBINEDBITZ
+   ROL                                  ; nibble -> bits 1-4; carry(!Z bit) -> bit 0 ; carry cleared
+DECOMPRESSION_DONE
+   RTS
+
+GETNIBBLE
+NIBBLES = *+1
+   LDA #$AA
+   LSR NIBCOUNT
+   BCC NEED_NIBBLES
+   AND #$0F                             ; isolate low 4 bits of nibble
+   RTS
+
+NEED_NIBBLES
+   INC NIBCOUNT
+   JSR GETSRC                           ; get 2 nibbles
+   STA NIBBLES
+   LSR 
+   LSR 
+   LSR 
+   LSR 
+   SEC
+   RTS
+
+!ifdef BACKWARD_DECOMPRESS {
+
+   ; Backward decompression -- get and put bytes backward
+
+GETPUT
+   JSR GETSRC
+PUTDST
+LZSA_DST_LO = *+1
+LZSA_DST_HI = *+2
+LZSA_DST_BANK = *+3
+   STA $AAAAAA
+   REP #$20
+   DEC PUTDST+1
+   SEP #$20
+   RTS
+
+GETLARGESRC
+   JSR GETSRC                           ; grab low 8 bits
+   TAX                                  ; move to X
+                                        ; fall through grab high 8 bits
+
+GETSRC
+LZSA_SRC_LO = *+1
+LZSA_SRC_HI = *+2
+LZSA_SRC_BANK = *+3
+   LDA $AAAAAA
+   REP #$20
+   DEC GETSRC+1
+   SEP #$20
+   RTS
+
+} else {
+
+   ; Forward decompression -- get and put bytes forward
+
+GETPUT
+   JSR GETSRC
+PUTDST
+LZSA_DST_LO = *+1
+LZSA_DST_HI = *+2
+LZSA_DST_BANK = *+3
+   STA $AAAAAA
+   REP #$20
+   INC PUTDST+1
+   SEP #$20
+   RTS
+
+GETLARGESRC
+   JSR GETSRC                           ; grab low 8 bits
+   TAX                                  ; move to X
+                                        ; fall through grab high 8 bits
+
+GETSRC
+LZSA_SRC_LO = *+1
+LZSA_SRC_HI = *+2
+LZSA_SRC_BANK = *+3
+   LDA $AAAAAA
+   REP #$20
+   INC GETSRC+1
+   SEP #$20
+   RTS
+}
diff --git a/asm/6809/unlzsa1-6309.s b/asm/6809/unlzsa1-6309.s
new file mode 100644
index 0000000..5866e8d
--- /dev/null
+++ b/asm/6809/unlzsa1-6309.s
@@ -0,0 +1,90 @@
+;  unlzsa1-6309.s - Hitachi 6309 decompression routine for raw LZSA1 - 92 bytes
+;  compress with lzsa -f1 -r <original_file> <compressed_file>
+;
+;  in:  x = start of compressed data
+;       y = start of decompression buffer
+;  out: y = end of decompression buffer + 1
+;
+;  Copyright (C) 2020 Emmanuel Marty, Doug Masten
+;
+;  This software is provided 'as-is', without any express or implied
+;  warranty.  In no event will the authors be held liable for any damages
+;  arising from the use of this software.
+;
+;  Permission is granted to anyone to use this software for any purpose,
+;  including commercial applications, and to alter it and redistribute it
+;  freely, subject to the following restrictions:
+;
+;  1. The origin of this software must not be misrepresented; you must not
+;     claim that you wrote the original software. If you use this software
+;     in a product, an acknowledgment in the product documentation would be
+;     appreciated but is not required.
+;  2. Altered source versions must be plainly marked as such, and must not be
+;     misrepresented as being the original software.
+;  3. This notice may not be removed or altered from any source distribution.
+
+decompress_lzsa1 equ lz1token
+
+lz1bigof lda ,x+           ; O set: load MSB 16-bit (negative, signed) offest
+lz1gotof leau d,y          ; put backreference start address in U (dst+offset)
+
+         ldd #$000f        ; clear MSB match length and set mask for MMMM
+         andb ,s+          ; isolate MMMM (embedded match length) in token
+         addb #$03         ; add MIN_MATCH_SIZE
+         cmpb #$12         ; MATCH_RUN_LEN?
+         bne lz1gotln      ; no, we have the full match length, go copy
+
+         addb ,x+          ; add extra match length byte + MIN_MATCH_SIZE + MATCH_RUN_LEN
+         bcc lz1gotln      ; if no overflow, we have the full length
+         bne lz1midln
+
+         ldb ,x+           ; load 16-bit len in D (low part in B, high in A)
+         lda ,x+           ; (little endian)
+         bne lz1gotln      ; check if we hit EOD (16-bit length = 0)
+         tstb
+         bne lz1gotln      ; go copy matched bytes if not
+
+         rts               ; done, bail
+
+lz1midln tfr b,a           ; copy high part of len into A
+         ldb ,x+           ; grab low 8 bits of len in B
+
+lz1gotln tfr d,w           ; set W with match length for TFM instruction
+         tfm u+,y+         ; copy match bytes
+
+lz1token ldb ,x+           ; load next token into B: O|LLL|MMMM
+         pshs b            ; save it
+
+         andb #$70         ; isolate LLL (embedded literals count) in B
+         beq lz1nolt       ; skip if no literals
+         cmpb #$70         ; LITERALS_RUN_LEN?
+         bne lz1declt      ; if not, we have the complete count, go unshift
+
+         ldb ,x+           ; load extra literals count byte
+         addb #$07         ; add LITERALS_RUN_LEN
+         bcc lz1gotla      ; if no overflow, we got the complete count, copy
+         bne lz1midlt
+
+         ldb ,x+           ; load low 8 bits of little-endian literals count
+         lda ,x+           ; load high 8 bits of literal count
+         bra lz1gotlt      ; we now have the complete count, go copy
+
+lz1midlt tfr b,a           ; copy high part of literals count into A
+         ldb ,x+           ; load low 8 bits of literals count
+         bra lz1gotlt      ; we now have the complete count, go copy
+
+lz1declt lsrb              ; shift literals count into place
+         lsrb
+         lsrb
+         lsrb
+
+lz1gotla clra              ; clear A (high part of literals count)
+lz1gotlt tfr d,w           ; set W with literals count for TFM instruction
+         tfm x+,y+         ; copy literal bytes
+
+lz1nolt  ldb ,x+           ; load either 8-bit or LSB 16-bit offset (negative, signed)
+         lda ,s            ; get token again, don't pop it from the stack
+         bmi lz1bigof      ; test O bit (small or large offset)
+
+         lda #$ff          ; set high 8 bits
+         bra lz1gotof
diff --git a/asm/6809/unlzsa1.s b/asm/6809/unlzsa1.s
new file mode 100644
index 0000000..559a303
--- /dev/null
+++ b/asm/6809/unlzsa1.s
@@ -0,0 +1,102 @@
+;  unlzsa1.s - 6809 decompression routine for raw LZSA1 - 110 bytes
+;  compress with lzsa -r <original_file> <compressed_file>
+;
+;  in:  x = start of compressed data
+;       y = start of decompression buffer
+;  out: y = end of decompression buffer + 1
+;
+;  Copyright (C) 2020 Emmanuel Marty
+;
+;  This software is provided 'as-is', without any express or implied
+;  warranty.  In no event will the authors be held liable for any damages
+;  arising from the use of this software.
+;
+;  Permission is granted to anyone to use this software for any purpose,
+;  including commercial applications, and to alter it and redistribute it
+;  freely, subject to the following restrictions:
+;
+;  1. The origin of this software must not be misrepresented; you must not
+;     claim that you wrote the original software. If you use this software
+;     in a product, an acknowledgment in the product documentation would be
+;     appreciated but is not required.
+;  2. Altered source versions must be plainly marked as such, and must not be
+;     misrepresented as being the original software.
+;  3. This notice may not be removed or altered from any source distribution.
+
+decompress_lzsa1 equ lz1token
+
+lz1bigof lda ,x+           ; O set: load MSB 16-bit (negative, signed) offest
+lz1gotof leau d,y          ; put backreference start address in U (dst+offset)
+
+         ldd #$000f        ; clear MSB match length and set mask for MMMM
+         andb ,s+          ; isolate MMMM (embedded match length) in token
+         addb #$03         ; add MIN_MATCH_SIZE
+         cmpb #$12         ; MATCH_RUN_LEN?
+         bne lz1gotln      ; no, we have the full match length, go copy
+
+         addb ,x+          ; add extra match length byte + MIN_MATCH_SIZE + MATCH_RUN_LEN
+         bcc lz1gotln      ; if no overflow, we have the full length
+         bne lz1midln
+
+         ldb ,x+           ; load 16-bit len in D (low part in B, high in A)
+         lda ,x+           ; (little endian)
+         bne lz1gotln      ; check if we hit EOD (16-bit length = 0)
+         tstb
+         bne lz1gotln      ; go copy matched bytes if not
+
+         rts               ; done, bail
+
+lz1midln tfr b,a           ; copy high part of len into A
+         ldb ,x+           ; grab low 8 bits of len in B
+
+lz1gotln pshs x            ; save source compressed data pointer
+         tfr d,x           ; copy match length to X
+
+lz1cpymt lda ,u+           ; copy matched byte
+         sta ,y+
+         leax -1,x         ; decrement X
+         bne lz1cpymt      ; loop until all matched bytes are copied
+
+         puls x            ; restore source compressed data pointer
+
+lz1token ldb ,x+           ; load next token into B: O|LLL|MMMM
+         pshs b            ; save it
+
+         andb #$70         ; isolate LLL (embedded literals count) in B
+         beq lz1nolt       ; skip if no literals
+         cmpb #$70         ; LITERALS_RUN_LEN?
+         bne lz1declt      ; if not, we have the complete count, go unshift
+
+         ldb ,x+           ; load extra literals count byte
+         addb #$07         ; add LITERALS_RUN_LEN
+         bcc lz1gotla      ; if no overflow, we got the complete count, copy
+         bne lz1midlt
+
+         ldb ,x+           ; load low 8 bits of little-endian literals count
+         lda ,x+           ; load high 8 bits of literal count
+         bra lz1gotlt      ; we now have the complete count, go copy
+
+lz1midlt tfr b,a           ; copy high part of literals count into A
+         ldb ,x+           ; load low 8 bits of literals count
+         bra lz1gotlt      ; we now have the complete count, go copy
+
+lz1declt lsrb              ; shift literals count into place
+         lsrb
+         lsrb
+         lsrb
+lz1gotla clra              ; clear A (high part of literals count)
+
+lz1gotlt leau ,x
+         tfr d,x           ; transfer 16-bit count into X
+lz1cpylt lda ,u+           ; copy literal byte
+         sta ,y+
+         leax -1,x         ; decrement X and update Z flag
+         bne lz1cpylt      ; loop until all literal bytes are copied
+         leax ,u
+
+lz1nolt  ldb ,x+           ; load either 8-bit or LSB 16-bit offset (negative, signed)
+         lda ,s            ; get token again, don't pop it from the stack
+         bmi lz1bigof      ; test O bit (small or large offset)
+
+         lda #$ff          ; set high 8 bits
+         bra lz1gotof
diff --git a/asm/6809/unlzsa1b-6309.s b/asm/6809/unlzsa1b-6309.s
new file mode 100644
index 0000000..6078085
--- /dev/null
+++ b/asm/6809/unlzsa1b-6309.s
@@ -0,0 +1,92 @@
+;  unlzsa1-6309.s - H6309 backward decompressor for raw LZSA1 - 97 bytes
+;  compress with lzsa -f1 -r -b <original_file> <compressed_file>
+;
+;  in:  x = last byte of compressed data
+;       y = last byte of decompression buffer
+;  out: y = first byte of decompressed data
+;
+;  Copyright (C) 2020 Emmanuel Marty, Doug Masten
+;
+;  This software is provided 'as-is', without any express or implied
+;  warranty.  In no event will the authors be held liable for any damages
+;  arising from the use of this software.
+;
+;  Permission is granted to anyone to use this software for any purpose,
+;  including commercial applications, and to alter it and redistribute it
+;  freely, subject to the following restrictions:
+;
+;  1. The origin of this software must not be misrepresented; you must not
+;     claim that you wrote the original software. If you use this software
+;     in a product, an acknowledgment in the product documentation would be
+;     appreciated but is not required.
+;  2. Altered source versions must be plainly marked as such, and must not be
+;     misrepresented as being the original software.
+;  3. This notice may not be removed or altered from any source distribution.
+
+decompress_lzsa1
+         leax 1,x
+         bra lz1token
+
+lz1bigof ldd ,--x          ; O set: load long 16-bit (negative, signed) offest
+lz1gotof negd              ; reverse sign of offset in D
+         leau d,y          ; put backreference start address in U (dst+offset)
+
+         ldd #$000f        ; clear MSB match length and set mask for MMMM
+         andb ,s+          ; isolate MMMM (embedded match length) in token
+         addb #$03         ; add MIN_MATCH_SIZE
+         cmpb #$12         ; MATCH_RUN_LEN?
+         bne lz1gotln      ; no, we have the full match length, go copy
+
+         addb ,-x          ; add extra match length byte + MIN_MATCH_SIZE + MATCH_RUN_LEN
+         bcc lz1gotln      ; if no overflow, we have the full length
+         bne lz1midln
+
+         ldd ,--x          ; load 16-bit len in D (low part in B, high in A)
+         bne lz1gotln      ; check if we hit EOD (16-bit length = 0)
+
+         leay 1,y          ; adjust pointer to first byte of decompressed data
+         rts               ; done, bail
+
+lz1midln tfr b,a           ; copy high part of len into A
+         ldb ,-x           ; grab low 8 bits of len in B
+
+lz1gotln tfr d,w           ; set W with match length for TFM instruction
+         tfm u-,y-         ; copy match bytes
+
+lz1token ldb ,-x           ; load next token into B: O|LLL|MMMM
+         pshs b            ; save it
+
+         andb #$70         ; isolate LLL (embedded literals count) in B
+         beq lz1nolt       ; skip if no literals
+         cmpb #$70         ; LITERALS_RUN_LEN?
+         bne lz1declt      ; if not, we have the complete count, go unshift
+
+         ldb ,-x           ; load extra literals count byte
+         addb #$07         ; add LITERALS_RUN_LEN
+         bcc lz1gotla      ; if no overflow, we got the complete count, copy
+         bne lz1midlt
+
+         ldd ,--x          ; load 16 bit count in D (low part in B, high in A)
+         bra lz1gotlt      ; we now have the complete count, go copy
+
+lz1midlt tfr b,a           ; copy high part of literals count into A
+         ldb ,-x           ; load low 8 bits of literals count
+         bra lz1gotlt      ; we now have the complete count, go copy
+
+lz1declt lsrb              ; shift literals count into place
+         lsrb
+         lsrb
+         lsrb
+
+lz1gotla clra              ; clear A (high part of literals count)
+lz1gotlt tfr d,w           ; set W with literals count for TFM instruction
+         leax -1,x         ; tfm is post-decrement
+         tfm x-,y-         ; copy literal bytes
+         leax 1,x
+
+lz1nolt  ldb ,s            ; get token again, don't pop it from the stack
+         bmi lz1bigof      ; test O bit (small or large offset)
+
+         ldb ,-x           ; load either 8-bit or LSB 16-bit offset (negative, signed)
+         lda #$ff          ; set high 8 bits
+         bra lz1gotof
diff --git a/asm/6809/unlzsa1b.s b/asm/6809/unlzsa1b.s
new file mode 100644
index 0000000..ada6dcc
--- /dev/null
+++ b/asm/6809/unlzsa1b.s
@@ -0,0 +1,105 @@
+;  unlzsa1b.s - 6809 backward decompression routine for raw LZSA1 - 113 bytes
+;  compress with lzsa -r -b <original_file> <compressed_file>
+;
+;  in:  x = last byte of compressed data
+;       y = last byte of decompression buffer
+;  out: y = first byte of decompressed data
+;
+;  Copyright (C) 2020 Emmanuel Marty
+;
+;  This software is provided 'as-is', without any express or implied
+;  warranty.  In no event will the authors be held liable for any damages
+;  arising from the use of this software.
+;
+;  Permission is granted to anyone to use this software for any purpose,
+;  including commercial applications, and to alter it and redistribute it
+;  freely, subject to the following restrictions:
+;
+;  1. The origin of this software must not be misrepresented; you must not
+;     claim that you wrote the original software. If you use this software
+;     in a product, an acknowledgment in the product documentation would be
+;     appreciated but is not required.
+;  2. Altered source versions must be plainly marked as such, and must not be
+;     misrepresented as being the original software.
+;  3. This notice may not be removed or altered from any source distribution.
+
+decompress_lzsa1
+         leax 1,x
+         leay 1,y
+         bra lz1token
+
+lz1bigof ldd ,--x          ; O set: load long 16 bit (negative, signed) offset
+lz1gotof nega              ; reverse sign of offset in D
+         negb
+         sbca #0
+         leau d,y          ; put backreference start address in U (dst+offset)
+
+         ldd #$000f        ; clear MSB match length and set mask for MMMM
+         andb ,s+          ; isolate MMMM (embedded match length) in token
+
+         addb #$03         ; add MIN_MATCH_SIZE
+         cmpb #$12         ; MATCH_RUN_LEN?
+         bne lz1gotln      ; no, we have the full match length, go copy
+
+         addb ,-x          ; add extra match length byte + MIN_MATCH_SIZE + MATCH_RUN_LEN
+         bcc lz1gotln      ; if no overflow, we have the full length
+         bne lz1midln
+
+         ldd ,--x          ; load 16-bit len in D (low part in B, high in A)
+         bne lz1gotln      ; check if we hit EOD (16-bit length = 0)
+
+         rts               ; done, bail
+
+lz1midln tfr b,a           ; copy high part of len into A
+         ldb ,-x           ; grab low 8 bits of len in B
+
+lz1gotln pshs x            ; save source compressed data pointer
+         tfr d,x           ; copy match length to X
+
+lz1cpymt lda ,-u           ; copy matched byte
+         sta ,-y
+         leax -1,x         ; decrement X
+         bne lz1cpymt      ; loop until all matched bytes are copied
+
+         puls x            ; restore source compressed data pointer
+
+lz1token ldb ,-x           ; load next token into B: O|LLL|MMMM
+         pshs b            ; save it
+
+         andb #$70         ; isolate LLL (embedded literals count) in B
+         beq lz1nolt       ; skip if no literals
+         cmpb #$70         ; LITERALS_RUN_LEN?
+         bne lz1declt      ; if not, we have the complete count, go unshift
+
+         ldb ,-x           ; load extra literals count byte
+         addb #$07         ; add LITERALS_RUN_LEN
+         bcc lz1gotla      ; if no overflow, we got the complete count, copy
+         bne lz1midlt
+
+         ldd ,--x          ; load 16 bit count in D (low part in B, high in A)
+         bra lz1gotlt      ; we now have the complete count, go copy
+
+lz1midlt tfr b,a           ; copy high part of literals count into A
+         ldb ,-x           ; load low 8 bits of literals count
+         bra lz1gotlt      ; we now have the complete count, go copy
+
+lz1declt lsrb              ; shift literals count into place
+         lsrb
+         lsrb
+         lsrb
+
+lz1gotla clra              ; clear A (high part of literals count)
+lz1gotlt leau ,x
+         tfr d,x           ; transfer 16-bit count into X
+lz1cpylt lda ,-u           ; copy literal byte
+         sta ,-y
+         leax -1,x         ; decrement X and update Z flag
+         bne lz1cpylt      ; loop until all literal bytes are copied
+         leax ,u
+
+lz1nolt  ldb ,s            ; get token again, don't pop it from the stack
+         bmi lz1bigof      ; test O bit (small or large offset)
+
+         ldb ,-x           ; O clear: load 8 bit (negative, signed) offset
+         lda #$ff          ; set high 8 bits
+         bra lz1gotof
diff --git a/asm/6809/unlzsa2-6309.s b/asm/6809/unlzsa2-6309.s
new file mode 100644
index 0000000..17970d8
--- /dev/null
+++ b/asm/6809/unlzsa2-6309.s
@@ -0,0 +1,129 @@
+;  unlzsa2-6309.s - Hitachi 6309 decompression routine for raw LZSA2 - 150 bytes
+;  compress with lzsa -f2 -r <original_file> <compressed_file>
+;
+;  in:  x = start of compressed data
+;       y = start of decompression buffer
+;  out: y = end of decompression buffer + 1
+;
+;  Copyright (C) 2020 Emmanuel Marty, Doug Masten
+;
+;  This software is provided 'as-is', without any express or implied
+;  warranty.  In no event will the authors be held liable for any damages
+;  arising from the use of this software.
+;
+;  Permission is granted to anyone to use this software for any purpose,
+;  including commercial applications, and to alter it and redistribute it
+;  freely, subject to the following restrictions:
+;
+;  1. The origin of this software must not be misrepresented; you must not
+;     claim that you wrote the original software. If you use this software
+;     in a product, an acknowledgment in the product documentation would be
+;     appreciated but is not required.
+;  2. Altered source versions must be plainly marked as such, and must not be
+;     misrepresented as being the original software.
+;  3. This notice may not be removed or altered from any source distribution.
+
+decompress_lzsa2
+         clr lz2nibct      ; reset nibble available flag
+         bra lz2token
+
+lz2nibct fcb 0             ; nibble ready flag
+
+lz2replg lslb              ; push token's Y flag bit into carry
+         bcs lz2rep16      ; if token's Y bit is set, rep or 16 bit offset
+
+         sex               ; push token's Z flag bit into reg A
+         bsr lz2nibl       ; get offset nibble in B
+         lsla              ; push token's Z flag bit into carry
+         rolb              ; shift Z flag from carry into bit 0 of B
+         eorb #$e1         ; set bits 13-15 of offset, reverse bit 8
+         tfr b,a           ; copy bits 8-15 of offset into A
+         suba #$02         ; substract 512 from offset
+         ldb ,x+           ; load low 8 bits of (negative, signed) offset
+         bra lz2gotof
+
+lz2rep16 bmi lz2repof      ; if token's Z flag bit is set, rep match
+         ldd ,x++          ; load high then low 8 bits of offset
+
+lz2gotof std lz2moff+2     ; store match offset
+
+lz2repof ldd #$0007        ; clear MSB match length and set mask for MMM
+         andb ,u           ; isolate MMM (embedded match length) in token
+lz2moff  leau $aaaa,y      ; put backreference start address in U (dst+offset)
+         addb #$02         ; add MIN_MATCH_SIZE_V2
+         cmpb #$09         ; MIN_MATCH_SIZE_V2 + MATCH_RUN_LEN_V2?
+         bne lz2gotln      ; no, we have the full match length, go copy
+
+         bsr lz2nibl       ; get offset nibble in B
+         addb #$09         ; add MIN_MATCH_SIZE_V2 + MATCH_RUN_LEN_V2
+         cmpb #$18         ; MIN_MATCH_SIZE_V2 + MATCH_RUN_LEN_V2 + 15?
+         bne lz2gotln      ; if not, we have the full match length, go copy
+
+         addb ,x+          ; add extra length byte + MIN_MATCH_SIZE_V2 + MATCH_RUN_LEN_V2 + 15
+         bcc lz2gotln      ; if no overflow, we have the full length
+         beq lz2done       ; detect EOD code
+
+         ldb ,x+           ; load 16-bit len in D (low part in B, high in A)
+         lda ,x+           ; (little endian)
+
+lz2gotln tfr d,w           ; set W with match count for TFM instruction
+         tfm u+,y+         ; copy match bytes
+
+lz2token tfr x,u           ; save token address
+         ldb ,x+           ; load next token into B: XYZ|LL|MMM
+         andb #$18         ; isolate LL (embedded literals count) in B
+         beq lz2nolt       ; skip if no literals
+         cmpb #$18         ; LITERALS_RUN_LEN_V2?
+         bne lz2declt      ; if not, we have the complete count, go unshift
+
+         bsr lz2nibl       ; get extra literals length nibble in B
+         addb #$03         ; add LITERALS_RUN_LEN_V2
+         cmpb #$12         ; LITERALS_RUN_LEN_V2 + 15 ?
+         bne lz2gotla      ; if not, we have the full literals count, go copy
+
+         addb ,x+          ; add extra literals count byte + LITERALS_RUN_LEN + 15
+         bcc lz2gotla      ; if no overflow, we got the complete count, copy
+
+         ldb ,x+           ; load low 8 bits of little-endian literals count
+         lda ,x+           ; load high 8 bits of literal count
+         bra lz2gotlt      ; we now have the complete count, go copy
+
+lz2declt lsrb              ; shift literals count into place
+         lsrb
+         lsrb
+lz2gotla clra              ; clear A (high part of literals count)
+
+lz2gotlt tfr d,w           ; set W with literals count for TFM instruction
+         tfm x+,y+         ; copy literal bytes
+
+lz2nolt  ldb ,u            ; get token again
+         lslb              ; push token's X flag bit into carry
+         bcs lz2replg      ; if token's X bit is set, rep or large offset
+
+         lslb              ; push token's Y flag bit into carry
+         sex               ; push token's Z flag bit into reg A (carry flag is not effected)
+         bcs lz2offs9      ; if token's Y bit is set, 9 bits offset
+
+         bsr lz2nibl       ; get offset nibble in B
+         lsla              ; retrieve token's Z flag bit and push into carry
+         rolb              ; shift Z flag from carry into bit 0 of B
+         eorb #$e1         ; set bits 5-7 of offset, reverse bit 0
+         sex               ; set bits 8-15 of offset to $FF
+         bra lz2gotof
+
+lz2offs9 deca              ; set bits 9-15 of offset, reverse bit 8
+         ldb ,x+           ; load low 8 bits of (negative, signed) offset
+         bra lz2gotof
+
+lz2nibl  ldb #$aa
+         com lz2nibct      ; nibble ready?
+         bpl lz2gotnb
+
+         ldb ,x+           ; load two nibbles
+         stb lz2nibl+1     ; store nibble for next time (low 4 bits)
+         lsrb              ; shift 4 high bits of nibble down
+         lsrb
+         lsrb
+         lsrb
+lz2gotnb andb #$0f         ; only keep low 4 bits
+lz2done  rts
diff --git a/asm/6809/unlzsa2.s b/asm/6809/unlzsa2.s
new file mode 100644
index 0000000..a620cad
--- /dev/null
+++ b/asm/6809/unlzsa2.s
@@ -0,0 +1,146 @@
+;  unlzsa2.s - 6809 decompression routine for raw LZSA2 - 169 bytes
+;  compress with lzsa -f2 -r <original_file> <compressed_file>
+;
+;  in:  x = start of compressed data
+;       y = start of decompression buffer
+;  out: y = end of decompression buffer + 1
+;
+;  Copyright (C) 2020 Emmanuel Marty
+;
+;  This software is provided 'as-is', without any express or implied
+;  warranty.  In no event will the authors be held liable for any damages
+;  arising from the use of this software.
+;
+;  Permission is granted to anyone to use this software for any purpose,
+;  including commercial applications, and to alter it and redistribute it
+;  freely, subject to the following restrictions:
+;
+;  1. The origin of this software must not be misrepresented; you must not
+;     claim that you wrote the original software. If you use this software
+;     in a product, an acknowledgment in the product documentation would be
+;     appreciated but is not required.
+;  2. Altered source versions must be plainly marked as such, and must not be
+;     misrepresented as being the original software.
+;  3. This notice may not be removed or altered from any source distribution.
+
+decompress_lzsa2
+         clr <lz2nibct,pcr ; reset nibble available flag
+
+lz2token ldb ,x+           ; load next token into B: XYZ|LL|MMM
+         pshs b            ; save it
+
+         andb #$18         ; isolate LL (embedded literals count) in B
+         beq lz2nolt       ; skip if no literals
+         cmpb #$18         ; LITERALS_RUN_LEN_V2?
+         bne lz2declt      ; if not, we have the complete count, go unshift
+
+         bsr lz2nibl       ; get extra literals length nibble in B
+         addb #$03         ; add LITERALS_RUN_LEN_V2
+         cmpb #$12         ; LITERALS_RUN_LEN_V2 + 15 ?
+         bne lz2gotla      ; if not, we have the full literals count, go copy
+
+         addb ,x+          ; add extra literals count byte + LITERALS_RUN_LEN + 15
+         bcc lz2gotla      ; if no overflow, we got the complete count, copy
+
+         ldb ,x+           ; load low 8 bits of little-endian literals count
+         lda ,x+           ; load high 8 bits of literal count
+         bra lz2gotlt      ; we now have the complete count, go copy
+
+lz2declt lsrb              ; shift literals count into place
+         lsrb
+         lsrb
+lz2gotla clra              ; clear A (high part of literals count)
+
+lz2gotlt leau ,x
+         tfr d,x           ; transfer 16-bit count into X
+lz2cpylt lda ,u+           ; copy literal byte
+         sta ,y+
+         leax -1,x         ; decrement X and update Z flag
+         bne lz2cpylt      ; loop until all literal bytes are copied
+         leax ,u
+
+lz2nolt  ldb ,s            ; get token again, don't pop it from the stack
+
+         lslb              ; push token's X flag bit into carry
+         bcs lz2replg      ; if token's X bit is set, rep or large offset
+
+         lslb              ; push token's Y flag bit into carry
+         sex               ; push token's Z flag bit into reg A (carry flag is not effected)
+         bcs lz2offs9      ; if token's Y bit is set, 9 bits offset
+
+         bsr lz2nibl       ; get offset nibble in B
+         lsla              ; retrieve token's Z flag bit and push into carry
+
+         rolb              ; shift Z flag from carry into bit 0 of B
+         eorb #$e1         ; set bits 5-7 of offset, reverse bit 0
+         sex               ; set bits 8-15 of offset to $FF
+         bra lz2gotof
+
+lz2offs9 deca              ; set bits 9-15 of offset, reverse bit 8
+         ldb ,x+           ; load low 8 bits of (negative, signed) offset
+         bra lz2gotof
+
+lz2nibct fcb $00           ; nibble ready flag
+
+lz2nibl  ldb #$aa
+         com <lz2nibct,pcr ; toggle nibble ready flag and check
+         bpl lz2gotnb
+
+         ldb ,x+           ; load two nibbles
+         stb <lz2nibl+1,pcr ; store nibble for next time (low 4 bits)
+
+         lsrb              ; shift 4 high bits of nibble down
+         lsrb
+         lsrb
+         lsrb
+
+lz2gotnb andb #$0f         ; only keep low 4 bits
+lz2done  rts
+
+lz2replg lslb              ; push token's Y flag bit into carry
+         bcs lz2rep16      ; if token's Y bit is set, rep or 16 bit offset
+
+         sex               ; push token's Z flag bit into reg A
+         bsr lz2nibl       ; get offset nibble in B
+         lsla              ; push token's Z flag bit into carry
+         rolb              ; shift Z flag from carry into bit 0 of B
+         eorb #$e1         ; set bits 13-15 of offset, reverse bit 8
+         tfr b,a           ; copy bits 8-15 of offset into A
+         suba #$02         ; substract 512 from offset
+         ldb ,x+           ; load low 8 bits of (negative, signed) offset
+         bra lz2gotof
+
+lz2rep16 bmi lz2repof      ; if token's Z flag bit is set, rep match
+         ldd ,x++          ; load high then low 8 bits of offset
+
+lz2gotof std <lz2repof+2,pcr ; store match offset
+lz2repof leau $aaaa,y      ; put backreference start address in U (dst+offset)
+
+         ldd #$0007        ; clear MSB match length and set mask for MMM
+         andb ,s+          ; isolate MMM (embedded match length) in token
+         addb #$02         ; add MIN_MATCH_SIZE_V2
+         cmpb #$09         ; MIN_MATCH_SIZE_V2 + MATCH_RUN_LEN_V2?
+         bne lz2gotln      ; no, we have the full match length, go copy
+
+         bsr lz2nibl       ; get offset nibble in B
+         addb #$09         ; add MIN_MATCH_SIZE_V2 + MATCH_RUN_LEN_V2
+         cmpb #$18         ; MIN_MATCH_SIZE_V2 + MATCH_RUN_LEN_V2 + 15?
+         bne lz2gotln      ; if not, we have the full match length, go copy
+
+         addb ,x+          ; add extra length byte + MIN_MATCH_SIZE_V2 + MATCH_RUN_LEN_V2 + 15
+         bcc lz2gotln      ; if no overflow, we have the full length
+         beq lz2done       ; detect EOD code
+
+         ldb ,x+           ; load 16-bit len in D (low part in B, high in A)
+         lda ,x+           ; (little endian)
+
+lz2gotln pshs x            ; save source compressed data pointer
+         tfr d,x           ; copy match length to X
+
+lz2cpymt lda ,u+           ; copy matched byte
+         sta ,y+
+         leax -1,x         ; decrement X
+         bne lz2cpymt      ; loop until all matched bytes are copied
+
+         puls x            ; restore source compressed data pointer
+         lbra lz2token     ; go decode next token
diff --git a/asm/6809/unlzsa2b-6309.s b/asm/6809/unlzsa2b-6309.s
new file mode 100644
index 0000000..8e15bf8
--- /dev/null
+++ b/asm/6809/unlzsa2b-6309.s
@@ -0,0 +1,133 @@
+;  unlzsa2b-6309.s - H6309 backward decompressor for raw LZSA2 - 155 bytes
+;  compress with lzsa -f2 -r -b <original_file> <compressed_file>
+;
+;  in:  x = last byte of compressed data
+;       y = last byte of decompression buffer
+;  out: y = first byte of decompressed data
+;
+;  Copyright (C) 2020 Emmanuel Marty, Doug Masten
+;
+;  This software is provided 'as-is', without any express or implied
+;  warranty.  In no event will the authors be held liable for any damages
+;  arising from the use of this software.
+;
+;  Permission is granted to anyone to use this software for any purpose,
+;  including commercial applications, and to alter it and redistribute it
+;  freely, subject to the following restrictions:
+;
+;  1. The origin of this software must not be misrepresented; you must not
+;     claim that you wrote the original software. If you use this software
+;     in a product, an acknowledgment in the product documentation would be
+;     appreciated but is not required.
+;  2. Altered source versions must be plainly marked as such, and must not be
+;     misrepresented as being the original software.
+;  3. This notice may not be removed or altered from any source distribution.
+
+decompress_lzsa2
+         clr lz2nibct      ; reset nibble available flag
+         leax 1,x          ; adjust compressed data pointer
+         bra lz2token
+
+lz2nibct fcb 0             ; nibble ready flag
+
+lz2replg lslb              ; push token's Y flag bit into carry
+         bcs lz2rep16      ; if token's Y bit is set, rep or 16 bit offset
+
+         sex               ; push token's Z flag bit into reg A
+         bsr lz2nibl       ; get offset nibble in B
+         lsla              ; push token's Z flag bit into carry
+         rolb              ; shift Z flag from carry into bit 0 of B
+         eorb #$e1         ; set bits 13-15 of offset, reverse bit 8
+         tfr b,a           ; copy bits 8-15 of offset into A
+         suba #$02         ; substract 512 from offset
+         bra lz2lowof
+
+lz2rep16 bmi lz2repof      ; if token's Z flag bit is set, rep match
+         lda ,-x           ; load high 8 bits of (negative, signed) offset
+lz2lowof ldb ,-x           ; load low 8 bits of offset
+
+lz2gotof negd              ; reverse sign of offset in D
+         std lz2moff+2     ; store match offset
+
+lz2repof ldd #$0007        ; clear MSB match length and set mask for MMM
+         andb ,u           ; isolate MMM (embedded match length) in token
+lz2moff  leau $aaaa,y      ; put backreference start address in U (dst+offset)
+         addb #$02         ; add MIN_MATCH_SIZE_V2
+         cmpb #$09         ; MIN_MATCH_SIZE_V2 + MATCH_RUN_LEN_V2?
+         bne lz2gotln      ; no, we have the full match length, go copy
+
+         bsr lz2nibl       ; get offset nibble in B
+         addb #$09         ; add MIN_MATCH_SIZE_V2 + MATCH_RUN_LEN_V2
+         cmpb #$18         ; MIN_MATCH_SIZE_V2 + MATCH_RUN_LEN_V2 + 15?
+         bne lz2gotln      ; if not, we have the full match length, go copy
+
+         addb ,-x          ; add extra length byte + MIN_MATCH_SIZE_V2 + MATCH_RUN_LEN_V2 + 15
+         bcc lz2gotln      ; if no overflow, we have the full length
+         beq lz2done       ; detect EOD code
+
+         ldd ,--x          ; load 16-bit len in D (low part in B, high in A)
+
+lz2gotln tfr d,w           ; set W with match count for TFM instruction
+         tfm u-,y-         ; copy match bytes
+
+lz2token ldb ,-x           ; load next token into B: XYZ|LL|MMM
+         tfr x,u           ; save token address
+         andb #$18         ; isolate LL (embedded literals count) in B
+         beq lz2nolt       ; skip if no literals
+         cmpb #$18         ; LITERALS_RUN_LEN_V2?
+         bne lz2declt      ; if not, we have the complete count, go unshift
+
+         bsr lz2nibl       ; get extra literals length nibble in B
+         addb #$03         ; add LITERALS_RUN_LEN_V2
+         cmpb #$12         ; LITERALS_RUN_LEN_V2 + 15 ?
+         bne lz2gotla      ; if not, we have the full literals count, go copy
+
+         addb ,-x          ; add extra literals count byte + LITERALS_RUN_LEN + 15
+         bcc lz2gotla      ; if no overflow, we got the complete count, copy
+
+         ldd ,--x          ; load 16 bit count in D (low part in B, high in A)
+         bra lz2gotlt      ; we now have the complete count, go copy
+
+lz2nibl  com lz2nibct      ; nibble ready?
+         bpl lz2gotnb
+
+         ldb ,-x           ; load two nibbles
+         stb lz2gotnb+1    ; store nibble for next time (low 4 bits)
+         lsrb              ; shift 4 high bits of nibble down
+         lsrb
+         lsrb
+         lsrb
+         rts
+
+lz2declt lsrb              ; shift literals count into place
+         lsrb
+         lsrb
+lz2gotla clra              ; clear A (high part of literals count)
+
+lz2gotlt tfr d,w           ; set W with literals count for TFM instruction
+         leax -1,x         ; tfm is post-decrement
+         tfm x-,y-         ; copy literal bytes
+         leax 1,x
+
+lz2nolt  ldb ,u            ; get token again
+         lslb              ; push token's X flag bit into carry
+         bcs lz2replg      ; if token's X bit is set, rep or large offset
+
+         lslb              ; push token's Y flag bit into carry
+         sex               ; push token's Z flag bit into reg A (carry flag is not effected)
+         bcs lz2offs9      ; if token's Y bit is set, 9 bits offset
+
+         bsr lz2nibl       ; get offset nibble in B
+         lsla              ; retrieve token's Z flag bit and push into carry
+         rolb              ; shift Z flag from carry into bit 0 of B
+         eorb #$e1         ; set bits 5-7 of offset, reverse bit 0
+         sex               ; set bits 8-15 of offset to $FF
+         bra lz2gotof
+
+lz2offs9 deca              ; set bits 9-15 of offset, reverse bit 8
+         bra lz2lowof
+
+lz2done  leay 1,y          ; adjust pointer to first byte of decompressed data and then exit
+lz2gotnb ldb #$aa          ; load nibble
+         andb #$0f         ; only keep low 4 bits
+         rts
diff --git a/asm/6809/unlzsa2b.s b/asm/6809/unlzsa2b.s
new file mode 100644
index 0000000..b538cac
--- /dev/null
+++ b/asm/6809/unlzsa2b.s
@@ -0,0 +1,152 @@
+;  unlzsa2b.s - 6809 backward decompression routine for raw LZSA2 - 171 bytes
+;  compress with lzsa -f2 -r -b <original_file> <compressed_file>
+;
+;  in:  x = last byte of compressed data
+;       y = last byte of decompression buffer
+;  out: y = first byte of decompressed data
+;
+;  Copyright (C) 2020 Emmanuel Marty
+;
+;  This software is provided 'as-is', without any express or implied
+;  warranty.  In no event will the authors be held liable for any damages
+;  arising from the use of this software.
+;
+;  Permission is granted to anyone to use this software for any purpose,
+;  including commercial applications, and to alter it and redistribute it
+;  freely, subject to the following restrictions:
+;
+;  1. The origin of this software must not be misrepresented; you must not
+;     claim that you wrote the original software. If you use this software
+;     in a product, an acknowledgment in the product documentation would be
+;     appreciated but is not required.
+;  2. Altered source versions must be plainly marked as such, and must not be
+;     misrepresented as being the original software.
+;  3. This notice may not be removed or altered from any source distribution.
+
+decompress_lzsa2
+         clr <lz2nibct,pcr ; reset nibble available flag
+         leax 1,x
+         leay 1,y
+
+lz2token ldb ,-x           ; load next token into B: XYZ|LL|MMM
+         pshs b            ; save it
+
+         andb #$18         ; isolate LLL (embedded literals count) in B
+         beq lz2nolt       ; skip if no literals
+         cmpb #$18         ; LITERALS_RUN_LEN_V2?
+         bne lz2declt      ; if not, we have the complete count, go unshift
+
+         bsr lz2nibl       ; get extra literals length nibble in B
+         addb #$03         ; add LITERALS_RUN_LEN_V2
+         cmpb #$12         ; LITERALS_RUN_LEN_V2 + 15 ?
+         bne lz2gotla      ; if not, we have the full literals count, go copy
+
+         addb ,-x          ; add extra literals count byte + LITERALS_RUN_LEN + 15
+         bcc lz2gotla      ; if no overflow, we got the complete count, copy
+
+         ldd ,--x          ; load 16 bit count in D (low part in B, high in A)
+         bra lz2gotlt      ; we now have the complete count, go copy
+
+lz2declt lsrb              ; shift literals count into place
+         lsrb
+         lsrb
+lz2gotla clra              ; clear A (high part of literals count)
+
+lz2gotlt leau ,x
+         tfr d,x           ; transfer 16-bit count into X
+lz2cpylt lda ,-u           ; copy literal byte
+         sta ,-y
+         leax -1,x         ; decrement X and update Z flag
+         bne lz2cpylt      ; loop until all literal bytes are copied
+         leax ,u
+
+lz2nolt  ldb ,s            ; get token again, don't pop it from the stack
+
+         lslb              ; push token's X flag bit into carry
+         bcs lz2replg      ; if token's X bit is set, rep or large offset
+
+         lslb              ; push token's Y flag bit into carry
+         sex               ; push token's Z flag bit into reg A (carry flag is not effected)
+         bcs lz2offs9      ; if token's Y bit is set, 9 bits offset
+
+         bsr lz2nibl       ; get offset nibble in B
+         lsla              ; retrieve token's Z flag bit and push into carry
+
+         rolb              ; shift Z flag from carry into bit 0 of B
+         eorb #$e1         ; set bits 5-7 of offset, reverse bit 0
+         sex               ; set bits 8-15 of offset to $FF
+         bra lz2gotof
+
+lz2offs9 deca              ; set bits 9-15 of offset, reverse bit 8
+         bra lz2lowof
+
+lz2nibct fcb $00           ; nibble ready flag
+
+lz2nibl  ldb #$aa
+         com <lz2nibct,pcr ; toggle nibble ready flag and check
+         bpl lz2gotnb
+
+         ldb ,-x           ; load two nibbles
+         stb <lz2nibl+1,pcr ; store nibble for next time (low 4 bits)
+
+         lsrb              ; shift 4 high bits of nibble down
+         lsrb
+         lsrb
+         lsrb
+
+lz2gotnb andb #$0f         ; only keep low 4 bits
+lz2done  rts
+
+lz2replg lslb              ; push token's Y flag bit into carry
+         bcs lz2rep16      ; if token's Y bit is set, rep or 16 bit offset
+
+         sex               ; push token's Z flag bit into reg A
+         bsr lz2nibl       ; get offset nibble in B
+         lsla              ; retrieve token's Z flag bit and push into carry
+
+         rolb              ; shift Z flag from carry into bit 0 of B
+         eorb #$e1         ; set bits 13-15 of offset, reverse bit 8
+         tfr b,a           ; copy bits 8-15 of offset into A
+         suba #$02         ; substract 512 from offset
+         bra lz2lowof
+
+lz2rep16 bmi lz2repof      ; if token's Z flag bit is set, rep match
+
+         lda ,-x           ; load high 8 bits of (negative, signed) offset
+lz2lowof ldb ,-x           ; load low 8 bits of offset
+
+lz2gotof nega              ; reverse sign of offset in D
+         negb
+         sbca #0
+         std <lz2repof+2,pcr ; store match offset
+
+lz2repof leau $aaaa,y      ; put backreference start address in U (dst+offset)
+
+         ldd #$0007        ; clear MSB match length and set mask for MMM
+         andb ,s+          ; isolate MMM (embedded match length) in token
+
+         addb #$02         ; add MIN_MATCH_SIZE_V2
+         cmpb #$09         ; MIN_MATCH_SIZE_V2 + MATCH_RUN_LEN_V2?
+         bne lz2gotln      ; no, we have the full match length, go copy
+
+         bsr lz2nibl       ; get offset nibble in B
+         addb #$09         ; add MIN_MATCH_SIZE_V2 + MATCH_RUN_LEN_V2
+         cmpb #$18         ; MIN_MATCH_SIZE_V2 + MATCH_RUN_LEN_V2 + 15?
+         bne lz2gotln      ; if not, we have the full match length, go copy
+
+         addb ,-x          ; add extra length byte + MIN_MATCH_SIZE_V2 + MATCH_RUN_LEN_V2 + 15
+         bcc lz2gotln      ; if no overflow, we have the full length
+         beq lz2done       ; detect EOD code
+
+         ldd ,--x          ; load 16-bit len in D (low part in B, high in A)
+
+lz2gotln pshs x            ; save source compressed data pointer
+         tfr d,x           ; copy match length to X
+
+lz2cpymt lda ,-u           ; copy matched byte
+         sta ,-y
+         leax -1,x         ; decrement X
+         bne lz2cpymt      ; loop until all matched bytes are copied
+
+         puls x            ; restore source compressed data pointer
+         lbra lz2token     ; go decode next token
diff --git a/asm/8088/LZSA1JMP.ASM b/asm/8088/LZSA1JMP.ASM
index b96b498..a1eac9f 100644
--- a/asm/8088/LZSA1JMP.ASM
+++ b/asm/8088/LZSA1JMP.ASM
@@ -1,4 +1,4 @@
-; lzsa1fta.asm time-efficient decompressor implementation for 808x CPUs.
+; lzsa2fta.asm time-efficient decompressor implementation for 808x CPUs.
 ; Turbo Assembler IDEAL mode dialect.
 ; (Is supposed to also assemble with NASM's IDEAL mode support, but YMMV.)
 ;
@@ -15,6 +15,7 @@
 ; - Trashes all data and segment registers
 ;
 ; Copyright (C) 2019 Jim Leonard, Emmanuel Marty
+; Additional speed optimizations by Pavel Zagrebin
 ;
 ; This software is provided 'as-is', without any express or implied
 ; warranty.  In no event will the authors be held liable for any damages
@@ -107,7 +108,8 @@
 ; the 'M' bits in the token form the value 15, and an extra byte follows here,
 ; with three possible types of value.
 ;
-;  0-237: the value is added to the 15 stored in the token. The final value is 3 + 15 + this byte.
+;  0-237: the value is added to the 15 stored in the token.
+;         The final value is 3 + 15 + this byte.
 ;  239:   a second byte follows. The final match length is 256 + the second byte.
 ;  238:   a second and third byte follow, forming a little-endian 16-bit value.
 ;         The final encoded match length is that 16-bit value.
@@ -121,6 +123,14 @@
                 ; have the most code, but these are uncommon paths so the
                 ; tiny speed loss in just these paths is not a concern.
 
+;Setting OPTIMIZE_LONG_RLE to 1 speeds up decompressing long runs of the
+;same 16-bit word value, but hurts decompression speed of other data
+;types slightly.  Turn this on if you know your data has very long 16-bit
+;word-based runs (reported as RLE2 sequences in the LZSA compressor output
+;with an average length of at least 32 bytes), otherwise leave it off.
+
+OPTIMIZE_LONG_RLE EQU 0
+
 SEGMENT CODE para public
 
 ASSUME  cs:CODE, ds:CODE
@@ -138,43 +148,35 @@ leml2 EQU OFFSET lit_ext_mat_len_2b
 leme2 EQU OFFSET lit_ext_mat_ext_2b
 
 ;short-circuit special cases for 0 through 6 literal copies:
-l6ml1 EQU OFFSET lit_len_mat_len_1b
+l6ml1 EQU OFFSET lit_len_mat_len_1b_6
 l6me1 EQU OFFSET lit_len_mat_ext_1b
-l6ml2 EQU OFFSET lit_len_mat_len_2b
+l6ml2 EQU OFFSET lit_len_mat_len_2b_6
 l6me2 EQU OFFSET lit_len_mat_ext_2b
-l5ml1 EQU OFFSET lit_len_mat_len_1b + 1
+l5ml1 EQU OFFSET lit_len_mat_len_1b_45
 l5me1 EQU OFFSET lit_len_mat_ext_1b + 1
-l5ml2 EQU OFFSET lit_len_mat_len_2b + 1
+l5ml2 EQU OFFSET lit_len_mat_len_2b_45
 l5me2 EQU OFFSET lit_len_mat_ext_2b + 1
-l4ml1 EQU OFFSET lit_len_mat_len_1b + 2
+l4ml1 EQU OFFSET lit_len_mat_len_1b_45 + 1
 l4me1 EQU OFFSET lit_len_mat_ext_1b + 2
-l4ml2 EQU OFFSET lit_len_mat_len_2b + 2
+l4ml2 EQU OFFSET lit_len_mat_len_2b_45 + 1
 l4me2 EQU OFFSET lit_len_mat_ext_2b + 2
-l3ml1 EQU OFFSET lit_len_mat_len_1b + 3
+l3ml1 EQU OFFSET lit_len_mat_len_1b_23
 l3me1 EQU OFFSET lit_len_mat_ext_1b + 3
-l3ml2 EQU OFFSET lit_len_mat_len_2b + 3
+l3ml2 EQU OFFSET lit_len_mat_len_2b_23
 l3me2 EQU OFFSET lit_len_mat_ext_2b + 3
-l2ml1 EQU OFFSET lit_len_mat_len_1b + 4
+l2ml1 EQU OFFSET lit_len_mat_len_1b_23 + 1
 l2me1 EQU OFFSET lit_len_mat_ext_1b + 4
-l2ml2 EQU OFFSET lit_len_mat_len_2b + 4
+l2ml2 EQU OFFSET lit_len_mat_len_2b_23 + 1
 l2me2 EQU OFFSET lit_len_mat_ext_2b + 4
-l1ml1 EQU OFFSET lit_len_mat_len_1b + 5
+l1ml1 EQU OFFSET lit_len_mat_len_1b_01
 l1me1 EQU OFFSET lit_len_mat_ext_1b + 5
-l1ml2 EQU OFFSET lit_len_mat_len_2b + 5
+l1ml2 EQU OFFSET lit_len_mat_len_2b_01
 l1me2 EQU OFFSET lit_len_mat_ext_2b + 5
-l0ml1 EQU OFFSET lit_len_mat_len_1b + 6 ; MMMM handling comes after LLL code
-l0me1 EQU OFFSET lit_len_mat_ext_1b + 6 ; MMMM handling comes after LLL code
-l0ml2 EQU OFFSET lit_len_mat_len_2b + 6 ; MMMM handling comes after LLL code
-l0me2 EQU OFFSET lit_len_mat_ext_2b + 6 ; MMMM handling comes after LLL code
+l0ml1 EQU OFFSET lit_len_mat_len_1b_01 + 1 ; MMMM handling comes after LLL code
+l0me1 EQU OFFSET lit_len_mat_ext_1b + 6    ; MMMM handling comes after LLL code
+l0ml2 EQU OFFSET lit_len_mat_len_2b_01 + 1 ; MMMM handling comes after LLL code
+l0me2 EQU OFFSET lit_len_mat_ext_2b + 6    ; MMMM handling comes after LLL code
 
-; === Hand-written (!) jumptable actually begins here.
-; Located before the program code results in an extra JMP and 3 wasted bytes,
-; but it makes the code easier to follow in this location.
-; Relocate the jump table after the ENDP directive to save 3 bytes.
-;
-; 7 6 5 4 3 2 1 0
-; O L L L M M M M
-;
 ;         0     1     2     3     4     5     6     7     8     9     a     b     c     d     e     f
 jtbl DW l0ml1,l0ml1,l0ml1,l0ml1,l0ml1,l0ml1,l0ml1,l0ml1,l0ml1,l0ml1,l0ml1,l0ml1,l0ml1,l0ml1,l0ml1,l0me1 ;0
      DW l1ml1,l1ml1,l1ml1,l1ml1,l1ml1,l1ml1,l1ml1,l1ml1,l1ml1,l1ml1,l1ml1,l1ml1,l1ml1,l1ml1,l1ml1,l1me1 ;1
@@ -215,7 +217,7 @@ MACRO get_word_match_offset
 ENDM
 
 MACRO do_match_copy_long
-LOCAL do_run, do_run_w
+LOCAL even0,even1,even2,do_run,do_run_w
 ; Copies a long match as optimally as possible.
 ; requirements: cx=length, bp=negative offset, ds:si=compdata, es:di=output
 ; trashes: ax, bx
@@ -226,45 +228,52 @@ LOCAL do_run, do_run_w
         xchg    ax,si           ;save si
         lea     si,[bp+di]      ;si = output buffer + negative match offset
         cmp     bp,-2           ;do we have a byte/word run to optimize?
-        jae     do_run          ;perform a run if so, otherwise fall through
-;You may be tempted to change "jae" to "jge" because DX is a signed number.
-;Don't!  The total window is 64k, so if you treat this as a signed comparison,
-;you will get incorrect results for offsets over 32K.
+IF OPTIMIZE_LONG_RLE
+        jae     do_run          ;catch offset = -2 or -1
+ELSE
+        ja      do_run          ;catch offset = -1
+ENDIF
 
 ;If we're here, we have a long copy and it isn't byte-overlapping (if it
 ;overlapped, we'd be in @@do_run)  So, let's copy faster with REP MOVSW.
 ;This affects 8088 only slightly, but is a bigger win on 8086 and higher.
         shr     cx,1
+        jnc     even0
+        movsb
+even0:
         rep     movsw
-        adc     cl,0
-        rep     movsb
         xchg    si,ax           ;restore si
         mov     ds,bx           ;restore ds
         jmp     decode_token
-
 do_run:
+IF OPTIMIZE_LONG_RLE
         je      do_run_w        ;if applicable, handle word-sized value faster
+ENDIF
         xchg    dx,ax           ;save si into dx, as ax is getting trashed
         lodsb                   ;load first byte of run into al
         mov     ah,al
         shr     cx,1
+        jnc     even1
+        stosb
+even1:
         rep     stosw           ;perform word run
-        adc     cl,0
-        rep     stosb           ;finish word run
         mov     si,dx           ;restore si
         mov     ds,bx           ;restore ds
         jmp     decode_token
 
+IF OPTIMIZE_LONG_RLE
 do_run_w:
         xchg    dx,ax           ;save si into dx, as ax is getting trashed
         lodsw                   ;load first word of run
         shr     cx,1
         rep     stosw           ;perform word run
-        adc     cl,0            ;despite 2-byte offset, compressor might
-        rep     stosb           ;output odd length. better safe than sorry.
+        jnc     even2
+        stosb                   ;should be after rep stosw!
+even2:
         mov     si,dx           ;restore si
         mov     ds,bx           ;restore ds
         jmp     decode_token
+ENDIF
 ENDM
 
 MACRO do_match_copy
@@ -277,6 +286,9 @@ MACRO do_match_copy
         mov     ds,ax           ;ds=es
         xchg    ax,si           ;save si
         lea     si,[bp+di]      ;si = output buffer + negative match offset
+        movsb
+        movsb
+        movsb                   ;Handle MINMATCH (instead of add cx,MINMATCH)
         rep     movsb
         xchg    si,ax           ;restore si
         mov     ds,bx           ;restore ds
@@ -284,34 +296,36 @@ MACRO do_match_copy
 ENDM
 
 MACRO do_literal_copy
+LOCAL even
 ; Copies a literal sequence using words.
 ; Meant for longer lengths; for 128 bytes or less, use REP MOVSB.
 ; requirements: cx=length, ds:si=compdata, es:di=output
 ; must leave cx=0 at exit
         shr     cx,1
+        jnc even
+        movsb
+even:
         rep     movsw
-        adc     cl,0
-        rep     movsb
 ENDM
 
 MACRO copy_small_match_len
         and     al,0FH          ;isolate length in token (MMMM)
-        add     al,minmatch     ;ax=match length
         xchg    cx,ax           ;cx=match length
         do_match_copy           ;copy match with cx=length, bp=offset
 ENDM
 
 MACRO copy_large_match_len
-LOCAL val239, val238, EOD
+LOCAL val239,val238,EOD
 ; Handle MMMM=Fh
 ; Assumptions: ah=0 from get_????_match_offset's xchg
         lodsb                   ;grab extra match length byte
         add     al,0Fh+minmatch ;add MATCH_RUN_LEN + MIN_MATCH_SIZE
-        jz      val238          ;if zf & cf, 238: get 16-bit match length
+;       jz      val238          ;if zf & cf, 238: get 16-bit match length
         jc      val239          ;if cf,      239: get extra match length byte
         xchg    cx,ax           ;otherwise, we have our match length
         do_match_copy_long      ;copy match with cx=length, bp=offset
 val239:
+        jz val238
         lodsb                   ;ah=0; grab single extra length byte
         inc     ah              ;ax=256+length byte
         xchg    cx,ax
@@ -347,16 +361,27 @@ decode_token:
 
 ; Path #1: LLL=0-6, MMMM=0-Eh, O=0 (1-byte match offset)
 ; Handle LLL=0-6 by jumping directly into # of bytes to copy (6 down to 1)
-lit_len_mat_len_1b:
-        movsb
-        movsb
-        movsb
-        movsb
-        movsb
+lit_len_mat_len_1b_01:
         movsb
         get_byte_match_offset
         copy_small_match_len
-
+lit_len_mat_len_1b_23:
+        movsb
+        movsw
+        get_byte_match_offset
+        copy_small_match_len
+lit_len_mat_len_1b_45:
+        movsb
+        movsw
+        movsw
+        get_byte_match_offset
+        copy_small_match_len
+lit_len_mat_len_1b_6:
+        movsw
+        movsw
+        movsw
+        get_byte_match_offset
+        copy_small_match_len
 
 ; Path #2: LLL=0-6, MMMM=Fh,   O=0 (1-byte match offset)
 lit_len_mat_ext_1b:
@@ -375,13 +400,14 @@ lit_ext_mat_len_1b:
 ; on entry: ax=0 + token, bp=ax
         lodsb                   ;grab extra literal length byte
         add     al,litrunlen    ;add 7h literal run length
-        jz      @@val249_3      ;if zf & cf, 249: get 16-bit literal length
+;       jz      @@val249_3      ;if zf & cf, 249: get 16-bit literal length
         jc      @@val250_3      ;if cf,      250: get extra literal length byte
         xchg    cx,ax           ;otherwise, we have our literal length
         do_literal_copy         ;this might be better as rep movsw !!! benchmark
         get_byte_match_offset
         copy_small_match_len
 @@val250_3:
+jz      @@val249_3
         lodsb                   ;ah=0; grab single extra length byte
         inc     ah              ;ax=256+length byte
         xchg    cx,ax
@@ -401,13 +427,14 @@ lit_ext_mat_ext_1b:
 ; on entry: ax=0 + token, bp=ax
         lodsb                   ;grab extra literal length byte
         add     al,litrunlen    ;add 7h literal run length
-        jz      @@val249_4      ;if zf & cf, 249: get 16-bit literal length
+;       jz      @@val249_4      ;if zf & cf, 249: get 16-bit literal length
         jc      @@val250_4      ;if cf,      250: get extra literal length byte
         xchg    cx,ax           ;otherwise, we have our literal length
         do_literal_copy         ;this might be better as rep movsw !!! benchmark
         get_byte_match_offset
         copy_large_match_len
 @@val250_4:
+jz @@val249_4
         lodsb                   ;ah=0; grab single extra length byte
         inc     ah              ;ax=256+length byte
         xchg    cx,ax
@@ -424,17 +451,30 @@ lit_ext_mat_ext_1b:
 
 ; Path #5: LLL=0-6, MMMM=0-Eh, O=1 (2-byte match offset)
 ; Handle LLL=0-6 by jumping directly into # of bytes to copy (6 down to 1)
-lit_len_mat_len_2b:
-        movsb
-        movsb
-        movsb
+lit_len_mat_len_2b_01:
         movsb
+        get_word_match_offset
+        copy_small_match_len
+lit_len_mat_len_2b_23:
         movsb
+        movsw
+        get_word_match_offset
+        copy_small_match_len
+lit_len_mat_len_2b_45:
         movsb
+        movsw
+        movsw
+        get_word_match_offset
+        copy_small_match_len
+lit_len_mat_len_2b_6:
+        movsw
+        movsw
+        movsw
         get_word_match_offset
         copy_small_match_len
 
 
+; Path #6: LLL=0-6, MMMM=Fh,   O=1 (2-byte match offset)
 ; Path #6: LLL=0-6, MMMM=Fh,   O=1 (2-byte match offset)
 lit_len_mat_ext_2b:
         movsb
@@ -446,19 +486,19 @@ lit_len_mat_ext_2b:
         get_word_match_offset
         copy_large_match_len
 
-
 ; Path #7: LLL=7,   MMMM=0-Eh, O=1 (2-byte match offset)
 lit_ext_mat_len_2b:
 ; on entry: ax=0 + token, bp=ax
         lodsb                   ;grab extra literal length byte
         add     al,litrunlen    ;add 7h literal run length
-        jz      @@val249_7      ;if zf & cf, 249: get 16-bit literal length
+;       jz      @@val249_7      ;if zf & cf, 249: get 16-bit literal length
         jc      @@val250_7      ;if cf,      250: get extra literal length byte
         xchg    cx,ax           ;otherwise, we have our literal length
         do_literal_copy         ;this might be better as rep movsw !!! benchmark
         get_word_match_offset
         copy_small_match_len
 @@val250_7:
+jz @@val249_7
         lodsb                   ;ah=0; grab single extra length byte
         inc     ah              ;ax=256+length byte
         xchg    cx,ax
@@ -478,13 +518,14 @@ lit_ext_mat_ext_2b:
 ; on entry: ax=0 + token, bp=ax
         lodsb                   ;grab extra literal length byte
         add     al,litrunlen    ;add 7h literal run length
-        jz      @@val249_8      ;if zf & cf, 249: get 16-bit literal length
+;       jz      @@val249_8      ;if zf & cf, 249: get 16-bit literal length
         jc      @@val250_8      ;if cf,      250: get extra literal length byte
         xchg    cx,ax           ;otherwise, we have our literal length
         do_literal_copy         ;this might be better as rep movsw !!! benchmark
         get_word_match_offset
         copy_large_match_len
 @@val250_8:
+jz @@val249_8
         lodsb                   ;ah=0; grab single extra length byte
         inc     ah              ;ax=256+length byte
         xchg    cx,ax
@@ -512,6 +553,8 @@ ENDS    CODE
 
 END
 
+
+
 ;Speed optimization history (decompression times in microseconds @ 4.77 MHz):
 ; defer add MIN_MATCH_SIZE  shuttle  97207 alice 57200 robotron 362884 ++*
 ; jumptable rewrite, no RLE shuttle  97744 alice 46905 robotron 309032 -++
@@ -521,3 +564,18 @@ END
 ; long match copy #1 16-bit shuttle  92490 alice 46905 robotron 308722 +*+
 ; long match copy #2 extraB shuttle  92464 alice 46905 robotron 308371 +.+
 ; long match copy #3 0f->ed shuttle  86765 alice 46864 robotron 303895 +++!
+; baseline new test harness shuttle  83925 alice 37948 robotron 269002 ***
+; Pavel optimizations       shuttle  82225 alice 36798 robotron 261226 +++
+; OPTIMIZE_LONG_RLE 1       shuttle  82242 alice 36787 robotron 261392 **-
+;
+;------
+;
+;Pavel's optimization history:
+;                        shuttle   alice   robotron  time in 1.193 MHz timer clocks
+;baseline                  19109    D9A6      570F6
+;adc cl,0->adc cl,cl       19035    D9A6      56FAB
+;rep movsb->shr cx,1;jnc   18FD4    D998      56F14
+;cmp bp,-2->inc bp;inc bp  18F07    D999      56EA3
+;jz;jc->jc                 18D81    D973      56B2F
+;add al,3->movsb x3        18B1E    D777      56197
+;more lit_len_mat tables   18A83    D341      54ACC
diff --git a/asm/x86/decompress_small_v1.asm b/asm/x86/decompress_small_v1.asm
new file mode 100644
index 0000000..41ce991
--- /dev/null
+++ b/asm/x86/decompress_small_v1.asm
@@ -0,0 +1,120 @@
+;  decompress_small_v1.asm - space-efficient decompressor implementation for x86
+;
+;  Copyright (C) 2019 Emmanuel Marty
+;
+;  This software is provided 'as-is', without any express or implied
+;  warranty.  In no event will the authors be held liable for any damages
+;  arising from the use of this software.
+;
+;  Permission is granted to anyone to use this software for any purpose,
+;  including commercial applications, and to alter it and redistribute it
+;  freely, subject to the following restrictions:
+;
+;  1. The origin of this software must not be misrepresented; you must not
+;     claim that you wrote the original software. If you use this software
+;     in a product, an acknowledgment in the product documentation would be
+;     appreciated but is not required.
+;  2. Altered source versions must be plainly marked as such, and must not be
+;     misrepresented as being the original software.
+;  3. This notice may not be removed or altered from any source distribution.
+
+    segment .text
+    bits 32
+
+;  ---------------------------------------------------------------------------
+;  Decompress raw LZSA1 block
+;  inputs:
+;  * esi: raw LZSA1 block
+;  * edi: output buffer
+;  output:
+;  * eax:    decompressed size
+;  ---------------------------------------------------------------------------
+
+    %ifndef BIN
+      global lzsa1_decompress
+      global _lzsa1_decompress
+    %endif
+    
+lzsa1_decompress:
+_lzsa1_decompress:
+    pushad
+    
+    ;mov    edi, [esp+32+4]    ; edi = outbuf
+    ;mov    esi, [esp+32+8]    ; esi = inbuf
+    
+    xor    ecx, ecx
+.decode_token:
+    mul    ecx
+    lodsb                     ; read token byte: O|LLL|MMMM
+    mov    dl, al             ; keep token in dl
+   
+    and    al, 070H           ; isolate literals length in token (LLL)
+    shr    al, 4              ; shift literals length into place
+
+    cmp    al, 07H            ; LITERALS_RUN_LEN?
+    jne    .got_literals      ; no, we have the full literals count from the token, go copy
+
+    lodsb                     ; grab extra length byte
+    add    al, 07H            ; add LITERALS_RUN_LEN
+    jnc    .got_literals      ; if no overflow, we have the full literals count, go copy
+    jne    .mid_literals
+
+    lodsw                     ; grab 16-bit extra length
+    jmp    .got_literals
+
+.mid_literals:
+    lodsb                     ; grab single extra length byte
+    inc    ah                 ; add 256
+
+.got_literals:
+    xchg   ecx, eax
+    rep    movsb              ; copy cx literals from ds:si to es:di
+
+    test   dl, dl             ; check match offset size in token (O bit)
+    js     .get_long_offset
+
+    dec     ecx
+    xchg    eax, ecx          ; clear ah - cx is zero from the rep movsb above
+    lodsb
+    jmp     .get_match_length
+
+.get_long_offset:
+    lodsw                     ; Get 2-byte match offset
+
+.get_match_length:
+    xchg    eax, edx          ; edx: match offset  eax: original token
+    and     al, 0FH           ; isolate match length in token (MMMM)
+    add     al, 3             ; add MIN_MATCH_SIZE
+
+    cmp     al, 012H          ; MATCH_RUN_LEN?
+    jne     .got_matchlen     ; no, we have the full match length from the token, go copy
+
+    lodsb                     ; grab extra length byte
+    add     al,012H           ; add MIN_MATCH_SIZE + MATCH_RUN_LEN
+    jnc     .got_matchlen     ; if no overflow, we have the entire length
+    jne     .mid_matchlen       
+
+    lodsw                     ; grab 16-bit length
+    test    eax, eax          ; bail if we hit EOD
+    je      .done_decompressing 
+    jmp     .got_matchlen
+
+.mid_matchlen:
+    lodsb                     ; grab single extra length byte
+    inc     ah                ; add 256
+
+.got_matchlen:
+    xchg    ecx, eax          ; copy match length into ecx
+    xchg    esi, eax          
+    mov     esi, edi          ; esi now points at back reference in output data
+    movsx   edx, dx           ; sign-extend dx to 32-bits.
+    add     esi, edx
+    rep     movsb             ; copy match
+    xchg    esi, eax          ; restore esi
+    jmp     .decode_token     ; go decode another token
+
+.done_decompressing:
+    sub    edi, [esp+32+4]
+    mov    [esp+28], edi      ; eax = decompressed size
+    popad
+    ret                       ; done
diff --git a/asm/x86/decompress_small_v2.asm b/asm/x86/decompress_small_v2.asm
new file mode 100644
index 0000000..fe185c1
--- /dev/null
+++ b/asm/x86/decompress_small_v2.asm
@@ -0,0 +1,181 @@
+;  decompress_small_v2.asm - space-efficient decompressor implementation for x86
+;
+;  Copyright (C) 2019 Emmanuel Marty
+;
+;  This software is provided 'as-is', without any express or implied
+;  warranty.  In no event will the authors be held liable for any damages
+;  arising from the use of this software.
+;
+;  Permission is granted to anyone to use this software for any purpose,
+;  including commercial applications, and to alter it and redistribute it
+;  freely, subject to the following restrictions:
+;
+;  1. The origin of this software must not be misrepresented; you must not
+;     claim that you wrote the original software. If you use this software
+;     in a product, an acknowledgment in the product documentation would be
+;     appreciated but is not required.
+;  2. Altered source versions must be plainly marked as such, and must not be
+;     misrepresented as being the original software.
+;  3. This notice may not be removed or altered from any source distribution.
+
+    segment .text
+    bits 32
+
+;  ---------------------------------------------------------------------------
+;  Decompress raw LZSA2 block
+;  inputs:
+;  * esi: raw LZSA2 block
+;  * edi: output buffer
+;  output:
+;  * eax:    decompressed size
+;  ---------------------------------------------------------------------------
+    
+    %ifndef BIN
+      global lzsa2_decompress
+      global _lzsa2_decompress
+    %endif
+    
+lzsa2_decompress:
+_lzsa2_decompress:
+    pushad
+    
+    ;mov    edi, [esp+32+4]      ; edi = outbuf
+    ;mov    esi, [esp+32+8]      ; esi = inbuf
+    
+    xor    ecx, ecx
+    xor    ebx, ebx             ; ebx = 0100H
+    inc    bh
+    xor    ebp, ebp
+
+.decode_token:
+    mul    ecx
+    lodsb                       ; read token byte: XYZ|LL|MMMM
+    mov    dl, al               ; keep token in dl
+   
+    and    al, 018H             ; isolate literals length in token (LL)
+    shr    al, 3                ; shift literals length into place
+
+    cmp    al, 03H              ; LITERALS_RUN_LEN_V2?
+    jne    .got_literals        ; no, we have the full literals count from the token, go copy
+
+    call   .get_nibble          ; get extra literals length nibble
+    add    al, cl               ; add len from token to nibble 
+    cmp    al, 012H             ; LITERALS_RUN_LEN_V2 + 15 ?
+    jne    .got_literals        ; if not, we have the full literals count, go copy
+
+    lodsb                       ; grab extra length byte
+    add    al,012H              ; overflow?
+    jnc    .got_literals        ; if not, we have the full literals count, go copy
+
+    lodsw                       ; grab 16-bit extra length
+
+.got_literals:
+    xchg   ecx, eax
+    rep    movsb                ; copy ecx literals from esi to edi
+
+    test   dl, 0C0h             ; check match offset mode in token (X bit)
+    js     .rep_match_or_large_offset
+
+    ;;cmp dl,040H               ; check if this is a 5 or 9-bit offset (Y bit)
+                                ; discovered via the test with bit 6 set
+    xchg   ecx, eax             ; clear ah - cx is zero from the rep movsb above
+    jne    .offset_9_bit
+
+                                ; 5 bit offset
+    cmp    dl, 020H             ; test bit 5
+    call   .get_nibble_x
+    jmp    .dec_offset_top
+
+.offset_9_bit:                  ; 9 bit offset
+    lodsb                       ; get 8 bit offset from stream in A
+    dec    ah                   ; set offset bits 15-8 to 1
+    test   dl, 020H             ; test bit Z (offset bit 8)
+    je     .get_match_length
+.dec_offset_top:
+    dec    ah                   ; clear bit 8 if Z bit is clear
+                                ; or set offset bits 15-8 to 1
+    jmp    .get_match_length
+
+.rep_match_or_large_offset:
+    ;;cmp dl,0c0H               ; check if this is a 13-bit offset or a 16-bit offset/rep match (Y bit)
+    jpe    .rep_match_or_16_bit
+
+                                ; 13 bit offset
+
+    cmp    dl, 0A0H             ; test bit 5 (knowing that bit 7 is also set)
+    xchg   ah, al
+    call   .get_nibble_x
+    sub    al, 2                ; substract 512
+    jmp    .get_match_length_1
+
+.rep_match_or_16_bit:
+    test   dl, 020H             ; test bit Z (offset bit 8)
+    jne    .repeat_match        ; rep-match
+
+                                ; 16 bit offset
+    lodsb                       ; Get 2-byte match offset
+
+.get_match_length_1:
+    xchg   ah, al
+    lodsb                       ; load match offset bits 0-7
+
+.get_match_length:
+    xchg   ebp, eax             ; ebp: offset
+.repeat_match:
+    xchg   eax, edx             ; ax: original token
+    and    al, 07H              ; isolate match length in token (MMM)
+    add    al, 2                ; add MIN_MATCH_SIZE_V2
+
+    cmp    al, 09H              ; MIN_MATCH_SIZE_V2 + MATCH_RUN_LEN_V2?
+    jne    .got_matchlen        ; no, we have the full match length from the token, go copy
+
+    call   .get_nibble          ; get extra literals length nibble
+    add    al, cl               ; add len from token to nibble 
+    cmp    al, 018H             ; MIN_MATCH_SIZE_V2 + MATCH_RUN_LEN_V2 + 15?
+    jne    .got_matchlen        ; no, we have the full match length from the token, go copy
+
+    lodsb                       ; grab extra length byte
+    add    al,018H              ; overflow?
+    jnc    .got_matchlen        ; if not, we have the entire length
+    je     .done_decompressing  ; detect EOD code
+
+    lodsw                       ; grab 16-bit length
+
+.got_matchlen:
+    xchg   ecx, eax             ; copy match length into ecx
+    xchg   esi, eax          
+    movsx  ebp, bp              ; sign-extend bp to 32-bits
+    lea    esi,[ebp+edi]        ; esi now points at back reference in output data
+    rep    movsb                ; copy match
+    xchg   esi, eax             ; restore esi
+    jmp    .decode_token        ; go decode another token
+
+.done_decompressing:
+    sub    edi, [esp+32+4]
+    mov    [esp+28], edi
+    popad
+    ret                         ; done
+
+.get_nibble_x:
+    cmc                         ; carry set if bit 4 was set
+    rcr    al, 1
+    call   .get_nibble          ; get nibble for offset bits 0-3
+    or     al, cl               ; merge nibble
+    rol    al, 1
+    xor    al, 0E1H             ; set offset bits 7-5 to 1
+    ret
+
+.get_nibble:
+    neg    bh                   ; nibble ready?
+    jns    .has_nibble
+   
+    xchg   ebx, eax
+    lodsb                       ; load two nibbles
+    xchg   ebx, eax
+
+.has_nibble:
+    mov    cl, 4                ; swap 4 high and low bits of nibble
+    ror    bl, cl
+    mov    cl, 0FH
+    and    cl, bl
+    ret
diff --git a/src/lib.h b/src/lib.h
index 5556d2a..2520b13 100755
--- a/src/lib.h
+++ b/src/lib.h
@@ -63,7 +63,7 @@ typedef enum _lzsa_status_t {
 
    /* Decompression-specific status codes */
    LZSA_ERROR_FORMAT,                     /**< Invalid input format or magic number when decompressing */
-   LZSA_ERROR_DECOMPRESSION,              /**< Internal decompression error */
+   LZSA_ERROR_DECOMPRESSION               /**< Internal decompression error */
 } lzsa_status_t;
 
 /* Compression flags */
diff --git a/src/lzsa.c b/src/lzsa.c
index 4cce404..3f6e357 100755
--- a/src/lzsa.c
+++ b/src/lzsa.c
@@ -31,7 +31,6 @@
  */
 
 #include <stdio.h>
-#include <stdbool.h>
 #include <stdlib.h>
 #include <string.h>
 #ifdef _WIN32
@@ -48,7 +47,7 @@
 #define OPT_RAW_BACKWARD   8
 #define OPT_STATS          16
 
-#define TOOL_VERSION "1.2.0"
+#define TOOL_VERSION "1.3.6"
 
 /*---------------------------------------------------------------------------*/
 
@@ -295,7 +294,7 @@ int comparestream_open(lzsa_stream_t *stream, const char *pszCompareFilename, co
 
    pCompareStream->pCompareDataBuf = NULL;
    pCompareStream->nCompareDataSize = 0;
-   pCompareStream->f = (void*)fopen(pszCompareFilename, pszMode);
+   pCompareStream->f = (FILE*)fopen(pszCompareFilename, pszMode);
 
    if (pCompareStream->f) {
       stream->obj = pCompareStream;
@@ -866,11 +865,11 @@ int main(int argc, char **argv) {
    const char *pszInFilename = NULL;
    const char *pszOutFilename = NULL;
    const char *pszDictionaryFilename = NULL;
-   bool bArgsError = false;
-   bool bCommandDefined = false;
-   bool bVerifyCompression = false;
-   bool bMinMatchDefined = false;
-   bool bFormatVersionDefined = false;
+   int nArgsError = 0;
+   int nCommandDefined = 0;
+   int nVerifyCompression = 0;
+   int nMinMatchDefined = 0;
+   int nFormatVersionDefined = 0;
    char cCommand = 'z';
    int nMinMatchSize = 0;
    unsigned int nOptions = OPT_FAVOR_RATIO;
@@ -878,51 +877,51 @@ int main(int argc, char **argv) {
 
    for (i = 1; i < argc; i++) {
       if (!strcmp(argv[i], "-d")) {
-         if (!bCommandDefined) {
-            bCommandDefined = true;
+         if (!nCommandDefined) {
+            nCommandDefined = 1;
             cCommand = 'd';
          }
          else
-            bArgsError = true;
+            nArgsError = 1;
       }
       else if (!strcmp(argv[i], "-z")) {
-         if (!bCommandDefined) {
-            bCommandDefined = true;
+         if (!nCommandDefined) {
+            nCommandDefined = 1;
             cCommand = 'z';
          }
          else
-            bArgsError = true;
+            nArgsError = 1;
       }
       else if (!strcmp(argv[i], "-c")) {
-         if (!bVerifyCompression) {
-            bVerifyCompression = true;
+         if (!nVerifyCompression) {
+            nVerifyCompression = 1;
          }
          else
-            bArgsError = true;
+            nArgsError = 1;
       }
       else if (!strcmp(argv[i], "-cbench")) {
-         if (!bCommandDefined) {
-            bCommandDefined = true;
+         if (!nCommandDefined) {
+            nCommandDefined = 1;
             cCommand = 'B';
          }
          else
-            bArgsError = true;
+            nArgsError = 1;
       }
       else if (!strcmp(argv[i], "-dbench")) {
-         if (!bCommandDefined) {
-            bCommandDefined = true;
+         if (!nCommandDefined) {
+            nCommandDefined = 1;
             cCommand = 'b';
          }
          else
-            bArgsError = true;
+            nArgsError = 1;
       }
       else if (!strcmp(argv[i], "-test")) {
-         if (!bCommandDefined) {
-            bCommandDefined = true;
+         if (!nCommandDefined) {
+            nCommandDefined = 1;
             cCommand = 't';
          }
          else
-            bArgsError = true;
+            nArgsError = 1;
       }
       else if (!strcmp(argv[i], "-D")) {
          if (!pszDictionaryFilename && (i + 1) < argc) {
@@ -930,119 +929,119 @@ int main(int argc, char **argv) {
             i++;
          }
          else
-            bArgsError = true;
+            nArgsError = 1;
       }
       else if (!strncmp(argv[i], "-D", 2)) {
          if (!pszDictionaryFilename) {
             pszDictionaryFilename = argv[i] + 2;
          }
          else
-            bArgsError = true;
+            nArgsError = 1;
       }
       else if (!strcmp(argv[i], "-m")) {
-         if (!bMinMatchDefined && (i + 1) < argc) {
+         if (!nMinMatchDefined && (i + 1) < argc) {
             char *pEnd = NULL;
             nMinMatchSize = (int)strtol(argv[i + 1], &pEnd, 10);
             if (pEnd && pEnd != argv[i + 1] && (nMinMatchSize >= 2 && nMinMatchSize <= 5)) {
                i++;
-               bMinMatchDefined = true;
+               nMinMatchDefined = 1;
                nOptions &= (~OPT_FAVOR_RATIO);
             }
             else {
-               bArgsError = true;
+               nArgsError = 1;
             }
          }
          else
-            bArgsError = true;
+            nArgsError = 1;
       }
       else if (!strncmp(argv[i], "-m", 2)) {
-         if (!bMinMatchDefined) {
+         if (!nMinMatchDefined) {
             char *pEnd = NULL;
             nMinMatchSize = (int)strtol(argv[i] + 2, &pEnd, 10);
             if (pEnd && pEnd != (argv[i]+2) && (nMinMatchSize >= 2 && nMinMatchSize <= 5)) {
-               bMinMatchDefined = true;
+               nMinMatchDefined = 1;
                nOptions &= (~OPT_FAVOR_RATIO);
             }
             else {
-               bArgsError = true;
+               nArgsError = 1;
             }
          }
          else
-            bArgsError = true;
+            nArgsError = 1;
       }
       else if (!strcmp(argv[i], "--prefer-ratio")) {
-         if (!bMinMatchDefined) {
+         if (!nMinMatchDefined) {
             nMinMatchSize = 0;
-            bMinMatchDefined = true;
+            nMinMatchDefined = 1;
          }
          else
-            bArgsError = true;
+            nArgsError = 1;
       }
       else if (!strcmp(argv[i], "--prefer-speed")) {
-         if (!bMinMatchDefined) {
+         if (!nMinMatchDefined) {
             nMinMatchSize = 3;
             nOptions &= (~OPT_FAVOR_RATIO);
-            bMinMatchDefined = true;
+            nMinMatchDefined = 1;
          }
          else
-            bArgsError = true;
+            nArgsError = 1;
       }
       else if (!strcmp(argv[i], "-f")) {
-         if (!bFormatVersionDefined && (i + 1) < argc) {
+         if (!nFormatVersionDefined && (i + 1) < argc) {
             char *pEnd = NULL;
             nFormatVersion = (int)strtol(argv[i + 1], &pEnd, 10);
             if (pEnd && pEnd != argv[i + 1] && (nFormatVersion >= 1 && nFormatVersion <= 2)) {
                i++;
-               bFormatVersionDefined = true;
+               nFormatVersionDefined = 1;
             }
             else {
-               bArgsError = true;
+               nArgsError = 1;
             }
          }
          else
-            bArgsError = true;
+            nArgsError = 1;
       }
       else if (!strncmp(argv[i], "-f", 2)) {
-         if (!bFormatVersionDefined) {
+         if (!nFormatVersionDefined) {
             char *pEnd = NULL;
             nFormatVersion = (int)strtol(argv[i] + 2, &pEnd, 10);
             if (pEnd && pEnd != (argv[i] + 2) && (nFormatVersion >= 1 && nFormatVersion <= 2)) {
-               bFormatVersionDefined = true;
+               nFormatVersionDefined = 1;
             }
             else {
-               bArgsError = true;
+               nArgsError = 1;
             }
          }
          else
-            bArgsError = true;
+            nArgsError = 1;
       }
       else if (!strcmp(argv[i], "-v")) {
          if ((nOptions & OPT_VERBOSE) == 0) {
             nOptions |= OPT_VERBOSE;
          }
          else
-            bArgsError = true;
+            nArgsError = 1;
       }
       else if (!strcmp(argv[i], "-r")) {
          if ((nOptions & OPT_RAW) == 0) {
             nOptions |= OPT_RAW;
          }
          else
-            bArgsError = true;
+            nArgsError = 1;
       }
       else if (!strcmp(argv[i], "-b")) {
          if ((nOptions & OPT_RAW_BACKWARD) == 0) {
             nOptions |= OPT_RAW_BACKWARD;
          }
          else
-            bArgsError = true;
+            nArgsError = 1;
       }
       else if (!strcmp(argv[i], "-stats")) {
       if ((nOptions & OPT_STATS) == 0) {
          nOptions |= OPT_STATS;
       }
       else
-         bArgsError = true;
+         nArgsError = 1;
       }
       else {
          if (!pszInFilename)
@@ -1051,21 +1050,21 @@ int main(int argc, char **argv) {
             if (!pszOutFilename)
                pszOutFilename = argv[i];
             else
-               bArgsError = true;
+               nArgsError = 1;
          }
       }
    }
 
-   if (!bArgsError && (nOptions & OPT_RAW_BACKWARD) && !(nOptions & OPT_RAW)) {
+   if (!nArgsError && (nOptions & OPT_RAW_BACKWARD) && !(nOptions & OPT_RAW)) {
       fprintf(stderr, "error: -b (compress backwards) requires -r (raw block format)\n");
       return 100;
    }
 
-   if (!bArgsError && cCommand == 't') {
+   if (!nArgsError && cCommand == 't') {
       return do_self_test(nOptions, nMinMatchSize, nFormatVersion);
    }
 
-   if (bArgsError || !pszInFilename || !pszOutFilename) {
+   if (nArgsError || !pszInFilename || !pszOutFilename) {
       fprintf(stderr, "lzsa command-line tool v" TOOL_VERSION " by Emmanuel Marty and spke\n");
       fprintf(stderr, "usage: %s [-c] [-d] [-v] [-r] <infile> <outfile>\n", argv[0]);
       fprintf(stderr, "       -c: check resulting stream after compressing\n");
@@ -1089,7 +1088,7 @@ int main(int argc, char **argv) {
 
    if (cCommand == 'z') {
       int nResult = do_compress(pszInFilename, pszOutFilename, pszDictionaryFilename, nOptions, nMinMatchSize, nFormatVersion);
-      if (nResult == 0 && bVerifyCompression) {
+      if (nResult == 0 && nVerifyCompression) {
          return do_compare(pszOutFilename, pszInFilename, pszDictionaryFilename, nOptions, nFormatVersion);
       } else {
          return nResult;
diff --git a/src/matchfinder.c b/src/matchfinder.c
index fbdc5ca..3de2cfa 100644
--- a/src/matchfinder.c
+++ b/src/matchfinder.c
@@ -66,7 +66,7 @@ int lzsa_build_suffix_array(lzsa_compressor *pCompressor, const unsigned char *p
    int *PLCP = (int*)pCompressor->pos_data;  /* Use temporarily */
    int *Phi = PLCP;
    int nCurLen = 0;
-   int i;
+   int i, r;
 
    /* Compute the permuted LCP first (Kärkkäinen method) */
    Phi[intervals[0]] = -1;
@@ -132,7 +132,7 @@ int lzsa_build_suffix_array(lzsa_compressor *pCompressor, const unsigned char *p
    intervals[0] = 0;
    next_interval_idx = 1;
 
-   for (int r = 1; r < nInWindowSize; r++) {
+   for (r = 1; r < nInWindowSize; r++) {
       const unsigned int next_pos = SA_and_LCP[r] & POS_MASK;
       const unsigned int next_lcp = SA_and_LCP[r] & LCP_MASK;
       const unsigned int top_lcp = *top & LCP_MASK;
diff --git a/src/shrink_block_v1.c b/src/shrink_block_v1.c
index c30e4a9..32c5c38 100644
--- a/src/shrink_block_v1.c
+++ b/src/shrink_block_v1.c
@@ -157,66 +157,69 @@ static inline int lzsa_get_offset_cost_v1(const unsigned int nMatchOffset) {
  * @param nEndOffset offset to end finding matches at (typically the size of the total input window in bytes
  */
 static void lzsa_optimize_forward_v1(lzsa_compressor *pCompressor, lzsa_match *pBestMatch, const int nStartOffset, const int nEndOffset, const int nReduce) {
-   lzsa_arrival *arrival = pCompressor->arrival - (nStartOffset << MATCHES_PER_ARRIVAL_SHIFT);
+   lzsa_arrival *arrival = pCompressor->arrival - (nStartOffset << ARRIVALS_PER_POSITION_SHIFT);
    const int nMinMatchSize = pCompressor->min_match_size;
    const int nFavorRatio = (pCompressor->flags & LZSA_FLAG_FAVOR_RATIO) ? 1 : 0;
+   const int nModeSwitchPenalty = nFavorRatio ? 0 : MODESWITCH_PENALTY;
    const int nDisableScore = nReduce ? 0 : (2 * BLOCK_SIZE);
    int i, j, n;
 
    if ((nEndOffset - nStartOffset) > BLOCK_SIZE) return;
 
-   memset(arrival + (nStartOffset << MATCHES_PER_ARRIVAL_SHIFT), 0, sizeof(lzsa_arrival) * ((nEndOffset - nStartOffset + 1) << MATCHES_PER_ARRIVAL_SHIFT));
+   memset(arrival + (nStartOffset << ARRIVALS_PER_POSITION_SHIFT), 0, sizeof(lzsa_arrival) * ((nEndOffset - nStartOffset + 1) << ARRIVALS_PER_POSITION_SHIFT));
 
-   arrival[nStartOffset << MATCHES_PER_ARRIVAL_SHIFT].from_slot = -1;
+   arrival[nStartOffset << ARRIVALS_PER_POSITION_SHIFT].from_slot = -1;
 
    for (i = nStartOffset; i != nEndOffset; i++) {
+      lzsa_arrival* cur_arrival = &arrival[i << ARRIVALS_PER_POSITION_SHIFT];
       int m;
 
-      for (j = 0; j < NMATCHES_PER_ARRIVAL_V1 && arrival[(i << MATCHES_PER_ARRIVAL_SHIFT) + j].from_slot; j++) {
-         int nPrevCost = arrival[(i << MATCHES_PER_ARRIVAL_SHIFT) + j].cost;
+      for (j = 0; j < NARRIVALS_PER_POSITION_V1 && cur_arrival[j].from_slot; j++) {
+         int nPrevCost = cur_arrival[j].cost;
          int nCodingChoiceCost = nPrevCost + 8 /* literal */;
-         int nScore = arrival[(i << MATCHES_PER_ARRIVAL_SHIFT) + j].score + 1;
-         int nNumLiterals = arrival[(i << MATCHES_PER_ARRIVAL_SHIFT) + j].num_literals + 1;
+         int nScore = cur_arrival[j].score + 1;
+         int nNumLiterals = cur_arrival[j].num_literals + 1;
 
          if (nNumLiterals == LITERALS_RUN_LEN_V1 || nNumLiterals == 256 || nNumLiterals == 512) {
             nCodingChoiceCost += 8;
          }
 
-         if (!nFavorRatio && nNumLiterals == 1)
-            nCodingChoiceCost += MODESWITCH_PENALTY;
+         if (nNumLiterals == 1)
+            nCodingChoiceCost += nModeSwitchPenalty;
 
-         for (n = 0; n < NMATCHES_PER_ARRIVAL_V1 /* we only need the literals + short match cost + long match cost cases */; n++) {
-            lzsa_arrival *pDestArrival = &arrival[((i + 1) << MATCHES_PER_ARRIVAL_SHIFT) + n];
+         lzsa_arrival *pDestSlots = &arrival[(i + 1) << ARRIVALS_PER_POSITION_SHIFT];
+         for (n = 0; n < NARRIVALS_PER_POSITION_V1 /* we only need the literals + short match cost + long match cost cases */; n++) {
+            lzsa_arrival *pDestArrival = &pDestSlots[n];
 
             if (pDestArrival->from_slot == 0 ||
                nCodingChoiceCost < pDestArrival->cost ||
                (nCodingChoiceCost == pDestArrival->cost && nScore < (pDestArrival->score + nDisableScore))) {
-               memmove(&arrival[((i + 1) << MATCHES_PER_ARRIVAL_SHIFT) + n + 1],
-                  &arrival[((i + 1) << MATCHES_PER_ARRIVAL_SHIFT) + n],
-                  sizeof(lzsa_arrival) * (NMATCHES_PER_ARRIVAL_V1 - n - 1));
+               memmove(&arrival[((i + 1) << ARRIVALS_PER_POSITION_SHIFT) + n + 1],
+                  &arrival[((i + 1) << ARRIVALS_PER_POSITION_SHIFT) + n],
+                  sizeof(lzsa_arrival) * (NARRIVALS_PER_POSITION_V1 - n - 1));
 
                pDestArrival->cost = nCodingChoiceCost;
                pDestArrival->from_pos = i;
                pDestArrival->from_slot = j + 1;
-               pDestArrival->match_offset = 0;
                pDestArrival->match_len = 0;
                pDestArrival->num_literals = nNumLiterals;
                pDestArrival->score = nScore;
-               pDestArrival->rep_offset = arrival[(i << MATCHES_PER_ARRIVAL_SHIFT) + j].rep_offset;
+               pDestArrival->rep_offset = cur_arrival[j].rep_offset;
                break;
             }
          }
       }
 
       const lzsa_match *match = pCompressor->match + ((i - nStartOffset) << MATCHES_PER_INDEX_SHIFT_V1);
+      int nNumArrivalsForThisPos = j;
 
       for (m = 0; m < NMATCHES_PER_INDEX_V1 && match[m].length; m++) {
          int nMatchLen = match[m].length;
          int nMatchOffsetCost = lzsa_get_offset_cost_v1(match[m].offset);
          int nStartingMatchLen, k;
 
-         if ((i + nMatchLen) > (nEndOffset - LAST_LITERALS))
-            nMatchLen = nEndOffset - LAST_LITERALS - i;
+         if ((i + nMatchLen) > nEndOffset)
+            nMatchLen = nEndOffset - i;
 
          if (nMatchLen >= LEAVE_ALONE_MATCH_SIZE)
             nStartingMatchLen = nMatchLen;
@@ -225,43 +228,48 @@ static void lzsa_optimize_forward_v1(lzsa_compressor *pCompressor, lzsa_match *p
          for (k = nStartingMatchLen; k <= nMatchLen; k++) {
             int nMatchLenCost = lzsa_get_match_varlen_size_v1(k - MIN_MATCH_SIZE_V1);
 
-            for (j = 0; j < NMATCHES_PER_ARRIVAL_V1 && arrival[(i << MATCHES_PER_ARRIVAL_SHIFT) + j].from_slot; j++) {
-               int nPrevCost = arrival[(i << MATCHES_PER_ARRIVAL_SHIFT) + j].cost;
+            lzsa_arrival *pDestSlots = &arrival[(i + k) << ARRIVALS_PER_POSITION_SHIFT];
+
+            for (j = 0; j < nNumArrivalsForThisPos; j++) {
+               int nPrevCost = cur_arrival[j].cost;
                int nCodingChoiceCost = nPrevCost + 8 /* token */ /* the actual cost of the literals themselves accumulates up the chain */ + nMatchOffsetCost + nMatchLenCost;
-               int nScore = arrival[(i << MATCHES_PER_ARRIVAL_SHIFT) + j].score + 5;
                int exists = 0;
 
-               if (!nFavorRatio && !arrival[(i << MATCHES_PER_ARRIVAL_SHIFT) + j].num_literals)
-                  nCodingChoiceCost += MODESWITCH_PENALTY;
+               if (!cur_arrival[j].num_literals)
+                  nCodingChoiceCost += nModeSwitchPenalty;
 
                for (n = 0;
-                  n < NMATCHES_PER_ARRIVAL_V1 && arrival[((i + k) << MATCHES_PER_ARRIVAL_SHIFT) + n].from_slot && arrival[((i + k) << MATCHES_PER_ARRIVAL_SHIFT) + n].cost <= nCodingChoiceCost;
+                  n < NARRIVALS_PER_POSITION_V1 && pDestSlots[n].from_slot && pDestSlots[n].cost <= nCodingChoiceCost;
                   n++) {
-                  if (lzsa_get_offset_cost_v1(arrival[((i + k) << MATCHES_PER_ARRIVAL_SHIFT) + n].rep_offset) == lzsa_get_offset_cost_v1(match[m].offset)) {
+                  if (lzsa_get_offset_cost_v1(pDestSlots[n].rep_offset) == nMatchOffsetCost) {
                      exists = 1;
                      break;
                   }
                }
 
-               for (n = 0; !exists && n < NMATCHES_PER_ARRIVAL_V1 /* we only need the literals + short match cost + long match cost cases */; n++) {
-                  lzsa_arrival *pDestArrival = &arrival[((i + k) << MATCHES_PER_ARRIVAL_SHIFT) + n];
+               if (!exists) {
+                  int nScore = cur_arrival[j].score + 5;
 
-                  if (pDestArrival->from_slot == 0 ||
-                     nCodingChoiceCost < pDestArrival->cost ||
-                     (nCodingChoiceCost == pDestArrival->cost && nScore < (pDestArrival->score + nDisableScore))) {
-                     memmove(&arrival[((i + k) << MATCHES_PER_ARRIVAL_SHIFT) + n + 1],
-                        &arrival[((i + k) << MATCHES_PER_ARRIVAL_SHIFT) + n],
-                        sizeof(lzsa_arrival) * (NMATCHES_PER_ARRIVAL_V1 - n - 1));
+                  for (n = 0; n < NARRIVALS_PER_POSITION_V1 /* we only need the literals + short match cost + long match cost cases */; n++) {
+                     lzsa_arrival *pDestArrival = &pDestSlots[n];
 
-                     pDestArrival->cost = nCodingChoiceCost;
-                     pDestArrival->from_pos = i;
-                     pDestArrival->from_slot = j + 1;
-                     pDestArrival->match_offset = match[m].offset;
-                     pDestArrival->match_len = k;
-                     pDestArrival->num_literals = 0;
-                     pDestArrival->score = nScore;
-                     pDestArrival->rep_offset = match[m].offset;
-                     break;
+                     if (pDestArrival->from_slot == 0 ||
+                        nCodingChoiceCost < pDestArrival->cost ||
+                        (nCodingChoiceCost == pDestArrival->cost && nScore < (pDestArrival->score + nDisableScore))) {
+                        memmove(&pDestSlots[n + 1],
+                           &pDestSlots[n],
+                           sizeof(lzsa_arrival) * (NARRIVALS_PER_POSITION_V1 - n - 1));
+
+                        pDestArrival->cost = nCodingChoiceCost;
+                        pDestArrival->from_pos = i;
+                        pDestArrival->from_slot = j + 1;
+                        pDestArrival->match_len = k;
+                        pDestArrival->num_literals = 0;
+                        pDestArrival->score = nScore;
+                        pDestArrival->rep_offset = match[m].offset;
+                        j = NARRIVALS_PER_POSITION_V1;
+                        break;
+                     }
                   }
                }
             }
@@ -269,14 +277,17 @@ static void lzsa_optimize_forward_v1(lzsa_compressor *pCompressor, lzsa_match *p
       }
    }
 
-   lzsa_arrival *end_arrival = &arrival[(i << MATCHES_PER_ARRIVAL_SHIFT) + 0];
+   lzsa_arrival *end_arrival = &arrival[(i << ARRIVALS_PER_POSITION_SHIFT) + 0];
 
    while (end_arrival->from_slot > 0 && end_arrival->from_pos >= 0) {
       if (end_arrival->from_pos >= nEndOffset) return;
       pBestMatch[end_arrival->from_pos].length = end_arrival->match_len;
-      pBestMatch[end_arrival->from_pos].offset = end_arrival->match_offset;
+      if (end_arrival->match_len)
+         pBestMatch[end_arrival->from_pos].offset = end_arrival->rep_offset;
+      else
+         pBestMatch[end_arrival->from_pos].offset = 0;
 
-      end_arrival = &arrival[(end_arrival->from_pos << MATCHES_PER_ARRIVAL_SHIFT) + (end_arrival->from_slot - 1)];
+      end_arrival = &arrival[(end_arrival->from_pos << ARRIVALS_PER_POSITION_SHIFT) + (end_arrival->from_slot - 1)];
    }
 }
 
@@ -301,12 +312,12 @@ static int lzsa_optimize_command_count_v1(lzsa_compressor *pCompressor, const un
       lzsa_match *pMatch = pBestMatch + i;
 
       if (pMatch->length == 0 &&
-         (i + 1) < (nEndOffset - LAST_LITERALS) &&
+         (i + 1) < nEndOffset &&
          pBestMatch[i + 1].length >= MIN_MATCH_SIZE_V1 &&
          pBestMatch[i + 1].length < MAX_VARLEN &&
          pBestMatch[i + 1].offset &&
          i >= pBestMatch[i + 1].offset &&
-         (i + pBestMatch[i + 1].length + 1) <= (nEndOffset - LAST_LITERALS) &&
+         (i + pBestMatch[i + 1].length + 1) <= nEndOffset &&
          !memcmp(pInWindow + i - (pBestMatch[i + 1].offset), pInWindow + i, pBestMatch[i + 1].length + 1)) {
          int nCurLenSize = lzsa_get_match_varlen_size_v1(pBestMatch[i + 1].length - MIN_MATCH_SIZE_V1);
          int nReducedLenSize = lzsa_get_match_varlen_size_v1(pBestMatch[i + 1].length + 1 - MIN_MATCH_SIZE_V1);
@@ -413,8 +424,6 @@ static int lzsa_get_compressed_size_v1(lzsa_compressor *pCompressor, lzsa_match
          int nMatchOffset = pMatch->offset;
          int nMatchLen = pMatch->length;
          int nEncodedMatchLen = nMatchLen - MIN_MATCH_SIZE_V1;
-         int nTokenLiteralsLen = (nNumLiterals >= LITERALS_RUN_LEN_V1) ? LITERALS_RUN_LEN_V1 : nNumLiterals;
-         int nTokenMatchLen = (nEncodedMatchLen >= MATCH_RUN_LEN_V1) ? MATCH_RUN_LEN_V1 : nEncodedMatchLen;
          int nTokenLongOffset = (nMatchOffset <= 256) ? 0x00 : 0x80;
          int nCommandSize = 8 /* token */ + lzsa_get_literals_varlen_size_v1(nNumLiterals) + (nNumLiterals << 3) + (nTokenLongOffset ? 16 : 8) /* match offset */ + lzsa_get_match_varlen_size_v1(nEncodedMatchLen);
 
@@ -429,7 +438,6 @@ static int lzsa_get_compressed_size_v1(lzsa_compressor *pCompressor, lzsa_match
    }
 
    {
-      int nTokenLiteralsLen = (nNumLiterals >= LITERALS_RUN_LEN_V1) ? LITERALS_RUN_LEN_V1 : nNumLiterals;
       int nCommandSize = 8 /* token */ + lzsa_get_literals_varlen_size_v1(nNumLiterals) + (nNumLiterals << 3);
 
       nCompressedSize += nCommandSize;
diff --git a/src/shrink_block_v2.c b/src/shrink_block_v2.c
index eb4a16e..fc6d232 100644
--- a/src/shrink_block_v2.c
+++ b/src/shrink_block_v2.c
@@ -43,22 +43,18 @@
  * @param nOutOffset current write index into output buffer
  * @param nMaxOutDataSize maximum size of output buffer, in bytes
  * @param nCurNibbleOffset write index into output buffer, of current byte being filled with nibbles
- * @param nCurFreeNibbles current number of free nibbles in byte
  * @param nNibbleValue value to write (0..15)
  */
-static int lzsa_write_nibble_v2(unsigned char *pOutData, int nOutOffset, const int nMaxOutDataSize, int *nCurNibbleOffset, int *nCurFreeNibbles, int nNibbleValue) {
+static int lzsa_write_nibble_v2(unsigned char *pOutData, int nOutOffset, const int nMaxOutDataSize, int *nCurNibbleOffset, int nNibbleValue) {
    if (nOutOffset < 0) return -1;
 
    if ((*nCurNibbleOffset) == -1) {
       if (nOutOffset >= nMaxOutDataSize) return -1;
       (*nCurNibbleOffset) = nOutOffset;
-      (*nCurFreeNibbles) = 2;
-      pOutData[nOutOffset++] = 0;
+      pOutData[nOutOffset++] = nNibbleValue << 4;
    }
-
-   pOutData[*nCurNibbleOffset] = (pOutData[*nCurNibbleOffset] << 4) | (nNibbleValue & 0x0f);
-   (*nCurFreeNibbles)--;
-   if ((*nCurFreeNibbles) == 0) {
+   else {
+      pOutData[*nCurNibbleOffset] = (pOutData[*nCurNibbleOffset]) | (nNibbleValue & 0x0f);
       (*nCurNibbleOffset) = -1;
    }
 
@@ -96,15 +92,17 @@ static inline int lzsa_get_literals_varlen_size_v2(const int nLength) {
  *
  * @param pOutData pointer to output buffer
  * @param nOutOffset current write index into output buffer
+ * @param nMaxOutDataSize maximum size of output buffer, in bytes
+ * @param nCurNibbleOffset write index into output buffer, of current byte being filled with nibbles
  * @param nLength literals length
  */
-static inline int lzsa_write_literals_varlen_v2(unsigned char *pOutData, int nOutOffset, const int nMaxOutDataSize, int *nCurNibbleOffset, int *nCurFreeNibbles, int nLength) {
+static inline int lzsa_write_literals_varlen_v2(unsigned char *pOutData, int nOutOffset, const int nMaxOutDataSize, int *nCurNibbleOffset, int nLength) {
    if (nLength >= LITERALS_RUN_LEN_V2) {
       if (nLength < (LITERALS_RUN_LEN_V2 + 15)) {
-         nOutOffset = lzsa_write_nibble_v2(pOutData, nOutOffset, nMaxOutDataSize, nCurNibbleOffset, nCurFreeNibbles, nLength - LITERALS_RUN_LEN_V2);
+         nOutOffset = lzsa_write_nibble_v2(pOutData, nOutOffset, nMaxOutDataSize, nCurNibbleOffset, nLength - LITERALS_RUN_LEN_V2);
       }
       else {
-         nOutOffset = lzsa_write_nibble_v2(pOutData, nOutOffset, nMaxOutDataSize, nCurNibbleOffset, nCurFreeNibbles, 15);
+         nOutOffset = lzsa_write_nibble_v2(pOutData, nOutOffset, nMaxOutDataSize, nCurNibbleOffset, 15);
          if (nOutOffset < 0) return -1;
 
          if (nLength < 256)
@@ -150,15 +148,17 @@ static inline int lzsa_get_match_varlen_size_v2(const int nLength) {
  *
  * @param pOutData pointer to output buffer
  * @param nOutOffset current write index into output buffer
+ * @param nMaxOutDataSize maximum size of output buffer, in bytes
+ * @param nCurNibbleOffset write index into output buffer, of current byte being filled with nibbles
  * @param nLength encoded match length (actual match length - MIN_MATCH_SIZE_V2)
  */
-static inline int lzsa_write_match_varlen_v2(unsigned char *pOutData, int nOutOffset, const int nMaxOutDataSize, int *nCurNibbleOffset, int *nCurFreeNibbles, int nLength) {
+static inline int lzsa_write_match_varlen_v2(unsigned char *pOutData, int nOutOffset, const int nMaxOutDataSize, int *nCurNibbleOffset, int nLength) {
    if (nLength >= MATCH_RUN_LEN_V2) {
       if (nLength < (MATCH_RUN_LEN_V2 + 15)) {
-         nOutOffset = lzsa_write_nibble_v2(pOutData, nOutOffset, nMaxOutDataSize, nCurNibbleOffset, nCurFreeNibbles, nLength - MATCH_RUN_LEN_V2);
+         nOutOffset = lzsa_write_nibble_v2(pOutData, nOutOffset, nMaxOutDataSize, nCurNibbleOffset, nLength - MATCH_RUN_LEN_V2);
       }
       else {
-         nOutOffset = lzsa_write_nibble_v2(pOutData, nOutOffset, nMaxOutDataSize, nCurNibbleOffset, nCurFreeNibbles, 15);
+         nOutOffset = lzsa_write_nibble_v2(pOutData, nOutOffset, nMaxOutDataSize, nCurNibbleOffset, 15);
          if (nOutOffset < 0) return -1;
 
          if ((nLength + MIN_MATCH_SIZE_V2) < 256)
@@ -183,59 +183,73 @@ static inline int lzsa_write_match_varlen_v2(unsigned char *pOutData, int nOutOf
  * @param nMatchOffset match offset to use as rep candidate
  * @param nStartOffset current offset in input window (typically the number of previously compressed bytes)
  * @param nEndOffset offset to end finding matches at (typically the size of the total input window in bytes
- * @param nMatchesPerArrival number of arrivals to record per input buffer position
  * @param nDepth current insertion depth
  */
-static void lzsa_insert_forward_match_v2(lzsa_compressor *pCompressor, const unsigned char *pInWindow, const int i, const int nMatchOffset, const int nStartOffset, const int nEndOffset, const int nMatchesPerArrival, int nDepth) {
-   lzsa_arrival *arrival = pCompressor->arrival - (nStartOffset << MATCHES_PER_ARRIVAL_SHIFT);
+static void lzsa_insert_forward_match_v2(lzsa_compressor *pCompressor, const unsigned char *pInWindow, const int i, const int nMatchOffset, const int nStartOffset, const int nEndOffset, int nDepth) {
+   lzsa_arrival *arrival = pCompressor->arrival + ((i - nStartOffset) << ARRIVALS_PER_POSITION_SHIFT);
+   const int *rle_len = (int*)pCompressor->intervals /* reuse */;
+   lzsa_match* visited = ((lzsa_match*)pCompressor->pos_data) - nStartOffset /* reuse */;
    int j;
 
-   if (nDepth >= 10) return;
+   for (j = 0; j < NARRIVALS_PER_POSITION_V2_BIG && arrival[j].from_slot; j++) {
+      int nRepOffset = arrival[j].rep_offset;
 
-   for (j = 0; j < nMatchesPerArrival && arrival[(i << MATCHES_PER_ARRIVAL_SHIFT) + j].from_slot; j++) {
-      int nRepOffset = arrival[(i << MATCHES_PER_ARRIVAL_SHIFT) + j].rep_offset;
-
-      if (nMatchOffset != nRepOffset && nRepOffset && arrival[(i << MATCHES_PER_ARRIVAL_SHIFT) + j].rep_len >= MIN_MATCH_SIZE_V2) {
-         int nRepPos = arrival[(i << MATCHES_PER_ARRIVAL_SHIFT) + j].rep_pos;
-         int nRepLen = arrival[(i << MATCHES_PER_ARRIVAL_SHIFT) + j].rep_len;
+      if (nMatchOffset != nRepOffset && nRepOffset && arrival[j].rep_len >= MIN_MATCH_SIZE_V2) {
+         int nRepPos = arrival[j].rep_pos;
+         int nRepLen = arrival[j].rep_len;
 
          if (nRepPos > nMatchOffset &&
-            (nRepPos - nMatchOffset + nRepLen) <= (nEndOffset - LAST_LITERALS) &&
-            !memcmp(pInWindow + nRepPos - nRepOffset, pInWindow + nRepPos - nMatchOffset, nRepLen)) {
-            int nCurRepLen = nRepLen;
+            (nRepPos + nRepLen) <= nEndOffset &&
+            pCompressor->match[((nRepPos - nStartOffset) << MATCHES_PER_INDEX_SHIFT_V2) + NMATCHES_PER_INDEX_V2 - 1].length == 0) {
 
-            int nMaxRepLen = nEndOffset - nRepPos;
-            if (nMaxRepLen > LCP_MAX)
-               nMaxRepLen = LCP_MAX;
-            while ((nCurRepLen + 8) < nMaxRepLen && !memcmp(pInWindow + nRepPos + nCurRepLen, pInWindow + nRepPos - nMatchOffset + nCurRepLen, 8))
-               nCurRepLen += 8;
-            while ((nCurRepLen + 4) < nMaxRepLen && !memcmp(pInWindow + nRepPos + nCurRepLen, pInWindow + nRepPos - nMatchOffset + nCurRepLen, 4))
-               nCurRepLen += 4;
-            while (nCurRepLen < nMaxRepLen && pInWindow[nRepPos + nCurRepLen] == pInWindow[nRepPos - nMatchOffset + nCurRepLen])
-               nCurRepLen++;
+            if (visited[nRepPos].offset != nMatchOffset || visited[nRepPos].length > nRepLen) {
+               visited[nRepPos].offset = nMatchOffset;
+               visited[nRepPos].length = nRepLen;
 
-            lzsa_match *fwd_match = pCompressor->match + ((nRepPos - nStartOffset) << MATCHES_PER_INDEX_SHIFT_V2);
-            int exists = 0;
-            int r;
+               if (pInWindow[nRepPos] == pInWindow[nRepPos - nMatchOffset]) {
+                  int nLen0 = rle_len[nRepPos - nMatchOffset];
+                  int nLen1 = rle_len[nRepPos];
+                  int nMinLen = (nLen0 < nLen1) ? nLen0 : nLen1;
 
-            for (r = 0; r < NMATCHES_PER_INDEX_V2 && fwd_match[r].length >= MIN_MATCH_SIZE_V2; r++) {
-               if (fwd_match[r].offset == nMatchOffset) {
-                  exists = 1;
+                  if (nMinLen >= nRepLen || !memcmp(pInWindow + nRepPos + nMinLen, pInWindow + nRepPos + nMinLen - nMatchOffset, nRepLen - nMinLen)) {
+                     visited[nRepPos].length = 0;
 
-                  if (fwd_match[r].length < nCurRepLen) {
-                     fwd_match[r].length = nCurRepLen;
-                     lzsa_insert_forward_match_v2(pCompressor, pInWindow, nRepPos, nMatchOffset, nStartOffset, nEndOffset, nMatchesPerArrival, nDepth + 1);
+                     lzsa_match* fwd_match = pCompressor->match + ((nRepPos - nStartOffset) << MATCHES_PER_INDEX_SHIFT_V2);
+                     int r;
+
+                     for (r = 0; r < NMATCHES_PER_INDEX_V2 && fwd_match[r].length >= MIN_MATCH_SIZE_V2; r++) {
+                        if (fwd_match[r].offset == nMatchOffset) {
+                           r = NMATCHES_PER_INDEX_V2;
+                           break;
+                        }
+                     }
+
+                     if (r < NMATCHES_PER_INDEX_V2) {
+                        int nMaxRepLen = nEndOffset - nRepPos;
+                        if (nMaxRepLen > LCP_MAX)
+                           nMaxRepLen = LCP_MAX;
+                        int nCurRepLen = (nMinLen > nRepLen) ? nMinLen : nRepLen;
+                        if (nCurRepLen > nMaxRepLen)
+                           nCurRepLen = nMaxRepLen;
+                        const unsigned char* pInWindowMax = pInWindow + nRepPos + nMaxRepLen;
+                        const unsigned char* pInWindowAtRepPos = pInWindow + nRepPos + nCurRepLen;
+                        while ((pInWindowAtRepPos + 8) < pInWindowMax && !memcmp(pInWindowAtRepPos, pInWindowAtRepPos - nMatchOffset, 8))
+                           pInWindowAtRepPos += 8;
+                        while ((pInWindowAtRepPos + 4) < pInWindowMax && !memcmp(pInWindowAtRepPos, pInWindowAtRepPos - nMatchOffset, 4))
+                           pInWindowAtRepPos += 4;
+                        while (pInWindowAtRepPos < pInWindowMax && pInWindowAtRepPos[0] == pInWindowAtRepPos[-nMatchOffset])
+                           pInWindowAtRepPos++;
+
+                        nCurRepLen = (int)(pInWindowAtRepPos - (pInWindow + nRepPos));
+                        fwd_match[r].offset = nMatchOffset;
+                        fwd_match[r].length = nCurRepLen;
+
+                        if (nDepth < 9)
+                           lzsa_insert_forward_match_v2(pCompressor, pInWindow, nRepPos, nMatchOffset, nStartOffset, nEndOffset, nDepth + 1);
+                     }
                   }
-                  break;
                }
             }
-
-            if (!exists && r < NMATCHES_PER_INDEX_V2) {
-               fwd_match[r].offset = nMatchOffset;
-               fwd_match[r].length = nCurRepLen;
-
-               lzsa_insert_forward_match_v2(pCompressor, pInWindow, nRepPos, nMatchOffset, nStartOffset, nEndOffset, nMatchesPerArrival, nDepth + 1);
-            }
          }
       }
    }
@@ -251,33 +265,44 @@ static void lzsa_insert_forward_match_v2(lzsa_compressor *pCompressor, const uns
  * @param nEndOffset offset to end finding matches at (typically the size of the total input window in bytes
  * @param nReduce non-zero to reduce the number of tokens when the path costs are equal, zero not to
  * @param nInsertForwardReps non-zero to insert forward repmatch candidates, zero to use the previously inserted candidates
- * @param nMatchesPerArrival number of arrivals to record per input buffer position
+ * @param nArrivalsPerPosition number of arrivals to record per input buffer position
  */
-static void lzsa_optimize_forward_v2(lzsa_compressor *pCompressor, const unsigned char *pInWindow, lzsa_match *pBestMatch, const int nStartOffset, const int nEndOffset, const int nReduce, const int nInsertForwardReps, const int nMatchesPerArrival) {
-   lzsa_arrival *arrival = pCompressor->arrival - (nStartOffset << MATCHES_PER_ARRIVAL_SHIFT);
-   const int nFavorRatio = (pCompressor->flags & LZSA_FLAG_FAVOR_RATIO) ? 1 : 0;
+static void lzsa_optimize_forward_v2(lzsa_compressor *pCompressor, const unsigned char *pInWindow, lzsa_match *pBestMatch, const int nStartOffset, const int nEndOffset, const int nReduce, const int nInsertForwardReps, const int nArrivalsPerPosition) {
+   lzsa_arrival *arrival = pCompressor->arrival - (nStartOffset << ARRIVALS_PER_POSITION_SHIFT);
+   const int *rle_len = (int*)pCompressor->intervals /* reuse */;
+   lzsa_match *visited = ((lzsa_match*)pCompressor->pos_data) - nStartOffset /* reuse */;
+   char *nRepSlotHandledMask = pCompressor->rep_slot_handled_mask;
+   char *nRepLenHandledMask = pCompressor->rep_len_handled_mask;
+   const int nModeSwitchPenalty = (pCompressor->flags & LZSA_FLAG_FAVOR_RATIO) ? 0 : MODESWITCH_PENALTY;
    const int nMinMatchSize = pCompressor->min_match_size;
    const int nDisableScore = nReduce ? 0 : (2 * BLOCK_SIZE);
-   const int nLeaveAloneMatchSize = (nMatchesPerArrival == NMATCHES_PER_ARRIVAL_V2_SMALL) ? LEAVE_ALONE_MATCH_SIZE_SMALL : LEAVE_ALONE_MATCH_SIZE;
+   const int nMaxRepInsertedLen = nReduce ? LEAVE_ALONE_MATCH_SIZE : 0;
+   const int nLeaveAloneMatchSize = (nArrivalsPerPosition == NARRIVALS_PER_POSITION_V2_SMALL) ? LEAVE_ALONE_MATCH_SIZE_SMALL : LEAVE_ALONE_MATCH_SIZE;
    int i, j, n;
 
    if ((nEndOffset - nStartOffset) > BLOCK_SIZE) return;
 
-   memset(arrival + (nStartOffset << MATCHES_PER_ARRIVAL_SHIFT), 0, sizeof(lzsa_arrival) * ((nEndOffset - nStartOffset + 1) << MATCHES_PER_ARRIVAL_SHIFT));
+   memset(arrival + (nStartOffset << ARRIVALS_PER_POSITION_SHIFT), 0, sizeof(lzsa_arrival) * ((nEndOffset - nStartOffset + 1) << ARRIVALS_PER_POSITION_SHIFT));
 
-   for (i = (nStartOffset << MATCHES_PER_ARRIVAL_SHIFT); i != ((nEndOffset + 1) << MATCHES_PER_ARRIVAL_SHIFT); i++) {
+   for (i = (nStartOffset << ARRIVALS_PER_POSITION_SHIFT); i != ((nEndOffset + 1) << ARRIVALS_PER_POSITION_SHIFT); i++) {
       arrival[i].cost = 0x40000000;
    }
 
-   arrival[nStartOffset << MATCHES_PER_ARRIVAL_SHIFT].from_slot = -1;
+   arrival[nStartOffset << ARRIVALS_PER_POSITION_SHIFT].from_slot = -1;
+
+   if (nInsertForwardReps) {
+      memset(visited + nStartOffset, 0, (nEndOffset - nStartOffset) * sizeof(lzsa_match));
+   }
 
    for (i = nStartOffset; i != nEndOffset; i++) {
+      lzsa_arrival *cur_arrival = &arrival[i << ARRIVALS_PER_POSITION_SHIFT];
       int m;
 
-      for (j = 0; j < nMatchesPerArrival && arrival[(i << MATCHES_PER_ARRIVAL_SHIFT) + j].from_slot; j++) {
-         const int nPrevCost = arrival[(i << MATCHES_PER_ARRIVAL_SHIFT) + j].cost & 0x3fffffff;
+      for (j = 0; j < nArrivalsPerPosition && cur_arrival[j].from_slot; j++) {
+         const int nPrevCost = cur_arrival[j].cost & 0x3fffffff;
          int nCodingChoiceCost = nPrevCost + 8 /* literal */;
-         int nNumLiterals = arrival[(i << MATCHES_PER_ARRIVAL_SHIFT) + j].num_literals + 1;
+         int nScore = cur_arrival[j].score + 1;
+         int nNumLiterals = cur_arrival[j].num_literals + 1;
 
          if (nNumLiterals == LITERALS_RUN_LEN_V2) {
             nCodingChoiceCost += 4;
@@ -289,52 +314,70 @@ static void lzsa_optimize_forward_v2(lzsa_compressor *pCompressor, const unsigne
             nCodingChoiceCost += 16;
          }
 
-         if (!nFavorRatio && nNumLiterals == 1)
-            nCodingChoiceCost += MODESWITCH_PENALTY;
+         if (nNumLiterals == 1)
+            nCodingChoiceCost += nModeSwitchPenalty;
 
-         lzsa_arrival *pDestSlots = &arrival[(i + 1) << MATCHES_PER_ARRIVAL_SHIFT];
-         if (nCodingChoiceCost <= pDestSlots[nMatchesPerArrival - 1].cost) {
+         lzsa_arrival *pDestSlots = &cur_arrival[1 << ARRIVALS_PER_POSITION_SHIFT];
+         if (nCodingChoiceCost < pDestSlots[nArrivalsPerPosition - 1].cost ||
+            (nCodingChoiceCost == pDestSlots[nArrivalsPerPosition - 1].cost && nScore < (pDestSlots[nArrivalsPerPosition - 1].score + nDisableScore))) {
+            int nRepOffset = cur_arrival[j].rep_offset;
             int exists = 0;
+
             for (n = 0;
-               n < nMatchesPerArrival && pDestSlots[n].cost <= nCodingChoiceCost;
+               n < nArrivalsPerPosition && pDestSlots[n].cost < nCodingChoiceCost;
                n++) {
-               if (pDestSlots[n].rep_offset == arrival[(i << MATCHES_PER_ARRIVAL_SHIFT) + j].rep_offset) {
+               if (pDestSlots[n].rep_offset == nRepOffset) {
                   exists = 1;
                   break;
                }
             }
 
             if (!exists) {
-               int nScore = arrival[(i << MATCHES_PER_ARRIVAL_SHIFT) + j].score + 1;
-               for (n = 0; n < nMatchesPerArrival; n++) {
-                  lzsa_arrival *pDestArrival = &pDestSlots[n];
-                  if (nCodingChoiceCost < pDestArrival->cost ||
-                     (nCodingChoiceCost == pDestArrival->cost && nScore < (pDestArrival->score + nDisableScore))) {
+               for (;
+                  n < nArrivalsPerPosition && pDestSlots[n].cost == nCodingChoiceCost && nScore >= (pDestSlots[n].score + nDisableScore);
+                  n++) {
+                  if (pDestSlots[n].rep_offset == nRepOffset) {
+                     exists = 1;
+                     break;
+                  }
+               }
 
-                     if (pDestArrival->from_slot) {
+               if (!exists) {
+                  if (n < nArrivalsPerPosition) {
+                     int nn;
+
+                     for (nn = n;
+                        nn < nArrivalsPerPosition && pDestSlots[nn].cost == nCodingChoiceCost;
+                        nn++) {
+                        if (pDestSlots[nn].rep_offset == nRepOffset) {
+                           exists = 1;
+                           break;
+                        }
+                     }
+
+                     if (!exists) {
                         int z;
 
-                        for (z = n; z < nMatchesPerArrival - 1; z++) {
-                           if (pDestSlots[z].rep_offset == arrival[(i << MATCHES_PER_ARRIVAL_SHIFT) + j].rep_offset)
+                        for (z = n; z < nArrivalsPerPosition - 1 && pDestSlots[z].from_slot; z++) {
+                           if (pDestSlots[z].rep_offset == nRepOffset)
                               break;
                         }
 
                         memmove(&pDestSlots[n + 1],
                            &pDestSlots[n],
                            sizeof(lzsa_arrival) * (z - n));
-                     }
 
-                     pDestArrival->cost = nCodingChoiceCost;
-                     pDestArrival->from_pos = i;
-                     pDestArrival->from_slot = j + 1;
-                     pDestArrival->match_offset = 0;
-                     pDestArrival->match_len = 0;
-                     pDestArrival->num_literals = nNumLiterals;
-                     pDestArrival->score = nScore;
-                     pDestArrival->rep_offset = arrival[(i << MATCHES_PER_ARRIVAL_SHIFT) + j].rep_offset;
-                     pDestArrival->rep_pos = arrival[(i << MATCHES_PER_ARRIVAL_SHIFT) + j].rep_pos;
-                     pDestArrival->rep_len = arrival[(i << MATCHES_PER_ARRIVAL_SHIFT) + j].rep_len;
-                     break;
+                        lzsa_arrival* pDestArrival = &pDestSlots[n];
+                        pDestArrival->cost = nCodingChoiceCost;
+                        pDestArrival->from_pos = i;
+                        pDestArrival->from_slot = j + 1;
+                        pDestArrival->match_len = 0;
+                        pDestArrival->num_literals = nNumLiterals;
+                        pDestArrival->score = nScore;
+                        pDestArrival->rep_offset = nRepOffset;
+                        pDestArrival->rep_pos = cur_arrival[j].rep_pos;
+                        pDestArrival->rep_len = cur_arrival[j].rep_len;
+                     }
                   }
                }
             }
@@ -342,203 +385,276 @@ static void lzsa_optimize_forward_v2(lzsa_compressor *pCompressor, const unsigne
       }
 
       lzsa_match *match = pCompressor->match + ((i - nStartOffset) << MATCHES_PER_INDEX_SHIFT_V2);
+      int nNumArrivalsForThisPos = j, nMinOverallRepLen = 0, nMaxOverallRepLen = 0;
 
-      int nMinRepLen[NMATCHES_PER_ARRIVAL_V2_BIG];
-      memset(nMinRepLen, 0, nMatchesPerArrival * sizeof(int));
+      int nRepMatchArrivalIdxAndLen[(NARRIVALS_PER_POSITION_V2_BIG * 2) + 1];
+      int nNumRepMatchArrivals = 0;
+
+      int nMaxRepLenForPos = nEndOffset - i;
+      if (nMaxRepLenForPos > LCP_MAX)
+         nMaxRepLenForPos = LCP_MAX;
+      const unsigned char* pInWindowStart = pInWindow + i;
+      const unsigned char* pInWindowMax = pInWindowStart + nMaxRepLenForPos;
+
+      for (j = 0; j < nNumArrivalsForThisPos && (i + MIN_MATCH_SIZE_V2) <= nEndOffset; j++) {
+         int nRepOffset = cur_arrival[j].rep_offset;
+
+         if (nRepOffset) {
+            if (i > nRepOffset) {
+               if (pInWindow[i] == pInWindow[i - nRepOffset]) {
+                  const unsigned char* pInWindowAtPos;
+
+                  int nLen0 = rle_len[i - nRepOffset];
+                  int nLen1 = rle_len[i];
+                  int nMinLen = (nLen0 < nLen1) ? nLen0 : nLen1;
+
+                  if (nMinLen > nMaxRepLenForPos)
+                     nMinLen = nMaxRepLenForPos;
+                  pInWindowAtPos = pInWindowStart + nMinLen;
+
+                  while ((pInWindowAtPos + 8) < pInWindowMax && !memcmp(pInWindowAtPos - nRepOffset, pInWindowAtPos, 8))
+                     pInWindowAtPos += 8;
+                  while ((pInWindowAtPos + 4) < pInWindowMax && !memcmp(pInWindowAtPos - nRepOffset, pInWindowAtPos, 4))
+                     pInWindowAtPos += 4;
+                  while (pInWindowAtPos < pInWindowMax && pInWindowAtPos[-nRepOffset] == pInWindowAtPos[0])
+                     pInWindowAtPos++;
+                  int nCurRepLen = (int)(pInWindowAtPos - pInWindowStart);
+
+                  if (nCurRepLen >= MIN_MATCH_SIZE_V2) {
+                     if (nMaxOverallRepLen < nCurRepLen)
+                        nMaxOverallRepLen = nCurRepLen;
+                     nRepMatchArrivalIdxAndLen[nNumRepMatchArrivals++] = j;
+                     nRepMatchArrivalIdxAndLen[nNumRepMatchArrivals++] = nCurRepLen;
+                  }
+               }
+            }
+         }
+      }
+      nRepMatchArrivalIdxAndLen[nNumRepMatchArrivals] = -1;
+
+      if (!nReduce) {
+         memset(nRepSlotHandledMask, 0, nArrivalsPerPosition * ((LCP_MAX + 1) / 8) * sizeof(char));
+      }
+      memset(nRepLenHandledMask, 0, ((LCP_MAX + 1) / 8) * sizeof(char));
 
       for (m = 0; m < NMATCHES_PER_INDEX_V2 && match[m].length; m++) {
          int nMatchLen = match[m].length & 0x7fff;
          int nMatchOffset = match[m].offset;
-         int nScorePenalty = ((match[m].length & 0x8000) >> 15);
+         int nScorePenalty = 3 + ((match[m].length & 0x8000) >> 15);
          int nNoRepmatchOffsetCost = (nMatchOffset <= 32) ? 4 : ((nMatchOffset <= 512) ? 8 : ((nMatchOffset <= (8192 + 512)) ? 12 : 16));
          int nStartingMatchLen, k;
-         int nMaxRepLen[NMATCHES_PER_ARRIVAL_V2_BIG];
 
-         if ((i + nMatchLen) > (nEndOffset - LAST_LITERALS))
-            nMatchLen = nEndOffset - LAST_LITERALS - i;
-
-         for (j = 0; j < nMatchesPerArrival && arrival[(i << MATCHES_PER_ARRIVAL_SHIFT) + j].from_slot; j++) {
-            int nRepOffset = arrival[(i << MATCHES_PER_ARRIVAL_SHIFT) + j].rep_offset;
-            int nCurMaxRepLen = 0;
-
-            if (nRepOffset) {
-               if (nMatchOffset == nRepOffset)
-                  nCurMaxRepLen = nMatchLen;
-               else {
-                  if (i > nRepOffset &&
-                     (i - nRepOffset + nMatchLen) <= (nEndOffset - LAST_LITERALS)) {
-                     nCurMaxRepLen = nMinRepLen[j];
-                     while ((nCurMaxRepLen + 8) < nMatchLen && !memcmp(pInWindow + i - nRepOffset + nCurMaxRepLen, pInWindow + i + nCurMaxRepLen, 8))
-                        nCurMaxRepLen += 8;
-                     while ((nCurMaxRepLen + 4) < nMatchLen && !memcmp(pInWindow + i - nRepOffset + nCurMaxRepLen, pInWindow + i + nCurMaxRepLen, 4))
-                        nCurMaxRepLen += 4;
-                     while (nCurMaxRepLen < nMatchLen && pInWindow[i - nRepOffset + nCurMaxRepLen] == pInWindow[i + nCurMaxRepLen])
-                        nCurMaxRepLen++;
-                     nMinRepLen[j] = nCurMaxRepLen;
-                  }
-               }
-            }
-
-            nMaxRepLen[j] = nCurMaxRepLen;
-         }
-         while (j < nMatchesPerArrival)
-            nMaxRepLen[j++] = 0;
+         if ((i + nMatchLen) > nEndOffset)
+            nMatchLen = nEndOffset - i;
 
          if (nInsertForwardReps)
-            lzsa_insert_forward_match_v2(pCompressor, pInWindow, i, nMatchOffset, nStartOffset, nEndOffset, nMatchesPerArrival, 0);
+            lzsa_insert_forward_match_v2(pCompressor, pInWindow, i, nMatchOffset, nStartOffset, nEndOffset, 0);
 
-         int nMatchLenCost = 0;
+         int nNonRepMatchArrivalIdx = -1;
+         for (j = 0; j < nNumArrivalsForThisPos; j++) {
+            int nRepOffset = cur_arrival[j].rep_offset;
+
+            if (nMatchOffset != nRepOffset) {
+               nNonRepMatchArrivalIdx = j;
+               break;
+            }
+         }
+
+         int nMatchLenCost;
          if (nMatchLen >= nLeaveAloneMatchSize) {
             nStartingMatchLen = nMatchLen;
-            nMatchLenCost = 4 + 24;
+            nMatchLenCost = 4 + 24 + 8 /* token */;
          }
          else {
             nStartingMatchLen = nMinMatchSize;
-            nMatchLenCost = 0;
+            nMatchLenCost = 0 + 8 /* token */;
          }
 
          for (k = nStartingMatchLen; k <= nMatchLen; k++) {
             if (k == (MATCH_RUN_LEN_V2 + MIN_MATCH_SIZE_V2)) {
-               nMatchLenCost = 4;
+               nMatchLenCost = 4 + 8 /* token */;
             }
             else {
                if (k == (MATCH_RUN_LEN_V2 + 15 + MIN_MATCH_SIZE_V2))
-                  nMatchLenCost = 4 + 8;
+                  nMatchLenCost = 4 + 8 + 8 /* token */;
                else {
                   if (k == 256)
-                     nMatchLenCost = 4 + 24;
+                     nMatchLenCost = 4 + 24 + 8 /* token */;
                }
             }
 
-            lzsa_arrival *pDestSlots = &arrival[(i + k) << MATCHES_PER_ARRIVAL_SHIFT];
-            int nInsertedNoRepMatchCandidate = 0;
+            lzsa_arrival *pDestSlots = &cur_arrival[k << ARRIVALS_PER_POSITION_SHIFT];
 
-            for (j = 0; j < nMatchesPerArrival && arrival[(i << MATCHES_PER_ARRIVAL_SHIFT) + j].from_slot; j++) {
-               const int nPrevCost = arrival[(i << MATCHES_PER_ARRIVAL_SHIFT) + j].cost & 0x3fffffff;
-               int nRepCodingChoiceCost = nPrevCost + 8 /* token */ /* the actual cost of the literals themselves accumulates up the chain */ + nMatchLenCost;
+            /* Insert non-repmatch candidate */
 
-               if (nRepCodingChoiceCost <= pDestSlots[nMatchesPerArrival - 1].cost) {
-                  int nRepOffset = arrival[(i << MATCHES_PER_ARRIVAL_SHIFT) + j].rep_offset;
+            if (nNonRepMatchArrivalIdx >= 0) {
+               const int nPrevCost = cur_arrival[nNonRepMatchArrivalIdx].cost & 0x3fffffff;
+               int nCodingChoiceCost = nPrevCost /* the actual cost of the literals themselves accumulates up the chain */ + nMatchLenCost + nNoRepmatchOffsetCost;
 
-                  if (nMatchOffset != nRepOffset && !nInsertedNoRepMatchCandidate) {
-                     int nCodingChoiceCost = nRepCodingChoiceCost + nNoRepmatchOffsetCost;
+               if (!cur_arrival[nNonRepMatchArrivalIdx].num_literals)
+                  nCodingChoiceCost += nModeSwitchPenalty;
 
-                     if (!nFavorRatio && !arrival[(i << MATCHES_PER_ARRIVAL_SHIFT) + j].num_literals)
-                        nCodingChoiceCost += MODESWITCH_PENALTY;
+               int nScore = cur_arrival[nNonRepMatchArrivalIdx].score + nScorePenalty;
+               if (nCodingChoiceCost < pDestSlots[nArrivalsPerPosition - 2].cost ||
+                  (nCodingChoiceCost == pDestSlots[nArrivalsPerPosition - 2].cost && nScore < (pDestSlots[nArrivalsPerPosition - 2].score + nDisableScore))) {
+                  int exists = 0;
 
-                     if (nCodingChoiceCost <= pDestSlots[nMatchesPerArrival - 1].cost) {
-                        int exists = 0;
-                        int nScore = arrival[(i << MATCHES_PER_ARRIVAL_SHIFT) + j].score + 3 + nScorePenalty;
-
-                        for (n = 0;
-                           n < nMatchesPerArrival && pDestSlots[n].cost <= nCodingChoiceCost;
-                           n++) {
-                           if (pDestSlots[n].rep_offset == nMatchOffset &&
-                              (!nInsertForwardReps || pDestSlots[n].cost != nCodingChoiceCost || pDestSlots[n].rep_pos >= i || nScore >= (pDestSlots[n].score + nDisableScore) ||
-                                 pDestSlots[nMatchesPerArrival - 1].from_slot)) {
-                              exists = 1;
-                              break;
-                           }
-                        }
-
-                        if (!exists) {
-                           for (n = 0; n < nMatchesPerArrival - 1; n++) {
-                              lzsa_arrival *pDestArrival = &pDestSlots[n];
-
-                              if (nCodingChoiceCost < pDestArrival->cost ||
-                                 (nCodingChoiceCost == pDestArrival->cost && nScore < (pDestArrival->score + nDisableScore))) {
-                                 if (pDestArrival->from_slot) {
-                                    int z;
-
-                                    for (z = n; z < nMatchesPerArrival - 1; z++) {
-                                       if (pDestSlots[z].rep_offset == nMatchOffset)
-                                          break;
-                                    }
-
-                                    if (z == (nMatchesPerArrival - 1) && pDestSlots[z].from_slot && pDestSlots[z].match_len < MIN_MATCH_SIZE_V2)
-                                       z--;
-
-                                    memmove(&pDestSlots[n + 1],
-                                       &pDestSlots[n],
-                                       sizeof(lzsa_arrival) * (z - n));
-                                 }
-
-                                 pDestArrival->cost = nCodingChoiceCost;
-                                 pDestArrival->from_pos = i;
-                                 pDestArrival->from_slot = j + 1;
-                                 pDestArrival->match_offset = nMatchOffset;
-                                 pDestArrival->match_len = k;
-                                 pDestArrival->num_literals = 0;
-                                 pDestArrival->score = nScore;
-                                 pDestArrival->rep_offset = nMatchOffset;
-                                 pDestArrival->rep_pos = i;
-                                 pDestArrival->rep_len = k;
-                                 nInsertedNoRepMatchCandidate = 1;
-                                 break;
-                              }
-                           }
-                        }
+                  for (n = 0;
+                     n < nArrivalsPerPosition && pDestSlots[n].cost < nCodingChoiceCost;
+                     n++) {
+                     if (pDestSlots[n].rep_offset == nMatchOffset) {
+                        exists = 1;
+                        break;
                      }
                   }
 
-                  /* If this coding choice doesn't rep-match, see if we still get a match by using the current repmatch offset for this arrival. This can occur (and not have the
-                   * matchfinder offer the offset in the first place, or have too many choices with the same cost to retain the repmatchable offset) when compressing regions
-                   * of identical bytes, for instance. Checking for this provides a big compression win on some files. */
-
-                  if (nMaxRepLen[j] >= k) {
-                     int exists = 0;
-
-                     /* A match is possible at the rep offset; insert the extra coding choice. */
-
-                     for (n = 0;
-                        n < nMatchesPerArrival && pDestSlots[n].cost <= nRepCodingChoiceCost;
+                  if (!exists) {
+                     for (;
+                        n < nArrivalsPerPosition && pDestSlots[n].cost == nCodingChoiceCost && nScore >= (pDestSlots[n].score + nDisableScore);
                         n++) {
-                        if (pDestSlots[n].rep_offset == nRepOffset) {
+                        if (pDestSlots[n].rep_offset == nMatchOffset) {
                            exists = 1;
                            break;
                         }
                      }
 
                      if (!exists) {
-                        int nScore = arrival[(i << MATCHES_PER_ARRIVAL_SHIFT) + j].score + 2;
+                        if (n < nArrivalsPerPosition - 1) {
+                           int nn;
 
-                        for (n = 0; n < nMatchesPerArrival; n++) {
-                           lzsa_arrival *pDestArrival = &pDestSlots[n];
+                           for (nn = n;
+                              nn < nArrivalsPerPosition && pDestSlots[nn].cost == nCodingChoiceCost;
+                              nn++) {
+                              if (pDestSlots[nn].rep_offset == nMatchOffset &&
+                                 (!nInsertForwardReps || pDestSlots[nn].rep_pos >= i ||
+                                    pDestSlots[nArrivalsPerPosition - 1].from_slot)) {
+                                 exists = 1;
+                                 break;
+                              }
+                           }
 
-                           if (nRepCodingChoiceCost < pDestArrival->cost ||
-                              (nRepCodingChoiceCost == pDestArrival->cost && nScore < (pDestArrival->score + nDisableScore))) {
-                              if (pDestArrival->from_slot) {
-                                 int z;
+                           if (!exists) {
+                              int z;
 
-                                 for (z = n; z < nMatchesPerArrival - 1; z++) {
-                                    if (pDestSlots[z].rep_offset == nRepOffset)
-                                       break;
-                                 }
-
-                                 memmove(&pDestSlots[n + 1],
-                                    &pDestSlots[n],
-                                    sizeof(lzsa_arrival) * (z - n));
+                              for (z = n; z < nArrivalsPerPosition - 1 && pDestSlots[z].from_slot; z++) {
+                                 if (pDestSlots[z].rep_offset == nMatchOffset)
+                                    break;
                               }
 
-                              pDestArrival->cost = nRepCodingChoiceCost;
+                              if (z == (nArrivalsPerPosition - 1) && pDestSlots[z].from_slot && pDestSlots[z].match_len < MIN_MATCH_SIZE_V2)
+                                 z--;
+
+                              memmove(&pDestSlots[n + 1],
+                                 &pDestSlots[n],
+                                 sizeof(lzsa_arrival) * (z - n));
+
+                              lzsa_arrival* pDestArrival = &pDestSlots[n];
+                              pDestArrival->cost = nCodingChoiceCost;
                               pDestArrival->from_pos = i;
-                              pDestArrival->from_slot = j + 1;
-                              pDestArrival->match_offset = nRepOffset;
+                              pDestArrival->from_slot = nNonRepMatchArrivalIdx + 1;
                               pDestArrival->match_len = k;
                               pDestArrival->num_literals = 0;
                               pDestArrival->score = nScore;
-                              pDestArrival->rep_offset = nRepOffset;
+                              pDestArrival->rep_offset = nMatchOffset;
                               pDestArrival->rep_pos = i;
                               pDestArrival->rep_len = k;
-                              break;
+                              nRepLenHandledMask[k >> 3] &= ~(1 << (k & 7));
                            }
                         }
                      }
                   }
                }
-               else {
-                  break;
+            }
+
+            /* Insert repmatch candidates */
+
+            if (k > nMinOverallRepLen && k <= nMaxOverallRepLen && (nRepLenHandledMask[k >> 3] & (1 << (k & 7))) == 0) {
+               int nCurRepMatchArrival;
+
+               nRepLenHandledMask[k >> 3] |= 1 << (k & 7);
+
+               for (nCurRepMatchArrival = 0; (j = nRepMatchArrivalIdxAndLen[nCurRepMatchArrival]) >= 0; nCurRepMatchArrival += 2) {
+                  int nMaskOffset = (j << 7) + (k >> 3);
+                  if (nRepMatchArrivalIdxAndLen[nCurRepMatchArrival + 1] >= k && (nReduce || !(nRepSlotHandledMask[nMaskOffset] & (1 << (k & 7))))) {
+                     const int nPrevCost = cur_arrival[j].cost & 0x3fffffff;
+                     int nRepCodingChoiceCost = nPrevCost /* the actual cost of the literals themselves accumulates up the chain */ + nMatchLenCost;
+                     int nScore = cur_arrival[j].score + 2;
+
+                     if (nRepCodingChoiceCost < pDestSlots[nArrivalsPerPosition - 1].cost ||
+                        (nRepCodingChoiceCost == pDestSlots[nArrivalsPerPosition - 1].cost && nScore < (pDestSlots[nArrivalsPerPosition - 1].score + nDisableScore))) {
+                        int nRepOffset = cur_arrival[j].rep_offset;
+                        int exists = 0;
+
+                        for (n = 0;
+                           n < nArrivalsPerPosition && pDestSlots[n].cost < nRepCodingChoiceCost;
+                           n++) {
+                           if (pDestSlots[n].rep_offset == nRepOffset) {
+                              exists = 1;
+                              if (!nReduce)
+                                 nRepSlotHandledMask[nMaskOffset] |= 1 << (k & 7);
+                              break;
+                           }
+                        }
+
+                        if (!exists) {
+                           for (;
+                              n < nArrivalsPerPosition && pDestSlots[n].cost == nRepCodingChoiceCost && nScore >= (pDestSlots[n].score + nDisableScore);
+                              n++) {
+                              if (pDestSlots[n].rep_offset == nRepOffset) {
+                                 exists = 1;
+                                 break;
+                              }
+                           }
+
+                           if (!exists) {
+                              if (n < nArrivalsPerPosition) {
+                                 int nn;
+
+                                 for (nn = n;
+                                    nn < nArrivalsPerPosition && pDestSlots[nn].cost == nRepCodingChoiceCost;
+                                    nn++) {
+                                    if (pDestSlots[nn].rep_offset == nRepOffset) {
+                                       exists = 1;
+                                       break;
+                                    }
+                                 }
+
+                                 if (!exists) {
+                                    int z;
+
+                                    for (z = n; z < nArrivalsPerPosition - 1 && pDestSlots[z].from_slot; z++) {
+                                       if (pDestSlots[z].rep_offset == nRepOffset)
+                                          break;
+                                    }
+
+                                    memmove(&pDestSlots[n + 1],
+                                       &pDestSlots[n],
+                                       sizeof(lzsa_arrival) * (z - n));
+
+                                    lzsa_arrival* pDestArrival = &pDestSlots[n];
+                                    pDestArrival->cost = nRepCodingChoiceCost;
+                                    pDestArrival->from_pos = i;
+                                    pDestArrival->from_slot = j + 1;
+                                    pDestArrival->match_len = k;
+                                    pDestArrival->num_literals = 0;
+                                    pDestArrival->score = nScore;
+                                    pDestArrival->rep_offset = nRepOffset;
+                                    pDestArrival->rep_pos = i;
+                                    pDestArrival->rep_len = k;
+                                    nRepLenHandledMask[k >> 3] &= ~(1 << (k & 7));
+                                 }
+                              }
+                           }
+                        }
+                     }
+                     else {
+                        break;
+                     }
+                  }
                }
+
+               if (k < nMaxRepInsertedLen)
+                  nMinOverallRepLen = k;
             }
          }
 
@@ -547,13 +663,16 @@ static void lzsa_optimize_forward_v2(lzsa_compressor *pCompressor, const unsigne
       }
    }
 
-   lzsa_arrival *end_arrival = &arrival[(i << MATCHES_PER_ARRIVAL_SHIFT) + 0];
+   lzsa_arrival *end_arrival = &arrival[(i << ARRIVALS_PER_POSITION_SHIFT) + 0];
 
    while (end_arrival->from_slot > 0 && end_arrival->from_pos >= 0) {
       if (end_arrival->from_pos >= nEndOffset) return;
       pBestMatch[end_arrival->from_pos].length = end_arrival->match_len;
-      pBestMatch[end_arrival->from_pos].offset = end_arrival->match_offset;
-      end_arrival = &arrival[(end_arrival->from_pos << MATCHES_PER_ARRIVAL_SHIFT) + (end_arrival->from_slot - 1)];
+      if (end_arrival->match_len)
+         pBestMatch[end_arrival->from_pos].offset = end_arrival->rep_offset;
+      else
+         pBestMatch[end_arrival->from_pos].offset = 0;
+      end_arrival = &arrival[(end_arrival->from_pos << ARRIVALS_PER_POSITION_SHIFT) + (end_arrival->from_slot - 1)];
    }
 }
 
@@ -582,12 +701,12 @@ static int lzsa_optimize_command_count_v2(lzsa_compressor *pCompressor, const un
       lzsa_match *pMatch = pBestMatch + i;
 
       if (pMatch->length == 0 &&
-         (i + 1) < (nEndOffset - LAST_LITERALS) &&
+         (i + 1) < nEndOffset &&
          pBestMatch[i + 1].length >= MIN_MATCH_SIZE_V2 &&
          pBestMatch[i + 1].length < MAX_VARLEN &&
          pBestMatch[i + 1].offset &&
          i >= pBestMatch[i + 1].offset &&
-         (i + pBestMatch[i + 1].length + 1) <= (nEndOffset - LAST_LITERALS) &&
+         (i + pBestMatch[i + 1].length + 1) <= nEndOffset &&
          !memcmp(pInWindow + i - (pBestMatch[i + 1].offset), pInWindow + i, pBestMatch[i + 1].length + 1)) {
          int nCurLenSize = lzsa_get_match_varlen_size_v2(pBestMatch[i + 1].length - MIN_MATCH_SIZE_V2);
          int nReducedLenSize = lzsa_get_match_varlen_size_v2(pBestMatch[i + 1].length + 1 - MIN_MATCH_SIZE_V2);
@@ -623,7 +742,7 @@ static int lzsa_optimize_command_count_v2(lzsa_compressor *pCompressor, const un
                    * matching large regions of identical bytes for instance, where there are too many offsets to be considered by the parser, and when not compressing to favor the
                    * ratio (the forward arrivals parser already has this covered). */
                   if (i > nRepMatchOffset &&
-                     (i - nRepMatchOffset + pMatch->length) <= (nEndOffset - LAST_LITERALS) &&
+                     (i - nRepMatchOffset + pMatch->length) <= nEndOffset &&
                      !memcmp(pInWindow + i - nRepMatchOffset, pInWindow + i - pMatch->offset, pMatch->length)) {
                      pMatch->offset = nRepMatchOffset;
                      nDidReduce = 1;
@@ -632,7 +751,7 @@ static int lzsa_optimize_command_count_v2(lzsa_compressor *pCompressor, const un
 
                if (pBestMatch[nNextIndex].offset && pMatch->offset != pBestMatch[nNextIndex].offset && nRepMatchOffset != pBestMatch[nNextIndex].offset) {
                   /* Otherwise, try to gain a match forward as well */
-                  if (i > pBestMatch[nNextIndex].offset && (i - pBestMatch[nNextIndex].offset + pMatch->length) <= (nEndOffset - LAST_LITERALS)) {
+                  if (i > pBestMatch[nNextIndex].offset && (i - pBestMatch[nNextIndex].offset + pMatch->length) <= nEndOffset) {
                      int nMaxLen = 0;
                      while (nMaxLen < pMatch->length && pInWindow[i - pBestMatch[nNextIndex].offset + nMaxLen] == pInWindow[i - pMatch->offset + nMaxLen])
                         nMaxLen++;
@@ -675,20 +794,20 @@ static int lzsa_optimize_command_count_v2(lzsa_compressor *pCompressor, const un
                      nCurCommandSize += (pMatch->offset <= 32) ? 4 : ((pMatch->offset <= 512) ? 8 : ((pMatch->offset <= (8192 + 512)) ? 12 : 16));
 
                   /* Calculate the next command's current cost */
-                  int nNextCommandSize = 8 /* token */ + lzsa_get_literals_varlen_size_v2(nNextLiterals) + (nNextLiterals << 3) + lzsa_get_match_varlen_size_v2(pBestMatch[nNextIndex].length - MIN_MATCH_SIZE_V2);
+                  int nNextCommandSize = 8 /* token */ + lzsa_get_literals_varlen_size_v2(nNextLiterals) + /* (nNextLiterals << 3) + */ lzsa_get_match_varlen_size_v2(pBestMatch[nNextIndex].length - MIN_MATCH_SIZE_V2);
                   if (pBestMatch[nNextIndex].offset != pMatch->offset)
                      nNextCommandSize += (pBestMatch[nNextIndex].offset <= 32) ? 4 : ((pBestMatch[nNextIndex].offset <= 512) ? 8 : ((pBestMatch[nNextIndex].offset <= (8192 + 512)) ? 12 : 16));
 
                   int nOriginalCombinedCommandSize = nCurCommandSize + nNextCommandSize;
 
                   /* Calculate the cost of replacing this match command by literals + the next command with the cost of encoding these literals (excluding 'nNumLiterals' bytes) */
-                  int nReducedCommandSize = (pMatch->length << 3) + 8 /* token */ + lzsa_get_literals_varlen_size_v2(nNumLiterals + pMatch->length + nNextLiterals) + (nNextLiterals << 3) + lzsa_get_match_varlen_size_v2(pBestMatch[nNextIndex].length - MIN_MATCH_SIZE_V2);
+                  int nReducedCommandSize = (pMatch->length << 3) + 8 /* token */ + lzsa_get_literals_varlen_size_v2(nNumLiterals + pMatch->length + nNextLiterals) + /* (nNextLiterals << 3) + */ lzsa_get_match_varlen_size_v2(pBestMatch[nNextIndex].length - MIN_MATCH_SIZE_V2);
                   if (pBestMatch[nNextIndex].offset != nRepMatchOffset)
                      nReducedCommandSize += (pBestMatch[nNextIndex].offset <= 32) ? 4 : ((pBestMatch[nNextIndex].offset <= 512) ? 8 : ((pBestMatch[nNextIndex].offset <= (8192 + 512)) ? 12 : 16));
 
                   int nReplaceRepOffset = 0;
                   if (nRepMatchOffset && nRepMatchOffset != nPrevRepMatchOffset && nRepMatchLen >= MIN_MATCH_SIZE_V2 && nRepMatchOffset != pBestMatch[nNextIndex].offset && nRepIndex > pBestMatch[nNextIndex].offset &&
-                     (nRepIndex - pBestMatch[nNextIndex].offset + nRepMatchLen) <= (nEndOffset - LAST_LITERALS) &&
+                     (nRepIndex - pBestMatch[nNextIndex].offset + nRepMatchLen) <= nEndOffset &&
                      !memcmp(pInWindow + nRepIndex - nRepMatchOffset, pInWindow + nRepIndex - pBestMatch[nNextIndex].offset, nRepMatchLen)) {
                      /* Replacing this match command by literals would let us create a repmatch */
                      nReplaceRepOffset = 1;
@@ -729,26 +848,30 @@ static int lzsa_optimize_command_count_v2(lzsa_compressor *pCompressor, const un
                pBestMatch[i + pMatch->length].length)) {
 
             int nNextIndex = i + pMatch->length;
-            int nNextLiterals = 0;
 
             while (nNextIndex < nEndOffset && pBestMatch[nNextIndex].length < MIN_MATCH_SIZE_V2) {
-               nNextLiterals++;
                nNextIndex++;
             }
 
+            int nNextOffset;
+            if (nNextIndex < nEndOffset)
+               nNextOffset = pBestMatch[nNextIndex].offset;
+            else
+               nNextOffset = 0;
+
             int nCurPartialSize = lzsa_get_match_varlen_size_v2(pMatch->length - MIN_MATCH_SIZE_V2);
 
-            nCurPartialSize += 8 /* token */ + lzsa_get_literals_varlen_size_v2(0) + lzsa_get_match_varlen_size_v2(pBestMatch[i + pMatch->length].length - MIN_MATCH_SIZE_V2);
+            nCurPartialSize += 8 /* token */ + /* lzsa_get_literals_varlen_size_v2(0) + */ lzsa_get_match_varlen_size_v2(pBestMatch[i + pMatch->length].length - MIN_MATCH_SIZE_V2);
             if (pBestMatch[i + pMatch->length].offset != pMatch->offset)
                nCurPartialSize += (pBestMatch[i + pMatch->length].offset <= 32) ? 4 : ((pBestMatch[i + pMatch->length].offset <= 512) ? 8 : ((pBestMatch[i + pMatch->length].offset <= (8192 + 512)) ? 12 : 16));
 
-            if (pBestMatch[nNextIndex].offset != pBestMatch[i + pMatch->length].offset)
-               nCurPartialSize += (pBestMatch[nNextIndex].offset <= 32) ? 4 : ((pBestMatch[nNextIndex].offset <= 512) ? 8 : ((pBestMatch[nNextIndex].offset <= (8192 + 512)) ? 12 : 16));
+            if (nNextOffset != pBestMatch[i + pMatch->length].offset)
+               nCurPartialSize += (nNextOffset <= 32) ? 4 : ((nNextOffset <= 512) ? 8 : ((nNextOffset <= (8192 + 512)) ? 12 : 16));
 
             int nReducedPartialSize = lzsa_get_match_varlen_size_v2(pMatch->length + pBestMatch[i + pMatch->length].length - MIN_MATCH_SIZE_V2);
 
-            if (pBestMatch[nNextIndex].offset != pMatch->offset)
-               nReducedPartialSize += (pBestMatch[nNextIndex].offset <= 32) ? 4 : ((pBestMatch[nNextIndex].offset <= 512) ? 8 : ((pBestMatch[nNextIndex].offset <= (8192 + 512)) ? 12 : 16));
+            if (nNextOffset != pMatch->offset)
+               nReducedPartialSize += (nNextOffset <= 32) ? 4 : ((nNextOffset <= 512) ? 8 : ((nNextOffset <= (8192 + 512)) ? 12 : 16));
 
             if (nCurPartialSize >= nReducedPartialSize) {
                int nMatchLen = pMatch->length;
@@ -793,7 +916,6 @@ static int lzsa_optimize_command_count_v2(lzsa_compressor *pCompressor, const un
 static int lzsa_get_compressed_size_v2(lzsa_compressor *pCompressor, lzsa_match *pBestMatch, const int nStartOffset, const int nEndOffset) {
    int i;
    int nNumLiterals = 0;
-   int nOutOffset = 0;
    int nRepMatchOffset = 0;
    int nCompressedSize = 0;
 
@@ -838,7 +960,6 @@ static int lzsa_get_compressed_size_v2(lzsa_compressor *pCompressor, lzsa_match
    }
 
    {
-      int nTokenLiteralsLen = (nNumLiterals >= LITERALS_RUN_LEN_V2) ? LITERALS_RUN_LEN_V2 : nNumLiterals;
       int nCommandSize = 8 /* token */ + lzsa_get_literals_varlen_size_v2(nNumLiterals) + (nNumLiterals << 3);
 
       nCompressedSize += nCommandSize;
@@ -846,7 +967,7 @@ static int lzsa_get_compressed_size_v2(lzsa_compressor *pCompressor, lzsa_match
    }
 
    if (pCompressor->flags & LZSA_FLAG_RAW_BLOCK) {
-      nCompressedSize += (8 + 4 + 8);
+      nCompressedSize += (8 + 4);
    }
 
    return nCompressedSize;
@@ -870,7 +991,7 @@ static int lzsa_write_block_v2(lzsa_compressor *pCompressor, lzsa_match *pBestMa
    int nNumLiterals = 0;
    int nInFirstLiteralOffset = 0;
    int nOutOffset = 0;
-   int nCurNibbleOffset = -1, nCurFreeNibbles = 0;
+   int nCurNibbleOffset = -1;
    int nRepMatchOffset = 0;
 
    for (i = nStartOffset; i < nEndOffset; ) {
@@ -916,7 +1037,7 @@ static int lzsa_write_block_v2(lzsa_compressor *pCompressor, lzsa_match *pBestMa
             return -1;
 
          pOutData[nOutOffset++] = nTokenOffsetMode | (nTokenLiteralsLen << 3) | nTokenMatchLen;
-         nOutOffset = lzsa_write_literals_varlen_v2(pOutData, nOutOffset, nMaxOutDataSize, &nCurNibbleOffset, &nCurFreeNibbles, nNumLiterals);
+         nOutOffset = lzsa_write_literals_varlen_v2(pOutData, nOutOffset, nMaxOutDataSize, &nCurNibbleOffset, nNumLiterals);
          if (nOutOffset < 0) return -1;
 
          if (nNumLiterals < pCompressor->stats.min_literals || pCompressor->stats.min_literals == -1)
@@ -933,14 +1054,14 @@ static int lzsa_write_block_v2(lzsa_compressor *pCompressor, lzsa_match *pBestMa
          }
 
          if (nTokenOffsetMode == 0x00 || nTokenOffsetMode == 0x20) {
-            nOutOffset = lzsa_write_nibble_v2(pOutData, nOutOffset, nMaxOutDataSize, &nCurNibbleOffset, &nCurFreeNibbles, ((-nMatchOffset) & 0x1e) >> 1);
+            nOutOffset = lzsa_write_nibble_v2(pOutData, nOutOffset, nMaxOutDataSize, &nCurNibbleOffset, ((-nMatchOffset) & 0x1e) >> 1);
             if (nOutOffset < 0) return -1;
          }
          else if (nTokenOffsetMode == 0x40 || nTokenOffsetMode == 0x60) {
             pOutData[nOutOffset++] = (-nMatchOffset) & 0xff;
          }
          else if (nTokenOffsetMode == 0x80 || nTokenOffsetMode == 0xa0) {
-            nOutOffset = lzsa_write_nibble_v2(pOutData, nOutOffset, nMaxOutDataSize, &nCurNibbleOffset, &nCurFreeNibbles, ((-(nMatchOffset - 512)) >> 9) & 0x0f);
+            nOutOffset = lzsa_write_nibble_v2(pOutData, nOutOffset, nMaxOutDataSize, &nCurNibbleOffset, ((-(nMatchOffset - 512)) >> 9) & 0x0f);
             if (nOutOffset < 0) return -1;
             pOutData[nOutOffset++] = (-(nMatchOffset - 512)) & 0xff;
          }
@@ -954,7 +1075,7 @@ static int lzsa_write_block_v2(lzsa_compressor *pCompressor, lzsa_match *pBestMa
 
          nRepMatchOffset = nMatchOffset;
 
-         nOutOffset = lzsa_write_match_varlen_v2(pOutData, nOutOffset, nMaxOutDataSize, &nCurNibbleOffset, &nCurFreeNibbles, nEncodedMatchLen);
+         nOutOffset = lzsa_write_match_varlen_v2(pOutData, nOutOffset, nMaxOutDataSize, &nCurNibbleOffset, nEncodedMatchLen);
          if (nOutOffset < 0) return -1;
 
          if (nMatchOffset < pCompressor->stats.min_offset || pCompressor->stats.min_offset == -1)
@@ -1013,10 +1134,10 @@ static int lzsa_write_block_v2(lzsa_compressor *pCompressor, lzsa_match *pBestMa
          return -1;
 
       if (pCompressor->flags & LZSA_FLAG_RAW_BLOCK)
-         pOutData[nOutOffset++] = (nTokenLiteralsLen << 3) | 0x47;
+         pOutData[nOutOffset++] = (nTokenLiteralsLen << 3) | 0xe7;
       else
          pOutData[nOutOffset++] = (nTokenLiteralsLen << 3) | 0x00;
-      nOutOffset = lzsa_write_literals_varlen_v2(pOutData, nOutOffset, nMaxOutDataSize, &nCurNibbleOffset, &nCurFreeNibbles, nNumLiterals);
+      nOutOffset = lzsa_write_literals_varlen_v2(pOutData, nOutOffset, nMaxOutDataSize, &nCurNibbleOffset, nNumLiterals);
       if (nOutOffset < 0) return -1;
 
       if (nNumLiterals < pCompressor->stats.min_literals || pCompressor->stats.min_literals == -1)
@@ -1046,9 +1167,8 @@ static int lzsa_write_block_v2(lzsa_compressor *pCompressor, lzsa_match *pBestMa
 
       if (nOutOffset >= nMaxOutDataSize)
          return -1;
-      pOutData[nOutOffset++] = 0;      /* Match offset */
 
-      nOutOffset = lzsa_write_nibble_v2(pOutData, nOutOffset, nMaxOutDataSize, &nCurNibbleOffset, &nCurFreeNibbles, 15);   /* Extended match length nibble */
+      nOutOffset = lzsa_write_nibble_v2(pOutData, nOutOffset, nMaxOutDataSize, &nCurNibbleOffset, 15);   /* Extended match length nibble */
       if (nOutOffset < 0) return -1;
 
       if ((nOutOffset + 1) > nMaxOutDataSize)
@@ -1058,7 +1178,7 @@ static int lzsa_write_block_v2(lzsa_compressor *pCompressor, lzsa_match *pBestMa
    }
 
    if (nCurNibbleOffset != -1) {
-      nOutOffset = lzsa_write_nibble_v2(pOutData, nOutOffset, nMaxOutDataSize, &nCurNibbleOffset, &nCurFreeNibbles, 0);
+      nOutOffset = lzsa_write_nibble_v2(pOutData, nOutOffset, nMaxOutDataSize, &nCurNibbleOffset, 0);
       if (nOutOffset < 0 || nCurNibbleOffset != -1)
          return -1;
    }
@@ -1079,19 +1199,19 @@ static int lzsa_write_block_v2(lzsa_compressor *pCompressor, lzsa_match *pBestMa
  * @return size of compressed data in output buffer, or -1 if the data is uncompressible
  */
 static int lzsa_write_raw_uncompressed_block_v2(lzsa_compressor *pCompressor, const unsigned char *pInWindow, const int nStartOffset, const int nEndOffset, unsigned char *pOutData, const int nMaxOutDataSize) {
-   int nCurNibbleOffset = -1, nCurFreeNibbles = 0;
+   int nCurNibbleOffset = -1;
    int nNumLiterals = nEndOffset - nStartOffset;
    int nTokenLiteralsLen = (nNumLiterals >= LITERALS_RUN_LEN_V2) ? LITERALS_RUN_LEN_V2 : nNumLiterals;
    int nOutOffset = 0;
 
-   int nCommandSize = 8 /* token */ + lzsa_get_literals_varlen_size_v2(nNumLiterals) + (nNumLiterals << 3) + 8 + 4 + 8;
+   int nCommandSize = 8 /* token */ + lzsa_get_literals_varlen_size_v2(nNumLiterals) + (nNumLiterals << 3) + 4 + 8;
    if ((nOutOffset + ((nCommandSize + 7) >> 3)) > nMaxOutDataSize)
       return -1;
 
    pCompressor->num_commands = 0;
-   pOutData[nOutOffset++] = (nTokenLiteralsLen << 3) | 0x47;
+   pOutData[nOutOffset++] = (nTokenLiteralsLen << 3) | 0xe7;
 
-   nOutOffset = lzsa_write_literals_varlen_v2(pOutData, nOutOffset, nMaxOutDataSize, &nCurNibbleOffset, &nCurFreeNibbles, nNumLiterals);
+   nOutOffset = lzsa_write_literals_varlen_v2(pOutData, nOutOffset, nMaxOutDataSize, &nCurNibbleOffset, nNumLiterals);
    if (nOutOffset < 0) return -1;
 
    if (nNumLiterals != 0) {
@@ -1102,9 +1222,7 @@ static int lzsa_write_raw_uncompressed_block_v2(lzsa_compressor *pCompressor, co
 
    /* Emit EOD marker for raw block */
 
-   pOutData[nOutOffset++] = 0;      /* Match offset */
-
-   nOutOffset = lzsa_write_nibble_v2(pOutData, nOutOffset, nMaxOutDataSize, &nCurNibbleOffset, &nCurFreeNibbles, 15);   /* Extended match length nibble */
+   nOutOffset = lzsa_write_nibble_v2(pOutData, nOutOffset, nMaxOutDataSize, &nCurNibbleOffset, 15);   /* Extended match length nibble */
    if (nOutOffset < 0) return -1;
 
    if ((nOutOffset + 1) > nMaxOutDataSize)
@@ -1115,7 +1233,7 @@ static int lzsa_write_raw_uncompressed_block_v2(lzsa_compressor *pCompressor, co
    pCompressor->num_commands++;
 
    if (nCurNibbleOffset != -1) {
-      nOutOffset = lzsa_write_nibble_v2(pOutData, nOutOffset, nMaxOutDataSize, &nCurNibbleOffset, &nCurFreeNibbles, 0);
+      nOutOffset = lzsa_write_nibble_v2(pOutData, nOutOffset, nMaxOutDataSize, &nCurNibbleOffset, 0);
       if (nOutOffset < 0 || nCurNibbleOffset != -1)
          return -1;
    }
@@ -1137,12 +1255,27 @@ static int lzsa_write_raw_uncompressed_block_v2(lzsa_compressor *pCompressor, co
  */
 int lzsa_optimize_and_write_block_v2(lzsa_compressor *pCompressor, const unsigned char *pInWindow, const int nPreviousBlockSize, const int nInDataSize, unsigned char *pOutData, const int nMaxOutDataSize) {
    int nResult, nBaseCompressedSize;
-   int nMatchesPerArrival = (nInDataSize < 65536) ? NMATCHES_PER_ARRIVAL_V2_BIG : NMATCHES_PER_ARRIVAL_V2_SMALL;
+   int nArrivalsPerPosition = (nInDataSize < 65536) ? NARRIVALS_PER_POSITION_V2_BIG : NARRIVALS_PER_POSITION_V2_SMALL;
+   int *rle_len = (int*)pCompressor->intervals /* reuse */;
+   int i;
+
+   i = 0;
+   while (i < (nPreviousBlockSize + nInDataSize)) {
+      int nRangeStartIdx = i;
+      unsigned char c = pInWindow[nRangeStartIdx];
+      do {
+         i++;
+      } while (i < (nPreviousBlockSize + nInDataSize) && pInWindow[i] == c);
+      while (nRangeStartIdx < i) {
+         rle_len[nRangeStartIdx] = i - nRangeStartIdx;
+         nRangeStartIdx++;
+      }
+   }
 
    /* Compress optimally without breaking ties in favor of less tokens */
    
    memset(pCompressor->best_match, 0, BLOCK_SIZE * sizeof(lzsa_match));
-   lzsa_optimize_forward_v2(pCompressor, pInWindow, pCompressor->best_match - nPreviousBlockSize, nPreviousBlockSize, nPreviousBlockSize + nInDataSize, 0 /* reduce */, (nInDataSize < 65536) ? 1 : 0 /* insert forward reps */, nMatchesPerArrival);
+   lzsa_optimize_forward_v2(pCompressor, pInWindow, pCompressor->best_match - nPreviousBlockSize, nPreviousBlockSize, nPreviousBlockSize + nInDataSize, 0 /* reduce */, (nInDataSize < 65536) ? 1 : 0 /* insert forward reps */, nArrivalsPerPosition);
 
    int nDidReduce;
    int nPasses = 0;
@@ -1159,7 +1292,7 @@ int lzsa_optimize_and_write_block_v2(lzsa_compressor *pCompressor, const unsigne
 
       /* Compress optimally and do break ties in favor of less tokens */
       memset(pCompressor->improved_match, 0, BLOCK_SIZE * sizeof(lzsa_match));
-      lzsa_optimize_forward_v2(pCompressor, pInWindow, pCompressor->improved_match - nPreviousBlockSize, nPreviousBlockSize, nPreviousBlockSize + nInDataSize, 1 /* reduce */, 0 /* use forward reps */, nMatchesPerArrival);
+      lzsa_optimize_forward_v2(pCompressor, pInWindow, pCompressor->improved_match - nPreviousBlockSize, nPreviousBlockSize, nPreviousBlockSize + nInDataSize, 1 /* reduce */, 0 /* use forward reps */, nArrivalsPerPosition);
 
       nPasses = 0;
       do {
@@ -1169,8 +1302,77 @@ int lzsa_optimize_and_write_block_v2(lzsa_compressor *pCompressor, const unsigne
 
       nReducedCompressedSize = lzsa_get_compressed_size_v2(pCompressor, pCompressor->improved_match - nPreviousBlockSize, nPreviousBlockSize, nPreviousBlockSize + nInDataSize);
       if (nReducedCompressedSize > 0 && nReducedCompressedSize <= nBaseCompressedSize) {
+         const int nEndOffset = nPreviousBlockSize + nInDataSize;
+         int nSupplementedCompressedSize;
+
          /* Pick the parse with the reduced number of tokens as it didn't negatively affect the size */
          pBestMatch = pCompressor->improved_match - nPreviousBlockSize;
+
+         int* first_offset_for_byte = pCompressor->first_offset_for_byte;
+         int* next_offset_for_pos = pCompressor->next_offset_for_pos;
+         int nPosition;
+
+         /* Supplement small matches */
+
+         memset(first_offset_for_byte, 0xff, sizeof(int) * 65536);
+         memset(next_offset_for_pos, 0xff, sizeof(int) * nInDataSize);
+
+         for (nPosition = nPreviousBlockSize; nPosition < nEndOffset - 1; nPosition++) {
+            next_offset_for_pos[nPosition - nPreviousBlockSize] = first_offset_for_byte[((unsigned int)pInWindow[nPosition]) | (((unsigned int)pInWindow[nPosition + 1]) << 8)];
+            first_offset_for_byte[((unsigned int)pInWindow[nPosition]) | (((unsigned int)pInWindow[nPosition + 1]) << 8)] = nPosition;
+         }
+
+         for (nPosition = nPreviousBlockSize + 1; nPosition < (nEndOffset - 1); nPosition++) {
+            lzsa_match* match = pCompressor->match + ((nPosition - nPreviousBlockSize) << MATCHES_PER_INDEX_SHIFT_V2);
+            int m = 0, nInserted = 0;
+            int nMatchPos;
+
+            while (m < 15 && match[m].length)
+               m++;
+
+            for (nMatchPos = next_offset_for_pos[nPosition - nPreviousBlockSize]; m < 15 && nMatchPos >= 0; nMatchPos = next_offset_for_pos[nMatchPos - nPreviousBlockSize]) {
+               int nMatchOffset = nPosition - nMatchPos;
+               int nExistingMatchIdx;
+               int nAlreadyExists = 0;
+
+               for (nExistingMatchIdx = 0; nExistingMatchIdx < m; nExistingMatchIdx++) {
+                  if (match[nExistingMatchIdx].offset == nMatchOffset) {
+                     nAlreadyExists = 1;
+                     break;
+                  }
+               }
+
+               if (!nAlreadyExists) {
+                  int nMatchLen = 2;
+                  while (nMatchLen < 16 && (nPosition + nMatchLen + 4) < nEndOffset && !memcmp(pInWindow + nMatchPos + nMatchLen, pInWindow + nPosition + nMatchLen, 4))
+                     nMatchLen += 4;
+                  while (nMatchLen < 16 && (nPosition + nMatchLen) < nEndOffset && pInWindow[nMatchPos + nMatchLen] == pInWindow[nPosition + nMatchLen])
+                     nMatchLen++;
+                  match[m].length = nMatchLen;
+                  match[m].offset = nMatchOffset;
+                  m++;
+                  nInserted++;
+                  if (nInserted >= 15)
+                     break;
+               }
+            }
+         }
+
+         /* Compress optimally with the extra matches */
+         memset(pCompressor->best_match, 0, BLOCK_SIZE * sizeof(lzsa_match));
+         lzsa_optimize_forward_v2(pCompressor, pInWindow, pCompressor->best_match - nPreviousBlockSize, nPreviousBlockSize, nPreviousBlockSize + nInDataSize, 1 /* reduce */, 0 /* use forward reps */, nArrivalsPerPosition);
+
+         nPasses = 0;
+         do {
+            nDidReduce = lzsa_optimize_command_count_v2(pCompressor, pInWindow, pCompressor->best_match - nPreviousBlockSize, nPreviousBlockSize, nPreviousBlockSize + nInDataSize);
+            nPasses++;
+         } while (nDidReduce && nPasses < 20);
+
+         nSupplementedCompressedSize = lzsa_get_compressed_size_v2(pCompressor, pCompressor->best_match - nPreviousBlockSize, nPreviousBlockSize, nPreviousBlockSize + nInDataSize);
+         if (nSupplementedCompressedSize > 0 && nSupplementedCompressedSize < nReducedCompressedSize) {
+            /* Pick the parse with the extra matches as it didn't negatively affect the size */
+            pBestMatch = pCompressor->best_match - nPreviousBlockSize;
+         }
       }
    }
 
diff --git a/src/shrink_context.c b/src/shrink_context.c
index c1e7ab3..9e6900f 100644
--- a/src/shrink_context.c
+++ b/src/shrink_context.c
@@ -62,6 +62,10 @@ int lzsa_compressor_init(lzsa_compressor *pCompressor, const int nMaxWindowSize,
    pCompressor->best_match = NULL;
    pCompressor->improved_match = NULL;
    pCompressor->arrival = NULL;
+   pCompressor->rep_slot_handled_mask = NULL;
+   pCompressor->rep_len_handled_mask = NULL;
+   pCompressor->first_offset_for_byte = NULL;
+   pCompressor->next_offset_for_pos = NULL;
    pCompressor->min_match_size = nMinMatchSize;
    if (pCompressor->min_match_size < nMinMatchSizeForFormat)
       pCompressor->min_match_size = nMinMatchSizeForFormat;
@@ -89,7 +93,7 @@ int lzsa_compressor_init(lzsa_compressor *pCompressor, const int nMaxWindowSize,
             pCompressor->open_intervals = (unsigned int *)malloc((LCP_AND_TAG_MAX + 1) * sizeof(unsigned int));
 
             if (pCompressor->open_intervals) {
-               pCompressor->arrival = (lzsa_arrival *)malloc(((BLOCK_SIZE + 1) << MATCHES_PER_ARRIVAL_SHIFT) * sizeof(lzsa_arrival));
+               pCompressor->arrival = (lzsa_arrival *)malloc(((BLOCK_SIZE + 1) << ARRIVALS_PER_POSITION_SHIFT) * sizeof(lzsa_arrival));
    
                if (pCompressor->arrival) {
                   pCompressor->best_match = (lzsa_match *)malloc(BLOCK_SIZE * sizeof(lzsa_match));
@@ -102,8 +106,26 @@ int lzsa_compressor_init(lzsa_compressor *pCompressor, const int nMaxWindowSize,
                            pCompressor->match = (lzsa_match *)malloc(BLOCK_SIZE * NMATCHES_PER_INDEX_V2 * sizeof(lzsa_match));
                         else
                            pCompressor->match = (lzsa_match *)malloc(BLOCK_SIZE * NMATCHES_PER_INDEX_V1 * sizeof(lzsa_match));
-                        if (pCompressor->match)
-                           return 0;
+                        if (pCompressor->match) {
+                           if (pCompressor->format_version == 2) {
+                              pCompressor->rep_slot_handled_mask = (char*)malloc(NARRIVALS_PER_POSITION_V2_BIG * ((LCP_MAX + 1) / 8) * sizeof(char));
+                              if (pCompressor->rep_slot_handled_mask) {
+                                 pCompressor->rep_len_handled_mask = (char*)malloc(((LCP_MAX + 1) / 8) * sizeof(char));
+                                 if (pCompressor->rep_len_handled_mask) {
+                                    pCompressor->first_offset_for_byte = (int*)malloc(65536 * sizeof(int));
+                                    if (pCompressor->first_offset_for_byte) {
+                                       pCompressor->next_offset_for_pos = (int*)malloc(BLOCK_SIZE * sizeof(int));
+                                       if (pCompressor->next_offset_for_pos) {
+                                          return 0;
+                                       }
+                                    }
+                                 }
+                              }
+                           }
+                           else {
+                              return 0;
+                           }
+                        }
                      }
                   }
                }
@@ -124,6 +146,26 @@ int lzsa_compressor_init(lzsa_compressor *pCompressor, const int nMaxWindowSize,
 void lzsa_compressor_destroy(lzsa_compressor *pCompressor) {
    divsufsort_destroy(&pCompressor->divsufsort_context);
 
+   if (pCompressor->next_offset_for_pos) {
+      free(pCompressor->next_offset_for_pos);
+      pCompressor->next_offset_for_pos = NULL;
+   }
+
+   if (pCompressor->first_offset_for_byte) {
+      free(pCompressor->first_offset_for_byte);
+      pCompressor->first_offset_for_byte = NULL;
+   }
+
+   if (pCompressor->rep_len_handled_mask) {
+      free(pCompressor->rep_len_handled_mask);
+      pCompressor->rep_len_handled_mask = NULL;
+   }
+
+   if (pCompressor->rep_slot_handled_mask) {
+      free(pCompressor->rep_slot_handled_mask);
+      pCompressor->rep_slot_handled_mask = NULL;
+   }
+
    if (pCompressor->match) {
       free(pCompressor->match);
       pCompressor->match = NULL;
diff --git a/src/shrink_context.h b/src/shrink_context.h
index 70245cf..ce80fbd 100644
--- a/src/shrink_context.h
+++ b/src/shrink_context.h
@@ -49,10 +49,10 @@ extern "C" {
 #define VISITED_FLAG 0x80000000
 #define EXCL_VISITED_MASK  0x7fffffff
 
-#define NMATCHES_PER_ARRIVAL_V1 8
-#define NMATCHES_PER_ARRIVAL_V2_SMALL 9
-#define NMATCHES_PER_ARRIVAL_V2_BIG 32
-#define MATCHES_PER_ARRIVAL_SHIFT 5
+#define NARRIVALS_PER_POSITION_V1 8
+#define NARRIVALS_PER_POSITION_V2_SMALL 9
+#define NARRIVALS_PER_POSITION_V2_BIG 32
+#define ARRIVALS_PER_POSITION_SHIFT 5
 
 #define NMATCHES_PER_INDEX_V1 8
 #define MATCHES_PER_INDEX_SHIFT_V1 3
@@ -63,8 +63,6 @@ extern "C" {
 #define LEAVE_ALONE_MATCH_SIZE 300
 #define LEAVE_ALONE_MATCH_SIZE_SMALL 1000
 
-#define LAST_LITERALS 0
-
 #define MODESWITCH_PENALTY 3
 
 /** One match */
@@ -81,12 +79,10 @@ typedef struct {
 
    int from_pos;
    unsigned short rep_len;
+   unsigned short match_len;
    int rep_pos;
    int num_literals;
    int score;
-
-   unsigned short match_offset;
-   unsigned short match_len;
 } lzsa_arrival;
 
 /** Compression statistics */
@@ -128,6 +124,10 @@ typedef struct _lzsa_compressor {
    lzsa_match *best_match;
    lzsa_match *improved_match;
    lzsa_arrival *arrival;
+   char *rep_slot_handled_mask;
+   char *rep_len_handled_mask;
+   int *first_offset_for_byte;
+   int *next_offset_for_pos;
    int min_match_size;
    int format_version;
    int flags;